Working prototype of autotuning

2025-11-29 17:28:03 +01:00 · 2018-08-22 16:20:18 +02:00
parent f60ceaa316
commit 803ce8ab70
33 changed files with 992 additions and 976 deletions
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the different helper functions and 
+classes
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+import gc
+import numpy as np
+import logging
+from socket import gethostname
+
+import pycuda.driver as cuda
+
+
+from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF
+
+class Autotuner:
+    def __init__(self, 
+                nx=2048, ny=2048, 
+                block_widths=range(8, 32, 2),
+                block_heights=range(8, 32, 2)):
+        logger = logging.getLogger(__name__)
+        self.filename = "autotuning_data_" + gethostname() + ".npz"
+        self.nx = nx
+        self.ny = ny
+        self.block_widths = block_widths
+        self.block_heights = block_heights
+        self.performance = {}
+
+
+    def benchmark(self, simulator, force=False):
+        logger = logging.getLogger(__name__)
+        
+        #Run through simulators and benchmark
+        key = str(simulator.__name__)
+        logger.info("Benchmarking %s to %s", key, self.filename)
+        
+        #If this simulator has been benchmarked already, skip it
+        if (force==False and os.path.isfile(self.filename)):
+            with np.load(self.filename) as data:
+                if key in data["simulators"]:
+                    logger.info("%s already benchmarked - skipping", key)
+                    return
+    
+        # Set arguments to send to the simulators during construction
+        context = Common.CudaContext(autotuning=False)
+        g = 9.81
+        h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
+        arguments = {
+            'context': context,
+            'h0': h0, 'hu0': hu0, 'hv0': hv0,
+            'nx': self.nx, 'ny': self.ny,
+            'dx': dx, 'dy': dy, 'dt': 0.9*dt,
+            'g': g
+        } 
+             
+        # Load existing data into memory
+        benchmark_data = {
+                "simulators": [],
+        }
+        if (os.path.isfile(self.filename)):
+            with np.load(self.filename) as data:
+                for k, v in data.items():
+                    benchmark_data[k] = v
+   
+        # Run benchmark
+        benchmark_data[key + "_megacells"] = Autotuner.benchmark_single_simulator(simulator, arguments, self.block_widths, self.block_heights)
+        benchmark_data[key + "_block_widths"] = self.block_widths
+        benchmark_data[key + "_block_heights"] = self.block_heights
+        benchmark_data[key + "_arguments"] = str(arguments)
+        
+        existing_sims = benchmark_data["simulators"]
+        if (isinstance(existing_sims, np.ndarray)):
+            existing_sims = existing_sims.tolist()
+        if (key not in existing_sims):
+            benchmark_data["simulators"] = existing_sims + [key]
+
+        # Save to file
+        np.savez_compressed(self.filename, **benchmark_data)
+
+
+            
+    """
+    Function which reads a numpy file with autotuning data
+    and reports the maximum performance and block size
+    """
+    def get_peak_performance(self, simulator):
+        logger = logging.getLogger(__name__)
+        
+        assert issubclass(simulator, Simulator.BaseSimulator)
+        key = simulator.__name__
+        
+        if (key in self.performance):
+            return self.performance[key]
+        else:
+            #Run simulation if required
+            if (not os.path.isfile(self.filename)):
+                logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
+                self.benchmark(simulator)
+            
+            with np.load(self.filename) as data:
+                if key not in data['simulators']:
+                    logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
+                    data.close()
+                    self.benchmark(simulator)
+                    data = np.load(self.filename)
+                
+                def find_max_index(megacells):
+                    max_index = np.nanargmax(megacells)
+                    return np.unravel_index(max_index, megacells.shape)
+                
+                megacells = data[key + '_megacells']
+                block_widths = data[key + '_block_widths']
+                block_heights = data[key + '_block_heights']
+                j, i = find_max_index(megacells)
+                
+                self.performance[key] = { "block_width": block_widths[i],
+                                         "block_height": block_heights[j],
+                                         "megacells": megacells[j, i] }
+                logger.debug("Returning %s as peak performance parameters", self.performance[key])
+                return self.performance[key]
+        
+            #This should never happen
+            raise "Something wrong: Could not get autotuning data!"
+            return None
+        
+        
+                
+    """
+    Runs a set of benchmarks for a single simulator
+    """
+    def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
+        logger = logging.getLogger(__name__)
+        
+        megacells = np.empty((len(block_heights), len(block_widths)))
+        megacells.fill(np.nan)
+
+        logger.debug("Running %d benchmarks with %s", len(block_heights)*len(block_widths), simulator.__name__)
+        
+        sim_arguments = arguments.copy()
+                    
+        with Common.Timer(simulator.__name__) as t:
+            for j, block_height in enumerate(block_heights):
+                sim_arguments.update({'block_height': block_height})
+                for i, block_width in enumerate(block_widths):
+                    sim_arguments.update({'block_width': block_width})
+                    megacells[j, i] = Autotuner.run_benchmark(simulator, sim_arguments)
+                        
+
+        logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
+
+        return megacells
+            
+            
+    """
+    Runs a benchmark, and returns the number of megacells achieved
+    """
+    def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
+        logger = logging.getLogger(__name__)
+        
+        #Initialize simulator
+        try:
+            sim = simulator(**arguments)
+        except:
+            #An exception raised - not possible to continue
+            logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
+            return np.nan
+        
+        #Create timer events
+        start = cuda.Event()
+        end = cuda.Event()
+        
+        #Warmup
+        for i in range(warmup_timesteps):
+            sim.stepEuler(sim.dt)
+            
+        #Run simulation with timer        
+        start.record(sim.stream)
+        for i in range(timesteps):
+            sim.stepEuler(sim.dt)
+        end.record(sim.stream)
+        
+        #Synchronize end event
+        end.synchronize()
+        
+        #Compute megacells
+        gpu_elapsed = end.time_since(start)*1.0e-3
+        megacells = (sim.nx*sim.ny*timesteps / (1000*1000)) / gpu_elapsed
+
+        #Sanity check solution
+        h, hu, hv = sim.download()
+        sane = True
+        sane = sane and Autotuner.sanity_check(h, 0.3, 0.7)
+        sane = sane and Autotuner.sanity_check(hu, -0.2, 0.2)
+        sane = sane and Autotuner.sanity_check(hv, -0.2, 0.2)
+        
+        if (sane):
+            logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
+            return megacells
+        else:
+            logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
+            return np.nan
+        
+        
+        
+    """
+    Generates test dataset
+    """
+    def gen_test_data(nx, ny, g):
+        width = 100.0
+        height = 100.0
+        dx = width / float(nx)
+        dy = height / float(ny)
+
+        x_center = dx*nx/2.0
+        y_center = dy*ny/2.0
+
+        #Create a gaussian "dam break" that will not form shocks
+        size = width / 5.0
+        dt = 10**10
+        
+        h  = np.zeros((ny, nx), dtype=np.float32); 
+        hu = np.zeros((ny, nx), dtype=np.float32);
+        hv = np.zeros((ny, nx), dtype=np.float32);
+        
+        x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
+        y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
+        xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
+        r = np.sqrt(xv**2 + yv**2)
+        xv = None
+        yv = None
+        gc.collect()
+            
+        #Generate highres
+        h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
+        hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
+        hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
+        
+        scale = 0.7
+        max_h_estimate = 0.6
+        max_u_estimate = 0.1*np.sqrt(2.0)
+        dx = width/nx
+        dy = height/ny
+        dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g*max_h_estimate))
+        
+        return h, hu, hv, dx, dy, dt
+        
+    """
+    Checks that a variable is "sane"
+    """
+    def sanity_check(variable, bound_min, bound_max):
+        maxval = np.amax(variable)
+        minval = np.amin(variable)
+        if (np.isnan(maxval) 
+                or np.isnan(minval)
+                or maxval > bound_max
+                or minval < bound_min):
+            return False
+        else:
+            return True
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -0,0 +1,424 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the different helper functions and 
+classes
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+
+import numpy as np
+import time
+import re
+import io
+import hashlib
+import logging
+import gc
+
+import pycuda.compiler as cuda_compiler
+import pycuda.gpuarray
+import pycuda.driver as cuda
+
+from GPUSimulators import Autotuner
+
+"""
+Class which keeps track of time spent for a section of code
+"""
+class Timer(object):
+    def __init__(self, tag, log_level=logging.DEBUG):
+        self.tag = tag
+        self.log_level = log_level
+        self.logger = logging.getLogger(__name__)
+        
+    def __enter__(self):
+        self.start = time.time()
+        return self
+    
+    def __exit__(self, *args):
+        self.end = time.time()
+        self.secs = self.end - self.start
+        self.msecs = self.secs * 1000 # millisecs
+        self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
+            
+            
+            
+            
+
+"""
+Class which keeps track of the CUDA context and some helper functions
+"""
+class CudaContext(object):
+    
+    def __init__(self, blocking=False, use_cache=True, autotuning=True):
+        self.blocking = blocking
+        self.use_cache = use_cache
+        self.logger =  logging.getLogger(__name__)
+        self.kernels = {}
+        
+        self.module_path = os.path.dirname(os.path.realpath(__file__))
+        
+        #Initialize cuda (must be first call to PyCUDA)
+        cuda.init(flags=0)
+        
+        self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
+        
+        #Print some info about CUDA
+        self.logger.info("CUDA version %s", str(cuda.get_version()))
+        self.logger.info("Driver version %s",  str(cuda.get_driver_version()))
+
+        self.cuda_device = cuda.Device(0)
+        self.logger.info("Using '%s' GPU", self.cuda_device.name())
+        self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
+        self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
+
+        # Create the CUDA context
+        if (self.blocking):
+            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
+            self.logger.warning("Using blocking context")
+        else:
+            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
+        
+        self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
+        
+        #Create cache dir for cubin files
+        if (self.use_cache):
+            self.cache_path = os.path.join(self.module_path, "cuda_cache") 
+            if not os.path.isdir(self.cache_path):
+                os.mkdir(self.cache_path)
+            self.logger.debug("Using CUDA cache dir %s", self.cache_path)
+            
+        self.autotuner = None
+        if (autotuning):
+            self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
+            self.autotuner = Autotuner.Autotuner()
+            
+    
+    def __del__(self, *args):
+        self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
+            
+        # Loop over all contexts in stack, and remove "this"
+        other_contexts = []
+        while (cuda.Context.get_current() != None):
+            context = cuda.Context.get_current()
+            if (context.handle != self.cuda_context.handle):
+                self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
+                other_contexts = [context] + other_contexts
+                cuda.Context.pop()
+            else:
+                self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
+                cuda.Context.pop()
+
+        # Add all the contexts we popped that were not our own
+        for context in other_contexts:
+            self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
+            cuda.Context.push(context)
+            
+        self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
+        self.cuda_context.detach()
+        
+        
+    def __str__(self):
+        return "CudaContext id " + str(self.cuda_context.handle)
+        
+    
+    def hash_kernel(kernel_filename, include_dirs):        
+        # Generate a kernel ID for our caches
+        num_includes = 0
+        max_includes = 100
+        kernel_hasher = hashlib.md5()
+        logger = logging.getLogger(__name__)
+        
+        # Loop over file and includes, and check if something has changed
+        files = [kernel_filename]
+        while len(files):
+        
+            if (num_includes > max_includes):
+                raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
+        
+            filename = files.pop()
+            
+            #logger.debug("Hashing %s", filename)
+                
+            modified = os.path.getmtime(filename)
+                
+            # Open the file
+            with io.open(filename, "r") as file:
+            
+                # Search for #inclue <something> and also hash the file
+                file_str = file.read()
+                kernel_hasher.update(file_str.encode('utf-8'))
+                kernel_hasher.update(str(modified).encode('utf-8'))
+                
+                #Find all includes
+                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
+                
+            # Loop over everything that looks like an include
+            for include_file in includes:
+                
+                #Search through include directories for the file
+                file_path = os.path.dirname(filename)
+                for include_path in [file_path] + include_dirs:
+                
+                    # If we find it, add it to list of files to check
+                    temp_path = os.path.join(include_path, include_file)
+                    if (os.path.isfile(temp_path)):
+                        files = files + [temp_path]
+                        num_includes = num_includes + 1 #For circular includes...
+                        break
+            
+        return kernel_hasher.hexdigest()
+    
+    """
+    Reads a text file and creates an OpenCL kernel from that
+    """
+    def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
+                    prepared_call_args, \
+                    include_dirs=[], no_extern_c=True, 
+                    **kwargs):
+        """
+        Helper function to print compilation output
+        """
+        def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
+            self.logger.debug("Compilation returned %s", str(compile_success_bool))
+            if info_str:
+                self.logger.debug("Info: %s", info_str)
+            if error_str:
+                self.logger.debug("Error: %s", error_str)
+        
+        #self.logger.debug("Getting %s", kernel_filename)
+            
+        # Create a hash of the kernel (and its includes)
+        kwargs_hasher = hashlib.md5()
+        kwargs_hasher.update(str(kwargs).encode('utf-8'));
+        kwargs_hash = kwargs_hasher.hexdigest()
+        kwargs_hasher = None
+        root, ext = os.path.splitext(kernel_filename)
+        kernel_hash = root \
+                + "_" + CudaContext.hash_kernel( \
+                    os.path.join(self.module_path, kernel_filename), \
+                    include_dirs=[self.module_path] + include_dirs) \
+                + "_" + kwargs_hash \
+                + ext
+        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
+        
+        # If we have the kernel in our hashmap, return it
+        if (kernel_hash in self.kernels.keys()):
+            self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
+            return self.kernels[kernel_hash]
+        
+        # If we have it on disk, return it
+        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
+            self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
+                
+            with io.open(cached_kernel_filename, "rb") as file:
+                file_str = file.read()
+                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
+                
+            kernel = module.get_function(kernel_function_name)
+            kernel.prepare(prepared_call_args)
+            self.kernels[kernel_hash] = kernel
+            return kernel
+            
+        # Otherwise, compile it from source
+        else:
+            self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
+                
+            #Create kernel string
+            kernel_string = ""
+            for key, value in kwargs.items():
+                kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
+            kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
+            if (self.use_cache):
+                with io.open(cached_kernel_filename + ".txt", "w") as file:
+                    file.write(kernel_string)
+                
+            
+            with Timer("compiler") as timer:
+                cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
+                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
+                if (self.use_cache):
+                    with io.open(cached_kernel_filename, "wb") as file:
+                        file.write(cubin)
+                
+            kernel = module.get_function(kernel_function_name)
+            kernel.prepare(prepared_call_args)
+            self.kernels[kernel_hash] = kernel
+            
+            
+            return kernel
+    
+    """
+    Clears the kernel cache (useful for debugging & development)
+    """
+    def clear_kernel_cache(self):
+        self.logger.debug("Clearing cache")
+        self.kernels = {}
+        gc.collect()
+        
+    """
+    Synchronizes all streams etc
+    """
+    def synchronize(self):
+        self.cuda_context.synchronize()
+        
+
+
+
+
+
+
+
+
+
+
+
+"""
+Class that holds data 
+"""
+class CudaArray2D:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data):
+        self.logger =  logging.getLogger(__name__)
+        self.nx = nx
+        self.ny = ny
+        self.x_halo = x_halo
+        self.y_halo = y_halo
+        
+        nx_halo = nx + 2*x_halo
+        ny_halo = ny + 2*y_halo
+        
+        #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+        
+        #Make sure data is in proper format
+        assert np.issubdtype(cpu_data.dtype, np.float32), "Wrong datatype: %s" % str(cpu_data.dtype)
+        assert cpu_data.itemsize == 4, "Wrong size of data type"
+        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
+
+        #Upload data to the device
+        if (cpu_data.shape == (ny_halo, nx_halo)):
+            self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
+        elif (cpu_data.shape == (self.ny, self.nx)):
+            #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), cpu_data.dtype)
+            #self.data.fill(0.0)
+            
+            #Create copy object from host to device
+            copy = cuda.Memcpy2D()
+            copy.set_src_host(cpu_data)
+            copy.set_dst_device(self.data.gpudata)
+            
+            #Set offsets and pitch of destination
+            copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
+            copy.dst_y = self.y_halo
+            copy.dst_pitch = self.data.strides[0]
+            
+            #Set width in bytes to copy for each row and
+            #number of rows to copy
+            copy.width_in_bytes = self.nx*cpu_data.itemsize
+            copy.height = self.ny
+            
+            #Perform the copy
+            copy(stream)
+            stream.synchronize()
+
+        else:
+            assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
+        
+        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+        
+        
+    def __del__(self, *args):
+        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        self.data.gpudata.free()
+        self.data = None
+        
+    """
+    Enables downloading data from GPU to Python
+    """
+    def download(self, stream, async=False):
+        #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
+        #Allocate host memory
+        #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
+        cpu_data = np.empty((self.ny, self.nx), dtype=np.float32)
+        
+        #Create copy object from device to host
+        copy = cuda.Memcpy2D()
+        copy.set_src_device(self.data.gpudata)
+        copy.set_dst_host(cpu_data)
+        
+        #Set offsets and pitch of source
+        copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
+        copy.src_y = self.y_halo
+        copy.src_pitch = self.data.strides[0]
+        
+        #Set width in bytes to copy for each row and
+        #number of rows to copy
+        copy.width_in_bytes = self.nx*cpu_data.itemsize
+        copy.height = self.ny
+        
+        copy(stream)
+        if async==False:
+            stream.synchronize()
+        
+        return cpu_data
+
+        
+        
+        
+        
+        
+        
+        
+        
+"""
+A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
+"""
+class SWEDataArakawaA:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0):
+        self.h0  = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
+        self.hu0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
+        self.hv0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
+        
+        self.h1  = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
+        self.hu1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
+        self.hv1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
+
+    """
+    Swaps the variables after a timestep has been completed
+    """
+    def swap(self):
+        self.h1,  self.h0  = self.h0,  self.h1
+        self.hu1, self.hu0 = self.hu0, self.hu1
+        self.hv1, self.hv0 = self.hv0, self.hv1
+        
+    """
+    Enables downloading data from CL device to Python
+    """
+    def download(self, stream):
+        h_cpu  = self.h0.download(stream, async=True)
+        hu_cpu = self.hu0.download(stream, async=True)
+        hv_cpu = self.hv0.download(stream, async=False)
+        
+        return h_cpu, hu_cpu, hv_cpu
+        
+        
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the FORCE flux
+for the shallow water equations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+from GPUSimulators import Simulator
+
+
+
+        
+        
+        
+        
+        
+        
+
+
+"""
+Class that solves the SW equations 
+"""
+class FORCE (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            1, 1, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
+                                        "iiffffPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+    
+    def __str__(self):
+        return "First order centered"
+        
+    def simulate(self, t_end):
+        return super().simulateEuler(t_end)
+        
+    def stepEuler(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+        
+        
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/FORCE_kernel.cu
@@ -0,0 +1,156 @@
+/*
+This OpenCL kernel implements the classical Lax-Friedrichs scheme
+for the shallow water equations, with edge fluxes.
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.cu"
+#include "fluxes/FirstOrderCentered.cu"
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+__device__ 
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dx_, const float dt_) {
+                      
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Compute fluxes along the x axis
+    {
+        int j=ty;
+        const int l = j + 1; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
+            const int k = i;
+            
+            // Q at interface from the right and left
+            const float3 Qp = make_float3(Q[0][l][k+1],
+                                          Q[1][l][k+1],
+                                          Q[2][l][k+1]);
+            const float3 Qm = make_float3(Q[0][l][k],
+                                          Q[1][l][k],
+                                          Q[2][l][k]);
+                                       
+            // Computed flux
+            const float3 flux = FORCE_1D_flux(Qm, Qp, g_, dx_, dt_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */
+__device__ 
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Compute fluxes along the y axis
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
+        const int l = j;
+        {
+            int i=tx;
+            const int k = i + 1; //Skip ghost cells
+            
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Qp = make_float3(Q[0][l+1][k],
+                                          Q[2][l+1][k],
+                                          Q[1][l+1][k]);
+            const float3 Qm = make_float3(Q[0][l][k],
+                                          Q[2][l][k],
+                                          Q[1][l][k]);
+
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = FORCE_1D_flux(Qm, Qp, g_, dy_, dt_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+extern "C" {
+__global__ void FORCEKernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
+    
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    
+    
+    //Read into shared memory
+    readBlock1(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+        
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    __syncthreads();
+    
+    //Compute flux along x, and evolve
+    computeFluxF(Q, F, g_, dx_, dt_);
+    __syncthreads();
+    evolveF1(Q, F, nx_, ny_, dx_, dt_);
+    __syncthreads();
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    __syncthreads();
+    
+    //Compute flux along y, and evolve
+    computeFluxG(Q, F, g_, dy_, dt_);
+    __syncthreads();
+    evolveG1(Q, F, nx_, ny_, dy_, dt_);
+    __syncthreads();
+    
+    //Write to main memory
+    writeBlock1(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
+
+} // extern "C"
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+from GPUSimulators import Simulator
+
+
+
+
+
+
+
+"""
+Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
+"""
+class HLL (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            1, 1, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
+                                        "iiffffPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+    
+    def __str__(self):
+        return "Harten-Lax-van Leer"
+    
+    def simulate(self, t_end):
+        return super().simulateEuler(t_end)
+        
+    def stepEuler(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])        
+        self.data.swap()
+        self.t += dt
+        
+
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the 2nd order HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+from GPUSimulators import Simulator
+
+
+        
+        
+        
+        
+        
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class HLL2 (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 theta=1.8, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
+                                        "iifffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+        
+    def __str__(self):
+        return "Harten-Lax-van Leer (2nd order)"
+    
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
+        
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
+                
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(0), \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+            
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(1), \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+        
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/HLL2_kernel.cu
@@ -0,0 +1,233 @@
+/*
+This OpenCL kernel implements the second order HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.cu"
+#include "limiters.cu"
+#include "fluxes/HartenLaxVanLeer.cu"
+
+
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+__device__
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    {
+        const int j=ty;
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
+            const int k = i + 1;
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            const float3 Q_rl = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                            Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                            Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
+            const float3 Q_rr = make_float3(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
+                                            Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
+                                            Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
+                                         
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qx[0][j][i],
+                                            Q[1][l][k] - 0.5f*Qx[1][j][i],
+                                            Q[2][l][k] - 0.5f*Qx[2][j][i]);
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qx[0][j][i],
+                                            Q[1][l][k] + 0.5f*Qx[1][j][i],
+                                            Q[2][l][k] + 0.5f*Qx[2][j][i]);
+                                        
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dx_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+
+            // Compute flux based on prediction
+            const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+__device__
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
+        const int l = j + 1;
+        { 
+            int i=tx;
+            const int k = i + 2; //Skip ghost cells
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            //NOte that hu and hv are swapped ("transposing" the domain)!
+            const float3 Q_rl = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                            Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                            Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
+            const float3 Q_rr = make_float3(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
+                                            Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
+                                            Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
+                                        
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qy[0][j][i],
+                                            Q[2][l][k] - 0.5f*Qy[2][j][i],
+                                            Q[1][l][k] - 0.5f*Qy[1][j][i]);
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qy[0][j][i],
+                                            Q[2][l][k] + 0.5f*Qy[2][j][i],
+                                            Q[1][l][k] + 0.5f*Qy[1][j][i]);
+                                     
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dy_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dy_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+            
+            // Compute flux based on prediction
+            const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            //Note that we here swap hu and hv back to the original
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+
+
+
+extern "C" {
+__global__ void HLL2Kernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        int step_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
+            
+    //Shared memory variables
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    
+    
+    
+    
+    //Read into shared memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+    
+    //Set boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    __syncthreads();
+    
+    //Step 0 => evolve x first, then y
+    if (step_ == 0) {
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        __syncthreads();
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        __syncthreads();
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        __syncthreads();
+        
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        __syncthreads();
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        __syncthreads();
+    }
+    //Step 1 => evolve y first, then x
+    else {
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        __syncthreads();
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        __syncthreads();
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        __syncthreads();
+        
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        __syncthreads();
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        __syncthreads();
+    }
+    
+    
+    
+    
+    // Write to main memory for all internal cells
+    writeBlock2(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
+
+} // extern "C"
--- a/GPUSimulators/HLL_kernel.cu
+++ b/GPUSimulators/HLL_kernel.cu
@@ -0,0 +1,170 @@
+/*
+This GPU kernel implements the HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.cu"
+#include "fluxes/HartenLaxVanLeer.cu"
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+__device__
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    {
+        const int j=ty;
+        const int l = j + 1; //Skip ghost cells     
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) { 
+            const int k = i;
+            
+            const float3 Q_l  = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
+            const float3 Q_r  = make_float3(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
+            
+            const float3 flux = HLL_flux(Q_l, Q_r, g_);
+            
+            //Write to shared memory
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */
+__device__
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
+        const int l = j;
+        {
+            const int i=tx;
+            const int k = i + 1; //Skip ghost cells
+            
+            //NOte that hu and hv are swapped ("transposing" the domain)!
+            const float3 Q_l = make_float3(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
+            const float3 Q_r = make_float3(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
+                                       
+            // Computed flux
+            const float3 flux = HLL_flux(Q_l, Q_r, g_);
+            
+            //Write to shared memory
+            //Note that we here swap hu and hv back to the original
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+extern "C" {
+    
+__global__ void HLLKernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
+            
+    const int block_width = BLOCK_WIDTH;
+    const int block_height = BLOCK_HEIGHT;
+    
+    //Shared memory variables
+    __shared__ float Q[3][block_height+2][block_width+2];
+    __shared__ float F[3][block_height+1][block_width+1];
+    
+    
+    //Read into shared memory
+    readBlock1(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+
+    noFlowBoundary1(Q, nx_, ny_);
+    __syncthreads();
+    
+    //Compute F flux
+    computeFluxF(Q, F, g_);
+    __syncthreads();
+    evolveF1(Q, F, nx_, ny_, dx_, dt_);
+    __syncthreads();
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    __syncthreads();
+    
+    //Compute G flux
+    computeFluxG(Q, F, g_);
+    __syncthreads();
+    evolveG1(Q, F, nx_, ny_, dy_, dt_);
+    __syncthreads();
+    
+    
+    //Q[0][get_local_id(1) + 1][get_local_id(0) + 1] += 0.1;
+    
+    
+    
+    // Write to main memory for all internal cells
+    writeBlock1(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
+
+} // extern "C"
--- a/GPUSimulators/IPythonMagic.py
+++ b/GPUSimulators/IPythonMagic.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements helpers for IPython / Jupyter and CUDA
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import logging
+
+from IPython.core import magic_arguments
+from IPython.core.magic import line_magic, Magics, magics_class
+import pycuda.driver as cuda
+
+from GPUSimulators import Common
+
+
+@magics_class
+class MyIPythonMagic(Magics): 
+    @line_magic
+    def cuda_context_handler(self, context_name):
+        self.logger =  logging.getLogger(__name__)
+        
+        self.logger.debug("Registering %s as a global context", context_name)
+        
+        if context_name in self.shell.user_ns.keys():
+            self.logger.debug("Context already registered! Ignoring")
+            return
+        else:
+            self.logger.debug("Creating context")
+            #self.shell.ex(context_name + " = Common.CudaContext(blocking=False)")
+            self.shell.user_ns[context_name] = Common.CudaContext(blocking=False)
+        
+        # this function will be called on exceptions in any cell
+        def custom_exc(shell, etype, evalue, tb, tb_offset=None):
+            self.logger.exception("Exception caught: Resetting to CUDA context %s", context_name)
+            while (cuda.Context.get_current() != None):
+                context = cuda.Context.get_current()
+                self.logger.info("Popping <%s>", str(context.handle))
+                cuda.Context.pop()
+
+            if context_name in self.shell.user_ns.keys():
+                self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
+                #self.shell.ex(context_name + ".cuda_context.push()")
+                self.shell.user_ns[context_name].cuda_context.push()
+            else:
+                self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
+                self.logger.error("CUDA will not work now")
+
+            self.logger.debug("==================================================================")
+            
+            # still show the error within the notebook, don't just swallow it
+            shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
+
+        # this registers a custom exception handler for the whole current notebook
+        get_ipython().set_custom_exc((Exception,), custom_exc)
+        
+        
+        # Handle CUDA context when exiting python
+        import atexit
+        def exitfunc():
+            self.logger.info("Exitfunc: Resetting CUDA context stack")
+            while (cuda.Context.get_current() != None):
+                context = cuda.Context.get_current()
+                self.logger.info("`-> Popping <%s>", str(context.handle))
+                cuda.Context.pop()
+            self.logger.debug("==================================================================")
+        atexit.register(exitfunc)
+        
+    @line_magic
+    @magic_arguments.magic_arguments()
+    @magic_arguments.argument(
+        '--out', '-o', type=str, default='output.log', help='The filename to store the log to')
+    @magic_arguments.argument(
+        '--level', '-l', type=int, default=20, help='The level of logging to screen [0, 50]')
+    @magic_arguments.argument(
+        '--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
+    def setup_logging(self, line):
+        args = magic_arguments.parse_argstring(self.setup_logging, line)
+        import sys
+        
+        #Get root logger
+        logger = logging.getLogger('')
+        logger.setLevel(min(args.level, args.file_level))
+
+        #Add log to screen
+        ch = logging.StreamHandler()
+        ch.setLevel(args.level)
+        logger.addHandler(ch)
+        logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level))
+        
+        #Add log to file
+        logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), args.out)
+        fh = logging.FileHandler(args.out)
+        formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
+        fh.setFormatter(formatter)
+        fh.setLevel(args.file_level)
+        logger.addHandler(fh)
+        
+        logger.info("Python version %s", sys.version)
+
+# Register 
+ip = get_ipython()
+ip.register_magics(MyIPythonMagic)
+
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+from GPUSimulators import Simulator
+
+
+
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class KP07 (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 theta=1.3, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
+                                        "iifffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+        
+    def __str__(self):
+        return "Kurganov-Petrova 2007"
+    
+    def simulate(self, t_end):
+        return super().simulateRK(t_end, 2)
+        
+    def substepRK(self, dt, substep):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(substep), \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        
+    def stepEuler(self, dt):
+        self.substepRK(dt, 0)
+        self.t += dt
+        
+    def stepRK(self, dt, order):
+        if (order != 2):
+            raise NotImplementedError("Only second order implemented")
+        self.substepRK(dt, 0)
+        self.substepRK(dt, 1)
+        self.t += dt
+    
+    def download(self):
+        return self.data.download(self.stream)
+
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+from GPUSimulators import Simulator
+
+
+
+
+
+"""
+Class that solves the SW equations using the dimentionally split KP07 scheme
+"""
+class KP07_dimsplit (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 theta=1.3, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
+                                        "iifffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+    
+    def __str__(self):
+        return "Kurganov-Petrova 2007 dimensionally split"
+    
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
+    
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
+    
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(0), \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0], \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0], \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+    
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(1), \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0], \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0], \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+        
+        
--- a/GPUSimulators/KP07_dimsplit_kernel.cu
+++ b/GPUSimulators/KP07_dimsplit_kernel.cu
@@ -0,0 +1,225 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.cu"
+#include "limiters.cu"
+#include "fluxes/CentralUpwind.cu"
+
+
+__device__
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    {
+        int j=ty;
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
+            const int k = i + 1;
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            const float3 Q_rl = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                            Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                            Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
+            const float3 Q_rr = make_float3(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
+                                            Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
+                                            Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
+                                         
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qx[0][j][i],
+                                            Q[1][l][k] - 0.5f*Qx[1][j][i],
+                                            Q[2][l][k] - 0.5f*Qx[2][j][i]);
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qx[0][j][i],
+                                            Q[1][l][k] + 0.5f*Qx[1][j][i],
+                                            Q[2][l][k] + 0.5f*Qx[2][j][i]);
+                                    
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dx_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+
+            // Compute flux based on prediction
+            const float3 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }    
+}
+
+__device__
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
+        const int l = j + 1;
+        {
+            int i=tx;
+            const int k = i + 2; //Skip ghost cells
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            //NOte that hu and hv are swapped ("transposing" the domain)!
+            const float3 Q_rl = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                            Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                            Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
+            const float3 Q_rr = make_float3(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
+                                            Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
+                                            Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
+                                        
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qy[0][j][i],
+                                            Q[2][l][k] - 0.5f*Qy[2][j][i],
+                                            Q[1][l][k] - 0.5f*Qy[1][j][i]);
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qy[0][j][i],
+                                            Q[2][l][k] + 0.5f*Qy[2][j][i],
+                                            Q[1][l][k] + 0.5f*Qy[1][j][i]);
+                                     
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dy_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dy_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+            
+            // Compute flux based on prediction
+            const float3 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            //Note that we here swap hu and hv back to the original
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+/**
+  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
+  */
+extern "C" {
+__global__ void KP07DimsplitKernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        int step_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
+        
+        
+    //Shared memory variables
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    
+    
+    
+    //Read into shared memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    //Fix boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    
+    //Step 0 => evolve x first, then y
+    if (step_ == 0) {
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        __syncthreads();
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        __syncthreads();
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        __syncthreads();
+        
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        __syncthreads();
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        __syncthreads();
+    }
+    //Step 1 => evolve y first, then x
+    else {
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        __syncthreads();
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        __syncthreads();
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        __syncthreads();
+        
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        __syncthreads();
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        __syncthreads();
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        __syncthreads();
+    }
+    
+    
+    // Write to main memory for all internal cells
+    writeBlock2(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
+
+} // extern "C"
--- a/GPUSimulators/KP07_kernel.cu
+++ b/GPUSimulators/KP07_kernel.cu
@@ -0,0 +1,210 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.cu"
+#include "limiters.cu"
+#include "fluxes/CentralUpwind.cu"
+
+
+__device__
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    {
+        int j=ty;
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
+            const int k = i + 1;
+            // Q at interface from the right and left
+            const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                          Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                          Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
+            const float3 Qm = make_float3(Q[0][l][k  ] + 0.5f*Qx[0][j][i  ],
+                                          Q[1][l][k  ] + 0.5f*Qx[1][j][i  ],
+                                          Q[2][l][k  ] + 0.5f*Qx[2][j][i  ]);
+                                       
+            // Computed flux
+            const float3 flux = CentralUpwindFlux(Qm, Qp, g_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }    
+}
+
+__device__
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
+        const int l = j + 1;
+        {
+            int i=tx;
+            const int k = i + 2; //Skip ghost cells
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Qp = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                          Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                          Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
+            const float3 Qm = make_float3(Q[0][l  ][k] + 0.5f*Qy[0][j  ][i],
+                                          Q[2][l  ][k] + 0.5f*Qy[2][j  ][i],
+                                          Q[1][l  ][k] + 0.5f*Qy[1][j  ][i]);
+                                       
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = CentralUpwindFlux(Qm, Qp, g_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+/**
+  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
+  */
+extern "C" {
+__global__ void KP07Kernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        int step_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
+        
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    //Shared memory variables
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    
+    //The following slightly wastes memory, but enables us to reuse the 
+    //funcitons in common.opencl
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    
+    
+    
+    //Read into shared memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    //Fix boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    //Reconstruct slopes along x and axis
+    minmodSlopeX(Q, Qx, theta_);
+    minmodSlopeY(Q, Qy, theta_);
+    __syncthreads();
+    
+    
+    //Compute fluxes along the x and y axis
+    computeFluxF(Q, Qx, F, g_);
+    computeFluxG(Q, Qy, G, g_);
+    __syncthreads();
+    
+    
+    //Sum fluxes and advance in time for all internal cells
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +2
+        const int j = ty + 2;
+        
+        const float h1  = Q[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
+        const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_;
+        const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_;
+
+        float* const h_row  = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
+                    
+        if  (step_ == 0) {
+            //First step of RK2 ODE integrator
+            
+            h_row[ti] = h1;
+            hu_row[ti] = hu1;
+            hv_row[ti] = hv1;
+        }
+        else if (step_ == 1) {
+            //Second step of RK2 ODE integrator
+            
+            //First read Q^n
+            const float h_a  = h_row[ti];
+            const float hu_a = hu_row[ti];
+            const float hv_a = hv_row[ti];
+            
+            //Compute Q^n+1
+            const float h_b  = 0.5f*(h_a + h1);
+            const float hu_b = 0.5f*(hu_a + hu1);
+            const float hv_b = 0.5f*(hv_a + hv1);
+            
+            //Write to main memory
+            h_row[ti] = h_b;
+            hu_row[ti] = hu_b;
+            hv_row[ti] = hv_b;
+        }
+    }
+}
+} //extern "C"
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the classical Lax-Friedrichs numerical
+scheme for the shallow water equations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+from GPUSimulators import Simulator
+
+
+
+
+
+
+
+"""
+Class that solves the SW equations using the Lax Friedrichs scheme
+"""
+class LxF (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            1, 1, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+
+        # Get kernels
+        self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
+                                        "iiffffPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+        
+    def __str__(self):
+        return "Lax Friedrichs"
+        
+    def simulate(self, t_end):
+        return super().simulateEuler(t_end)
+        
+    def stepEuler(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+  
+
--- a/GPUSimulators/LxF_kernel.cu
+++ b/GPUSimulators/LxF_kernel.cu
@@ -0,0 +1,173 @@
+/*
+This OpenCL kernel implements the classical Lax-Friedrichs scheme
+for the shallow water equations, with edge fluxes.
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.cu"
+#include "fluxes/LaxFriedrichs.cu"
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+template <int block_width, int block_height>
+__device__ 
+void computeFluxF(float Q[3][block_height+2][block_width+2],
+                  float F[3][block_height][block_width+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    {
+        const int j=ty;
+        const int l = j + 1; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=block_width) {
+            const int k = i;
+            
+            // Q at interface from the right and left
+            const float3 Qp = make_float3(Q[0][l][k+1],
+                                          Q[1][l][k+1],
+                                          Q[2][l][k+1]);
+            const float3 Qm = make_float3(Q[0][l][k],
+                                          Q[1][l][k],
+                                          Q[2][l][k]);
+                                       
+            // Computed flux
+            const float3 flux = LxF_2D_flux(Qm, Qp, g_, dx_, dt_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */ 
+template <int block_width, int block_height>
+__device__
+void computeFluxG(float Q[3][block_height+2][block_width+2],
+                  float G[3][block_height+1][block_width],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+1; j+=block_height) {
+        const int l = j;
+        {
+            const int i=tx;
+            const int k = i + 1; //Skip ghost cells
+            
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Qp = make_float3(Q[0][l+1][k],
+                                          Q[2][l+1][k],
+                                          Q[1][l+1][k]);
+            const float3 Qm = make_float3(Q[0][l][k],
+                                          Q[2][l][k],
+                                          Q[1][l][k]);
+
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = LxF_2D_flux(Qm, Qp, g_, dy_, dt_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }  
+}
+
+
+extern "C" {
+__global__ 
+void LxFKernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
+            
+    const int block_width = BLOCK_WIDTH;
+    const int block_height = BLOCK_HEIGHT;
+            
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    __shared__ float Q[3][block_height+2][block_width+2];
+    __shared__ float F[3][block_height][block_width+1];
+    __shared__ float G[3][block_height+1][block_width];
+    
+    //Read into shared memory
+    readBlock1(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    //Compute fluxes along the x and y axis
+    computeFluxF<block_width, block_height>(Q, F, g_, dx_, dt_);
+    computeFluxG<block_width, block_height>(Q, G, g_, dy_, dt_);
+    __syncthreads();
+    
+    
+    //Evolve for all internal cells
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        //Index of thread within block
+        const int tx = get_local_id(0);
+        const int ty = get_local_id(1);
+        
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+        
+        const float h1  = Q[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
+        const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_;
+        const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_;
+
+        float* const h_row  = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
+        
+        h_row[ti] = h1;
+        hu_row[ti] = hu1;
+        hv_row[ti] = hv1;
+    }
+}
+
+} // extern "C"
+
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the classical Lax-Friedrichs numerical
+scheme for the shallow water equations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import logging
+
+import pycuda.compiler as cuda_compiler
+import pycuda.gpuarray
+import pycuda.driver as cuda
+
+from GPUSimulators import Common
+
+
+class BaseSimulator:
+    """
+    Initialization routine
+    context: GPU context to use
+    kernel_wrapper: wrapper function of GPU kernel
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 ghost_cells_x, ghost_cells_y, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width, block_height):
+        #Get logger
+        self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
+        
+        self.context = context
+        
+        if (self.context.autotuner):
+            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
+            block_width = int(peak_configuration["block_width"])
+            block_height = int(peak_configuration["block_height"])
+            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
+        
+        #Create a CUDA stream
+        self.stream = cuda.Stream()
+        
+        #Create data by uploading to device
+        self.data = Common.SWEDataArakawaA(self.stream, \
+                            nx, ny, \
+                            ghost_cells_x, ghost_cells_y, \
+                            h0, hu0, hv0)
+                           
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #GPU kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g) 
+        
+        #Keep track of simulation time
+        self.t = 0.0;
+                            
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height, 1) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
+                      ) 
+                      
+    """
+    Function which simulates forward in time using the default simulation type
+    """
+    def simulate(self, t_end):
+        raise(exceptions.NotImplementedError("Needs to be implemented in subclass"))
+                      
+    """ 
+    Function which simulates t_end seconds using forward Euler
+    Requires that the stepEuler functionality is implemented in the subclasses
+    """
+    def simulateEuler(self, t_end):
+        with Common.Timer(self.__class__.__name__ + ".simulateEuler") as t:
+            # Compute number of timesteps to perform
+            n = int(t_end / self.dt + 1)
+            
+            for i in range(0, n):
+                # Compute timestep for "this" iteration
+                local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+                
+                # Stop if end reached (should not happen)
+                if (local_dt <= 0.0):
+                    break
+            
+                # Step with forward Euler 
+                self.stepEuler(local_dt)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
+        return self.t, n
+        
+    """
+    Function which simulates t_end seconds using Runge-Kutta 2
+    Requires that the stepRK functionality is implemented in the subclasses
+    """
+    def simulateRK(self, t_end, order):
+        with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
+            # Compute number of timesteps to perform
+            n = int(t_end / self.dt + 1)
+            
+            for i in range(0, n):
+                # Compute timestep for "this" iteration
+                local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+                
+                # Stop if end reached (should not happen)
+                if (local_dt <= 0.0):
+                    break
+            
+                # Perform all the Runge-Kutta substeps
+                self.stepRK(local_dt, order)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
+        return self.t, n
+        
+    """
+    Function which simulates t_end seconds using second order dimensional splitting (XYYX)
+    Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
+    """
+    def simulateDimsplit(self, t_end):
+        with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
+            # Compute number of timesteps to perform
+            n = int(t_end / (2.0*self.dt) + 1)
+            
+            for i in range(0, n):
+                # Compute timestep for "this" iteration
+                local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
+                
+                # Stop if end reached (should not happen)
+                if (local_dt <= 0.0):
+                    break
+                
+                # Perform the dimensional split substeps
+                self.stepDimsplitXY(local_dt)
+                self.stepDimsplitYX(local_dt)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs)
+        return self.t, 2*n
+        
+    
+    """
+    Function which performs one single timestep of size dt using forward euler
+    """
+    def stepEuler(self, dt):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def stepRK(self, dt, substep):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+    
+    def stepDimsplitXY(self, dt):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def stepDimsplitYX(self, dt):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def sim_time(self):
+        return self.t
+
+    def download(self):
+        return self.data.download(self.stream)
+        
+    def synchronize(self):
+        self.stream.synchronize()
+
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Weighted average flux (WAF) described in
+E. Toro, Shock-Capturing methods for free-surface shallow flows, 2001
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+from GPUSimulators import Simulator
+
+
+
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class WAF (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
+                                        "iiffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+    
+    def __str__(self):
+        return "Weighted average flux"
+    
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
+        
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
+        
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, dt, \
+                        self.g, \
+                        np.int32(0), \
+                        self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                        self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                        self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                        self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                        self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                        self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
+        
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, dt, \
+                        self.g, \
+                        np.int32(1), \
+                        self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                        self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                        self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                        self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                        self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                        self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+        self.data.swap()
+        self.t += dt
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/WAF_kernel.cu
@@ -0,0 +1,199 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.cu"
+#include "fluxes/WeightedAverageFlux.cu"
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+__device__
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+                      
+    {
+        int j=ty; 
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
+            const int k = i + 1;
+            
+            // Q at interface from the right and left
+            const float3 Ql2 = make_float3(Q[0][l][k-1], Q[1][l][k-1], Q[2][l][k-1]);
+            const float3 Ql1 = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
+            const float3 Qr1 = make_float3(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
+            const float3 Qr2 = make_float3(Q[0][l][k+2], Q[1][l][k+2], Q[2][l][k+2]);
+
+            // Computed flux
+            const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dx_, dt_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */
+__device__
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Compute fluxes along the y axis
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
+        const int l = j + 1;
+        {
+            int i=tx;
+            const int k = i + 2; //Skip ghost cells
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Ql2 = make_float3(Q[0][l-1][k], Q[2][l-1][k], Q[1][l-1][k]);
+            const float3 Ql1 = make_float3(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
+            const float3 Qr1 = make_float3(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
+            const float3 Qr2 = make_float3(Q[0][l+2][k], Q[2][l+2][k], Q[1][l+2][k]);
+            
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dy_, dt_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern "C" {
+__global__ void WAFKernel(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_, int step_,
+        
+        //Input h^n
+        float* h0_ptr_, int h0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        float* h1_ptr_, int h1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {    
+    //Shared memory variables
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    
+    
+    
+    //Read into shared memory Q from global memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    //Set boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    __syncthreads();
+    
+    
+    
+    //Step 0 => evolve x first, then y
+    if (step_ == 0) {
+        //Compute fluxes along the x axis and evolve
+        computeFluxF(Q, F, g_, dx_, dt_);
+        __syncthreads();
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        __syncthreads();
+        
+        //Fix boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        __syncthreads();
+        
+        //Compute fluxes along the y axis and evolve
+        computeFluxG(Q, F, g_, dy_, dt_);
+        __syncthreads();
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        __syncthreads();
+    }
+    //Step 1 => evolve y first, then x
+    else {
+        //Compute fluxes along the y axis and evolve
+        computeFluxG(Q, F, g_, dy_, dt_);
+        __syncthreads();
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        __syncthreads();
+        
+        //Fix boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        __syncthreads();
+        
+        //Compute fluxes along the x axis and evolve
+        computeFluxF(Q, F, g_, dx_, dt_);
+        __syncthreads();
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        __syncthreads();
+    }
+
+
+    
+    // Write to main memory for all internal cells
+    writeBlock2(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
+
+} // extern "C"
--- a/GPUSimulators/init.py
+++ b/GPUSimulators/init.py
@@ -0,0 +1,5 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+
+# Nothing general to do
--- a/GPUSimulators/common.cu
+++ b/GPUSimulators/common.cu
@@ -0,0 +1,500 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+  * Location of thread in block
+  */
+inline __device__ int get_local_id(int dim) {
+    switch(dim) {
+        case 0: return threadIdx.x; 
+        case 1: return threadIdx.y;
+        case 2: return threadIdx.z;
+        default: return -1;
+    }
+}
+
+
+/**
+  * Get block index
+  */
+__device__ int get_group_id(int dim) {
+    switch(dim) {
+        case 0: return blockIdx.x;
+        case 1: return blockIdx.y;
+        case 2: return blockIdx.z;
+        default: return -1;
+    }
+}
+
+/**
+  * Location of thread in global domain
+  */
+__device__ int get_global_id(int dim) {
+    switch(dim) {
+        case 0: return blockDim.x*blockIdx.x + threadIdx.x;
+        case 1: return blockDim.y*blockIdx.y + threadIdx.y;
+        case 2: return blockDim.z*blockIdx.z + threadIdx.z;
+        default: return -1;
+    }
+}
+
+
+/**
+  * Float3 operators 
+  */
+inline __device__ float3 operator*(const float a, const float3 b) {
+    return make_float3(a*b.x, a*b.y, a*b.z);
+}
+
+inline __device__ float3 operator/(const float3 a, const float b) {
+    return make_float3(a.x/b, a.y/b, a.z/b);
+}
+
+inline __device__ float3 operator-(const float3 a, const float3 b) {
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+inline __device__ float3 operator+(const float3 a, const float3 b) {
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+inline __device__ __host__ float clamp(const float f, const float a, const float b) {
+    return fmaxf(a, fminf(f, b));
+}
+
+
+
+
+
+/**
+  * Reads a block of data  with one ghost cell for the shallow water equations
+  */
+__device__ void readBlock1(float* h_ptr_, int h_pitch_,
+                float* hu_ptr_, int hu_pitch_,
+                float* hv_ptr_, int hv_pitch_,
+                float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], 
+                const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = BLOCK_WIDTH * get_group_id(0);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
+    
+    //Read into shared memory
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
+        const int l = clamp(by + j, 0, ny_+1); // Out of bounds
+        
+        //Compute the pointer to current row in the arrays
+        float* const h_row  = (float*) ((char*) h_ptr_  + h_pitch_*l);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
+        
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
+            const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
+            
+            Q[0][j][i] = h_row[k];
+            Q[1][j][i] = hu_row[k];
+            Q[2][j][i] = hv_row[k];
+        }
+    }
+}
+
+
+
+
+
+/**
+  * Reads a block of data  with two ghost cells for the shallow water equations
+  */
+__device__ void readBlock2(float* h_ptr_, int h_pitch_,
+                float* hu_ptr_, int hu_pitch_,
+                float* hv_ptr_, int hv_pitch_,
+                float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
+                const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = BLOCK_WIDTH * get_group_id(0);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
+    
+    //Read into shared memory
+    for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
+        const int l = clamp(by + j, 0, ny_+3); // Out of bounds
+        
+        //Compute the pointer to current row in the arrays
+        float* const h_row  = (float*) ((char*) h_ptr_  + h_pitch_*l);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
+        
+        for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
+            const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
+            
+            Q[0][j][i] = h_row[k];
+            Q[1][j][i] = hu_row[k];
+            Q[2][j][i] = hv_row[k];
+        }
+    }
+}
+
+
+
+
+/**
+  * Writes a block of data to global memory for the shallow water equations.
+  */
+__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
+                 float* hu_ptr_, int hu_pitch_,
+                 float* hv_ptr_, int hv_pitch_,
+                 float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                 const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    //Only write internal cells
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+
+        float* const h_row  = (float*) ((char*) h_ptr_  + h_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj);
+        
+        h_row[ti]  = Q[0][j][i];
+        hu_row[ti] = Q[1][j][i];
+        hv_row[ti] = Q[2][j][i];
+    }
+}
+
+
+
+
+
+/**
+  * Writes a block of data to global memory for the shallow water equations.
+  */
+__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
+                 float* hu_ptr_, int hu_pitch_,
+                 float* hv_ptr_, int hv_pitch_,
+                 float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
+                 const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    //Only write internal cells
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +2
+        const int j = ty + 2;
+
+        float* const h_row  = (float*) ((char*) h_ptr_ + h_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj);
+        
+        h_row[ti]  = Q[0][j][i];
+        hu_row[ti] = Q[1][j][i];
+        hv_row[ti] = Q[2][j][i];
+    }
+}
+
+
+
+
+
+
+/**
+  * No flow boundary conditions for the shallow water equations
+  * with one ghost cell in each direction
+  */
+__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
+    //Global index
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    //Block-local indices
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    const int i = tx + 1; //Skip local ghost cells, i.e., +1
+    const int j = ty + 1;
+    
+    //Fix boundary conditions
+    if (ti == 1) {
+        Q[0][j][i-1] =  Q[0][j][i];
+        Q[1][j][i-1] = -Q[1][j][i];
+        Q[2][j][i-1] =  Q[2][j][i];
+    }
+    if (ti == nx_) {
+        Q[0][j][i+1] =  Q[0][j][i];
+        Q[1][j][i+1] = -Q[1][j][i];
+        Q[2][j][i+1] =  Q[2][j][i];
+    }
+    if (tj == 1) {
+        Q[0][j-1][i] =  Q[0][j][i];
+        Q[1][j-1][i] =  Q[1][j][i];
+        Q[2][j-1][i] = -Q[2][j][i];
+    }
+    if (tj == ny_) {
+        Q[0][j+1][i] =  Q[0][j][i];
+        Q[1][j+1][i] =  Q[1][j][i];
+        Q[2][j+1][i] = -Q[2][j][i];
+    }
+}
+
+
+
+
+
+
+/**
+  * No flow boundary conditions for the shallow water equations
+  * with two ghost cells in each direction
+  */
+__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
+    //Global index
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    //Block-local indices
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    const int i = tx + 2; //Skip local ghost cells, i.e., +2
+    const int j = ty + 2;
+    
+    if (ti == 2) {
+        Q[0][j][i-1] =  Q[0][j][i];
+        Q[1][j][i-1] = -Q[1][j][i];
+        Q[2][j][i-1] =  Q[2][j][i];
+        
+        Q[0][j][i-2] =  Q[0][j][i+1];
+        Q[1][j][i-2] = -Q[1][j][i+1];
+        Q[2][j][i-2] =  Q[2][j][i+1];
+    }
+    if (ti == nx_+1) {
+        Q[0][j][i+1] =  Q[0][j][i];
+        Q[1][j][i+1] = -Q[1][j][i];
+        Q[2][j][i+1] =  Q[2][j][i];
+        
+        Q[0][j][i+2] =  Q[0][j][i-1];
+        Q[1][j][i+2] = -Q[1][j][i-1];
+        Q[2][j][i+2] =  Q[2][j][i-1];
+    }
+    if (tj == 2) {
+        Q[0][j-1][i] =  Q[0][j][i];
+        Q[1][j-1][i] =  Q[1][j][i];
+        Q[2][j-1][i] = -Q[2][j][i];
+        
+        Q[0][j-2][i] =  Q[0][j+1][i];
+        Q[1][j-2][i] =  Q[1][j+1][i];
+        Q[2][j-2][i] = -Q[2][j+1][i];
+    }
+    if (tj == ny_+1) {
+        Q[0][j+1][i] =  Q[0][j][i];
+        Q[1][j+1][i] =  Q[1][j][i];
+        Q[2][j+1][i] = -Q[2][j][i];
+        
+        Q[0][j+2][i] =  Q[0][j-1][i];
+        Q[1][j+2][i] =  Q[1][j-1][i];
+        Q[2][j+2][i] = -Q[2][j-1][i];
+    }
+}
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the x axis (dimensional splitting)
+  */
+__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+              const int nx_, const int ny_,
+              const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+        
+        Q[0][j][i] = Q[0][j][i] + (F[0][ty][tx] - F[0][ty][tx+1]) * dt_ / dx_;
+        Q[1][j][i] = Q[1][j][i] + (F[1][ty][tx] - F[1][ty][tx+1]) * dt_ / dx_;
+        Q[2][j][i] = Q[2][j][i] + (F[2][ty][tx] - F[2][ty][tx+1]) * dt_ / dx_;
+    }
+}
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the x axis (dimensional splitting)
+  */
+__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+              const int nx_, const int ny_,
+              const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +1
+        const int j = ty + 2;
+        
+        Q[0][j][i] = Q[0][j][i] + (F[0][ty][tx] - F[0][ty][tx+1]) * dt_ / dx_;
+        Q[1][j][i] = Q[1][j][i] + (F[1][ty][tx] - F[1][ty][tx+1]) * dt_ / dx_;
+        Q[2][j][i] = Q[2][j][i] + (F[2][ty][tx] - F[2][ty][tx+1]) * dt_ / dx_;
+    }
+}
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the y axis (dimensional splitting)
+  */
+__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+              const int nx_, const int ny_,
+              const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+        
+        Q[0][j][i] = Q[0][j][i] + (G[0][ty][tx] - G[0][ty+1][tx]) * dt_ / dy_;
+        Q[1][j][i] = Q[1][j][i] + (G[1][ty][tx] - G[1][ty+1][tx]) * dt_ / dy_;
+        Q[2][j][i] = Q[2][j][i] + (G[2][ty][tx] - G[2][ty+1][tx]) * dt_ / dy_;
+    }
+}
+
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the y axis (dimensional splitting)
+  */
+__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
+              const int nx_, const int ny_,
+              const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +2
+        const int j = ty + 2;
+        
+        Q[0][j][i] = Q[0][j][i] + (G[0][ty][tx] - G[0][ty+1][tx]) * dt_ / dy_;
+        Q[1][j][i] = Q[1][j][i] + (G[1][ty][tx] - G[1][ty+1][tx]) * dt_ / dy_;
+        Q[2][j][i] = Q[2][j][i] + (G[2][ty][tx] - G[2][ty+1][tx]) * dt_ / dy_;
+    }
+}
+
+
+
+
+
+
+
+
+
+
+__device__ float3 F_func(const float3 Q, const float g) {
+    float3 F;
+
+    F.x = Q.y;                              //hu
+    F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x;   //hu*hu/h + 0.5f*g*h*h;
+    F.z = Q.y*Q.z / Q.x;                    //hu*hv/h;
+
+    return F;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/GPUSimulators/fluxes/CentralUpwind.cu
+++ b/GPUSimulators/fluxes/CentralUpwind.cu
@@ -0,0 +1,39 @@
+/*
+This file implements the Central upwind flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+/**
+  * Central upwind flux function
+  */
+__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
+    const float3 Fp = F_func(Qp, g);
+    const float up = Qp.y / Qp.x;   // hu / h
+    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
+
+    const float3 Fm = F_func(Qm, g);
+    const float um = Qm.y / Qm.x;   // hu / h
+    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
+    
+    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
+    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
+    
+    return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
+}
+
--- a/GPUSimulators/fluxes/FirstOrderCentered.cu
+++ b/GPUSimulators/fluxes/FirstOrderCentered.cu
@@ -0,0 +1,33 @@
+/*
+This file implements the First ORder CEntered flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "./LaxFriedrichs.cu"
+#include "./LaxWendroff.cu"
+
+
+    
+    
+/**
+  * First Ordered Centered (Toro 2001, p.163)
+  */
+__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    return 0.5f*(F_lf + F_lw2);
+}
--- a/GPUSimulators/fluxes/Godunov.cu
+++ b/GPUSimulators/fluxes/Godunov.cu
@@ -0,0 +1,36 @@
+/*
+This file implements the Godunov flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+/**
+  * Godunovs centered scheme (Toro 2001, p 165)
+  */
+__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
+    
+    return F_func(Q_godc, g_);
+}
+    
+
+
--- a/GPUSimulators/fluxes/HartenLaxVanLeer.cu
+++ b/GPUSimulators/fluxes/HartenLaxVanLeer.cu
@@ -0,0 +1,63 @@
+/*
+This file implements the Harten-Lax-van Leer flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
+  */
+__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_func(Q_l, g_);
+    }
+    else if (S_r <= 0.0f) {
+        return F_func(Q_r, g_);
+    }
+    //Or estimate flux in the star region
+    else {
+        const float3 F_l = F_func(Q_l, g_);
+        const float3 F_r = F_func(Q_r, g_);
+        const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
+        return flux;
+    }
+}
--- a/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
+++ b/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
@@ -0,0 +1,80 @@
+/*
+This file implements the Harten-Lax-van Leer flux with contact discontinuity
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
+  */
+__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_l;
+    }
+    else if (S_r <= 0.0f) {
+        return F_r;
+    }
+    //Or estimate flux in the "left star" region
+    else if (S_l <= 0.0f && 0.0f <=S_star) {
+        const float v_l = Q_l.z / h_l;
+        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
+        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
+        return flux;
+    }
+    //Or estimate flux in the "righ star" region
+    else if (S_star <= 0.0f && 0.0f <=S_r) {
+        const float v_r = Q_r.z / h_r;
+        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
+        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
+        return flux;
+    }
+    else {
+        return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
+    }
+}
+
--- a/GPUSimulators/fluxes/LaxFriedrichs.cu
+++ b/GPUSimulators/fluxes/LaxFriedrichs.cu
@@ -0,0 +1,24 @@
+
+
+/**
+  * Lax-Friedrichs flux (Toro 2001, p 163)
+  */
+__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
+}
+
+
+
+/**
+  * Lax-Friedrichs extended to 2D
+  */
+__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Note numerical diffusion for 2D here (0.25)
+    return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
+}
--- a/GPUSimulators/fluxes/LaxWendroff.cu
+++ b/GPUSimulators/fluxes/LaxWendroff.cu
@@ -0,0 +1,33 @@
+/*
+This file implements the Lax-Wendroff flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+/**
+  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
+  */
+__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
+    
+    return F_func(Q_lw2, g_);
+}
+
--- a/GPUSimulators/fluxes/WeightedAverageFlux.cu
+++ b/GPUSimulators/fluxes/WeightedAverageFlux.cu
@@ -0,0 +1,187 @@
+/*
+This file implements the Weighted Average Flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "limiters.cu"
+
+/**
+  * Superbee flux limiter for WAF.
+  * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
+  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
+  * @param c_ the courant number for wave k, dt*S_k/dx
+  */
+__device__ float WAF_superbee(float r_, float c_) {
+    // r <= 0.0
+    if (r_ <= 0.0f) { 
+        return 1.0f;
+    }
+    // 0.0 <= r <= 1/2
+    else if (r_ <= 0.5f) { 
+        return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
+    }
+    // 1/2 <= r <= 1
+    else if (r_ <= 1.0f) {
+        return fabs(c_);
+    }
+    // 1 <= r <= 2
+    else  if (r_ <= 2.0f) {
+        return 1.0f - (1.0f - fabsf(c_))*r_;
+    }
+    // r >= 2
+    else {
+        return 2.0f*fabsf(c_) - 1.0f;
+    }
+}
+
+
+
+
+__device__ float WAF_albada(float r_, float c_) {
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    else {
+        return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
+    }
+}
+
+__device__ float WAF_minbee(float r_, float c_) {
+    r_ = fmaxf(-1.0f, fminf(2.0f, r_));
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    if (r_ >= 0.0f && r_ <= 1.0f) {
+        return 1.0f - (1.0f - fabsf(c_)) * r_;
+    }
+    else {
+        return fabsf(c_);
+    }
+}
+
+__device__ float WAF_minmod(float r_, float c_) {
+    return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
+}
+
+
+
+__device__ float limiterToWAFLimiter(float r_, float c_) {
+    return 1.0f - (1.0f - fabsf(c_))*r_;
+}
+
+__device__ float desingularize(float x_, float eps_) {
+    return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
+}
+
+// Compute h in the "star region", h^dagger
+__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
+    
+    //This estimate for the h* gives rise to spurious oscillations. 
+    //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
+    return h_tmp*h_tmp / g_;
+}
+
+/**
+  * Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
+  * @param r_ The flux limiter parameter (see Toro 2001, p. 203)
+  * @param Q_l2 Q_{i-1}
+  * @param Q_l1 Q_{i}
+  * @param Q_r1 Q_{i+1}
+  * @param Q_r2 Q_{i+2}
+  */
+__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
+    const float h_l = Q_l1.x;
+    const float h_r = Q_r1.x;
+    
+    const float h_l2 = Q_l2.x;
+    const float h_r2 = Q_r2.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l1.y / h_l;
+    const float u_r = Q_r1.y / h_r;
+    
+    const float u_l2 = Q_l2.y / h_l2;
+    const float u_r2 = Q_r2.y / h_r2;
+    
+    const float v_l = Q_l1.z / h_l;
+    const float v_r = Q_r1.z / h_r;
+    
+    const float v_l2 = Q_l2.z / h_l2;
+    const float v_r2 = Q_r2.z / h_r2;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    const float c_l2 = sqrt(g_*h_l2);
+    const float c_r2 = sqrt(g_*h_r2);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag_l = computeHStar(h_l2,  h_l, u_l2,  u_l, c_l2,  c_l, g_);
+    const float h_dag   = computeHStar( h_l,  h_r,  u_l,  u_r,  c_l,  c_r, g_);
+    const float h_dag_r = computeHStar( h_r, h_r2,  u_r, u_r2,  c_r, c_r2, g_);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l; 
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
+    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
+    
+    // Estimate the fluxes in the four regions
+    const float3 F_1 = F_func(Q_l1, g_);
+    const float3 F_4 = F_func(Q_r1, g_);
+    
+    const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
+    const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
+    //const float3 F_2 = F_func(Q_star_l, g_);
+    //const float3 F_3 = F_func(Q_star_r, g_);
+    
+    // Compute the courant numbers for the waves
+    const float c_1 = S_l * dt_ / dx_;
+    const float c_2 = S_star * dt_ / dx_;
+    const float c_3 = S_r * dt_ / dx_;
+    
+    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
+    const float eps = 1.0e-6f;
+    const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
+    const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
+    const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
+        
+    // Compute the limiter
+    // We use h for the nonlinear waves, and v for the middle shear wave 
+    const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
+    const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); 
+    const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
+    
+    //Average the fluxes
+    const float3 flux = 0.5f*( F_1 + F_4 )
+                      - 0.5f*( A_1 * (F_2 - F_1)
+                             + A_2 * (F_3 - F_2)
+                             + A_3 * (F_4 - F_3) );
+
+    return flux;
+}
--- a/GPUSimulators/limiters.cu
+++ b/GPUSimulators/limiters.cu
@@ -0,0 +1,129 @@
+/*
+This file implements different flux and slope limiters
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+
+
+
+
+/**
+  * Reconstructs a slope using the generalized minmod limiter based on three 
+  * consecutive values
+  */
+__device__ __inline__ float minmodSlope(float left, float center, float right, float theta) {
+    const float backward = (center - left) * theta;
+    const float central = (right - left) * 0.5f;
+    const float forward = (right - center) * theta;
+    
+	return 0.25f
+		*copysign(1.0f, backward)
+		*(copysign(1.0f, backward) + copysign(1.0f, central))
+		*(copysign(1.0f, central) + copysign(1.0f, forward))
+		*min( min(fabs(backward), fabs(central)), fabs(forward) );
+}
+
+
+
+
+/**
+  * Reconstructs a minmod slope for a whole block along the abscissa
+  */
+__device__ void minmodSlopeX(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  const float theta_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Reconstruct slopes along x axis
+    {
+        const int j = ty;
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
+            const int k = i + 1;
+            for (int p=0; p<3; ++p) {
+                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
+            }
+        }
+    }
+}
+
+
+/**
+  * Reconstructs a minmod slope for a whole block along the ordinate
+  */
+__device__ void minmodSlopeY(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  const float theta_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
+        const int l = j + 1;
+        {
+            const int i = tx;
+            const int k = i + 2; //Skip ghost cells
+            for (int p=0; p<3; ++p) {
+                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
+            }
+        }
+    }
+}
+
+
+
+
+__device__ float monotonized_central(float r_) {
+    return fmaxf(0.0f, fminf(2.0f, fminf(2.0f*r_, 0.5f*(1.0f+r_))));
+}
+
+__device__ float osher(float r_, float beta_) {
+    return fmaxf(0.0f, fminf(beta_, r_));
+}
+
+__device__ float sweby(float r_, float beta_) {
+    return fmaxf(0.0f, fmaxf(fminf(r_, beta_), fminf(beta_*r_, 1.0f)));
+}
+
+__device__ float minmod(float r_) {
+    return fmaxf(0.0f, fminf(1.0f, r_));
+}
+
+__device__ float generalized_minmod(float r_, float theta_) {
+    return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
+}
+
+__device__ float superbee(float r_) {
+    return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
+}
+
+__device__ float vanAlbada1(float r_) {
+    return (r_*r_ + r_) / (r_*r_ + 1.0f);
+}
+
+__device__ float vanAlbada2(float r_) {
+    return 2.0f*r_ / (r_*r_* + 1.0f);
+}
+
+__device__ float vanLeer(float r_) {
+    return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
+}