From 10d8e26108c8655f65f29f2933675e3fd7d1fdc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= <Andre.Brodtkorb@sintef.no>
Date: Tue, 4 Sep 2018 16:33:50 +0200
Subject: [PATCH] Refactoring

---
 GPUSimulators/Common.py                       | 220 --------
 GPUSimulators/CudaContext.py                  | 263 ++++++++++
 GPUSimulators/FORCE.py                        |   2 +-
 GPUSimulators/HLL.py                          |   2 +-
 GPUSimulators/HLL2.py                         |   2 +-
 GPUSimulators/IPythonMagic.py                 |   4 +-
 GPUSimulators/KP07.py                         |   2 +-
 GPUSimulators/KP07_dimsplit.py                |   2 +-
 GPUSimulators/LxF.py                          |   4 +-
 GPUSimulators/SWECommon.cu                    |  66 ---
 GPUSimulators/WAF.py                          |   2 +-
 .../{EulerCommon.cu => cuda/EulerCommon.h}    |   1 +
 GPUSimulators/cuda/SWECommon.h                | 474 ++++++++++++++++++
 .../{FORCE_kernel.cu => cuda/SWE_FORCE.cu}    |   5 +-
 .../{HLL_kernel.cu => cuda/SWE_HLL.cu}        |   5 +-
 .../{HLL2_kernel.cu => cuda/SWE_HLL2.cu}      |   7 +-
 .../{KP07_kernel.cu => cuda/SWE_KP07.cu}      |   7 +-
 .../SWE_KP07_dimsplit.cu}                     |   7 +-
 .../{LxF_kernel.cu => cuda/SWE_LxF.cu}        |   5 +-
 .../{WAF_kernel.cu => cuda/SWE_WAF.cu}        |   5 +-
 GPUSimulators/{common.cu => cuda/common.h}    |  32 +-
 .../{limiters.cu => cuda/limiters.h}          |   2 +-
 GPUSimulators/fluxes/CentralUpwind.cu         |  39 --
 GPUSimulators/fluxes/FirstOrderCentered.cu    |  33 --
 GPUSimulators/fluxes/Godunov.cu               |  36 --
 GPUSimulators/fluxes/HartenLaxVanLeer.cu      |  63 ---
 .../fluxes/HartenLaxVanLeerContact.cu         |  80 ---
 GPUSimulators/fluxes/LaxFriedrichs.cu         |  24 -
 GPUSimulators/fluxes/LaxWendroff.cu           |  33 --
 GPUSimulators/fluxes/WeightedAverageFlux.cu   | 187 -------
 30 files changed, 774 insertions(+), 840 deletions(-)
 create mode 100644 GPUSimulators/CudaContext.py
 delete mode 100644 GPUSimulators/SWECommon.cu
 rename GPUSimulators/{EulerCommon.cu => cuda/EulerCommon.h} (99%)
 create mode 100644 GPUSimulators/cuda/SWECommon.h
 rename GPUSimulators/{FORCE_kernel.cu => cuda/SWE_FORCE.cu} (98%)
 rename GPUSimulators/{HLL_kernel.cu => cuda/SWE_HLL.cu} (98%)
 rename GPUSimulators/{HLL2_kernel.cu => cuda/SWE_HLL2.cu} (98%)
 rename GPUSimulators/{KP07_kernel.cu => cuda/SWE_KP07.cu} (98%)
 rename GPUSimulators/{KP07_dimsplit_kernel.cu => cuda/SWE_KP07_dimsplit.cu} (98%)
 rename GPUSimulators/{LxF_kernel.cu => cuda/SWE_LxF.cu} (98%)
 rename GPUSimulators/{WAF_kernel.cu => cuda/SWE_WAF.cu} (98%)
 rename GPUSimulators/{common.cu => cuda/common.h} (91%)
 rename GPUSimulators/{limiters.cu => cuda/limiters.h} (99%)
 delete mode 100644 GPUSimulators/fluxes/CentralUpwind.cu
 delete mode 100644 GPUSimulators/fluxes/FirstOrderCentered.cu
 delete mode 100644 GPUSimulators/fluxes/Godunov.cu
 delete mode 100644 GPUSimulators/fluxes/HartenLaxVanLeer.cu
 delete mode 100644 GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
 delete mode 100644 GPUSimulators/fluxes/LaxFriedrichs.cu
 delete mode 100644 GPUSimulators/fluxes/LaxWendroff.cu
 delete mode 100644 GPUSimulators/fluxes/WeightedAverageFlux.cu

diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py
index 17b20a4..e88069e 100644
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -34,7 +34,6 @@ import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 
-from GPUSimulators import Autotuner
 
 """
 Class which keeps track of time spent for a section of code
@@ -58,225 +57,6 @@ class Timer(object):
             
             
             
-
-"""
-Class which keeps track of the CUDA context and some helper functions
-"""
-class CudaContext(object):
-    
-    def __init__(self, blocking=False, use_cache=True, autotuning=True):
-        self.blocking = blocking
-        self.use_cache = use_cache
-        self.logger =  logging.getLogger(__name__)
-        self.kernels = {}
-        
-        self.module_path = os.path.dirname(os.path.realpath(__file__))
-        
-        #Initialize cuda (must be first call to PyCUDA)
-        cuda.init(flags=0)
-        
-        self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
-        
-        #Print some info about CUDA
-        self.logger.info("CUDA version %s", str(cuda.get_version()))
-        self.logger.info("Driver version %s",  str(cuda.get_driver_version()))
-
-        self.cuda_device = cuda.Device(0)
-        self.logger.info("Using '%s' GPU", self.cuda_device.name())
-        self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
-
-        # Create the CUDA context
-        if (self.blocking):
-            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
-            self.logger.warning("Using blocking context")
-        else:
-            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
-            
-        free, total = cuda.mem_get_info()
-        self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024)))
-        
-        self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
-        
-        #Create cache dir for cubin files
-        if (self.use_cache):
-            self.cache_path = os.path.join(self.module_path, "cuda_cache") 
-            if not os.path.isdir(self.cache_path):
-                os.mkdir(self.cache_path)
-            self.logger.info("Using CUDA cache dir %s", self.cache_path)
-            
-        self.autotuner = None
-        if (autotuning):
-            self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
-            self.autotuner = Autotuner.Autotuner()
-            
-    
-    def __del__(self, *args):
-        self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
-            
-        # Loop over all contexts in stack, and remove "this"
-        other_contexts = []
-        while (cuda.Context.get_current() != None):
-            context = cuda.Context.get_current()
-            if (context.handle != self.cuda_context.handle):
-                self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
-                other_contexts = [context] + other_contexts
-                cuda.Context.pop()
-            else:
-                self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
-                cuda.Context.pop()
-
-        # Add all the contexts we popped that were not our own
-        for context in other_contexts:
-            self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
-            cuda.Context.push(context)
-            
-        self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
-        self.cuda_context.detach()
-        
-        
-    def __str__(self):
-        return "CudaContext id " + str(self.cuda_context.handle)
-        
-    
-    def hash_kernel(kernel_filename, include_dirs):        
-        # Generate a kernel ID for our caches
-        num_includes = 0
-        max_includes = 100
-        kernel_hasher = hashlib.md5()
-        logger = logging.getLogger(__name__)
-        
-        # Loop over file and includes, and check if something has changed
-        files = [kernel_filename]
-        while len(files):
-        
-            if (num_includes > max_includes):
-                raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
-        
-            filename = files.pop()
-            
-            #logger.debug("Hashing %s", filename)
-                
-            modified = os.path.getmtime(filename)
-                
-            # Open the file
-            with io.open(filename, "r") as file:
-            
-                # Search for #inclue <something> and also hash the file
-                file_str = file.read()
-                kernel_hasher.update(file_str.encode('utf-8'))
-                kernel_hasher.update(str(modified).encode('utf-8'))
-                
-                #Find all includes
-                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
-                
-            # Loop over everything that looks like an include
-            for include_file in includes:
-                
-                #Search through include directories for the file
-                file_path = os.path.dirname(filename)
-                for include_path in [file_path] + include_dirs:
-                
-                    # If we find it, add it to list of files to check
-                    temp_path = os.path.join(include_path, include_file)
-                    if (os.path.isfile(temp_path)):
-                        files = files + [temp_path]
-                        num_includes = num_includes + 1 #For circular includes...
-                        break
-            
-        return kernel_hasher.hexdigest()
-    
-    """
-    Reads a text file and creates an OpenCL kernel from that
-    """
-    def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
-                    prepared_call_args, \
-                    include_dirs=[], no_extern_c=True, 
-                    **kwargs):
-        """
-        Helper function to print compilation output
-        """
-        def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
-            self.logger.debug("Compilation returned %s", str(compile_success_bool))
-            if info_str:
-                self.logger.debug("Info: %s", info_str)
-            if error_str:
-                self.logger.debug("Error: %s", error_str)
-        
-        #self.logger.debug("Getting %s", kernel_filename)
-            
-        # Create a hash of the kernel (and its includes)
-        kwargs_hasher = hashlib.md5()
-        kwargs_hasher.update(str(kwargs).encode('utf-8'));
-        kwargs_hash = kwargs_hasher.hexdigest()
-        kwargs_hasher = None
-        root, ext = os.path.splitext(kernel_filename)
-        kernel_hash = root \
-                + "_" + CudaContext.hash_kernel( \
-                    os.path.join(self.module_path, kernel_filename), \
-                    include_dirs=[self.module_path] + include_dirs) \
-                + "_" + kwargs_hash \
-                + ext
-        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
-        
-        # If we have the kernel in our hashmap, return it
-        if (kernel_hash in self.kernels.keys()):
-            self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
-            return self.kernels[kernel_hash]
-        
-        # If we have it on disk, return it
-        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
-            self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
-                
-            with io.open(cached_kernel_filename, "rb") as file:
-                file_str = file.read()
-                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
-                
-            kernel = module.get_function(kernel_function_name)
-            kernel.prepare(prepared_call_args)
-            self.kernels[kernel_hash] = kernel
-            return kernel
-            
-        # Otherwise, compile it from source
-        else:
-            self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
-                
-            #Create kernel string
-            kernel_string = ""
-            for key, value in kwargs.items():
-                kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
-            kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
-            if (self.use_cache):
-                with io.open(cached_kernel_filename + ".txt", "w") as file:
-                    file.write(kernel_string)
-                
-            
-            with Timer("compiler") as timer:
-                cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
-                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
-                if (self.use_cache):
-                    with io.open(cached_kernel_filename, "wb") as file:
-                        file.write(cubin)
-                
-            kernel = module.get_function(kernel_function_name)
-            kernel.prepare(prepared_call_args)
-            self.kernels[kernel_hash] = kernel
-            
-            
-            return kernel
-    
-    """
-    Clears the kernel cache (useful for debugging & development)
-    """
-    def clear_kernel_cache(self):
-        self.logger.debug("Clearing cache")
-        self.kernels = {}
-        gc.collect()
-        
-    """
-    Synchronizes all streams etc
-    """
-    def synchronize(self):
-        self.cuda_context.synchronize()
         
 
 
diff --git a/GPUSimulators/CudaContext.py b/GPUSimulators/CudaContext.py
new file mode 100644
index 0000000..fd50864
--- /dev/null
+++ b/GPUSimulators/CudaContext.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements Cuda context handling
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+
+import os
+
+import numpy as np
+import time
+import re
+import io
+import hashlib
+import logging
+import gc
+
+import pycuda.compiler as cuda_compiler
+import pycuda.gpuarray
+import pycuda.driver as cuda
+
+from GPUSimulators import Autotuner, Common
+
+
+
+"""
+Class which keeps track of the CUDA context and some helper functions
+"""
+class CudaContext(object):
+    
+    def __init__(self, blocking=False, use_cache=True, autotuning=True):
+        self.blocking = blocking
+        self.use_cache = use_cache
+        self.logger =  logging.getLogger(__name__)
+        self.kernels = {}
+        
+        self.module_path = os.path.dirname(os.path.realpath(__file__))
+        
+        #Initialize cuda (must be first call to PyCUDA)
+        cuda.init(flags=0)
+        
+        self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
+        
+        #Print some info about CUDA
+        self.logger.info("CUDA version %s", str(cuda.get_version()))
+        self.logger.info("Driver version %s",  str(cuda.get_driver_version()))
+
+        self.cuda_device = cuda.Device(0)
+        self.logger.info("Using '%s' GPU", self.cuda_device.name())
+        self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
+
+        # Create the CUDA context
+        if (self.blocking):
+            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
+            self.logger.warning("Using blocking context")
+        else:
+            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
+            
+        free, total = cuda.mem_get_info()
+        self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024)))
+        
+        self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
+        
+        #Create cache dir for cubin files
+        if (self.use_cache):
+            self.cache_path = os.path.join(self.module_path, "cuda_cache") 
+            if not os.path.isdir(self.cache_path):
+                os.mkdir(self.cache_path)
+            self.logger.info("Using CUDA cache dir %s", self.cache_path)
+            
+        self.autotuner = None
+        if (autotuning):
+            self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
+            self.autotuner = Autotuner.Autotuner()
+            
+    
+    def __del__(self, *args):
+        self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
+            
+        # Loop over all contexts in stack, and remove "this"
+        other_contexts = []
+        while (cuda.Context.get_current() != None):
+            context = cuda.Context.get_current()
+            if (context.handle != self.cuda_context.handle):
+                self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
+                other_contexts = [context] + other_contexts
+                cuda.Context.pop()
+            else:
+                self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
+                cuda.Context.pop()
+
+        # Add all the contexts we popped that were not our own
+        for context in other_contexts:
+            self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
+            cuda.Context.push(context)
+            
+        self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
+        self.cuda_context.detach()
+        
+        
+    def __str__(self):
+        return "CudaContext id " + str(self.cuda_context.handle)
+        
+    
+    def hash_kernel(kernel_filename, include_dirs):        
+        # Generate a kernel ID for our caches
+        num_includes = 0
+        max_includes = 100
+        kernel_hasher = hashlib.md5()
+        logger = logging.getLogger(__name__)
+        
+        # Loop over file and includes, and check if something has changed
+        files = [kernel_filename]
+        while len(files):
+        
+            if (num_includes > max_includes):
+                raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
+        
+            filename = files.pop()
+            
+            #logger.debug("Hashing %s", filename)
+                
+            modified = os.path.getmtime(filename)
+                
+            # Open the file
+            with io.open(filename, "r") as file:
+            
+                # Search for #inclue <something> and also hash the file
+                file_str = file.read()
+                kernel_hasher.update(file_str.encode('utf-8'))
+                kernel_hasher.update(str(modified).encode('utf-8'))
+                
+                #Find all includes
+                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
+                
+            # Loop over everything that looks like an include
+            for include_file in includes:
+                
+                #Search through include directories for the file
+                file_path = os.path.dirname(filename)
+                for include_path in [file_path] + include_dirs:
+                
+                    # If we find it, add it to list of files to check
+                    temp_path = os.path.join(include_path, include_file)
+                    if (os.path.isfile(temp_path)):
+                        files = files + [temp_path]
+                        num_includes = num_includes + 1 #For circular includes...
+                        break
+            
+        return kernel_hasher.hexdigest()
+    
+    """
+    Reads a text file and creates an OpenCL kernel from that
+    """
+    def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
+                    prepared_call_args, \
+                    include_dirs=[], no_extern_c=True, 
+                    **kwargs):
+        """
+        Helper function to print compilation output
+        """
+        def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
+            self.logger.debug("Compilation returned %s", str(compile_success_bool))
+            if info_str:
+                self.logger.debug("Info: %s", info_str)
+            if error_str:
+                self.logger.debug("Error: %s", error_str)
+        
+        kernel_filename = os.path.normpath(kernel_filename)
+        #self.logger.debug("Getting %s", kernel_filename)
+            
+        # Create a hash of the kernel (and its includes)
+        kwargs_hasher = hashlib.md5()
+        kwargs_hasher.update(str(kwargs).encode('utf-8'));
+        kwargs_hash = kwargs_hasher.hexdigest()
+        kwargs_hasher = None
+        root, ext = os.path.splitext(kernel_filename)
+        kernel_hash = root \
+                + "_" + CudaContext.hash_kernel( \
+                    os.path.join(self.module_path, kernel_filename), \
+                    include_dirs=[self.module_path] + include_dirs) \
+                + "_" + kwargs_hash \
+                + ext
+        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
+        
+        # If we have the kernel in our hashmap, return it
+        if (kernel_hash in self.kernels.keys()):
+            self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
+            return self.kernels[kernel_hash]
+        
+        # If we have it on disk, return it
+        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
+            self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
+                
+            with io.open(cached_kernel_filename, "rb") as file:
+                file_str = file.read()
+                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
+                
+            kernel = module.get_function(kernel_function_name)
+            kernel.prepare(prepared_call_args)
+            self.kernels[kernel_hash] = kernel
+            return kernel
+            
+        # Otherwise, compile it from source
+        else:
+            self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
+                
+            #Create kernel string
+            kernel_string = ""
+            for key, value in kwargs.items():
+                kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
+            kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
+            if (self.use_cache):
+                cached_kernel_dir = os.path.dirname(cached_kernel_filename)
+                if not os.path.isdir(cached_kernel_dir):
+                    os.mkdir(cached_kernel_dir)
+                with io.open(cached_kernel_filename + ".txt", "w") as file:
+                    file.write(kernel_string)
+                
+            
+            with Common.Timer("compiler") as timer:
+                cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
+                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
+                if (self.use_cache):
+                    with io.open(cached_kernel_filename, "wb") as file:
+                        file.write(cubin)
+                
+            kernel = module.get_function(kernel_function_name)
+            kernel.prepare(prepared_call_args)
+            self.kernels[kernel_hash] = kernel
+            
+            
+            return kernel
+    
+    """
+    Clears the kernel cache (useful for debugging & development)
+    """
+    def clear_kernel_cache(self):
+        self.logger.debug("Clearing cache")
+        self.kernels = {}
+        gc.collect()
+        
+    """
+    Synchronizes all streams etc
+    """
+    def synchronize(self):
+        self.cuda_context.synchronize()
\ No newline at end of file
diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py
index c384c11..989b842 100644
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -66,7 +66,7 @@ class FORCE (Simulator.BaseSimulator):
             block_width, block_height);
 
         #Get kernels
-        self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \
                                         "iiffffPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py
index 54bb9f6..89d13ea 100644
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -61,7 +61,7 @@ class HLL (Simulator.BaseSimulator):
             block_width, block_height);
 
         #Get kernels
-        self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \
                                         "iiffffPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py
index e16521f..db1b9e3 100644
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -67,7 +67,7 @@ class HLL2 (Simulator.BaseSimulator):
         self.theta = np.float32(theta)
 
         #Get kernels
-        self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \
                                         "iifffffiPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/IPythonMagic.py b/GPUSimulators/IPythonMagic.py
index ce4c16f..35d515f 100644
--- a/GPUSimulators/IPythonMagic.py
+++ b/GPUSimulators/IPythonMagic.py
@@ -25,7 +25,7 @@ from IPython.core import magic_arguments
 from IPython.core.magic import line_magic, Magics, magics_class
 import pycuda.driver as cuda
 
-from GPUSimulators import Common
+from GPUSimulators import Common, CudaContext
 
 
 @magics_class
@@ -53,7 +53,7 @@ class MyIPythonMagic(Magics):
             self.logger.debug("Creating context")
             use_cache = False if args.no_cache else True
             use_autotuning = False if args.no_autotuning else True
-            self.shell.user_ns[args.name] = Common.CudaContext(blocking=args.blocking, use_cache=use_cache, autotuning=use_autotuning)
+            self.shell.user_ns[args.name] = CudaContext.CudaContext(blocking=args.blocking, use_cache=use_cache, autotuning=use_autotuning)
         
         # this function will be called on exceptions in any cell
         def custom_exc(shell, etype, evalue, tb, tb_offset=None):
diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py
index 258b8bb..e26f58a 100644
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -68,7 +68,7 @@ class KP07 (Simulator.BaseSimulator):
         self.theta = np.float32(theta)
 
         #Get kernels
-        self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \
                                         "iifffffiPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py
index eb715df..20556fb 100644
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -68,7 +68,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
         self.theta = np.float32(theta)
 
         #Get kernels
-        self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \
                                         "iifffffiPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py
index c75184c..1f1019c 100644
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
 #Import packages we need
-from GPUSimulators import Simulator, Common
+from GPUSimulators import Simulator, Common, CudaContext
 
 
 
@@ -62,7 +62,7 @@ class LxF (Simulator.BaseSimulator):
             block_width, block_height);
 
         # Get kernels
-        self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \
                                         "iiffffPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/SWECommon.cu b/GPUSimulators/SWECommon.cu
deleted file mode 100644
index 389a062..0000000
--- a/GPUSimulators/SWECommon.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
-for the shallow water equations, described in 
-A. Kurganov & Guergana Petrova
-A Second-Order Well-Balanced Positivity Preserving Central-Upwind
-Scheme for the Saint-Venant System Communications in Mathematical
-Sciences, 5 (2007), 133-160. 
-
-Copyright (C) 2016  SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-
-
-
-
-
-
-
-__device__ float3 F_func(const float3 Q, const float g) {
-    float3 F;
-
-    F.x = Q.y;                              //hu
-    F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x;   //hu*hu/h + 0.5f*g*h*h;
-    F.z = Q.y*Q.z / Q.x;                    //hu*hv/h;
-
-    return F;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py
index 7f5959a..f6768b6 100644
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -61,7 +61,7 @@ class WAF (Simulator.BaseSimulator):
             block_width, block_height);
 
         #Get kernels
-        self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
+        self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \
                                         "iiffffiPiPiPiPiPiPi", \
                                         BLOCK_WIDTH=self.local_size[0], \
                                         BLOCK_HEIGHT=self.local_size[1])
diff --git a/GPUSimulators/EulerCommon.cu b/GPUSimulators/cuda/EulerCommon.h
similarity index 99%
rename from GPUSimulators/EulerCommon.cu
rename to GPUSimulators/cuda/EulerCommon.h
index eaaffe9..a8442a0 100644
--- a/GPUSimulators/EulerCommon.cu
+++ b/GPUSimulators/cuda/EulerCommon.h
@@ -22,6 +22,7 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
 
 
 
diff --git a/GPUSimulators/cuda/SWECommon.h b/GPUSimulators/cuda/SWECommon.h
new file mode 100644
index 0000000..233e9f0
--- /dev/null
+++ b/GPUSimulators/cuda/SWECommon.h
@@ -0,0 +1,474 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "limiters.h"
+
+
+
+
+
+
+__device__ float3 F_func(const float3 Q, const float g) {
+    float3 F;
+
+    F.x = Q.y;                              //hu
+    F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x;   //hu*hu/h + 0.5f*g*h*h;
+    F.z = Q.y*Q.z / Q.x;                    //hu*hv/h;
+
+    return F;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+/**
+  * Superbee flux limiter for WAF.
+  * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
+  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
+  * @param c_ the courant number for wave k, dt*S_k/dx
+  */
+__device__ float WAF_superbee(float r_, float c_) {
+    // r <= 0.0
+    if (r_ <= 0.0f) { 
+        return 1.0f;
+    }
+    // 0.0 <= r <= 1/2
+    else if (r_ <= 0.5f) { 
+        return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
+    }
+    // 1/2 <= r <= 1
+    else if (r_ <= 1.0f) {
+        return fabs(c_);
+    }
+    // 1 <= r <= 2
+    else  if (r_ <= 2.0f) {
+        return 1.0f - (1.0f - fabsf(c_))*r_;
+    }
+    // r >= 2
+    else {
+        return 2.0f*fabsf(c_) - 1.0f;
+    }
+}
+
+
+
+
+__device__ float WAF_albada(float r_, float c_) {
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    else {
+        return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
+    }
+}
+
+__device__ float WAF_minbee(float r_, float c_) {
+    r_ = fmaxf(-1.0f, fminf(2.0f, r_));
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    if (r_ >= 0.0f && r_ <= 1.0f) {
+        return 1.0f - (1.0f - fabsf(c_)) * r_;
+    }
+    else {
+        return fabsf(c_);
+    }
+}
+
+__device__ float WAF_minmod(float r_, float c_) {
+    return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
+}
+
+__device__ float limiterToWAFLimiter(float r_, float c_) {
+    return 1.0f - (1.0f - fabsf(c_))*r_;
+}
+
+// Compute h in the "star region", h^dagger
+__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
+    
+    //This estimate for the h* gives rise to spurious oscillations. 
+    //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
+    return h_tmp*h_tmp / g_;
+}
+
+/**
+  * Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
+  * @param r_ The flux limiter parameter (see Toro 2001, p. 203)
+  * @param Q_l2 Q_{i-1}
+  * @param Q_l1 Q_{i}
+  * @param Q_r1 Q_{i+1}
+  * @param Q_r2 Q_{i+2}
+  */
+__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
+    const float h_l = Q_l1.x;
+    const float h_r = Q_r1.x;
+    
+    const float h_l2 = Q_l2.x;
+    const float h_r2 = Q_r2.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l1.y / h_l;
+    const float u_r = Q_r1.y / h_r;
+    
+    const float u_l2 = Q_l2.y / h_l2;
+    const float u_r2 = Q_r2.y / h_r2;
+    
+    const float v_l = Q_l1.z / h_l;
+    const float v_r = Q_r1.z / h_r;
+    
+    const float v_l2 = Q_l2.z / h_l2;
+    const float v_r2 = Q_r2.z / h_r2;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    const float c_l2 = sqrt(g_*h_l2);
+    const float c_r2 = sqrt(g_*h_r2);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag_l = computeHStar(h_l2,  h_l, u_l2,  u_l, c_l2,  c_l, g_);
+    const float h_dag   = computeHStar( h_l,  h_r,  u_l,  u_r,  c_l,  c_r, g_);
+    const float h_dag_r = computeHStar( h_r, h_r2,  u_r, u_r2,  c_r, c_r2, g_);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l; 
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
+    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
+    
+    // Estimate the fluxes in the four regions
+    const float3 F_1 = F_func(Q_l1, g_);
+    const float3 F_4 = F_func(Q_r1, g_);
+    
+    const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
+    const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
+    //const float3 F_2 = F_func(Q_star_l, g_);
+    //const float3 F_3 = F_func(Q_star_r, g_);
+    
+    // Compute the courant numbers for the waves
+    const float c_1 = S_l * dt_ / dx_;
+    const float c_2 = S_star * dt_ / dx_;
+    const float c_3 = S_r * dt_ / dx_;
+    
+    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
+    const float eps = 1.0e-6f;
+    const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
+    const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
+    const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
+        
+    // Compute the limiter
+    // We use h for the nonlinear waves, and v for the middle shear wave 
+    const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
+    const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); 
+    const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
+    
+    //Average the fluxes
+    const float3 flux = 0.5f*( F_1 + F_4 )
+                      - 0.5f*( A_1 * (F_2 - F_1)
+                             + A_2 * (F_3 - F_2)
+                             + A_3 * (F_4 - F_3) );
+
+    return flux;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/**
+  * Central upwind flux function
+  */
+__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
+    const float3 Fp = F_func(Qp, g);
+    const float up = Qp.y / Qp.x;   // hu / h
+    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
+
+    const float3 Fm = F_func(Qm, g);
+    const float um = Qm.y / Qm.x;   // hu / h
+    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
+    
+    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
+    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
+    
+    return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
+}
+
+
+
+
+
+
+
+
+
+/**
+  * Godunovs centered scheme (Toro 2001, p 165)
+  */
+__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
+    
+    return F_func(Q_godc, g_);
+}
+    
+
+
+    
+    
+
+
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
+  */
+__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_func(Q_l, g_);
+    }
+    else if (S_r <= 0.0f) {
+        return F_func(Q_r, g_);
+    }
+    //Or estimate flux in the star region
+    else {
+        const float3 F_l = F_func(Q_l, g_);
+        const float3 F_r = F_func(Q_r, g_);
+        const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
+        return flux;
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
+  */
+__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_l;
+    }
+    else if (S_r <= 0.0f) {
+        return F_r;
+    }
+    //Or estimate flux in the "left star" region
+    else if (S_l <= 0.0f && 0.0f <=S_star) {
+        const float v_l = Q_l.z / h_l;
+        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
+        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
+        return flux;
+    }
+    //Or estimate flux in the "righ star" region
+    else if (S_star <= 0.0f && 0.0f <=S_r) {
+        const float v_r = Q_r.z / h_r;
+        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
+        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
+        return flux;
+    }
+    else {
+        return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+/**
+  * Lax-Friedrichs flux (Toro 2001, p 163)
+  */
+__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
+}
+
+
+
+
+
+
+
+
+/**
+  * Lax-Friedrichs extended to 2D
+  */
+__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Note numerical diffusion for 2D here (0.25)
+    return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+/**
+  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
+  */
+__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
+    
+    return F_func(Q_lw2, g_);
+}
+
+
+
+
+
+
+
+
+
+
+ 
+    
+/**
+  * First Ordered Centered (Toro 2001, p.163)
+  */
+__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    return 0.5f*(F_lf + F_lw2);
+}
diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/cuda/SWE_FORCE.cu
similarity index 98%
rename from GPUSimulators/FORCE_kernel.cu
rename to GPUSimulators/cuda/SWE_FORCE.cu
index bedb0d7..002503d 100644
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/cuda/SWE_FORCE.cu
@@ -19,9 +19,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 
-#include "common.cu"
-#include "SWECommon.cu"
-#include "fluxes/FirstOrderCentered.cu"
+#include "common.h"
+#include "SWECommon.h"
 
 
 /**
diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/cuda/SWE_HLL.cu
similarity index 98%
rename from GPUSimulators/HLL_kernel.cu
rename to GPUSimulators/cuda/SWE_HLL.cu
index a86963b..def3d04 100644
--- a/GPUSimulators/HLL_kernel.cu
+++ b/GPUSimulators/cuda/SWE_HLL.cu
@@ -19,9 +19,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 
-#include "common.cu"
-#include "SWECommon.cu"
-#include "fluxes/HartenLaxVanLeer.cu"
+#include "common.h"
+#include "SWECommon.h"
 
 
 
diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/cuda/SWE_HLL2.cu
similarity index 98%
rename from GPUSimulators/HLL2_kernel.cu
rename to GPUSimulators/cuda/SWE_HLL2.cu
index 7909a57..d51233f 100644
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/cuda/SWE_HLL2.cu
@@ -18,10 +18,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 
-#include "common.cu"
-#include "SWECommon.cu"
-#include "limiters.cu"
-#include "fluxes/HartenLaxVanLeer.cu"
+#include "common.h"
+#include "SWECommon.h"
+#include "limiters.h"
 
 
 
diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/cuda/SWE_KP07.cu
similarity index 98%
rename from GPUSimulators/KP07_kernel.cu
rename to GPUSimulators/cuda/SWE_KP07.cu
index dbd1964..41ffc54 100644
--- a/GPUSimulators/KP07_kernel.cu
+++ b/GPUSimulators/cuda/SWE_KP07.cu
@@ -24,10 +24,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 
-#include "common.cu"
-#include "SWECommon.cu"
-#include "limiters.cu"
-#include "fluxes/CentralUpwind.cu"
+#include "common.h"
+#include "SWECommon.h"
+#include "limiters.h"
 
 
 __device__
diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/cuda/SWE_KP07_dimsplit.cu
similarity index 98%
rename from GPUSimulators/KP07_dimsplit_kernel.cu
rename to GPUSimulators/cuda/SWE_KP07_dimsplit.cu
index 7d9fd9d..a350bcb 100644
--- a/GPUSimulators/KP07_dimsplit_kernel.cu
+++ b/GPUSimulators/cuda/SWE_KP07_dimsplit.cu
@@ -24,10 +24,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 
-#include "common.cu"
-#include "SWECommon.cu"
-#include "limiters.cu"
-#include "fluxes/CentralUpwind.cu"
+#include "common.h"
+#include "SWECommon.h"
+#include "limiters.h"
 
 
 __device__
diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/cuda/SWE_LxF.cu
similarity index 98%
rename from GPUSimulators/LxF_kernel.cu
rename to GPUSimulators/cuda/SWE_LxF.cu
index ff094ad..f9cbcae 100644
--- a/GPUSimulators/LxF_kernel.cu
+++ b/GPUSimulators/cuda/SWE_LxF.cu
@@ -19,9 +19,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 
-#include "Common.cu"
-#include "SWECommon.cu"
-#include "fluxes/LaxFriedrichs.cu"
+#include "common.h"
+#include "SWECommon.h"
 
 
 /**
diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/cuda/SWE_WAF.cu
similarity index 98%
rename from GPUSimulators/WAF_kernel.cu
rename to GPUSimulators/cuda/SWE_WAF.cu
index b35e5dd..b56af1d 100644
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/cuda/SWE_WAF.cu
@@ -24,9 +24,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 
-#include "common.cu"
-#include "SWECommon.cu"
-#include "fluxes/WeightedAverageFlux.cu"
+#include "common.h"
+#include "SWECommon.h"
 
 
 
diff --git a/GPUSimulators/common.cu b/GPUSimulators/cuda/common.h
similarity index 91%
rename from GPUSimulators/common.cu
rename to GPUSimulators/cuda/common.h
index 2f3f057..c0af6f5 100644
--- a/GPUSimulators/common.cu
+++ b/GPUSimulators/cuda/common.h
@@ -22,6 +22,7 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
 
 
 /**
@@ -50,34 +51,17 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
 
 
 
-/**
-  * Reads a block of data  with one ghost cell for the shallow water equations
-  */
-template<int sm_width, int sm_height, int block_width, int block_height>
-inline __device__ void readBlock(float* ptr_, int pitch_,
-                float (&shmem)[sm_height][sm_width], 
-                const int max_x_, const int max_y_) {
-    
-    //Index of block within domain
-    const int bx = blockDim.x * blockIdx.x;
-    const int by = blockDim.y * blockIdx.y;
-    
-    //Read into shared memory
-    for (int j=threadIdx.y; j<sm_height; j+=block_height) {
-        const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
-        
-        //Compute the pointer to current row in the arrays
-        const float* const row  = (float*) ((char*) ptr_  + pitch_*l);
-        
-        for (int i=threadIdx.x; i<sm_width; i+=block_width) {
-            const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
 
-            shmem[j][i] = row[k];
-        }
-    }
+__device__ float desingularize(float x_, float eps_) {
+    return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
 }
 
 
+
+
+
+
+
 /**
   * Reads a block of data with ghost cells
   */
diff --git a/GPUSimulators/limiters.cu b/GPUSimulators/cuda/limiters.h
similarity index 99%
rename from GPUSimulators/limiters.cu
rename to GPUSimulators/cuda/limiters.h
index e34d21f..35a0e86 100644
--- a/GPUSimulators/limiters.cu
+++ b/GPUSimulators/cuda/limiters.h
@@ -17,7 +17,7 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-
+#pragma once
 
 
 
diff --git a/GPUSimulators/fluxes/CentralUpwind.cu b/GPUSimulators/fluxes/CentralUpwind.cu
deleted file mode 100644
index e20d480..0000000
--- a/GPUSimulators/fluxes/CentralUpwind.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-This file implements the Central upwind flux
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-
-/**
-  * Central upwind flux function
-  */
-__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
-    const float3 Fp = F_func(Qp, g);
-    const float up = Qp.y / Qp.x;   // hu / h
-    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
-
-    const float3 Fm = F_func(Qm, g);
-    const float um = Qm.y / Qm.x;   // hu / h
-    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
-    
-    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
-    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
-    
-    return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
-}
-
diff --git a/GPUSimulators/fluxes/FirstOrderCentered.cu b/GPUSimulators/fluxes/FirstOrderCentered.cu
deleted file mode 100644
index 1c34760..0000000
--- a/GPUSimulators/fluxes/FirstOrderCentered.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-This file implements the First ORder CEntered flux
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "./LaxFriedrichs.cu"
-#include "./LaxWendroff.cu"
-
-
-    
-    
-/**
-  * First Ordered Centered (Toro 2001, p.163)
-  */
-__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
-    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
-    return 0.5f*(F_lf + F_lw2);
-}
diff --git a/GPUSimulators/fluxes/Godunov.cu b/GPUSimulators/fluxes/Godunov.cu
deleted file mode 100644
index 5d1352a..0000000
--- a/GPUSimulators/fluxes/Godunov.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-This file implements the Godunov flux
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-
-
-/**
-  * Godunovs centered scheme (Toro 2001, p 165)
-  */
-__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
-    
-    return F_func(Q_godc, g_);
-}
-    
-
-
diff --git a/GPUSimulators/fluxes/HartenLaxVanLeer.cu b/GPUSimulators/fluxes/HartenLaxVanLeer.cu
deleted file mode 100644
index e561781..0000000
--- a/GPUSimulators/fluxes/HartenLaxVanLeer.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-This file implements the Harten-Lax-van Leer flux
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-/**
-  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
-  */
-__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
-    const float h_l = Q_l.x;
-    const float h_r = Q_r.x;
-    
-    // Calculate velocities
-    const float u_l = Q_l.y / h_l;
-    const float u_r = Q_r.y / h_r;
-    
-    // Estimate the potential wave speeds
-    const float c_l = sqrt(g_*h_l);
-    const float c_r = sqrt(g_*h_r);
-    
-    // Compute h in the "star region", h^dagger
-    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
-    
-    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
-    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
-    
-    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
-    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
-    
-    // Compute wave speed estimates
-    const float S_l = u_l - c_l*q_l;
-    const float S_r = u_r + c_r*q_r;
-    
-    //Upwind selection
-    if (S_l >= 0.0f) {
-        return F_func(Q_l, g_);
-    }
-    else if (S_r <= 0.0f) {
-        return F_func(Q_r, g_);
-    }
-    //Or estimate flux in the star region
-    else {
-        const float3 F_l = F_func(Q_l, g_);
-        const float3 F_r = F_func(Q_r, g_);
-        const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
-        return flux;
-    }
-}
diff --git a/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu b/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
deleted file mode 100644
index 9dc870e..0000000
--- a/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-This file implements the Harten-Lax-van Leer flux with contact discontinuity
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-
-
-/**
-  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
-  */
-__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
-    const float h_l = Q_l.x;
-    const float h_r = Q_r.x;
-    
-    // Calculate velocities
-    const float u_l = Q_l.y / h_l;
-    const float u_r = Q_r.y / h_r;
-    
-    // Estimate the potential wave speeds
-    const float c_l = sqrt(g_*h_l);
-    const float c_r = sqrt(g_*h_r);
-    
-    // Compute h in the "star region", h^dagger
-    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
-    
-    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
-    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
-    
-    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
-    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
-    
-    // Compute wave speed estimates
-    const float S_l = u_l - c_l*q_l;
-    const float S_r = u_r + c_r*q_r;
-    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
-    
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    //Upwind selection
-    if (S_l >= 0.0f) {
-        return F_l;
-    }
-    else if (S_r <= 0.0f) {
-        return F_r;
-    }
-    //Or estimate flux in the "left star" region
-    else if (S_l <= 0.0f && 0.0f <=S_star) {
-        const float v_l = Q_l.z / h_l;
-        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
-        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
-        return flux;
-    }
-    //Or estimate flux in the "righ star" region
-    else if (S_star <= 0.0f && 0.0f <=S_r) {
-        const float v_r = Q_r.z / h_r;
-        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
-        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
-        return flux;
-    }
-    else {
-        return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
-    }
-}
-
diff --git a/GPUSimulators/fluxes/LaxFriedrichs.cu b/GPUSimulators/fluxes/LaxFriedrichs.cu
deleted file mode 100644
index 5759b2d..0000000
--- a/GPUSimulators/fluxes/LaxFriedrichs.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-/**
-  * Lax-Friedrichs flux (Toro 2001, p 163)
-  */
-__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
-}
-
-
-
-/**
-  * Lax-Friedrichs extended to 2D
-  */
-__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    //Note numerical diffusion for 2D here (0.25)
-    return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
-}
diff --git a/GPUSimulators/fluxes/LaxWendroff.cu b/GPUSimulators/fluxes/LaxWendroff.cu
deleted file mode 100644
index 839763b..0000000
--- a/GPUSimulators/fluxes/LaxWendroff.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-This file implements the Lax-Wendroff flux
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-
-
-/**
-  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
-  */
-__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
-    
-    return F_func(Q_lw2, g_);
-}
-
diff --git a/GPUSimulators/fluxes/WeightedAverageFlux.cu b/GPUSimulators/fluxes/WeightedAverageFlux.cu
deleted file mode 100644
index 0574c32..0000000
--- a/GPUSimulators/fluxes/WeightedAverageFlux.cu
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
-This file implements the Weighted Average Flux
-
-Copyright (C) 2016, 2017, 2018 SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "limiters.cu"
-
-/**
-  * Superbee flux limiter for WAF.
-  * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
-  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
-  * @param c_ the courant number for wave k, dt*S_k/dx
-  */
-__device__ float WAF_superbee(float r_, float c_) {
-    // r <= 0.0
-    if (r_ <= 0.0f) { 
-        return 1.0f;
-    }
-    // 0.0 <= r <= 1/2
-    else if (r_ <= 0.5f) { 
-        return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
-    }
-    // 1/2 <= r <= 1
-    else if (r_ <= 1.0f) {
-        return fabs(c_);
-    }
-    // 1 <= r <= 2
-    else  if (r_ <= 2.0f) {
-        return 1.0f - (1.0f - fabsf(c_))*r_;
-    }
-    // r >= 2
-    else {
-        return 2.0f*fabsf(c_) - 1.0f;
-    }
-}
-
-
-
-
-__device__ float WAF_albada(float r_, float c_) {
-    if (r_ <= 0.0f) {
-        return 1.0f;
-    }
-    else {
-        return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
-    }
-}
-
-__device__ float WAF_minbee(float r_, float c_) {
-    r_ = fmaxf(-1.0f, fminf(2.0f, r_));
-    if (r_ <= 0.0f) {
-        return 1.0f;
-    }
-    if (r_ >= 0.0f && r_ <= 1.0f) {
-        return 1.0f - (1.0f - fabsf(c_)) * r_;
-    }
-    else {
-        return fabsf(c_);
-    }
-}
-
-__device__ float WAF_minmod(float r_, float c_) {
-    return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
-}
-
-
-
-__device__ float limiterToWAFLimiter(float r_, float c_) {
-    return 1.0f - (1.0f - fabsf(c_))*r_;
-}
-
-__device__ float desingularize(float x_, float eps_) {
-    return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
-}
-
-// Compute h in the "star region", h^dagger
-__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
-    
-    //This estimate for the h* gives rise to spurious oscillations. 
-    //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
-    
-    const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
-    return h_tmp*h_tmp / g_;
-}
-
-/**
-  * Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
-  * @param r_ The flux limiter parameter (see Toro 2001, p. 203)
-  * @param Q_l2 Q_{i-1}
-  * @param Q_l1 Q_{i}
-  * @param Q_r1 Q_{i+1}
-  * @param Q_r2 Q_{i+2}
-  */
-__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
-    const float h_l = Q_l1.x;
-    const float h_r = Q_r1.x;
-    
-    const float h_l2 = Q_l2.x;
-    const float h_r2 = Q_r2.x;
-    
-    // Calculate velocities
-    const float u_l = Q_l1.y / h_l;
-    const float u_r = Q_r1.y / h_r;
-    
-    const float u_l2 = Q_l2.y / h_l2;
-    const float u_r2 = Q_r2.y / h_r2;
-    
-    const float v_l = Q_l1.z / h_l;
-    const float v_r = Q_r1.z / h_r;
-    
-    const float v_l2 = Q_l2.z / h_l2;
-    const float v_r2 = Q_r2.z / h_r2;
-    
-    // Estimate the potential wave speeds
-    const float c_l = sqrt(g_*h_l);
-    const float c_r = sqrt(g_*h_r);
-    
-    const float c_l2 = sqrt(g_*h_l2);
-    const float c_r2 = sqrt(g_*h_r2);
-    
-    // Compute h in the "star region", h^dagger
-    const float h_dag_l = computeHStar(h_l2,  h_l, u_l2,  u_l, c_l2,  c_l, g_);
-    const float h_dag   = computeHStar( h_l,  h_r,  u_l,  u_r,  c_l,  c_r, g_);
-    const float h_dag_r = computeHStar( h_r, h_r2,  u_r, u_r2,  c_r, c_r2, g_);
-    
-    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
-    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
-    
-    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
-    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
-    
-    // Compute wave speed estimates
-    const float S_l = u_l - c_l*q_l; 
-    const float S_r = u_r + c_r*q_r;
-    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
-    
-    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
-    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
-    
-    // Estimate the fluxes in the four regions
-    const float3 F_1 = F_func(Q_l1, g_);
-    const float3 F_4 = F_func(Q_r1, g_);
-    
-    const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
-    const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
-    //const float3 F_2 = F_func(Q_star_l, g_);
-    //const float3 F_3 = F_func(Q_star_r, g_);
-    
-    // Compute the courant numbers for the waves
-    const float c_1 = S_l * dt_ / dx_;
-    const float c_2 = S_star * dt_ / dx_;
-    const float c_3 = S_r * dt_ / dx_;
-    
-    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
-    const float eps = 1.0e-6f;
-    const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
-    const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
-    const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
-        
-    // Compute the limiter
-    // We use h for the nonlinear waves, and v for the middle shear wave 
-    const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
-    const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); 
-    const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
-    
-    //Average the fluxes
-    const float3 flux = 0.5f*( F_1 + F_4 )
-                      - 0.5f*( A_1 * (F_2 - F_1)
-                             + A_2 * (F_3 - F_2)
-                             + A_3 * (F_4 - F_3) );
-
-    return flux;
-}