diff --git a/GPUSimulators/Autotuner.py b/GPUSimulators/Autotuner.py
index 44cdb00..a77447b 100644
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@@ -34,8 +34,8 @@ from GPUSimulators import Common, Simulator
 class Autotuner:
     def __init__(self, 
                 nx=2048, ny=2048, 
-                block_widths=range(8, 32, 2),
-                block_heights=range(8, 32, 2)):
+                block_widths=range(8, 32, 1),
+                block_heights=range(8, 32, 1)):
         logger = logging.getLogger(__name__)
         self.filename = "autotuning_data_" + gethostname() + ".npz"
         self.nx = nx
@@ -239,19 +239,21 @@ class Autotuner:
         h  = np.zeros((ny, nx), dtype=np.float32); 
         hu = np.zeros((ny, nx), dtype=np.float32);
         hv = np.zeros((ny, nx), dtype=np.float32);
-        
-        x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
-        y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
+
+        extent = 1.0/np.sqrt(2.0)
+        x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size
+        y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size
         xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
-        r = np.sqrt(xv**2 + yv**2)
+        r = np.minimum(1.0, np.sqrt(xv**2 + yv**2))
         xv = None
         yv = None
         gc.collect()
-            
+
         #Generate highres
-        h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
-        hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
-        hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
+        cos = np.cos(np.pi*r)
+        h = 0.5 + 0.1*0.5*(1.0 + cos)
+        hu = 0.1*0.5*(1.0 + cos)
+        hv = hu.copy()
         
         scale = 0.7
         max_h_estimate = 0.6
diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py
index 84bdb7a..17b20a4 100644
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -289,8 +289,9 @@ class CudaContext(object):
 
 
 
+
 """
-Class that holds data 
+Class that holds 2D data 
 """
 class CudaArray2D:
     """
@@ -307,42 +308,41 @@ class CudaArray2D:
         ny_halo = ny + 2*y_halo
         
         #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+        self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
         
-        #Make sure data is in proper format
-        if cpu_data is not None:
-            assert cpu_data.itemsize == 4, "Wrong size of data type"
-            assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
-
-        #Upload data to the device
+        #If we don't have any data, just allocate and return
         if cpu_data is None:
-            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
-        elif (cpu_data.shape == (ny_halo, nx_halo)):
-            self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
-        elif (cpu_data.shape == (self.ny, self.nx)):
-            #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
-            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
-            #self.data.fill(0.0)
+            return
             
-            #Create copy object from host to device
-            copy = cuda.Memcpy2D()
-            copy.set_src_host(cpu_data)
-            copy.set_dst_device(self.data.gpudata)
+        #Make sure data is in proper format
+        assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
+        assert cpu_data.itemsize == 4, "Wrong size of data type"
+        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
+
+        #Create copy object from host to device
+        copy = cuda.Memcpy2D()
+        copy.set_src_host(cpu_data)
+        copy.set_dst_device(self.data.gpudata)
             
-            #Set offsets and pitch of destination
-            copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
-            copy.dst_y = self.y_halo
-            copy.dst_pitch = self.data.strides[0]
-            
-            #Set width in bytes to copy for each row and
-            #number of rows to copy
-            copy.width_in_bytes = self.nx*cpu_data.itemsize
-            copy.height = self.ny
-            
-            #Perform the copy
-            copy(stream)
-            stream.synchronize()
-        else:
-            assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
+        #Set offsets of upload in destination
+        x_offset = (nx_halo - cpu_data.shape[1]) // 2
+        y_offset = (ny_halo - cpu_data.shape[0]) // 2
+        copy.dst_x_in_bytes = x_offset*self.data.strides[1]
+        copy.dst_y = y_offset
+        
+        #Set destination pitch
+        copy.dst_pitch = self.data.strides[0]
+        
+        #Set width in bytes to copy for each row and
+        #number of rows to copy
+        width = max(self.nx, cpu_data.shape[1])
+        height = max(self.ny, cpu_data.shape[0])
+        copy.width_in_bytes = width*cpu_data.itemsize
+        copy.height = height
+        
+        #Perform the copy
+        copy(stream)
         
         #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
         
@@ -389,6 +389,114 @@ class CudaArray2D:
         
         
         
+"""
+Class that holds 2D data 
+"""
+class CudaArray3D:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
+        self.logger =  logging.getLogger(__name__)
+        self.nx = nx
+        self.ny = ny
+        self.nz = nz
+        self.x_halo = x_halo
+        self.y_halo = y_halo
+        self.z_halo = z_halo
+        
+        nx_halo = nx + 2*x_halo
+        ny_halo = ny + 2*y_halo
+        nz_halo = nz + 2*z_halo
+        
+        #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
+        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+        self.data = pycuda.gpuarray.empty((nz_halo, ny_halo, nx_halo), dtype)
+        
+        #If we don't have any data, just allocate and return
+        if cpu_data is None:
+            return
+            
+        #Make sure data is in proper format
+        assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo)))
+        assert cpu_data.itemsize == 4, "Wrong size of data type"
+        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
+            
+        #Create copy object from host to device
+        copy = cuda.Memcpy3D()
+        copy.set_src_host(cpu_data)
+        copy.set_dst_device(self.data.gpudata)
+        
+        #Set offsets of destination
+        x_offset = (nx_halo - cpu_data.shape[2]) // 2
+        y_offset = (ny_halo - cpu_data.shape[1]) // 2
+        z_offset = (nz_halo - cpu_data.shape[0]) // 2
+        copy.dst_x_in_bytes = x_offset*self.data.strides[1]
+        copy.dst_y = y_offset
+        copy.dst_z = z_offset
+        
+        #Set pitch of destination
+        copy.dst_pitch = self.data.strides[0]
+        
+        #Set width in bytes to copy for each row and
+        #number of rows to copy
+        width = max(self.nx, cpu_data.shape[2])
+        height = max(self.ny, cpu_data.shape[1])
+        depth = max(self.nz, cpu-data.shape[0])
+        copy.width_in_bytes = width*cpu_data.itemsize
+        copy.height = height
+        copy.depth = depth
+        
+        #Perform the copy
+        copy(stream)
+        
+        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+        
+        
+    def __del__(self, *args):
+        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        self.data.gpudata.free()
+        self.data = None
+        
+    """
+    Enables downloading data from GPU to Python
+    """
+    def download(self, stream, async=False):
+        #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
+        #Allocate host memory
+        #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
+        cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
+        
+        #Create copy object from device to host
+        copy = cuda.Memcpy2D()
+        copy.set_src_device(self.data.gpudata)
+        copy.set_dst_host(cpu_data)
+        
+        #Set offsets and pitch of source
+        copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
+        copy.src_y = self.y_halo
+        copy.src_z = self.z_halo
+        copy.src_pitch = self.data.strides[0]
+        
+        #Set width in bytes to copy for each row and
+        #number of rows to copy
+        copy.width_in_bytes = self.nx*cpu_data.itemsize
+        copy.height = self.ny
+        copy.depth = self.nz
+        
+        copy(stream)
+        if async==False:
+            stream.synchronize()
+        
+        return cpu_data
+
+        
+        
+        
+        
+        
+        
+        
         
 """
 A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
diff --git a/GPUSimulators/EulerCommon.cu b/GPUSimulators/EulerCommon.cu
new file mode 100644
index 0000000..eaaffe9
--- /dev/null
+++ b/GPUSimulators/EulerCommon.cu
@@ -0,0 +1,58 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+
+
+
+__device__ float pressure(float4 Q, float gamma) {
+    const float rho   = Q.x;
+    const float rho_u = Q.y;
+    const float rho_v = Q.z;
+    const float E     = Q.w;
+
+    return (gamma-1.0f)*(E-0.5f*(rho_u*rho_u + rho_v*rho_v)/rho);
+}
+
+
+__device__ float4 F_func(const float4 Q, float gamma) {
+    const float rho   = Q.x;
+    const float rho_u = Q.y;
+    const float rho_v = Q.z;
+    const float E     = Q.w;
+
+    const float u = rho_u/rho;
+    const float P = pressure(Q, gamma);
+
+    float4 F;
+
+    F.x = rho_u;
+    F.y = rho_u*u + P;
+    F.z = rho_v*u;
+    F.w = u*(E+P);
+
+    return F;
+}
\ No newline at end of file
diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu
index c43fd38..419c665 100644
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/FORCE_kernel.cu
@@ -20,6 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 #include "common.cu"
+#include "SWECommon.cu"
 #include "fluxes/FirstOrderCentered.cu"
 
 
@@ -32,8 +33,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                   const float g_, const float dx_, const float dt_) {
                       
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Compute fluxes along the x axis
     {
@@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                   float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Compute fluxes along the y axis
     for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
diff --git a/GPUSimulators/HLL2Euler.py b/GPUSimulators/HLL2Euler.py
new file mode 100644
index 0000000..8a9f285
--- /dev/null
+++ b/GPUSimulators/HLL2Euler.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the 2nd order HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+from GPUSimulators import Simulator, Common
+
+
+        
+        
+        
+        
+        
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class HLL2Euler (Simulator.BaseSimulator):
+
+    """
+    Initialization routine
+    rho: Density
+    rho_u: Momentum along x-axis
+    rho_v: Momentum along y-axis
+    E: energy
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis
+    dy: Grid cell spacing along y-axis
+    dt: Size of each timestep 
+    g: Gravitational accelleration
+    """
+    def __init__(self, \
+                 context, \
+                 rho, rho_u, rho_v, E, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 theta=1.8, \
+                 block_width=16, block_height=16):
+                 
+        # Call super constructor
+        super().__init__(context, \
+            nx, ny, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)
+
+        #Get kernels
+        self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
+                                        "iifffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=self.local_size[0], \
+                                        BLOCK_HEIGHT=self.local_size[1])
+        
+        #Create data by uploading to device
+        self.u0 = Common.ArakawaA2D(self.stream, \
+                        nx, ny, \
+                        2, 2, \
+                        [rho, rho_u, rho_v, E])
+        self.u1 = Common.ArakawaA2D(self.stream, \
+                        nx, ny, \
+                        2, 2, \
+                        [None, None, None, None])
+                        
+    def __str__(self):
+        return "Harten-Lax-van Leer (2nd order)"
+    
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
+        
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
+                
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(0), \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
+                self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0]
+                self.u1[3].data.gpudata, self.u1[3].data.strides[0])
+        self.u0, self.u1 = self.u1, self.u0
+        self.t += dt
+            
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(1), \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
+                self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0]
+                self.u1[3].data.gpudata, self.u1[3].data.strides[0])
+        self.u0, self.u1 = self.u1, self.u0
+        self.t += dt
+        
+    def download(self):
+        return self.u0.download(self.stream)
\ No newline at end of file
diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu
index 168627f..bdaf0f8 100644
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/HLL2_kernel.cu
@@ -19,6 +19,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 #include "common.cu"
+#include "SWECommon.cu"
 #include "limiters.cu"
 #include "fluxes/HartenLaxVanLeer.cu"
 
@@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dx_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     {
         const int j=ty;
@@ -89,8 +90,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
         const int l = j + 1;
diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/HLL_kernel.cu
index 487782f..42d34e1 100644
--- a/GPUSimulators/HLL_kernel.cu
+++ b/GPUSimulators/HLL_kernel.cu
@@ -20,6 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 #include "common.cu"
+#include "SWECommon.cu"
 #include "fluxes/HartenLaxVanLeer.cu"
 
 
@@ -34,8 +35,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                   float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     {
         const int j=ty;
@@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                   float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
         const int l = j;
@@ -156,7 +157,7 @@ __global__ void HLLKernel(
     __syncthreads();
     
     
-    //Q[0][get_local_id(1) + 1][get_local_id(0) + 1] += 0.1;
+    //Q[0][threadIdx.y + 1][threadIdx.x + 1] += 0.1;
     
     
     
diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/KP07_dimsplit_kernel.cu
index 9a523aa..1dd31d0 100644
--- a/GPUSimulators/KP07_dimsplit_kernel.cu
+++ b/GPUSimulators/KP07_dimsplit_kernel.cu
@@ -25,6 +25,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 #include "common.cu"
+#include "SWECommon.cu"
 #include "limiters.cu"
 #include "fluxes/CentralUpwind.cu"
 
@@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dx_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     {
         int j=ty;
@@ -80,8 +81,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
         const int l = j + 1;
diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/KP07_kernel.cu
index fe033f1..250a9a8 100644
--- a/GPUSimulators/KP07_kernel.cu
+++ b/GPUSimulators/KP07_kernel.cu
@@ -25,6 +25,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 #include "common.cu"
+#include "SWECommon.cu"
 #include "limiters.cu"
 #include "fluxes/CentralUpwind.cu"
 
@@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     {
         int j=ty;
@@ -66,8 +67,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
         const int l = j + 1;
@@ -120,12 +121,12 @@ __global__ void KP07Kernel(
         float* hv1_ptr_, int hv1_pitch_) {
         
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
 
     //Index of cell within domain
-    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
-    const int tj = get_global_id(1) + 2;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
     
     //Shared memory variables
     __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/LxF_kernel.cu
index 052ef17..c6ceeec 100644
--- a/GPUSimulators/LxF_kernel.cu
+++ b/GPUSimulators/LxF_kernel.cu
@@ -19,7 +19,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 
-#include "common.cu"
+#include "Common.cu"
+#include "SWECommon.cu"
 #include "fluxes/LaxFriedrichs.cu"
 
 
@@ -32,8 +33,8 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
                   float F[3][block_height][block_width+1],
                   const float g_, const float dx_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     {
         const int j=ty;
@@ -68,8 +69,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
                   float G[3][block_height+1][block_width],
                   const float g_, const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     for (int j=ty; j<block_height+1; j+=block_height) {
         const int l = j;
@@ -118,8 +119,8 @@ void LxFKernel(
     const int block_height = BLOCK_HEIGHT;
             
     //Index of cell within domain
-    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
-    const int tj = get_global_id(1) + 1;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
     
     __shared__ float Q[3][block_height+2][block_width+2];
     __shared__ float F[3][block_height][block_width+1];
@@ -146,8 +147,8 @@ void LxFKernel(
     //Evolve for all internal cells
     if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
         //Index of thread within block
-        const int tx = get_local_id(0);
-        const int ty = get_local_id(1);
+        const int tx = threadIdx.x;
+        const int ty = threadIdx.y;
         
         const int i = tx + 1; //Skip local ghost cells, i.e., +1
         const int j = ty + 1;
diff --git a/GPUSimulators/SWECommon.cu b/GPUSimulators/SWECommon.cu
new file mode 100644
index 0000000..389a062
--- /dev/null
+++ b/GPUSimulators/SWECommon.cu
@@ -0,0 +1,66 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+
+
+
+
+
+__device__ float3 F_func(const float3 Q, const float g) {
+    float3 F;
+
+    F.x = Q.y;                              //hu
+    F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x;   //hu*hu/h + 0.5f*g*h*h;
+    F.z = Q.y*Q.z / Q.x;                    //hu*hv/h;
+
+    return F;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/WAF_kernel.cu
index f707aa8..3c971ae 100644
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/WAF_kernel.cu
@@ -25,6 +25,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 #include "common.cu"
+#include "SWECommon.cu"
 #include "fluxes/WeightedAverageFlux.cu"
 
 
@@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dx_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
                       
     {
         int j=ty; 
@@ -76,8 +77,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                   const float g_, const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Compute fluxes along the y axis
     for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
diff --git a/GPUSimulators/common.cu b/GPUSimulators/common.cu
index dab8fc2..5f5d66f 100644
--- a/GPUSimulators/common.cu
+++ b/GPUSimulators/common.cu
@@ -23,43 +23,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 
-/**
-  * Location of thread in block
-  */
-inline __device__ int get_local_id(int dim) {
-    switch(dim) {
-        case 0: return threadIdx.x; 
-        case 1: return threadIdx.y;
-        case 2: return threadIdx.z;
-        default: return -1;
-    }
-}
-
-
-/**
-  * Get block index
-  */
-__device__ int get_group_id(int dim) {
-    switch(dim) {
-        case 0: return blockIdx.x;
-        case 1: return blockIdx.y;
-        case 2: return blockIdx.z;
-        default: return -1;
-    }
-}
-
-/**
-  * Location of thread in global domain
-  */
-__device__ int get_global_id(int dim) {
-    switch(dim) {
-        case 0: return blockDim.x*blockIdx.x + threadIdx.x;
-        case 1: return blockDim.y*blockIdx.y + threadIdx.y;
-        case 2: return blockDim.z*blockIdx.z + threadIdx.z;
-        default: return -1;
-    }
-}
-
 
 /**
   * Float3 operators 
@@ -97,12 +60,12 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
                 float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], 
                 const int nx_, const int ny_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of block within domain
-    const int bx = BLOCK_WIDTH * get_group_id(0);
-    const int by = BLOCK_HEIGHT * get_group_id(1);
+    const int bx = BLOCK_WIDTH * blockIdx.x;
+    const int by = BLOCK_HEIGHT * blockIdx.y;
     
     //Read into shared memory
     for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
@@ -136,12 +99,12 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
                 float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                 const int nx_, const int ny_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of block within domain
-    const int bx = BLOCK_WIDTH * get_group_id(0);
-    const int by = BLOCK_HEIGHT * get_group_id(1);
+    const int bx = BLOCK_WIDTH * blockIdx.x;
+    const int by = BLOCK_HEIGHT * blockIdx.y;
     
     //Read into shared memory
     for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
@@ -174,12 +137,12 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
                  float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                  const int nx_, const int ny_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of cell within domain
-    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
-    const int tj = get_global_id(1) + 1;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
     
     //Only write internal cells
     if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
@@ -209,12 +172,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
                  float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                  const int nx_, const int ny_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of cell within domain
-    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
-    const int tj = get_global_id(1) + 2;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
     
     //Only write internal cells
     if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
@@ -242,12 +205,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
   */
 __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
     //Global index
-    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
-    const int tj = get_global_id(1) + 1;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
     
     //Block-local indices
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     const int i = tx + 1; //Skip local ghost cells, i.e., +1
     const int j = ty + 1;
@@ -286,12 +249,12 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const
   */
 __device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
     //Global index
-    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
-    const int tj = get_global_id(1) + 2;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
     
     //Block-local indices
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     const int i = tx + 2; //Skip local ghost cells, i.e., +2
     const int j = ty + 2;
@@ -347,12 +310,12 @@ __device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
               const int nx_, const int ny_,
               const float dx_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of cell within domain
-    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
-    const int tj = get_global_id(1) + 1;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
     
     if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
         const int i = tx + 1; //Skip local ghost cells, i.e., +1
@@ -377,12 +340,12 @@ __device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
               const int nx_, const int ny_,
               const float dx_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of cell within domain
-    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
-    const int tj = get_global_id(1) + 2;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
     
     if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
         const int i = tx + 2; //Skip local ghost cells, i.e., +1
@@ -407,12 +370,12 @@ __device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
               const int nx_, const int ny_,
               const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of cell within domain
-    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
-    const int tj = get_global_id(1) + 1;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
     
     if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
         const int i = tx + 1; //Skip local ghost cells, i.e., +1
@@ -438,12 +401,12 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
               const int nx_, const int ny_,
               const float dy_, const float dt_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Index of cell within domain
-    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
-    const int tj = get_global_id(1) + 2;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
     
     if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
         const int i = tx + 2; //Skip local ghost cells, i.e., +2
@@ -464,23 +427,6 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
 
 
 
-__device__ float3 F_func(const float3 Q, const float g) {
-    float3 F;
-
-    F.x = Q.y;                              //hu
-    F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x;   //hu*hu/h + 0.5f*g*h*h;
-    F.z = Q.y*Q.z / Q.x;                    //hu*hv/h;
-
-    return F;
-}
-
-
-
-
-
-
-
-
 
 
 
diff --git a/GPUSimulators/limiters.cu b/GPUSimulators/limiters.cu
index 39ec2d9..e34d21f 100644
--- a/GPUSimulators/limiters.cu
+++ b/GPUSimulators/limiters.cu
@@ -50,8 +50,8 @@ __device__ void minmodSlopeX(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                   const float theta_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     //Reconstruct slopes along x axis
     {
@@ -74,8 +74,8 @@ __device__ void minmodSlopeY(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
                   float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                   const float theta_) {
     //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
     
     for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
         const int l = j + 1;