Added initial version of SW code

2025-12-23 12:58:43 +01:00 · 2018-06-14 10:35:01 +02:00
parent 3767b121df
commit c5dc865c48
45 changed files with 13769 additions and 0 deletions
--- a/SWESimulators/CDKLM16.py
+++ b/SWESimulators/CDKLM16.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements 
+Alina Chertock, Michael Dudzinski, A. Kurganov & Maria Lukacova-Medvidova (2016)
+Well-Balanced Schemes for the Shallow Water Equations with Coriolis Forces
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+
+
+        
+        
+        
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class CDKLM16:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    u0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    v0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    f: Coriolis parameter (1.2e-4 s^1)
+    r: Bottom friction coefficient (2.4e-3 m/s)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, f, r, \
+                 theta=1.3, use_rk2=True,
+                 wind_stress=Common.WindStressParams(), \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.kernel = Common.get_kernel(self.cl_ctx, "CDKLM16_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 3
+        ghost_cells_y = 3
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.f = np.float32(f)
+        self.r = np.float32(r)
+        self.theta = np.float32(theta)
+        self.use_rk2 = use_rk2
+        self.wind_stress = wind_stress
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):        
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+        
+            if (self.use_rk2):
+                self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, local_dt, \
+                        self.g, \
+                        self.theta, \
+                        self.f, \
+                        self.r, \
+                        np.int32(0), \
+                        self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.wind_stress.type, \
+                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                        self.wind_stress.x0, self.wind_stress.y0, \
+                        self.wind_stress.u0, self.wind_stress.v0, \
+                        self.t)
+                self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, local_dt, \
+                        self.g, \
+                        self.theta, \
+                        self.f, \
+                        self.r, \
+                        np.int32(1), \
+                        self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.wind_stress.type, \
+                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                        self.wind_stress.x0, self.wind_stress.y0, \
+                        self.wind_stress.u0, self.wind_stress.v0, \
+                        self.t)
+            else:
+                self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, local_dt, \
+                        self.g, \
+                        self.theta, \
+                        self.f, \
+                        self.r, \
+                        np.int32(0), \
+                        self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.wind_stress.type, \
+                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                        self.wind_stress.x0, self.wind_stress.y0, \
+                        self.wind_stress.u0, self.wind_stress.v0, \
+                        self.t)
+                self.cl_data.swap()
+                
+            self.t += local_dt
+            
+        
+        return self.t
+    
+    """
+    Static function which reads a text file and creates an OpenCL kernel from that
+    """
+    def get_kernel(self, kernel_filename):
+        #Read the proper program
+        module_path = os.path.dirname(os.path.realpath(__file__))
+        fullpath = os.path.join(module_path, kernel_filename)
+        with open(fullpath, "r") as kernel_file:
+            kernel_string = kernel_file.read()
+            kernel = cl.Program(self.cl_ctx, kernel_string).build()
+            
+        return kernel
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/CDKLM16_kernel.opencl
+++ b/SWESimulators/CDKLM16_kernel.opencl
@@ -0,0 +1,440 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.opencl"
+
+
+
+float3 CDKLM16_F_func(const float3 Q, const float g) {
+    float3 F;
+
+    F.x = Q.x*Q.y;                        //h*u
+    F.y = Q.x*Q.y*Q.y + 0.5f*g*Q.x*Q.x;   //h*u*u + 0.5f*g*h*h;
+    F.z = Q.x*Q.y*Q.z;                    //h*u*v;
+
+    return F;
+}
+
+
+
+
+
+
+
+/**
+  * Note that the input vectors are (h, u, v), thus not the regular
+  * (h, hu, hv)
+  */
+float3 CDKLM16_flux(const float3 Qm, float3 Qp, const float g) {
+    const float3 Fp = CDKLM16_F_func(Qp, g);
+    const float up = Qp.y;         // u
+    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
+
+    const float3 Fm = CDKLM16_F_func(Qm, g);
+    const float um = Qm.y;         // u
+    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
+    
+    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
+    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
+    
+    float3 F;
+    
+    F.x = ((ap*Fm.x - am*Fp.x) + ap*am*(Qp.x-Qm.x))/(ap-am);
+    F.y = ((ap*Fm.y - am*Fp.y) + ap*am*(Qp.y-Qm.y))/(ap-am);
+    F.z = (Qm.y + Qp.y > 0) ? Fm.z : Fp.z; //Upwinding to be consistent
+    
+    return F;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+        
+        int step_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_,
+        
+        //Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+        
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 3; //Skip global ghost cells, i.e., +3
+    const int tj = get_global_id(1) + 3;
+    
+    // Our physical variables
+    __local float R[3][block_height+6][block_width+6];
+    
+    // Our reconstruction variables
+    __local float Q[4][block_height+4][block_width+4];
+    __local float Qx[4][block_height][block_width+2];
+    __local float Qy[4][block_height+2][block_width];
+    
+    // Our fluxes
+    __local float F[3][block_height][block_width+1];
+    __local float G[3][block_height+1][block_width];
+    
+    
+    
+    
+    
+    //Read into shared memory
+    for (int j=ty; j<block_height+6; j+=get_local_size(1)) {
+        const int l = clamp(by + j, 0, ny_+5); // Out of bounds
+        
+        //Compute the pointer to current row in the arrays
+        __global float* const h_row = (__global float*) ((__global char*) h0_ptr_ + h0_pitch_*l);
+        __global float* const hu_row = (__global float*) ((__global char*) hu0_ptr_ + hu0_pitch_*l);
+        __global float* const hv_row = (__global float*) ((__global char*) hv0_ptr_ + hv0_pitch_*l);
+        
+        for (int i=tx; i<block_width+6; i+=get_local_size(0)) {
+            const int k = clamp(bx + i, 0, nx_+5); // Out of bounds
+            
+            R[0][j][i] = h_row[k];
+            R[1][j][i] = hu_row[k];
+            R[2][j][i] = hv_row[k];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    //Fix boundary conditions
+    {
+        const int i = tx + 3; //Skip local ghost cells, i.e., +3
+        const int j = ty + 3;
+        
+        if (ti == 3) {
+            R[0][j][i-1] =  R[0][j][i];
+            R[1][j][i-1] = -R[1][j][i];
+            R[2][j][i-1] =  R[2][j][i];
+            
+            R[0][j][i-2] =  R[0][j][i+1];
+            R[1][j][i-2] = -R[1][j][i+1];
+            R[2][j][i-2] =  R[2][j][i+1];
+            
+            R[0][j][i-3] =  R[0][j][i+2];
+            R[1][j][i-3] = -R[1][j][i+2];
+            R[2][j][i-3] =  R[2][j][i+2];
+        }
+        if (ti == nx_+2) {
+            R[0][j][i+1] =  R[0][j][i];
+            R[1][j][i+1] = -R[1][j][i];
+            R[2][j][i+1] =  R[2][j][i];
+            
+            R[0][j][i+2] =  R[0][j][i-1];
+            R[1][j][i+2] = -R[1][j][i-1];
+            R[2][j][i+2] =  R[2][j][i-1];
+            
+            R[0][j][i+3] =  R[0][j][i-2];
+            R[1][j][i+3] = -R[1][j][i-2];
+            R[2][j][i+3] =  R[2][j][i-2];
+        }
+        if (tj == 3) {
+            R[0][j-1][i] =  R[0][j][i];
+            R[1][j-1][i] =  R[1][j][i];
+            R[2][j-1][i] = -R[2][j][i];
+            
+            R[0][j-2][i] =  R[0][j+1][i];
+            R[1][j-2][i] =  R[1][j+1][i];
+            R[2][j-2][i] = -R[2][j+1][i];
+            
+            R[0][j-3][i] =  R[0][j+2][i];
+            R[1][j-3][i] =  R[1][j+2][i];
+            R[2][j-3][i] = -R[2][j+2][i];
+        }
+        if (tj == ny_+2) {
+            R[0][j+1][i] =  R[0][j][i];
+            R[1][j+1][i] =  R[1][j][i];
+            R[2][j+1][i] = -R[2][j][i];
+            
+            R[0][j+2][i] =  R[0][j-1][i];
+            R[1][j+2][i] =  R[1][j-1][i];
+            R[2][j+2][i] = -R[2][j-1][i];
+            
+            R[0][j+3][i] =  R[0][j-2][i];
+            R[1][j+3][i] =  R[1][j-2][i];
+            R[2][j+3][i] = -R[2][j-2][i];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    
+    
+    
+    //Create our "steady state" reconstruction variables (u, v, K, L)
+    for (int j=ty; j<block_height+4; j+=get_local_size(1)) {
+        const int l = j + 1; //Skip one "ghost cell row" of Q, going from 6x6 to 4x4 "halo"
+        for (int i=tx; i<block_width+4; i+=get_local_size(0)) {
+            const int k = i + 1;
+            
+            const float h = R[0][l][k];
+            const float u = R[1][l][k] / h;
+            const float v = R[2][l][k] / h;
+            
+            const float B = 0.0f;
+            const float U = 0.25f * f_/g_ * (1.0*R[1][l+1][k]/R[0][l+1][k] + 2.0f*u + 1.0f*R[1][l-1][k]/R[0][l-1][k]);
+            const float V = 0.25f * f_/g_ * (1.0*R[2][l][k+1]/R[0][l][k+1] + 2.0f*v + 1.0f*R[2][l][k-1]/R[0][l][k-1]);
+            //const float U = f_/g_ * u;
+            //const float V = f_/g_ * v;
+            const float K = h + B - V;
+            const float L = h + B + U;
+            
+            Q[0][j][i] = u;
+            Q[1][j][i] = v;
+            Q[2][j][i] = K;
+            Q[3][j][i] = L;         
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    
+    
+    
+    //Reconstruct slopes along x axis
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            const int k = i + 1;
+            for (int p=0; p<4; ++p) {
+                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
+            }
+        }
+    }
+    
+    //Reconstruct slopes along y axis
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 2; //Skip ghost cells
+            for (int p=0; p<4; ++p) {
+                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    
+    
+    
+    
+    //Compute fluxes along the x axis
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells (be consistent with reconstruction offsets)
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i + 1;
+
+            // R=(u, v, K, L) reconstructed at a cell interface from the right (p) and left (m)
+            const float4 Rp = (float4)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                       Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                       Q[2][l][k+1] - 0.5f*Qx[2][j][i+1],
+                                       Q[3][l][k+1] - 0.5f*Qx[3][j][i+1]);
+            const float4 Rm = (float4)(Q[0][l][k  ] + 0.5f*Qx[0][j][i  ],
+                                       Q[1][l][k  ] + 0.5f*Qx[1][j][i  ],
+                                       Q[2][l][k  ] + 0.5f*Qx[2][j][i  ],
+                                       Q[3][l][k  ] + 0.5f*Qx[3][j][i  ]);
+
+            // Variables to reconstruct h from u, v, K, L
+            const float vp = Q[1][l][k+1];
+            const float vm = Q[1][l][k  ];
+            const float V = 0.5f * f_/g_ * (vp + vm);
+            const float B = 0.0f;
+            
+            // Reconstruct h = K/g + V - B
+            const float hp = Rp.z + V - B;
+            const float hm = Rm.z + V - B;
+            
+            // Our flux variables Q=(h, u, v)
+            const float3 Qp = (float3)(hp, Rp.x, Rp.y);
+            const float3 Qm = (float3)(hm, Rm.x, Rm.y);
+                                       
+            // Computed flux
+            const float3 flux = CDKLM16_flux(Qm, Qp, g_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+        
+    //Compute fluxes along the y axis
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+            const int k = i + 2; //Skip ghost cells
+            // Q at interface from the right and left
+            const float4 Rp = (float4)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                       Q[1][l+1][k] - 0.5f*Qy[1][j+1][i],
+                                       Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                       Q[3][l+1][k] - 0.5f*Qy[3][j+1][i]);
+            const float4 Rm = (float4)(Q[0][l  ][k] + 0.5f*Qy[0][j  ][i],
+                                       Q[1][l  ][k] + 0.5f*Qy[1][j  ][i],
+                                       Q[2][l  ][k] + 0.5f*Qy[2][j  ][i],
+                                       Q[3][l  ][k] + 0.5f*Qy[3][j  ][i]);
+              
+            // Variables to reconstruct h from u, v, K, L
+            const float up = Q[0][l+1][k];
+            const float um = Q[0][l  ][k];
+            const float U = 0.5f * f_/g_ * (up + um);
+            const float B = 0.0f;
+            
+            // Reconstruct h = L/g - U - B
+            const float hp = Rp.w - U - B;
+            const float hm = Rm.w - U - B;
+            
+            // Our flux variables Q=(h, v, u)
+            // Note that we swap u and v
+            const float3 Qp = (float3)(hp, Rp.y, Rp.x);
+            const float3 Qm = (float3)(hm, Rm.y, Rm.x);
+            
+            // Computed flux
+            // Note that we swap back u and v
+            const float3 flux = CDKLM16_flux(Qm, Qp, g_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    
+    
+    //Sum fluxes and advance in time for all internal cells
+    if (ti > 2 && ti < nx_+3 && tj > 2 && tj < ny_+3) {
+        const int i = tx + 3; //Skip local ghost cells, i.e., +2
+        const int j = ty + 3;
+        
+        const float X = windStressX(
+            wind_stress_type_, 
+            dx_, dy_, dt_,
+            tau0_, rho_, alpha_, xm_, Rc_,
+            x0_, y0_,
+            u0_, v0_,
+            t_);
+        const float Y = windStressY(
+            wind_stress_type_, 
+            dx_, dy_, dt_,
+            tau0_, rho_, alpha_, xm_, Rc_,
+            x0_, y0_,
+            u0_, v0_,
+            t_);
+        
+        const float h1  = R[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
+        const float hu1 = R[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_
+                                     + dt_*X - dt_*f_*R[2][j][i];
+        const float hv1 = R[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_
+                                     + dt_*Y + dt_*f_*R[1][j][i];
+
+        __global float* const h_row  = (__global float*) ((__global char*) h1_ptr_ + h1_pitch_*tj);
+        __global float* const hu_row = (__global float*) ((__global char*) hu1_ptr_ + hu1_pitch_*tj);
+        __global float* const hv_row = (__global float*) ((__global char*) hv1_ptr_ + hv1_pitch_*tj);
+        
+        const float C = 2.0f*r_*dt_/R[0][j][i];
+                    
+        if  (step_ == 0) {
+            //First step of RK2 ODE integrator
+            
+            h_row[ti] = h1;
+            hu_row[ti] = hu1 / (1.0f + C);
+            hv_row[ti] = hv1 / (1.0f + C);
+        }
+        else if (step_ == 1) {
+            //Second step of RK2 ODE integrator
+            
+            //First read Q^n
+            const float h_a  = h_row[ti];
+            const float hu_a = hu_row[ti];
+            const float hv_a = hv_row[ti];
+            
+            //Compute Q^n+1
+            const float h_b  = 0.5f*(h_a + h1);
+            const float hu_b = 0.5f*(hu_a + hu1);
+            const float hv_b = 0.5f*(hv_a + hv1);
+            
+            //Write to main memory
+            h_row[ti] = h_b;
+            hu_row[ti] = hu_b / (1.0f + 0.5f*C);
+            hv_row[ti] = hv_b / (1.0f + 0.5f*C);
+        }
+    }
+    
+}
--- a/SWESimulators/CTCS.py
+++ b/SWESimulators/CTCS.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Centered in Time, Centered in Space
+(leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+
+
+
+
+
+
+"""
+Class that solves the SW equations using the Centered in time centered in space scheme
+"""
+class CTCS:
+
+    """
+    Initialization routine
+    H: Water depth incl ghost cells, (nx+2)*(ny+2) cells
+    eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    f: Coriolis parameter (1.2e-4 s^1)
+    r: Bottom friction coefficient (2.4e-3 m/s)
+    A: Eddy viscosity coefficient (O(dx))
+    wind_stress: Wind stress parameters
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 H, eta0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, f, r, A, \
+                 wind_stress=Common.WindStressParams(), \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        reload(Common)
+        #Get kernels
+        self.u_kernel = Common.get_kernel(self.cl_ctx, "CTCS_U_kernel.opencl", block_width, block_height)
+        self.v_kernel = Common.get_kernel(self.cl_ctx, "CTCS_V_kernel.opencl", block_width, block_height)
+        self.eta_kernel = Common.get_kernel(self.cl_ctx, "CTCS_eta_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 1
+        ghost_cells_y = 1
+        self.H = Common.OpenCLArray2D(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, H)
+        self.cl_data = Common.SWEDataArkawaC(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.f = np.float32(f)
+        self.r = np.float32(r)
+        self.A = np.float32(A)
+        self.wind_stress = wind_stress
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):
+            #Notation: 
+            # cl_data.u0 => U^{n-1} before U kernel, U^{n+1} after U kernel
+            # cl_data.u1 => U^{n}
+            # When we call cl_data.swap(), we swap these, so that
+            # cl_data.u0 => U^{n}
+            # cl_data.u1 => U^{n+1} (U kernel has been executed)
+            # Now we are ready for the next time step
+            
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+            
+            self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, self.r, \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch,     # eta^{n-1} => eta^{n+1} \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch,   # U^{n} \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)   # V^{n}
+            
+            self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, self.r, self.A,\
+                    self.H.data, self.H.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch,      # eta^{n} \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch,    # U^{n-1} => U^{n+1} \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch,    # U^{n} \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch,    # V^{n} \
+                    self.wind_stress.type, \
+                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                    self.wind_stress.x0, self.wind_stress.y0, \
+                    self.wind_stress.u0, self.wind_stress.v0, \
+                    self.t)
+            
+            self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, self.r, self.A,\
+                    self.H.data, self.H.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch,     # eta^{n} \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch,   # U^{n} \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch,   # V^{n-1} => V^{n+1} \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch,   # V^{n} \
+                    self.wind_stress.type, \
+                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                    self.wind_stress.x0, self.wind_stress.y0, \
+                    self.wind_stress.u0, self.wind_stress.v0, \
+                    self.t)
+            
+            #After the kernels, swap the data pointers
+            self.cl_data.swap()
+            
+            self.t += local_dt
+        
+        return self.t
+    
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
+
+        
+
+
+
+
+
+
+
--- a/SWESimulators/CTCS2Layer.py
+++ b/SWESimulators/CTCS2Layer.py
@@ -0,0 +1,435 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Centered in Time, Centered in Space
+(leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+#Import packages we need
+import os
+import time
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+Class that holds data for the SW equations in OpenCL
+"""
+class CTCS2LayerDataCL:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, cl_ctx, h1_0, eta1_0, u1_0, v1_0, \
+                               h2_0, eta2_0, u2_0, v2_0):
+        #Make sure that the data is single precision floating point
+        if (not np.issubdtype(h1_0.dtype, np.float32) or np.isfortran(h1_0)):
+            print "Converting H_0"
+            h1_0 = h1_0.astype(np.float32, order='C')
+        if (not np.issubdtype(eta1_0.dtype, np.float32) or np.isfortran(eta1_0)):
+            print "Converting Eta_0"
+            eta1_0 = eta1_0.astype(np.float32, order='C')
+        if (not np.issubdtype(u1_0.dtype, np.float32) or np.isfortran(u1_0)):
+            print "Converting U_0"
+            u1_0 = u1_0.astype(np.float32, order='C')
+        if (not np.issubdtype(v1_0.dtype, np.float32) or np.isfortran(v1_0)):
+            print "Converting V_0"
+            v1_0 = v1_0.astype(np.float32, order='C')
+        
+        #Same for second (deepest) layer
+        if (not np.issubdtype(h2_0.dtype, np.float32) or np.isfortran(h2_0)):
+            print "Converting H2_0"
+            h2_0 = h2_0.astype(np.float32, order='C')
+        if (not np.issubdtype(eta2_0.dtype, np.float32) or np.isfortran(eta2_0)):
+            print "Converting Eta2_0"
+            eta2_0 = eta2_0.astype(np.float32, order='C')
+        if (not np.issubdtype(u2_0.dtype, np.float32) or np.isfortran(u2_0)):
+            print "Converting U2_0"
+            u2_0 = u2_0.astype(np.float32, order='C')
+        if (not np.issubdtype(v2_0.dtype, np.float32) or np.isfortran(v2_0)):
+            print "Converting V2_0"
+            v2_0 = v2_0.astype(np.float32, order='C')
+        
+        self.ny, self.nx = h1_0.shape
+        self.nx = self.nx - 2 # Ghost cells
+        self.ny = self.ny - 2
+
+        assert(h1_0.shape == (self.ny+2, self.nx+2))
+        assert(eta1_0.shape == (self.ny+2, self.nx+2))
+        assert(u1_0.shape == (self.ny+2, self.nx+1))
+        assert(v1_0.shape == (self.ny+1, self.nx+2))
+        
+        #Same for layer 2
+        assert(h2_0.shape == (self.ny+2, self.nx+2))
+        assert(eta2_0.shape == (self.ny+2, self.nx+2))
+        assert(u2_0.shape == (self.ny+2, self.nx+1))
+        assert(v2_0.shape == (self.ny+1, self.nx+2))
+
+        #Upload data to the device
+        mf = cl.mem_flags
+        self.h1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=h1_0)
+        
+        self.eta1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta1_0)
+        self.eta1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta1_0)
+        
+        self.u1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u1_0)
+        self.u1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u1_0)
+        
+        self.v1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v1_0)
+        self.v1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v1_0)
+        
+        #Same for layer 2
+        self.h2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=h2_0)
+        
+        self.eta2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta2_0)
+        self.eta2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta2_0)
+        
+        self.u2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u2_0)
+        self.u2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u2_0)
+        
+        self.v2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v2_0)
+        self.v2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v2_0)
+        
+        
+        
+        
+        
+        #Compute pitches
+        self.h1_0_pitch = np.int32(h1_0.shape[1]*4)
+        
+        self.eta1_0_pitch = np.int32(eta1_0.shape[1]*4)
+        self.eta1_1_pitch = np.int32(eta1_0.shape[1]*4)
+        
+        self.u1_0_pitch = np.int32(u1_0.shape[1]*4)
+        self.u1_1_pitch = np.int32(u1_0.shape[1]*4)
+        
+        self.v1_0_pitch = np.int32(v1_0.shape[1]*4)
+        self.v1_1_pitch = np.int32(v1_0.shape[1]*4)
+        
+        #Same for layer 2
+        self.h2_0_pitch = np.int32(h2_0.shape[1]*4)
+        
+        self.eta2_0_pitch = np.int32(eta2_0.shape[1]*4)
+        self.eta2_1_pitch = np.int32(eta2_0.shape[1]*4)
+        
+        self.u2_0_pitch = np.int32(u2_0.shape[1]*4)
+        self.u2_1_pitch = np.int32(u2_0.shape[1]*4)
+        
+        self.v2_0_pitch = np.int32(v2_0.shape[1]*4)
+        self.v2_1_pitch = np.int32(v2_0.shape[1]*4)
+        
+       
+    
+    """
+    Swaps the variables after a timestep has been completed
+    """
+    def swap(self):
+        self.eta1_1, self.eta1_0 = self.eta1_0, self.eta1_1
+        self.u1_1, self.u1_0 = self.u1_0, self.u1_1
+        self.v1_1, self.v1_0 = self.v1_0, self.v1_1
+        
+        #Same for layer 2
+        self.eta2_1, self.eta2_0 = self.eta2_0, self.eta2_1
+        self.u2_1, self.u2_0 = self.u2_0, self.u2_1
+        self.v2_1, self.v2_0 = self.v2_0, self.v2_1
+        
+        
+    """
+    Enables downloading data from CL device to Python
+    """
+    def download(self, cl_queue):
+        #Allocate data on the host for result
+        eta1_1 = np.empty((self.ny+2, self.nx+2), dtype=np.float32, order='C')
+        u1_1 = np.empty((self.ny+2, self.nx+1), dtype=np.float32, order='C')
+        v1_1 = np.empty((self.ny+1, self.nx+2), dtype=np.float32, order='C')
+        
+        #Same for layer 2
+        eta2_1 = np.empty((self.ny+2, self.nx+2), dtype=np.float32, order='C')
+        u2_1 = np.empty((self.ny+2, self.nx+1), dtype=np.float32, order='C')
+        v2_1 = np.empty((self.ny+1, self.nx+2), dtype=np.float32, order='C')
+        
+        #Copy data from device to host
+        cl.enqueue_copy(cl_queue, eta1_1, self.eta1_1)
+        cl.enqueue_copy(cl_queue, u1_1, self.u1_1)
+        cl.enqueue_copy(cl_queue, v1_1, self.v1_1)
+        
+        #Same for layer 2
+        cl.enqueue_copy(cl_queue, eta2_1, self.eta2_1)
+        cl.enqueue_copy(cl_queue, u2_1, self.u2_1)
+        cl.enqueue_copy(cl_queue, v2_1, self.v2_1)
+        
+        
+        #Return
+        return eta1_1, u1_1, v1_1, eta2_1, u2_1, v2_1
+        
+        
+        
+        
+        
+        
+
+
+
+
+
+
+
+
+
+
+"""
+Class that solves the SW equations using the Centered in time centered in space scheme
+"""
+class CTCS2Layer:
+
+    """
+    Initialization routine
+    h1_0: Water depth incl ghost cells, (nx+2)*(ny+2) cells
+    eta1_0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
+    u1_0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
+    v1_0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
+    h2_0: Water depth (layer 2) incl ghost cells, (nx+2)*(ny+2) cells
+    eta2_0: Initial deviation from mean sea level (layer 2) incl ghost cells, (nx+2)*(ny+2) cells
+    u2_0: Initial momentum (layer 2) along x-axis incl ghost cells, (nx+1)*(ny+2) cells
+    v2_0: Initial momentum (layer 2) along y-axis incl ghost cells, (nx+2)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    f: Coriolis parameter (1.2e-4 s^1)
+    r: Bottom friction coefficient (2.4e-3 m/s)
+    r2: Inter-layer friction coefficient (m/s)
+    A: Eddy viscosity coefficient (O(dx))
+    rho1: Density of upper layer (1025.0 kg / m^3)
+    rho2: Density of lower layer (1000.0 kg / m^3)
+    wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
+    wind_tau0: Amplitude of wind stress (Pa)
+    wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
+    wind_xm: Maximum wind stress for bell shaped wind stress
+    wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
+    wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
+    wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
+    wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
+    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
+    """
+    def __init__(self, \
+                 h1_0, eta1_0, u1_0, v1_0, \
+                 h2_0, eta2_0, u2_0, v2_0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, f, r1, r2, A, \
+                 rho1, rho2,
+                 wind_type=99, # "no wind" \
+                 wind_tau0=0, wind_alpha=0, wind_xm=0, wind_Rc=0, \
+                 wind_x0=0, wind_y0=0, \
+                 wind_u0=0, wind_v0=0):
+        #Make sure we get compiler output from OpenCL
+        os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
+
+        #Set which CL device to use
+        os.environ["PYOPENCL_CTX"] = "1"
+
+        #Create OpenCL context
+        self.cl_ctx = cl.create_some_context()
+        print "Using ", self.cl_ctx.devices[0].name
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.u_kernel = self.get_kernel("CTCS2Layer_U_kernel.opencl")
+        self.v_kernel = self.get_kernel("CTCS2Layer_V_kernel.opencl")
+        self.eta_kernel = self.get_kernel("CTCS2Layer_eta_kernel.opencl")
+        
+        #Create data by uploading to device
+        self.cl_data = CTCS2LayerDataCL(self.cl_ctx, h1_0, eta1_0, u1_0, v1_0, h2_0, eta2_0, u2_0, v2_0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.f = np.float32(f)
+        self.r1 = np.float32(r1)
+        self.r2 = np.float32(r2)
+        self.A = np.float32(A)
+        assert(rho1 <= rho2)
+        self.rho1 = np.float32(rho1)
+        self.rho2 = np.float32(rho2)
+        self.wind_type = np.int32(wind_type)
+        self.wind_tau0 = np.float32(wind_tau0)
+        self.wind_alpha = np.float32(wind_alpha)
+        self.wind_xm = np.float32(wind_xm)
+        self.wind_Rc = np.float32(wind_Rc)
+        self.wind_x0 = np.float32(wind_x0)
+        self.wind_y0 = np.float32(wind_y0)
+        self.wind_u0 = np.float32(wind_u0)
+        self.wind_v0 = np.float32(wind_v0)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (8, 8) # WARNING::: MUST MATCH defines of block_width/height in kernels!
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):
+            #Notation: 
+            # cl_data.u0 => U^{n-1} before U kernel, U^{n+1} after U kernel
+            # cl_data.u1 => U^{n}
+            # When we call cl_data.swap(), we swap these, so that
+            # cl_data.u0 => U^{n}
+            # cl_data.u1 => U^{n+1} (U kernel has been executed)
+            # Now we are ready for the next time step
+            
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+            
+            self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    \
+                    self.cl_data.eta1_0, self.cl_data.eta1_0_pitch,    # eta^{n-1} => eta^{n+1} \
+                    self.cl_data.u1_1, self.cl_data.u1_1_pitch,        # U^{n} \
+                    self.cl_data.v1_1, self.cl_data.v1_1_pitch,        # V^{n}
+                    \
+                    self.cl_data.eta2_0, self.cl_data.eta2_0_pitch, \
+                    self.cl_data.u2_1, self.cl_data.u2_1_pitch, \
+                    self.cl_data.v2_1, self.cl_data.v2_1_pitch)
+            
+            self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, \
+                    self.r1, self.r2, \
+                    self.A, \
+                    self.rho1, self.rho2, \
+                    \
+                    self.cl_data.h1_0, self.cl_data.h1_0_pitch, \
+                    self.cl_data.eta1_1, self.cl_data.eta1_1_pitch, # eta^{n} \
+                    self.cl_data.u1_0, self.cl_data.u1_0_pitch,     # U^{n-1} => U^{n+1} \
+                    self.cl_data.u1_1, self.cl_data.u1_1_pitch,     # U^{n} \
+                    self.cl_data.v1_1, self.cl_data.v1_1_pitch,     # V^{n} \
+                    \
+                    self.cl_data.h2_0, self.cl_data.h2_0_pitch, \
+                    self.cl_data.eta2_1, self.cl_data.eta2_1_pitch, \
+                    self.cl_data.u2_0, self.cl_data.u2_0_pitch, \
+                    self.cl_data.u2_1, self.cl_data.u2_1_pitch, \
+                    self.cl_data.v2_1, self.cl_data.v2_1_pitch, \
+                    \
+                    self.wind_type, \
+                    self.wind_tau0, self.wind_alpha, self.wind_xm, self.wind_Rc, \
+                    self.wind_x0, self.wind_y0, \
+                    self.wind_u0, self.wind_v0, \
+                    self.t)
+
+            self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, \
+                    self.r1, self.r2, \
+                    self.A, \
+                    self.rho1, self.rho2, \
+                    \
+                    self.cl_data.h1_0, self.cl_data.h1_0_pitch, \
+                    self.cl_data.eta1_1, self.cl_data.eta1_1_pitch,   # eta^{n} \
+                    self.cl_data.u1_1, self.cl_data.u1_1_pitch,       # U^{n} \
+                    self.cl_data.v1_0, self.cl_data.v1_0_pitch,       # V^{n-1} => V^{n+1} \
+                    self.cl_data.v1_1, self.cl_data.v1_1_pitch,       # V^{n} \
+                    \
+                    self.cl_data.h2_0, self.cl_data.h2_0_pitch, \
+                    self.cl_data.eta2_1, self.cl_data.eta2_1_pitch, \
+                    self.cl_data.u2_1, self.cl_data.u2_1_pitch, \
+                    self.cl_data.v2_0, self.cl_data.v2_0_pitch, \
+                    self.cl_data.v2_1, self.cl_data.v2_1_pitch, \
+                    \
+                    self.wind_type, \
+                    self.wind_tau0, self.wind_alpha, self.wind_xm, self.wind_Rc, \
+                    self.wind_x0, self.wind_y0, \
+                    self.wind_u0, self.wind_v0, \
+                    self.t)
+                    
+                    
+            #After the kernels, swap the data pointers
+            self.cl_data.swap()
+            
+            self.t += local_dt
+        
+        return self.t
+    
+    """
+    Static function which reads a text file and creates an OpenCL kernel from that
+    """
+    def get_kernel(self, kernel_filename):
+        #Read the proper program
+        module_path = os.path.dirname(os.path.realpath(__file__))
+        fullpath = os.path.join(module_path, kernel_filename)
+        with open(fullpath, "r") as kernel_file:
+            kernel_string = kernel_file.read()
+            kernel = cl.Program(self.cl_ctx, kernel_string).build()
+            
+        return kernel
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
+
+        
+
+
+
+
+
+
+
--- a/SWESimulators/CTCS2Layer_U_kernel.opencl
+++ b/SWESimulators/CTCS2Layer_U_kernel.opencl
@@ -0,0 +1,414 @@
+/**
+This OpenCL kernel implements part of the Centered in Time, Centered 
+in Space (leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#define block_height 8
+#define block_width 8
+
+
+typedef __local float eta_shmem[block_height+2][block_width+1];
+typedef __local float u_shmem[block_height+2][block_width+2];
+typedef __local float v_shmem[block_height+1][block_width+1];
+
+
+
+float windStressX(int wind_stress_type_,
+                float dx_, float dy_, float dt_,
+                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+                float x0_, float y0_,
+                float u0_, float v0_,
+                float t_) {
+    
+    float X = 0.0f;
+    
+    switch (wind_stress_type_) {
+    case 0: //UNIFORM_ALONGSHORE
+        {
+            const float y = (get_global_id(1)+0.5f)*dy_;
+            X = tau0_/rho_ * exp(-alpha_*y);
+        }
+        break;
+    case 1: //BELL_SHAPED_ALONGSHORE
+        if (t_ <= 48.0f*3600.0f) {
+            const float a = alpha_*((get_global_id(0)+0.5f)*dx_-xm_);
+            const float aa = a*a;
+            const float y = (get_global_id(1)+0.5f)*dy_;
+            X = tau0_/rho_ * exp(-aa) * exp(-alpha_*y);
+        }
+        break;
+    case 2: //MOVING_CYCLONE
+        {
+            const float x = (get_global_id(0))*dx_;
+            const float y = (get_global_id(1)+0.5f)*dy_;
+            const float a = (x-x0_-u0_*(t_+dt_));
+            const float aa = a*a;
+            const float b = (y-y0_-v0_*(t_+dt_));
+            const float bb = b*b;
+            const float r = sqrt(aa+bb);
+            const float c = 1.0f - r/Rc_;
+            const float xi = c*c;
+            
+            X = -(tau0_/rho_) * (b/Rc_) * exp(-0.5f*xi);
+        }
+        break;
+    }
+
+    return X;
+}
+
+
+
+
+/**
+  * Kernel that evolves U one step in time.
+  */
+__kernel void computeUKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r1_, //< Inter-layer friction coefficient
+        float r2_, //< Bottom friction coefficient
+    
+        //Numerical diffusion
+        float A_,
+        
+        //Density of each layer
+        float rho1_,
+        float rho2_,
+    
+        //Data for layer 1
+        __global float* H1_ptr_, int H1_pitch_,
+        __global float* eta1_1_ptr_, int eta1_1_pitch_, // eta^n
+        __global float* U1_0_ptr_, int U1_0_pitch_, // U^n-1, also output, U^n+1
+        __global float* U1_1_ptr_, int U1_1_pitch_, // U^n
+        __global float* V1_1_ptr_, int V1_1_pitch_, // V^n
+        
+        //Data for layer 2
+        __global float* H2_ptr_, int H2_pitch_,
+        __global float* eta2_1_ptr_, int eta2_1_pitch_, // eta^n
+        __global float* U2_0_ptr_, int U2_0_pitch_, // U^n-1, also output, U^n+1
+        __global float* U2_1_ptr_, int U2_1_pitch_, // U^n
+        __global float* V2_1_ptr_, int V2_1_pitch_, // V^n
+    
+        // Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+                    
+    eta_shmem H1_shared;
+    eta_shmem eta1_shared;
+    u_shmem U1_shared;
+    v_shmem V1_shared;
+    
+    eta_shmem H2_shared;
+    eta_shmem eta2_shared;
+    u_shmem U2_shared;
+    v_shmem V2_shared;
+   
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Start of block within domain
+    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
+    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
+
+    //Index of cell within domain
+    const int ti = bx + tx;
+    const int tj = by + ty;
+    
+    //Compute pointer to current row in the U array
+    __global float* const U1_0_row = (__global float*) ((__global char*) U1_0_ptr_ + U1_0_pitch_*tj);
+    __global float* const U2_0_row = (__global float*) ((__global char*) U2_0_ptr_ + U2_0_pitch_*tj);
+
+    //Read current U
+    float U1_0 = 0.0f;
+    float U2_0 = 0.0f;
+    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
+        U1_0 = U1_0_row[ti];
+        U2_0 = U2_0_row[ti];
+    }
+
+    //Read H and eta into shared memory: (nx+1)*(ny+2) cells
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        // "fake" global ghost cells by clamping
+        const int l = clamp(by + j - 1, 1, ny_);
+        
+        //Compute the pointer to current row in the H and eta arrays
+        __global float* const H1_row = (__global float*) ((__global char*) H1_ptr_ + H1_pitch_*l);
+        __global float* const H2_row = (__global float*) ((__global char*) H2_ptr_ + H2_pitch_*l);
+        
+        __global float* const eta1_1_row = (__global float*) ((__global char*) eta1_1_ptr_ + eta1_1_pitch_*l);
+        __global float* const eta2_1_row = (__global float*) ((__global char*) eta2_1_ptr_ + eta2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            // "fake" global ghost cells by clamping
+            const int k = clamp(bx + i, 1, nx_);
+            
+            H1_shared[j][i] = H1_row[k];
+            H2_shared[j][i] = H2_row[k];
+            
+            eta1_shared[j][i] = eta1_1_row[k];
+            eta2_shared[j][i] = eta2_1_row[k];
+        }
+    }
+
+    //Read U into shared memory: (nx+2)*(ny+2) cells
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        // "fake" ghost cells by clamping
+        const int l = clamp(by + j - 1, 1, ny_);
+        
+        //Compute the pointer to current row in the U array
+        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
+        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            // Prevent out-of-bounds
+            const int k = clamp(bx + i - 1, 0, nx_);
+            
+            U1_shared[j][i] = U1_1_row[k];
+            U2_shared[j][i] = U2_1_row[k];
+        }
+    }
+    
+
+    //Read V into shared memory: (nx+1)*(ny+1) cells
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        // Prevent out-of-bounds
+        const int l = clamp(by + j - 1, 0, ny_);
+        
+        //Compute the pointer to current row in the V array
+        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
+        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            // "fake" ghost cells by clamping
+            const int k = clamp(bx + i, 1, nx_);
+            
+            V1_shared[j][i] = V1_1_row[k];
+            V2_shared[j][i] = V2_1_row[k];
+        }
+    }
+    
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    /**
+      * Now get all our required variables as short-hands
+      * here we use the notation of
+      *  Var1_00 as var_i,j for layer 1
+      *  Var2_p0 as var_i+1,j for layer 2
+      *  Var1_0m as var_i,j-1 for layer 1
+      * etc
+      */
+    //Layer 1
+    const float U1_00 = U1_shared[ty+1][tx+1]; //U at "center"
+    const float U1_0p = U1_shared[ty+2][tx+1]; //U at "north"
+    const float U1_0m = U1_shared[ty  ][tx+1]; //U at "south"
+    const float U1_p0 = U1_shared[ty+1][tx+2]; //U at "east"
+    const float U1_m0 = U1_shared[ty+1][tx  ]; //U at "west"
+    
+    const float V1_00 = V1_shared[ty+1][tx  ];
+    const float V1_p0 = V1_shared[ty+1][tx+1];
+    const float V1_0m = V1_shared[ty  ][tx  ];
+    const float V1_pm = V1_shared[ty  ][tx+1];
+    
+    const float H1_0m = H1_shared[ty  ][tx  ]; 
+    const float H1_00 = H1_shared[ty+1][tx  ]; 
+    const float H1_0p = H1_shared[ty+2][tx  ];
+    const float H1_pm = H1_shared[ty  ][tx+1];
+    const float H1_p0 = H1_shared[ty+1][tx+1]; 
+    const float H1_pp = H1_shared[ty+2][tx+1];
+    
+    const float eta1_0m = eta1_shared[ty  ][tx  ]; 
+    const float eta1_00 = eta1_shared[ty+1][tx  ]; 
+    const float eta1_0p = eta1_shared[ty+2][tx  ];
+    const float eta1_pm = eta1_shared[ty  ][tx+1];
+    const float eta1_p0 = eta1_shared[ty+1][tx+1]; 
+    const float eta1_pp = eta1_shared[ty+2][tx+1];
+    
+    
+    //Layer 2 (bottom)
+    const float U2_00 = U2_shared[ty+1][tx+1]; 
+    const float U2_0p = U2_shared[ty+2][tx+1]; 
+    const float U2_0m = U2_shared[ty  ][tx+1]; 
+    const float U2_p0 = U2_shared[ty+1][tx+2]; 
+    const float U2_m0 = U2_shared[ty+1][tx  ]; 
+    
+    const float V2_00 = V2_shared[ty+1][tx  ];
+    const float V2_p0 = V2_shared[ty+1][tx+1];
+    const float V2_0m = V2_shared[ty  ][tx  ];
+    const float V2_pm = V2_shared[ty  ][tx+1];
+
+    const float H2_0m = H2_shared[ty  ][tx  ]; 
+    const float H2_00 = H2_shared[ty+1][tx  ]; 
+    const float H2_0p = H2_shared[ty+2][tx  ];
+    const float H2_pm = H2_shared[ty  ][tx+1];
+    const float H2_p0 = H2_shared[ty+1][tx+1]; 
+    const float H2_pp = H2_shared[ty+2][tx+1];
+    
+    const float eta2_0m = eta2_shared[ty  ][tx  ]; 
+    const float eta2_00 = eta2_shared[ty+1][tx  ]; 
+    const float eta2_0p = eta2_shared[ty+2][tx  ];
+    const float eta2_pm = eta2_shared[ty  ][tx+1];
+    const float eta2_p0 = eta2_shared[ty+1][tx+1]; 
+    const float eta2_pp = eta2_shared[ty+2][tx+1];
+
+    
+    
+    //Reconstruct Eta_bar at the V position
+    const float eta1_bar_0m = 0.25f*(eta1_0m + eta1_pm + eta1_00 + eta1_p0);
+    const float eta1_bar_00 = 0.25f*(eta1_00 + eta1_p0 + eta1_0p + eta1_pp);
+    
+    const float eta2_bar_0m = 0.25f*(eta2_0m + eta2_pm + eta2_00 + eta2_p0);
+    const float eta2_bar_00 = 0.25f*(eta2_00 + eta2_p0 + eta2_0p + eta2_pp);
+
+    
+    
+    
+    //Reconstruct H_bar and H_x (at the U position)
+    const float H1_bar_0m = 0.25f*(H1_0m + H1_pm + H1_00 + H1_p0);
+    const float H1_bar_00 = 0.25f*(H1_00 + H1_p0 + H1_0p + H1_pp);
+    const float H1_x = 0.5f*(H1_00 + H1_p0);
+    
+    const float H2_bar_0m = 0.25f*(H2_0m + H2_pm + H2_00 + H2_p0);
+    const float H2_bar_00 = 0.25f*(H2_00 + H2_p0 + H2_0p + H2_pp);
+    const float H2_x = 0.5f*(H2_00 + H2_p0);
+    
+    
+    
+    //Compute layer thickness of top layer
+    const float h1_p0 = H1_p0 + eta1_p0 - eta2_p0;
+    const float h1_00 = H1_00 + eta1_00 - eta2_00;
+    const float h1_bar_0m = H1_bar_0m + eta1_bar_0m - eta2_bar_0m;
+    const float h1_bar_00 = H1_bar_00 + eta1_bar_00 - eta2_bar_00;
+    
+    const float h2_p0 = H2_p0 + eta2_p0;
+    const float h2_00 = H2_00 + eta2_00;
+    const float h2_bar_0m = H2_bar_0m + eta2_bar_0m;
+    const float h2_bar_00 = H2_bar_00 + eta2_bar_00;
+    
+    
+    
+    //Compute pressure components
+    const float h1_x = 0.5f*(h1_p0 + h1_00);
+    const float h2_x = 0.5f*(h2_p0 + h2_00);
+    
+    //const float epsilon = (rho2_ - rho1_)/rho2_;
+    //const float P1_x = -g_*h1_x * (eta1_p0 - eta1_00 + h2_p0 - h2_00) * (1.0f - epsilon);
+    //const float P2_x = -g_*h2_x * (eta2_p0 - eta2_00 + H2_p0 - H2_00);
+    
+    const float P1_x = - g_*h1_x*(eta1_p0 - eta1_00) - 0.5f*g_*(eta1_p0*eta1_p0 - eta1_00*eta1_00);
+    const float P2_x = - g_ * (rho1_/rho2_) * 
+                            ( //Pressure contribution from top layer
+                            h2_x*(eta1_p0 - eta1_00) + 0.5f*(eta1_p0*eta1_p0 - eta1_00*eta1_00) 
+                            )
+                       - g_ * ((rho2_ - rho1_)/rho2_) * 
+                            ( //Pressure contribution from bottom layer
+                            h2_x*(eta2_p0 - eta2_00) + 0.5f*(eta2_p0*eta2_p0 - eta2_00*eta2_00) 
+                            );
+    
+
+    
+    
+    
+    //Reconstruct V at the U position
+    const float V1_bar = 0.25f*(V1_0m + V1_00 + V1_pm + V1_p0);
+    const float V2_bar = 0.25f*(V2_0m + V2_00 + V2_pm + V2_p0);
+
+    
+    
+    
+    //Calculate the bottom and/or inter-layer friction coefficient
+    //FIXME: Should this be h instead of H?
+    const float C1 = r1_/H1_x;
+    const float C2 = r2_/H2_x;
+
+    
+    
+    
+    //Calculate numerical diffusion / subgrid energy loss coefficient
+    const float D = 2.0f*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
+    
+    
+    
+    //Calculate nonlinear effects
+    const float N1_a = (U1_p0 + U1_00)*(U1_p0 + U1_00) / (h1_p0);
+    const float N1_b = (U1_00 + U1_m0)*(U1_00 + U1_m0) / (h1_00);
+    const float N1_c = (U1_0p + U1_00)*(V1_p0 + V1_00) / (h1_bar_00);
+    const float N1_d = (U1_00 + U1_0m)*(V1_pm + V1_0m) / (h1_bar_0m);
+    const float N1 = 0.25f*( N1_a - N1_b + (dx_/dy_)*(N1_c - N1_d) );
+    
+    const float N2_a = (U2_p0 + U2_00)*(U2_p0 + U2_00) / (h2_p0);
+    const float N2_b = (U2_00 + U2_m0)*(U2_00 + U2_m0) / (h2_00);
+    const float N2_c = (U2_0p + U2_00)*(V2_p0 + V2_00) / (h2_bar_00);
+    const float N2_d = (U2_00 + U2_0m)*(V2_pm + V2_0m) / (h2_bar_0m);
+    const float N2 = 0.25f*( N2_a - N2_b + (dx_/dy_)*(N2_c - N2_d) );
+    
+    
+    
+    
+    //Calculate eddy viscosity terms
+    const float E1 = (U1_p0 - U1_0 + U1_m0)/(dx_*dx_) + (U1_0p - U1_0 + U1_0m)/(dy_*dy_);
+    const float E2 = (U2_p0 - U2_0 + U2_m0)/(dx_*dx_) + (U2_0p - U2_0 + U2_0m)/(dy_*dy_);
+    
+    
+    
+    //Calculate the wind shear stress for the top layer
+    const float X = windStressX(
+        wind_stress_type_, 
+        dx_, dy_, dt_,
+        tau0_, rho1_, alpha_, xm_, Rc_,
+        x0_, y0_,
+        u0_, v0_,
+        t_);
+    
+    
+    
+    //Compute U at the next timestep
+    float U1_2 = (U1_0 + 2.0f*dt_*(f_*V1_bar + (N1 + P1_x)/dx_ + X + C1*U2_0 + A_*E1) ) / (1.0f + D);
+    float U2_2 = (U2_0 + 2.0f*dt_*(f_*V2_bar + (N2 + P2_x)/dx_     + C1*U1_0 + A_*E2) ) / (1.0f + 2.0f*dt_*C2 + D);
+
+    
+    
+    
+    //Write to main memory for internal cells
+    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
+        U1_0_row[ti] = U1_2;
+        U2_0_row[ti] = U2_2;
+    }
+}
+
+
+
+
+
+
+
--- a/SWESimulators/CTCS2Layer_V_kernel.opencl
+++ b/SWESimulators/CTCS2Layer_V_kernel.opencl
@@ -0,0 +1,395 @@
+/**
+This OpenCL kernel implements part of the Centered in Time, Centered 
+in Space (leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#define block_height 8
+#define block_width 8
+
+typedef __local float eta_shmem[block_height+1][block_width+2];
+typedef __local float u_shmem[block_height+1][block_width+1];
+typedef __local float v_shmem[block_height+2][block_width+2];
+    
+    
+    
+float windStressY(int wind_stress_type_,
+                float dx_, float dy_, float dt_,
+                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+                float x0_, float y0_,
+                float u0_, float v0_,
+                float t_) {
+    float Y = 0.0f;
+    
+    switch (wind_stress_type_) {
+    case 2: //MOVING_CYCLONE:
+        {
+            const float x = (get_global_id(0)+0.5f)*dx_; 
+            const float y = (get_global_id(1))*dy_;
+            const float a = (x-x0_-u0_*(t_+dt_));
+            const float aa = a*a;
+            const float b = (y-y0_-v0_*(t_+dt_));
+            const float bb = b*b;
+            const float r = sqrt(aa+bb);
+            const float c = 1.0f - r/Rc_;
+            const float xi = c*c;
+            
+            Y = (tau0_/rho_) * (a/Rc_) * exp(-0.5f*xi);
+        }
+        break;
+    }
+
+    return Y;
+}
+
+
+
+
+/**
+  * Kernel that evolves V one step in time.
+  */
+__kernel void computeVKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r1_, //< Inter-layer friction coefficient
+        float r2_, //< Bottom friction coefficient
+    
+        //Numerical diffusion
+        float A_,
+        
+        //Density of each layer
+        float rho1_,
+        float rho2_,
+    
+        //Data for layer 1
+        __global float* H1_ptr_, int H1_pitch_,
+        __global float* eta1_1_ptr_, int eta1_1_pitch_, // eta^n
+        __global float* U1_1_ptr_, int U1_1_pitch_,     // U^n
+        __global float* V1_0_ptr_, int V1_0_pitch_,     // V^n-1, also output V^n+1
+        __global float* V1_1_ptr_, int V1_1_pitch_,     // V^n
+        
+        //Data for layer 2
+        __global float* H2_ptr_, int H2_pitch_,
+        __global float* eta2_1_ptr_, int eta2_1_pitch_, 
+        __global float* U2_1_ptr_, int U2_1_pitch_,     
+        __global float* V2_0_ptr_, int V2_0_pitch_,     
+        __global float* V2_1_ptr_, int V2_1_pitch_,     
+    
+        // Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+                    
+    eta_shmem H1_shared;
+    eta_shmem eta1_shared;
+    u_shmem U1_shared;
+    v_shmem V1_shared;
+    
+    eta_shmem H2_shared;
+    eta_shmem eta2_shared;
+    u_shmem U2_shared;
+    v_shmem V2_shared;
+
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Start of block within domain
+    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
+    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
+
+    //Index of cell within domain
+    const int ti = bx + tx;
+    const int tj = by + ty;
+    
+    //Compute pointer to current row in the V array
+    __global float* const V1_0_row = (__global float*) ((__global char*) V1_0_ptr_ + V1_0_pitch_*tj);
+    __global float* const V2_0_row = (__global float*) ((__global char*) V2_0_ptr_ + V2_0_pitch_*tj);
+
+    //Read current V
+    float V1_0 = 0.0f;
+    float V2_0 = 0.0f;
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
+        V1_0 = V1_0_row[ti];
+        V2_0 = V2_0_row[ti];
+    }
+
+    //Read H and eta into shared memory: (nx+2)*(ny+1) cells
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        // "fake" global ghost cells by clamping
+        const int l = clamp(by + j, 1, ny_);
+        
+        //Compute the pointer to current row in the H and eta arrays
+        __global float* const H1_row = (__global float*) ((__global char*) H1_ptr_ + H1_pitch_*l);
+        __global float* const H2_row = (__global float*) ((__global char*) H2_ptr_ + H2_pitch_*l);
+        
+        __global float* const eta1_1_row = (__global float*) ((__global char*) eta1_1_ptr_ + eta1_1_pitch_*l);
+        __global float* const eta2_1_row = (__global float*) ((__global char*) eta2_1_ptr_ + eta2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            // "fake" global ghost cells by clamping
+            const int k = clamp(bx + i - 1, 1, nx_);
+            
+            H1_shared[j][i] = H1_row[k];
+            H2_shared[j][i] = H2_row[k];
+            
+            eta1_shared[j][i] = eta1_1_row[k];
+            eta2_shared[j][i] = eta2_1_row[k];
+        }
+    }
+
+    //Read U into shared memory: (nx+1)*(ny+1) cells
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        // "fake" ghost cells by clamping
+        const int l = clamp(by + j, 1, ny_);
+        
+        //Compute the pointer to current row in the U array
+        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
+        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            // Prevent out-of-bounds
+            const int k = clamp(bx + i - 1, 0, nx_);
+            
+            U1_shared[j][i] = U1_1_row[k];
+            U2_shared[j][i] = U2_1_row[k];
+        }
+    }
+    
+
+    //Read V into shared memory: (nx+2)*(ny+2) cells
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        // Prevent out-of-bounds
+        const int l = clamp(by + j - 1, 0, ny_);
+        
+        //Compute the pointer to current row in the V array
+        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
+        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            // "fake" ghost cells by clamping
+            const int k = clamp(bx + i - 1, 1, nx_);
+            
+            V1_shared[j][i] = V1_1_row[k];
+            V2_shared[j][i] = V2_1_row[k];
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    /**
+      * Now get all our required variables as short-hands
+      * here we use the notation of
+      *  Var_00 as var_i,j
+      *  Var_p0 as var_i+1,j
+      *  Var_0m as var_i,j-1
+      * etc
+      */
+    //Layer 1
+    const float V1_00 = V1_shared[ty+1][tx+1]; //V at "center"
+    const float V1_0p = V1_shared[ty+2][tx+1]; //V at "north"
+    const float V1_0m = V1_shared[ty  ][tx+1]; //V at "south"
+    const float V1_p0 = V1_shared[ty+1][tx+2]; //V at "east"
+    const float V1_m0 = V1_shared[ty+1][tx  ]; //V at "west"
+
+    const float U1_00 = U1_shared[ty  ][tx+1];
+    const float U1_0p = U1_shared[ty+1][tx+1];
+    const float U1_m0 = U1_shared[ty  ][tx  ];
+    const float U1_mp = U1_shared[ty+1][tx  ];
+
+    const float H1_m0 = H1_shared[ty  ][tx  ]; 
+    const float H1_00 = H1_shared[ty  ][tx+1]; 
+    const float H1_p0 = H1_shared[ty  ][tx+2];
+    const float H1_mp = H1_shared[ty+1][tx  ];
+    const float H1_0p = H1_shared[ty+1][tx+1];
+    const float H1_pp = H1_shared[ty+1][tx+2];
+    
+    const float eta1_m0 = eta1_shared[ty  ][tx  ]; 
+    const float eta1_00 = eta1_shared[ty  ][tx+1]; 
+    const float eta1_p0 = eta1_shared[ty  ][tx+2];
+    const float eta1_mp = eta1_shared[ty+1][tx  ]; 
+    const float eta1_0p = eta1_shared[ty+1][tx+1]; 
+    const float eta1_pp = eta1_shared[ty+1][tx+2];
+    
+    
+    //Layer 2 (bottom)
+    const float V2_00 = V2_shared[ty+1][tx+1];
+    const float V2_0p = V2_shared[ty+2][tx+1];
+    const float V2_0m = V2_shared[ty  ][tx+1];
+    const float V2_p0 = V2_shared[ty+1][tx+2];
+    const float V2_m0 = V2_shared[ty+1][tx  ];
+
+    const float U2_00 = U2_shared[ty  ][tx+1];
+    const float U2_0p = U2_shared[ty+1][tx+1];
+    const float U2_m0 = U2_shared[ty  ][tx  ];
+    const float U2_mp = U2_shared[ty+1][tx  ];
+
+    const float H2_m0 = H2_shared[ty  ][tx  ]; 
+    const float H2_00 = H2_shared[ty  ][tx+1]; 
+    const float H2_p0 = H2_shared[ty  ][tx+2];
+    const float H2_mp = H2_shared[ty+1][tx  ];
+    const float H2_0p = H2_shared[ty+1][tx+1];
+    const float H2_pp = H2_shared[ty+1][tx+2];
+    
+    const float eta2_m0 = eta2_shared[ty  ][tx  ]; 
+    const float eta2_00 = eta2_shared[ty  ][tx+1]; 
+    const float eta2_p0 = eta2_shared[ty  ][tx+2];
+    const float eta2_mp = eta2_shared[ty+1][tx  ]; 
+    const float eta2_0p = eta2_shared[ty+1][tx+1]; 
+    const float eta2_pp = eta2_shared[ty+1][tx+2];
+    
+    
+    //Reconstruct Eta_bar at the V position
+    const float eta1_bar_m0 = 0.25f*(eta1_m0 + eta1_mp + eta1_00 + eta1_0p);
+    const float eta1_bar_00 = 0.25f*(eta1_00 + eta1_0p + eta1_p0 + eta1_pp);
+    
+    const float eta2_bar_m0 = 0.25f*(eta2_m0 + eta2_mp + eta2_00 + eta2_0p);
+    const float eta2_bar_00 = 0.25f*(eta2_00 + eta2_0p + eta2_p0 + eta2_pp);
+    
+    
+    
+    
+
+    //Reconstruct H_bar and H_y (at the V position)
+    const float H1_bar_m0 = 0.25f*(H1_m0 + H1_mp + H1_00 + H1_0p);
+    const float H1_bar_00 = 0.25f*(H1_00 + H1_0p + H1_p0 + H1_pp);
+    const float H1_y = 0.5f*(H1_00 + H1_0p);
+    
+    const float H2_bar_m0 = 0.25f*(H2_m0 + H2_mp + H2_00 + H2_0p);
+    const float H2_bar_00 = 0.25f*(H2_00 + H2_0p + H2_p0 + H2_pp);
+    const float H2_y = 0.5f*(H2_00 + H2_0p);
+    
+    
+    
+    //Compute layer thickness of top layer
+    const float h1_0p = H1_0p + eta1_0p - eta2_0p;
+    const float h1_00 = H1_00 + eta1_00 - eta2_00;
+    const float h1_bar_00 = H1_bar_00 + eta1_bar_00 - eta2_bar_00;
+    const float h1_bar_m0 = H1_bar_m0 + eta1_bar_m0 - eta2_bar_m0;
+    
+    const float h2_0p = H2_0p + eta2_0p;
+    const float h2_00 = H2_00 + eta2_00;
+    const float h2_bar_00 = H2_bar_00 + eta2_bar_00;
+    const float h2_bar_m0 = H2_bar_m0 + eta2_bar_m0;
+    
+    
+    
+    //Compute pressure components
+    const float h1_y = 0.5f*(h1_0p + h1_00);
+    const float h2_y = 0.5f*(h2_0p + h2_00);
+    
+    //const float epsilon = (rho2_ - rho1_)/rho2_;
+    //const float P1_y = -0.5f*g_*(h1_0p + h1_00) * (eta1_0p - eta1_00 + h2_0p - h2_00) * (1.0f - epsilon);
+    //const float P2_y = -0.5f*g_*(h2_0p + h2_00) * (eta2_0p - eta2_00 + H2_0p - H2_00);
+
+    const float P1_y = -g_*h1_y*(eta1_0p - eta1_00) - 0.5f*g_*(eta1_0p*eta1_0p - eta1_00*eta1_00);
+    
+    const float P2_y = -g_ * (rho1_/rho2_) * 
+                            ( //Pressure contribution from top layer
+                            h2_y*(eta1_0p - eta1_00) + 0.5f*(eta1_0p*eta1_0p - eta1_00*eta1_00) 
+                            )
+                       -g_ * ((rho2_ - rho1_)/rho2_) * 
+                            ( //Pressure contribution from bottom layer
+                            h2_y*(eta2_0p - eta2_00) + 0.5f*(eta2_0p*eta2_0p - eta2_00*eta2_00) 
+                            );
+  
+
+    //Reconstruct U at the V position
+    const float U1_bar = 0.25f*(U1_m0 + U1_00 + U1_mp + U1_0p);
+    const float U2_bar = 0.25f*(U2_m0 + U2_00 + U2_mp + U2_0p);
+    
+    
+    
+
+    //Calculate the friction coefficient
+    //FIXME: Should this be h instead of H?
+    const float C1 = r1_/H1_y;
+    const float C2 = r2_/H2_y;
+    
+    
+    
+    //Calculate numerical diffusion / subgrid energy loss coefficient
+    const float D = 2.0f*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
+
+    
+    
+    
+    //Calculate nonlinear effects
+    const float N1_a = (V1_0p + V1_00)*(V1_0p + V1_00) / (h1_0p);
+    const float N1_b = (V1_00 + V1_0m)*(V1_00 + V1_0m) / (h1_00);
+    const float N1_c = (U1_0p + U1_00)*(V1_p0 + V1_00) / (h1_bar_00);
+    const float N1_d = (U1_mp + U1_m0)*(V1_00 + V1_m0) / (h1_bar_m0);
+    const float N1 = 0.25f*( N1_a - N1_b + (dy_/dx_)*(N1_c - N1_d) );
+    
+    const float N2_a = (V2_0p + V2_00)*(V2_0p + V2_00) / (h2_0p);
+    const float N2_b = (V2_00 + V2_0m)*(V2_00 + V2_0m) / (h2_00);
+    const float N2_c = (U2_0p + U2_00)*(V2_p0 + V2_00) / (h2_bar_00);
+    const float N2_d = (U2_mp + U2_m0)*(V2_00 + V2_m0) / (h2_bar_m0);
+    const float N2 = 0.25f*( N2_a - N2_b + (dy_/dx_)*(N2_c - N2_d) );
+    
+    
+    
+    
+    
+    //Calculate eddy viscosity term
+    const float E1 = (V1_p0 - V1_0 + V1_m0)/(dx_*dx_) + (V1_0p - V1_0 + V1_0m)/(dy_*dy_);
+    const float E2 = (V2_p0 - V2_0 + V2_m0)/(dx_*dx_) + (V2_0p - V2_0 + V2_0m)/(dy_*dy_);
+    
+    
+
+    //Calculate the wind shear stress
+    const float Y = windStressY(
+        wind_stress_type_, 
+        dx_, dy_, dt_,
+        tau0_, rho1_, alpha_, xm_, Rc_,
+        x0_, y0_,
+        u0_, v0_,
+        t_);
+
+    //Compute the V at the next timestep
+    float V1_2 = (V1_0 + 2.0f*dt_*(-f_*U1_bar + (N1 + P1_y)/dy_ + Y + C1*V2_0 + A_*E1) ) / (1.0f + D);
+    float V2_2 = (V2_0 + 2.0f*dt_*(-f_*U2_bar + (N2 + P2_y)/dy_     + C1*V1_0 + A_*E2) ) / (1.0f + 2.0f*dt_*C2 + D);
+
+    //Write to main memory for internal cells
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
+        V1_0_row[ti] = V1_2;
+        V2_0_row[ti] = V2_2;
+    }
+}
+
+
+
+
+
+
+
+
+
+
--- a/SWESimulators/CTCS2Layer_eta_kernel.opencl
+++ b/SWESimulators/CTCS2Layer_eta_kernel.opencl
@@ -0,0 +1,128 @@
+/**
+This OpenCL kernel implements part of the Centered in Time, Centered 
+in Space (leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define block_height 8
+#define block_width 8
+
+typedef __local float u_shmem[block_height][block_width+1];
+typedef __local float v_shmem[block_height+1][block_width];
+
+
+/**
+  * Kernel that evolves eta one step in time.
+  */
+__kernel void computeEtaKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        
+        //Data for layer 1
+        __global float* eta1_0_ptr_, int eta1_0_pitch_, //eta_1^n-1 (also used as output, that is eta_1^n+1)
+        __global float* U1_1_ptr_, int U1_1_pitch_, // U^n
+        __global float* V1_1_ptr_, int V1_1_pitch_, // V^n
+        
+        //Data for layer 2
+        __global float* eta2_0_ptr_, int eta2_0_pitch_, //eta_2^n-1 (also used as output, that is eta_2^n+1)
+        __global float* U2_1_ptr_, int U2_1_pitch_, // U^n
+        __global float* V2_1_ptr_, int V2_1_pitch_ // V^n
+        ) {
+    
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Start of block within domain
+    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
+    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
+
+    //Index of cell within domain
+    const int ti = bx + tx;
+    const int tj = by + ty;
+    
+    //Layer 1
+    u_shmem U1_1_shared;
+    v_shmem V1_1_shared;
+    
+    //Layer 2
+    u_shmem U2_1_shared;
+    v_shmem V2_1_shared;
+    
+    //Compute pointer to current row in the eta arrays
+    __global float* eta1_0_row = (__global float*) ((__global char*) eta1_0_ptr_ + eta1_0_pitch_*tj);
+    __global float* eta2_0_row = (__global float*) ((__global char*) eta2_0_ptr_ + eta2_0_pitch_*tj);
+
+    //Read current eta
+    float eta1_0 = 0.0f;
+    float eta2_0 = 0.0f;
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        eta1_0 = eta1_0_row[ti];
+        eta2_0 = eta2_0_row[ti];
+    }
+    
+    //Read U into shared memory
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = clamp(by + j, 1, ny_); // fake ghost cells
+        
+        //Compute the pointer to current row in the U array
+        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
+        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = clamp(bx + i - 1, 0, nx_); // prevent out of bounds
+            
+            U1_1_shared[j][i] = U1_1_row[k];
+            U2_1_shared[j][i] = U2_1_row[k];
+        }
+    }
+    
+    //Read V into shared memory
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = clamp(by + j - 1, 0, ny_); // prevent out of bounds
+        
+        //Compute the pointer to current row in the V array
+        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
+        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
+        
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+            const int k = clamp(bx + i, 1, nx_); // fake ghost cells
+            
+            V1_1_shared[j][i] = V1_1_row[k];
+            V2_1_shared[j][i] = V2_1_row[k];
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Compute the H at the next timestep
+    float eta1_2 = eta1_0 - 2.0f*dt_/dx_ * (U1_1_shared[ty][tx+1] - U1_1_shared[ty][tx] + U2_1_shared[ty][tx+1] - U2_1_shared[ty][tx])
+                          - 2.0f*dt_/dy_ * (V1_1_shared[ty+1][tx] - V1_1_shared[ty][tx] + V2_1_shared[ty+1][tx] - V2_1_shared[ty][tx]);
+    float eta2_2 = eta2_0 - 2.0f*dt_/dx_ * (U2_1_shared[ty][tx+1] - U2_1_shared[ty][tx])
+                          - 2.0f*dt_/dy_ * (V2_1_shared[ty+1][tx] - V2_1_shared[ty][tx]);
+    
+    //Write to main memory
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        eta1_0_row[ti] = eta1_2;
+        eta2_0_row[ti] = eta2_2;
+    }
+}
--- a/SWESimulators/CTCS_U_kernel.opencl
+++ b/SWESimulators/CTCS_U_kernel.opencl
@@ -0,0 +1,218 @@
+/**
+This OpenCL kernel implements part of the Centered in Time, Centered 
+in Space (leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "common.opencl"
+
+
+/**
+  * Kernel that evolves U one step in time.
+  */
+__kernel void computeUKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+    
+        //Numerical diffusion
+        float A_,
+    
+        //Data
+        __global float* H_ptr_, int H_pitch_,
+        __global float* eta1_ptr_, int eta1_pitch_, // eta^n
+        __global float* U0_ptr_, int U0_pitch_, // U^n-1, also output, U^n+1
+        __global float* U1_ptr_, int U1_pitch_, // U^n
+        __global float* V1_ptr_, int V1_pitch_, // V^n
+    
+        // Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+        
+    __local float H_shared[block_height+2][block_width+1];
+    __local float eta1_shared[block_height+2][block_width+1];
+    __local float U1_shared[block_height+2][block_width+2];
+    __local float V1_shared[block_height+1][block_width+1];
+
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Start of block within domain
+    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
+    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
+
+    //Index of cell within domain
+    const int ti = bx + tx;
+    const int tj = by + ty;
+    
+    //Compute pointer to current row in the U array
+    __global float* const U0_row = (__global float*) ((__global char*) U0_ptr_ + U0_pitch_*tj);
+
+    //Read current U
+    float U0 = 0.0f;
+    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
+        U0 = U0_row[ti];
+    }
+
+    //Read H and eta into shared memory: (nx+1)*(ny+2) cells
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        // "fake" global ghost cells by clamping
+        const int l = clamp(by + j - 1, 1, ny_);
+        
+        //Compute the pointer to current row in the H and eta arrays
+        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
+        __global float* const eta1_row = (__global float*) ((__global char*) eta1_ptr_ + eta1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            // "fake" global ghost cells by clamping
+            const int k = clamp(bx + i, 1, nx_);
+            
+            H_shared[j][i] = H_row[k];
+            eta1_shared[j][i] = eta1_row[k];
+        }
+    }
+
+    //Read U into shared memory: (nx+2)*(ny+2) cells
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        // "fake" ghost cells by clamping
+        const int l = clamp(by + j - 1, 1, ny_);
+        
+        //Compute the pointer to current row in the U array
+        __global float* const U1_row = (__global float*) ((__global char*) U1_ptr_ + U1_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            // Prevent out-of-bounds
+            const int k = clamp(bx + i - 1, 0, nx_);
+            
+            U1_shared[j][i] = U1_row[k];
+        }
+    }
+    
+
+    //Read V into shared memory: (nx+1)*(ny+1) cells
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        // Prevent out-of-bounds
+        const int l = clamp(by + j - 1, 0, ny_);
+        
+        //Compute the pointer to current row in the U array
+        __global float* const V1_row = (__global float*) ((__global char*) V1_ptr_ + V1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            // "fake" ghost cells by clamping
+            const int k = clamp(bx + i, 1, nx_);
+            
+            V1_shared[j][i] = V1_row[k];
+        }
+    }
+    
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    /**
+      * Now get all our required variables as short-hands
+      * here we use the notation of
+      *  Var_00 as var_i,j
+      *  Var_p0 as var_i+1,j
+      *  Var_0m as var_i,j-1
+      * etc
+      */
+    const float U_00 = U1_shared[ty+1][tx+1]; //U at "center"
+    const float U_0p = U1_shared[ty+2][tx+1]; //U at "north"
+    const float U_0m = U1_shared[ty  ][tx+1]; //U at "south"
+    const float U_p0 = U1_shared[ty+1][tx+2]; //U at "east"
+    const float U_m0 = U1_shared[ty+1][tx  ]; //U at "west"
+    
+    const float V_00 = V1_shared[ty+1][tx  ];
+    const float V_p0 = V1_shared[ty+1][tx+1];
+    const float V_0m = V1_shared[ty  ][tx  ];
+    const float V_pm = V1_shared[ty  ][tx+1];
+    
+    const float H_0m = H_shared[ty  ][tx  ]; 
+    const float H_00 = H_shared[ty+1][tx  ]; 
+    const float H_0p = H_shared[ty+2][tx  ];
+    const float H_pm = H_shared[ty  ][tx+1];
+    const float H_p0 = H_shared[ty+1][tx+1]; 
+    const float H_pp = H_shared[ty+2][tx+1];
+    
+    const float eta_0m = eta1_shared[ty  ][tx  ]; 
+    const float eta_00 = eta1_shared[ty+1][tx  ]; 
+    const float eta_0p = eta1_shared[ty+2][tx  ];
+    const float eta_pm = eta1_shared[ty  ][tx+1];
+    const float eta_p0 = eta1_shared[ty+1][tx+1]; 
+    const float eta_pp = eta1_shared[ty+2][tx+1];
+
+    //Reconstruct H_bar and H_x (at the U position)
+    const float H_bar_0m = 0.25f*(H_0m + H_pm + H_00 + H_p0);
+    const float H_bar_00 = 0.25f*(H_00 + H_p0 + H_0p + H_pp);
+    const float H_x = 0.5f*(H_00 + H_p0);
+    
+    //Reconstruct Eta_bar at the V position
+    const float eta_bar_0m = 0.25f*(eta_0m + eta_pm + eta_00 + eta_p0);
+    const float eta_bar_00 = 0.25f*(eta_00 + eta_p0 + eta_0p + eta_pp);
+
+    //Reconstruct V at the U position
+    const float V_bar = 0.25f*(V_0m + V_00 + V_pm + V_p0);
+
+    //Calculate the friction coefficient
+    const float C = 1.0 + 2*r_*dt_/H_x + 2*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
+
+    //Calculate the pressure/gravitational effect
+    const float h_p0 = H_p0 + eta_p0;
+    const float h_00 = H_00 + eta_00;
+    const float h_x = 0.5*(h_00 + h_p0); //Could possibly use h for pressure terms instead of H
+    const float P_x_hat = -0.5f*g_*(eta_p0*eta_p0 - eta_00*eta_00);
+    const float P_x = -g_*h_x*(eta_p0 - eta_00) + P_x_hat;
+    
+    //Calculate nonlinear effects
+    const float N_a = (U_p0 + U_00)*(U_p0 + U_00) / (H_p0 + eta_p0);
+    const float N_b = (U_00 + U_m0)*(U_00 + U_m0) / (H_00 + eta_00);
+    const float N_c = (U_0p + U_00)*(V_p0 + V_00) / (H_bar_00 + eta_bar_00);
+    const float N_d = (U_00 + U_0m)*(V_pm + V_0m) / (H_bar_0m + eta_bar_0m);
+    float N = 0.25f*( N_a - N_b + (dx_/dy_)*(N_c - N_d) );
+    
+    //Calculate eddy viscosity term
+    float E = (U_p0 - U0 + U_m0)/(dx_*dx_) + (U_0p - U0 + U_0m)/(dy_*dy_);
+    
+    //Calculate the wind shear stress
+    float X = windStressX(
+        wind_stress_type_, 
+        dx_, dy_, dt_,
+        tau0_, rho_, alpha_, xm_, Rc_,
+        x0_, y0_,
+        u0_, v0_,
+        t_);
+    
+    //Compute the V at the next timestep
+    float U2 = (U0 + 2.0f*dt_*(f_*V_bar + (N + P_x)/dx_ + X + A_*E) ) / C;
+
+    //Write to main memory for internal cells
+    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
+        U0_row[ti] = U2;
+    }
+}
--- a/SWESimulators/CTCS_V_kernel.opencl
+++ b/SWESimulators/CTCS_V_kernel.opencl
@@ -0,0 +1,222 @@
+/**
+This OpenCL kernel implements part of the Centered in Time, Centered 
+in Space (leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5.
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.opencl"
+
+
+
+/**
+  * Kernel that evolves V one step in time.
+  */
+__kernel void computeVKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+    
+        //Numerical diffusion
+        float A_,
+    
+        //Data
+        __global float* H_ptr_, int H_pitch_,
+        __global float* eta1_ptr_, int eta1_pitch_, // eta^n
+        __global float* U1_ptr_, int U1_pitch_, // U^n
+        __global float* V0_ptr_, int V0_pitch_, // V^n-1, also output V^n+1
+        __global float* V1_ptr_, int V1_pitch_, // V^n
+    
+        // Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+        
+    __local float H_shared[block_height+1][block_width+2];
+    __local float eta1_shared[block_height+1][block_width+2];
+    __local float U1_shared[block_height+1][block_width+1];
+    __local float V1_shared[block_height+2][block_width+2];
+
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Start of block within domain
+    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
+    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
+
+    //Index of cell within domain
+    const int ti = bx + tx;
+    const int tj = by + ty;
+    
+    //Compute pointer to current row in the V array
+    __global float* const V0_row = (__global float*) ((__global char*) V0_ptr_ + V0_pitch_*tj);
+
+    //Read current V
+    float V0 = 0.0f;
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
+        V0 = V0_row[ti];
+    }
+
+    //Read H and eta into shared memory: (nx+2)*(ny+1) cells
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        // "fake" global ghost cells by clamping
+        const int l = clamp(by + j, 1, ny_);
+        
+        //Compute the pointer to current row in the H and eta arrays
+        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
+        __global float* const eta1_row = (__global float*) ((__global char*) eta1_ptr_ + eta1_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            // "fake" global ghost cells by clamping
+            const int k = clamp(bx + i - 1, 1, nx_);
+            
+            H_shared[j][i] = H_row[k];
+            eta1_shared[j][i] = eta1_row[k];
+        }
+    }
+
+    //Read U into shared memory: (nx+1)*(ny+1) cells
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        // "fake" ghost cells by clamping
+        const int l = clamp(by + j, 1, ny_);
+        
+        //Compute the pointer to current row in the U array
+        __global float* const U1_row = (__global float*) ((__global char*) U1_ptr_ + U1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            // Prevent out-of-bounds
+            const int k = clamp(bx + i - 1, 0, nx_);
+            
+            U1_shared[j][i] = U1_row[k];
+        }
+    }
+    
+
+    //Read V into shared memory: (nx+2)*(ny+2) cells
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        // Prevent out-of-bounds
+        const int l = clamp(by + j - 1, 0, ny_);
+        
+        //Compute the pointer to current row in the U array
+        __global float* const V1_row = (__global float*) ((__global char*) V1_ptr_ + V1_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            // "fake" ghost cells by clamping
+            const int k = clamp(bx + i - 1, 1, nx_);
+            
+            V1_shared[j][i] = V1_row[k];
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    /**
+      * Now get all our required variables as short-hands
+      * here we use the notation of
+      *  Var_00 as var_i,j
+      *  Var_p0 as var_i+1,j
+      *  Var_0m as var_i,j-1
+      * etc
+      */
+    const float V_00 = V1_shared[ty+1][tx+1]; //V at "center"
+    const float V_0p = V1_shared[ty+2][tx+1]; //V at "north"
+    const float V_0m = V1_shared[ty  ][tx+1]; //V at "south"
+    const float V_p0 = V1_shared[ty+1][tx+2]; //V at "east"
+    const float V_m0 = V1_shared[ty+1][tx  ]; //V at "west"
+    
+    const float U_00 = U1_shared[ty  ][tx+1];
+    const float U_0p = U1_shared[ty+1][tx+1];
+    const float U_m0 = U1_shared[ty  ][tx  ];
+    const float U_mp = U1_shared[ty+1][tx  ];
+    
+    const float H_m0 = H_shared[ty  ][tx  ]; 
+    const float H_00 = H_shared[ty  ][tx+1]; 
+    const float H_p0 = H_shared[ty  ][tx+2];
+    const float H_mp = H_shared[ty+1][tx  ];
+    const float H_0p = H_shared[ty+1][tx+1]; 
+    const float H_pp = H_shared[ty+1][tx+2];
+    
+    const float eta_m0 = eta1_shared[ty  ][tx  ]; 
+    const float eta_00 = eta1_shared[ty  ][tx+1]; 
+    const float eta_p0 = eta1_shared[ty  ][tx+2];
+    const float eta_mp = eta1_shared[ty+1][tx  ]; 
+    const float eta_0p = eta1_shared[ty+1][tx+1]; 
+    const float eta_pp = eta1_shared[ty+1][tx+2];
+    
+
+    //Reconstruct H_bar and H_y (at the V position)
+    const float H_bar_m0 = 0.25f*(H_m0 + H_mp + H_00 + H_0p);
+    const float H_bar_00 = 0.25f*(H_00 + H_0p + H_p0 + H_pp);
+    const float H_y = 0.5f*(H_00 + H_0p);
+    
+    //Reconstruct Eta_bar at the V position
+    const float eta_bar_m0 = 0.25f*(eta_m0 + eta_mp + eta_00 + eta_0p);
+    const float eta_bar_00 = 0.25f*(eta_00 + eta_0p + eta_p0 + eta_pp);
+
+    //Reconstruct U at the V position
+    const float U_bar = 0.25f*(U_m0 + U_00 + U_mp + U_0p);
+
+    //Calculate the friction coefficient
+    const float C = 1.0 + 2*r_*dt_/H_y + 2*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
+
+    //Calculate the pressure/gravitational effect
+    const float h_0p = H_0p + eta_0p;
+    const float h_00 = H_00 + eta_00;
+    const float h_y = 0.5*(h_00 + h_0p); //Could possibly use h for pressure terms instead of H
+    const float P_y_hat = -0.5f*g_*(eta_0p*eta_0p - eta_00*eta_00);
+    const float P_y = -g_*h_y*(eta_0p - eta_00) + P_y_hat;
+    
+    //Calculate nonlinear effects
+    const float N_a = (V_0p + V_00)*(V_0p + V_00) / (H_0p + eta_0p);
+    const float N_b = (V_00 + V_0m)*(V_00 + V_0m) / (H_00 + eta_00);
+    const float N_c = (U_0p + U_00)*(V_p0 + V_00) / (H_bar_00 + eta_bar_00);
+    const float N_d = (U_mp + U_m0)*(V_00 + V_m0) / (H_bar_m0 + eta_bar_m0);
+    float N = 0.25f*( N_a - N_b + (dy_/dx_)*(N_c - N_d) );
+    
+    //Calculate eddy viscosity term
+    float E = (V_p0 - V0 + V_m0)/(dx_*dx_) + (V_0p - V0 + V_0m)/(dy_*dy_);
+
+    //Calculate the wind shear stress
+    float Y = windStressY(
+        wind_stress_type_, 
+        dx_, dy_, dt_,
+        tau0_, rho_, alpha_, xm_, Rc_,
+        x0_, y0_,
+        u0_, v0_,
+        t_);
+
+    //Compute the V at the next timestep
+    float V2 = (V0 + 2.0f*dt_*(-f_*U_bar + (N + P_y)/dy_ + Y + A_*E) ) / C;
+
+    //Write to main memory for internal cells
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
+        V0_row[ti] = V2;
+    }
+}
--- a/SWESimulators/CTCS_eta_kernel.opencl
+++ b/SWESimulators/CTCS_eta_kernel.opencl
@@ -0,0 +1,109 @@
+/**
+This OpenCL kernel implements part of the Centered in Time, Centered 
+in Space (leapfrog) numerical scheme for the shallow water equations, 
+described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+/**
+  * Kernel that evolves eta one step in time.
+  */
+__kernel void computeEtaKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+    
+        //Data
+        __global float* eta0_ptr_, int eta0_pitch_, //eta^n-1 (also used as output, that is eta^n+1)
+        __global float* U1_ptr_, int U1_pitch_, // U^n
+        __global float* V1_ptr_, int V1_pitch_ // V^n
+        ) {
+    
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Start of block within domain
+    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
+    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
+
+    //Index of cell within domain
+    const int ti = bx + tx;
+    const int tj = by + ty;
+    
+    __local float U1_shared[block_height][block_width+1];
+    __local float V1_shared[block_height+1][block_width];
+    
+    //Compute pointer to current row in the U array
+    __global float* eta0_row = (__global float*) ((__global char*) eta0_ptr_ + eta0_pitch_*tj);
+
+    //Read current eta
+    float eta0 = 0.0f;
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        eta0 = eta0_row[ti];
+    }
+    
+    //Read U into shared memory
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = clamp(by + j, 1, ny_); // fake ghost cells
+        
+        //Compute the pointer to current row in the V array
+        __global float* const U1_row = (__global float*) ((__global char*) U1_ptr_ + U1_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = clamp(bx + i - 1, 0, nx_); // prevent out of bounds
+            
+            U1_shared[j][i] = U1_row[k];
+        }
+    }
+    
+    //Read V into shared memory
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = clamp(by + j - 1, 0, ny_); // prevent out of bounds
+        
+        //Compute the pointer to current row in the V array
+        __global float* const V1_row = (__global float*) ((__global char*) V1_ptr_ + V1_pitch_*l);
+        
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+            const int k = clamp(bx + i, 1, nx_); // fake ghost cells
+            
+            V1_shared[j][i] = V1_row[k];
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Compute the H at the next timestep
+    float eta2 = eta0 - 2.0f*dt_/dx_ * (U1_shared[ty][tx+1] - U1_shared[ty][tx])
+                      - 2.0f*dt_/dy_ * (V1_shared[ty+1][tx] - V1_shared[ty][tx]);
+    
+    //Write to main memory
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        eta0_row[ti] = eta2;
+    }
+}
--- a/SWESimulators/Common.py
+++ b/SWESimulators/Common.py
@@ -0,0 +1,288 @@
+import pyopencl
+import os
+import numpy as np
+
+"""
+Static function which reads a text file and creates an OpenCL kernel from that
+"""
+def get_kernel(cl_ctx, kernel_filename, block_width, block_height):
+    import datetime
+    
+    #Create define string
+    define_string = "#define block_width " + str(block_width) + "\n"
+    define_string += "#define block_height " + str(block_height) + "\n\n"
+    define_string += "#ifndef my_variable_to_force_recompilation\n"
+    define_string += "#define my_variable_to_force_recompilation " + datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S") + "\n"
+    define_string += "#undef my_variable_to_force_recompilation \n"
+    define_string += "#endif\n\n"
+    
+    
+    def shellquote(s):
+        assert(cl_ctx.num_devices == 1)
+        platform_name = cl_ctx.devices[0].get_info(pyopencl.device_info.PLATFORM).name
+        platform_name = platform_name.upper()
+        if ('INTEL' in platform_name):
+            #Intel CL compiler doesn't like spaces in include paths. We have to escape them
+            return '"' + s.replace(" ", "\\ ") + '"'
+        elif ('NVIDIA' in platform_name):
+            #NVIDIA doesn't like double quoted paths...
+            return "'" + s + "'"
+            
+    module_path = os.path.dirname(os.path.realpath(__file__))
+    module_path_escaped = shellquote(module_path)
+    options = ['-I', module_path_escaped]
+    
+    #Read the proper program
+    fullpath = os.path.join(module_path, kernel_filename)
+    with open(fullpath, "r") as kernel_file:
+        kernel_string = define_string + kernel_file.read()
+        kernel = pyopencl.Program(cl_ctx, kernel_string).build(options)
+        
+    return kernel
+    
+    
+        
+        
+        
+        
+        
+        
+        
+
+"""
+Class that holds data 
+"""
+class OpenCLArray2D:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, data):
+        host_data = self.convert_to_float32(data)
+        
+        self.nx = nx
+        self.ny = ny
+        self.nx_halo = nx + 2*halo_x
+        self.ny_halo = ny + 2*halo_y
+        assert(host_data.shape[1] == self.nx_halo)
+        assert(host_data.shape[0] == self.ny_halo)
+        
+        assert(data.shape == (self.ny_halo, self.nx_halo))
+
+        #Upload data to the device
+        mf = pyopencl.mem_flags
+        self.data = pyopencl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_data)
+        
+        self.bytes_per_float = host_data.itemsize
+        assert(self.bytes_per_float == 4)
+        self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
+        
+        
+    """
+    Enables downloading data from CL device to Python
+    """
+    def download(self, cl_queue):
+        #Allocate data on the host for result
+        host_data = np.empty((self.ny_halo, self.nx_halo), dtype=np.float32, order='C')
+        
+        #Copy data from device to host
+        pyopencl.enqueue_copy(cl_queue, host_data, self.data)
+        
+        #Return
+        return host_data
+
+    """
+    Converts to C-style float 32 array suitable for the GPU/OpenCL
+    """
+    @staticmethod
+    def convert_to_float32(data):
+        if (not np.issubdtype(data.dtype, np.float32) or np.isfortran(data)):
+            print "Converting H0"
+            return data.astype(np.float32, order='C')
+        else:
+            return data
+
+        
+        
+        
+        
+        
+        
+        
+        
+"""
+A class representing an Akrawa A type (unstaggered, logically Cartesian) grid
+"""
+class SWEDataArkawaA:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, h0, hu0, hv0):
+        self.h0  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.hu0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
+        self.hv0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
+        
+        self.h1  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.hu1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
+        self.hv1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
+
+    """
+    Swaps the variables after a timestep has been completed
+    """
+    def swap(self):
+        self.h1,  self.h0  = self.h0,  self.h1
+        self.hu1, self.hu0 = self.hu0, self.hu1
+        self.hv1, self.hv0 = self.hv0, self.hv1
+        
+    """
+    Enables downloading data from CL device to Python
+    """
+    def download(self, cl_queue):
+        h_cpu  = self.h0.download(cl_queue)
+        hu_cpu = self.hu0.download(cl_queue)
+        hv_cpu = self.hv0.download(cl_queue)
+        
+        return h_cpu, hu_cpu, hv_cpu
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+"""
+A class representing an Akrawa A type (unstaggered, logically Cartesian) grid
+"""
+class SWEDataArkawaA:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, h0, hu0, hv0):
+        self.h0  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.hu0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
+        self.hv0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
+        
+        self.h1  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.hu1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
+        self.hv1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
+
+    """
+    Swaps the variables after a timestep has been completed
+    """
+    def swap(self):
+        self.h1,  self.h0  = self.h0,  self.h1
+        self.hu1, self.hu0 = self.hu0, self.hu1
+        self.hv1, self.hv0 = self.hv0, self.hv1
+        
+    """
+    Enables downloading data from CL device to Python
+    """
+    def download(self, cl_queue):
+        h_cpu  = self.h0.download(cl_queue)
+        hu_cpu = self.hu0.download(cl_queue)
+        hv_cpu = self.hv0.download(cl_queue)
+        
+        return h_cpu, hu_cpu, hv_cpu
+        
+        
+
+
+        
+        
+        
+        
+"""
+A class representing an Akrawa C type (staggered, u fluxes on east/west faces, v fluxes on north/south faces) grid
+We use h as cell centers
+"""
+class SWEDataArkawaC:
+    """
+    Uploads initial data to the CL device
+    """
+    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, h0, hu0, hv0):
+        #FIXME: This at least works for 0 and 1 ghost cells, but not convinced it generalizes
+        assert(halo_x <= 1 and halo_y <= 1)
+        
+        self.h0   = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.hu0  = OpenCLArray2D(cl_ctx, nx+1, ny, 0, halo_y, hu0)
+        self.hv0  = OpenCLArray2D(cl_ctx, nx, ny+1, halo_x, 0, hv0)
+        
+        self.h1   = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.hu1  = OpenCLArray2D(cl_ctx, nx+1, ny, 0, halo_y, hu0)
+        self.hv1  = OpenCLArray2D(cl_ctx, nx, ny+1, halo_x, 0, hv0)
+
+    """
+    Swaps the variables after a timestep has been completed
+    """
+    def swap(self):
+        #h is assumed to be constant (bottom topography really)
+        self.h1,  self.h0  = self.h0, self.h1
+        self.hu1, self.hu0 = self.hu0, self.hu1
+        self.hv1, self.hv0 = self.hv0, self.hv1
+        
+    """
+    Enables downloading data from CL device to Python
+    """
+    def download(self, cl_queue):
+        h_cpu  = self.h0.download(cl_queue)
+        hu_cpu = self.hu0.download(cl_queue)
+        hv_cpu = self.hv0.download(cl_queue)
+        
+        return h_cpu, hu_cpu, hv_cpu
+        
+        
+
+
+
+"""
+Class which represents different wind stresses
+"""
+class WindStressParams:
+
+    """
+    wind_type: TYpe of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
+    wind_tau0: Amplitude of wind stress (Pa)
+    wind_rho: Density of sea water (1025.0 kg / m^3)
+    wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
+    wind_xm: Maximum wind stress for bell shaped wind stress
+    wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
+    wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
+    wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
+    wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
+    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
+    """
+    def __init__(self, 
+                 type=99, # "no wind" \
+                 tau0=0, rho=0, alpha=0, xm=0, Rc=0, \
+                 x0=0, y0=0, \
+                 u0=0, v0=0):
+        self.type = np.int32(type)
+        self.tau0 = np.float32(tau0)
+        self.rho = np.float32(rho)
+        self.alpha = np.float32(alpha)
+        self.xm = np.float32(xm)
+        self.Rc = np.float32(Rc)
+        self.x0 = np.float32(x0)
+        self.y0 = np.float32(y0)
+        self.u0 = np.float32(u0)
+        self.v0 = np.float32(v0)
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
+                 
--- a/SWESimulators/DataOutput.py
+++ b/SWESimulators/DataOutput.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements saving shallow water simulations to a
+netcdf file.
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import numpy as np
+from netCDF4 import Dataset
+
+class CTCSNetCDFWriter:
+    def __init__(self, outfilename, nx, ny, dx, dy, ignore_ghostcells=True):
+        self.ncfile = Dataset(outfilename,'w') 
+        self.ignore_ghostcells = ignore_ghostcells
+        
+        #Create dimensions 
+        self.ncfile.createDimension('time', None) #Unlimited time dimension
+        if (self.ignore_ghostcells):
+            self.ncfile.createDimension('x_eta', nx)
+            self.ncfile.createDimension('y_eta', ny)
+            self.ncfile.createDimension('x_u', nx-1)
+            self.ncfile.createDimension('y_u', ny)
+            self.ncfile.createDimension('x_v', nx)
+            self.ncfile.createDimension('y_v', ny-1)
+        else:
+            self.ncfile.createDimension('x_eta', nx+2)
+            self.ncfile.createDimension('y_eta', ny+2)
+            self.ncfile.createDimension('x_u', nx+1)
+            self.ncfile.createDimension('y_u', ny+2)
+            self.ncfile.createDimension('x_v', nx+2)
+            self.ncfile.createDimension('y_v', ny+1)
+
+        #Create axis
+        self.nc_time = self.ncfile.createVariable('time', np.dtype('float32').char, 'time')
+        x_eta = self.ncfile.createVariable('x_eta', np.dtype('float32').char, 'x_eta')
+        y_eta = self.ncfile.createVariable('y_eta', np.dtype('float32').char, 'y_eta')
+        x_u = self.ncfile.createVariable('x_u', np.dtype('float32').char, 'x_u')
+        y_u = self.ncfile.createVariable('y_u', np.dtype('float32').char, 'y_u')
+        x_v = self.ncfile.createVariable('x_v', np.dtype('float32').char, 'x_v')
+        y_v = self.ncfile.createVariable('y_v', np.dtype('float32').char, 'y_v')
+        
+        #Set axis values/ticks
+        if (self.ignore_ghostcells):
+            x_eta[:] = np.linspace(dx/2.0, nx*dx - dx/2.0, nx)
+            y_eta[:] = np.linspace(dy/2.0, ny*dy - dy/2.0, ny)
+            x_u[:] = np.linspace(1, (nx-1)*dx, nx-1)
+            y_u[:] = np.linspace(dy/2.0, ny*dy - dy/2.0, ny)
+            x_v[:] = np.linspace(dx/2.0, nx*dx - dx/2.0, nx)
+            y_v[:] = np.linspace(1, (ny-1)*dy, ny-1)
+        else:
+            x_eta[:] = np.linspace(-dx/2.0, nx*dx + dx/2.0, nx+2)
+            y_eta[:] = np.linspace(-dy/2.0, ny*dy + dy/2.0, ny+2)
+            x_u[:] = np.linspace(0, nx*dx, nx+1)
+            y_u[:] = np.linspace(-dy/2.0, ny*dy + dy/2.0, ny+2)
+            x_v[:] = np.linspace(-dx/2.0, nx*dx + dx/2.0, nx+2)
+            y_v[:] = np.linspace(0, ny*dy, ny+1)
+
+        #Set units
+        self.nc_time.units = 's'
+        x_eta.units = 'm'
+        y_eta.units = 'm'
+        x_u.units = 'm'
+        y_u.units = 'm'
+        x_v.units = 'm'
+        y_v.units = 'm'
+
+        
+
+        #Create output data variables
+        self.nc_eta = self.ncfile.createVariable('eta', np.dtype('float32').char, ('time', 'y_eta', 'x_eta'))
+        self.nc_u = self.ncfile.createVariable('u', np.dtype('float32').char, ('time', 'y_u', 'x_u'))
+        self.nc_v = self.ncfile.createVariable('v', np.dtype('float32').char, ('time', 'y_v', 'x_v'))
+        
+        #Set units
+        self.nc_eta.units = 'm'
+        self.nc_u.units = 'm'
+        self.nc_v.units = 'm'
+
+        
+        
+        
+        
+    def __enter__(self):
+        return self
+        
+        
+        
+        
+        
+    def __exit__(self, exc_type, exc_value, traceback):
+        print "Closing '" + self.ncfile.filepath() + "'"
+        self.ncfile.close()
+        
+        
+        
+        
+        
+    def write(self, i, t, eta, u, v):
+        if (self.ignore_ghostcells):
+            self.nc_time[i] = t
+            self.nc_eta[i, :] = eta[1:-1, 1:-1]
+            self.nc_u[i, :] = u[1:-1, 1:-1]
+            self.nc_v[i, :] = v[1:-1, 1:-1]
+        else:
+            self.nc_time[i] = t
+            self.nc_eta[i, :] = eta
+            self.nc_u[i, :] = u
+            self.nc_v[i, :] = v
--- a/SWESimulators/FBL.py
+++ b/SWESimulators/FBL.py
@@ -0,0 +1,184 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Forward Backward Linear numerical 
+scheme for the shallow water equations, described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+        
+        
+        
+        
+        
+        
+
+
+
+
+
+
+
+
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class FBL:
+
+    """
+    Initialization routine
+    H: Water depth incl ghost cells, (nx+2)*(ny+2) cells
+    eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    f: Coriolis parameter (1.2e-4 s^1)
+    r: Bottom friction coefficient (2.4e-3 m/s)
+    wind_stress: Wind stress parameters
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 H, eta0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, f, r, \
+                 wind_stress=Common.WindStressParams(), \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.u_kernel = Common.get_kernel(self.cl_ctx, "FBL_U_kernel.opencl", block_width, block_height)
+        self.v_kernel = Common.get_kernel(self.cl_ctx, "FBL_V_kernel.opencl", block_width, block_height)
+        self.eta_kernel = Common.get_kernel(self.cl_ctx, "FBL_eta_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 0
+        ghost_cells_y = 0
+        self.H = Common.OpenCLArray2D(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, H)
+        self.cl_data = Common.SWEDataArkawaC(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.f = np.float32(f)
+        self.r = np.float32(r)
+        self.wind_stress = wind_stress
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (8, 8) # WARNING::: MUST MATCH defines of block_width/height in kernels!
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):        
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+                
+            self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, self.r, \
+                    self.H.data, self.H.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.wind_stress.type, \
+                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                    self.wind_stress.x0, self.wind_stress.y0, \
+                    self.wind_stress.u0, self.wind_stress.v0, \
+                    self.t)
+            
+            self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, self.r, \
+                    self.H.data, self.H.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.wind_stress.type, \
+                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                    self.wind_stress.x0, self.wind_stress.y0, \
+                    self.wind_stress.u0, self.wind_stress.v0, \
+                    self.t)
+            
+            self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, self.f, self.r, \
+                    self.H.data, self.H.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch)
+                
+            self.t += local_dt
+        
+        return self.t
+    
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
+
+        
+        
+
+
+
+
+
+
+
--- a/SWESimulators/FBL_U_kernel.opencl
+++ b/SWESimulators/FBL_U_kernel.opencl
@@ -0,0 +1,163 @@
+/*
+This OpenCL kernel implements part of the Forward Backward Linear 
+numerical scheme for the shallow water equations, described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "common.opencl"
+
+
+/**
+  * Kernel that evolves U one step in time.
+  */
+__kernel void computeUKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+    
+        //Data
+        __global float* H_ptr_, int H_pitch_,
+        __global float* U_ptr_, int U_pitch_,
+        __global float* V_ptr_, int V_pitch_,
+        __global float* eta_ptr_, int eta_pitch_,
+    
+        // Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+    
+    __local float H_shared[block_height][block_width+1];
+    __local float V_shared[block_height+1][block_width+1];
+    __local float eta_shared[block_height][block_width+1];
+
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1); 
+
+    //Index of cell within domain
+    const int ti = get_global_id(0); 
+    const int tj = get_global_id(1);
+    
+    //Compute pointer to row "tj" in the U array
+    __global float* const U_row = (__global float*) ((__global char*) U_ptr_ + U_pitch_*tj);
+
+    //Read current U
+    float U_current = 0.0f;
+    if (ti < nx_ + 1 && tj < ny_) {
+        U_current = U_row[ti];
+    }
+
+    //Read H and eta into local memory
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = by + j;
+        
+        //Compute the pointer to row "l" in the H and eta arrays
+        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
+        __global float* const eta_row = (__global float*) ((__global char*) eta_ptr_ + eta_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = bx + i - 1;
+            
+            if (k >= 0 && k < nx_ && l < ny_+1) {
+                H_shared[j][i] = H_row[k];
+                eta_shared[j][i] = eta_row[k];
+            }
+            else {
+                H_shared[j][i] = 0.0f;
+                eta_shared[j][i] = 0.0f;
+            }
+        }
+    }
+
+    //Read V into shared memory
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = by + j;
+        
+        //Compute the pointer to current row in the V array
+        __global float* const V_row = (__global float*) ((__global char*) V_ptr_ + V_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = bx + i - 1;
+            
+            if (k >= 0 && k < nx_ && l < ny_+1) {
+                V_shared[j][i] = V_row[k];
+            }
+            else {
+                V_shared[j][i] = 0.0f;
+            }
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Reconstruct H at the U position
+    float H_m = 0.5f*(H_shared[ty][tx] + H_shared[ty][tx+1]);
+
+    //Reconstruct V at the U position
+    float V_m = 0.0f;
+    if (tj==0) {
+        V_m = 0.5f*(V_shared[ty+1][tx] + V_shared[ty+1][tx+1]);
+    }
+    else if (tj==ny_-1) {
+        V_m = 0.5f*(V_shared[ty][tx] + V_shared[ty][tx+1]);
+    }
+    else {
+        V_m = 0.25f*(V_shared[ty][tx] + V_shared[ty][tx+1]
+                + V_shared[ty+1][tx] + V_shared[ty+1][tx+1]);
+    }
+
+    //Calculate the friction coefficient
+    float B = H_m/(H_m + r_*dt_);
+
+    //Calculate the gravitational effect
+    float P = g_*H_m*(eta_shared[ty][tx] - eta_shared[ty][tx+1])/dx_;
+    
+    //Calculate the wind shear stress
+    float X = windStressX(
+        wind_stress_type_, 
+        dx_, dy_, dt_,
+        tau0_, rho_, alpha_, xm_, Rc_,
+        x0_, y0_,
+        u0_, v0_,
+        t_);
+
+    //Compute the U at the next timestep
+    float U_next = B*(U_current + dt_*(f_*V_m + P + X) );
+
+    //Write to main memory for internal cells
+    if (ti < nx_+1 && tj < ny_) {
+        //Closed boundaries
+        if (ti == 0 || ti == nx_) {
+            U_next = 0.0f;
+        }
+        U_row[ti] = U_next;
+    }
+}
--- a/SWESimulators/FBL_V_kernel.opencl
+++ b/SWESimulators/FBL_V_kernel.opencl
@@ -0,0 +1,168 @@
+/*
+This OpenCL kernel implements part of the Forward Backward Linear 
+numerical scheme for the shallow water equations, described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.opencl"
+
+
+
+
+/**
+  * Kernel that evolves V one step in time.
+  */
+__kernel void computeVKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+    
+        //Data
+        __global float* H_ptr_, int H_pitch_,
+        __global float* U_ptr_, int U_pitch_,
+        __global float* V_ptr_, int V_pitch_,
+        __global float* eta_ptr_, int eta_pitch_,
+    
+        // Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+        
+    __local float H_shared[block_height+1][block_width];
+    __local float U_shared[block_height+1][block_width+1];
+    __local float eta_shared[block_height+1][block_width];
+
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+
+    //Index of cell within domain
+    const int ti = get_global_id(0); 
+    const int tj = get_global_id(1);
+    
+    //Compute pointer to current row in the U array
+    __global float* const V_row = (__global float*) ((__global char*) V_ptr_ + V_pitch_*tj);
+
+    //Read current V
+    float V_current = 0.0f;
+    if (ti < nx_ && tj < ny_+1) {
+        V_current = V_row[ti];
+    }
+
+    //Read H and eta into shared memory
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = by + j - 1;
+        
+        //Compute the pointer to current row in the H and eta arrays
+        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
+        __global float* const eta_row = (__global float*) ((__global char*) eta_ptr_ + eta_pitch_*l);
+        
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+            const int k = bx + i;
+            if (k < nx_ && l >= 0 && l < ny_+1) {
+                H_shared[j][i] = H_row[k];
+                eta_shared[j][i] = eta_row[k];
+            }
+            else {
+                H_shared[j][i] = 0.0f;
+                eta_shared[j][i] = 0.0f;
+            }
+        }
+    }
+
+    //Read U into shared memory
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = by + j - 1;
+        
+        //Compute the pointer to current row in the V array
+        __global float* const U_row = (__global float*) ((__global char*) U_ptr_ + U_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = bx + i;
+            if (k < nx_+1 && l >= 0 && l < ny_) {
+                U_shared[j][i] = U_row[k];
+            }
+            else {
+                U_shared[j][i] = 0.0f;
+            }
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Reconstruct H at the V position
+    float H_m = 0.5f*(H_shared[ty][tx] + H_shared[ty+1][tx]);
+
+    //Reconstruct U at the V position
+    float U_m;
+    if (ti==0) {
+        U_m = 0.5f*(U_shared[ty][tx+1] + U_shared[ty+1][tx+1]);
+    }
+    else if (ti==nx_-1) {
+        U_m = 0.5f*(U_shared[ty][tx] + U_shared[ty+1][tx]);
+    }
+    else {
+        U_m = 0.25f*(U_shared[ty][tx] + U_shared[ty][tx+1]
+                + U_shared[ty+1][tx] + U_shared[ty+1][tx+1]);
+    }
+
+    //Calculate the friction coefficient
+    float B = H_m/(H_m + r_*dt_);
+
+    //Calculate the gravitational effect
+    float P = g_*H_m*(eta_shared[ty][tx] - eta_shared[ty+1][tx])/dy_;
+
+    //Calculate the wind shear stress
+    float Y = windStressY(
+        wind_stress_type_, 
+        dx_, dy_, dt_,
+        tau0_, rho_, alpha_, xm_, Rc_,
+        x0_, y0_,
+        u0_, v0_,
+        t_);
+    
+    //Compute the V at the next timestep
+    float V_next = B*(V_current + dt_*(-f_*U_m + P + Y) );
+
+    //Write to main memory
+    if (ti < nx_ && tj < ny_+1) {
+        //Closed boundaries 
+        if (tj == 0) {
+            V_next = 0.0f;
+        }
+        else if (tj == ny_) {
+            V_next = 0.0f;
+        }
+
+        V_row[ti] = V_next;
+    }
+}
--- a/SWESimulators/FBL_eta_kernel.opencl
+++ b/SWESimulators/FBL_eta_kernel.opencl
@@ -0,0 +1,113 @@
+/*
+This OpenCL kernel implements part of the Forward Backward Linear 
+numerical scheme for the shallow water equations, described in 
+L. P. Røed, "Documentation of simple ocean models for use in ensemble
+predictions", Met no report 2012/3 and 2012/5 .
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+/**
+  * Kernel that evolves eta one step in time.
+  */
+__kernel void computeEtaKernel(
+        //Discretization parameters
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+    
+        //Physical parameters
+        float g_, //< Gravitational constant
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+    
+        //Data
+        __global float* H_ptr_, int H_pitch_,
+        __global float* U_ptr_, int U_pitch_,
+        __global float* V_ptr_, int V_pitch_,
+        __global float* eta_ptr_, int eta_pitch_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+
+    //Index of cell within domain
+    const int ti = get_global_id(0); 
+    const int tj = get_global_id(1);
+    
+    __local float U_shared[block_height][block_width+1];
+    __local float V_shared[block_height+1][block_width];
+    
+    //Compute pointer to current row in the U array
+    __global float* const eta_row = (__global float*) ((__global char*) eta_ptr_ + eta_pitch_*tj);
+
+    //Read current eta
+    float eta_current = 0.0f;
+    if (ti < nx_ && tj < ny_) {
+        eta_current = eta_row[ti];
+    }
+    
+    //Read U into shared memory
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const unsigned int l = by + j;
+        
+        //Compute the pointer to current row in the V array
+        __global float* const U_row = (__global float*) ((__global char*) U_ptr_ + U_pitch_*l);
+        
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const unsigned int k = bx + i;
+            if (k < nx_+1 && l < ny_) {
+                U_shared[j][i] = U_row[k];
+            }
+            else {
+                U_shared[j][i] = 0.0f;
+            }
+        }
+    }
+    
+    //Read V into shared memory
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const unsigned int l = by + j;
+        //Compute the pointer to current row in the V array
+        __global float* const V_row = (__global float*) ((__global char*) V_ptr_ + V_pitch_*l);
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+            const unsigned int k = bx + i;
+            if (k < nx_ && l < ny_+1) {
+                V_shared[j][i] = V_row[k];
+            }
+            else {
+                V_shared[j][i] = 0.0f;
+            }
+        }
+    }
+
+    //Make sure all threads have read into shared mem
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Compute the eta at the next timestep
+    float eta_next = eta_current - dt_/dx_ * (U_shared[ty][tx+1] - U_shared[ty][tx])
+                                 - dt_/dy_ * (V_shared[ty+1][tx] - V_shared[ty][tx]);
+    
+    //Write to main memory
+    if (ti < nx_ && tj < ny_) {
+        eta_row[ti] = eta_next;
+    }
+}
--- a/SWESimulators/FORCE.py
+++ b/SWESimulators/FORCE.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the FORCE flux
+for the shallow water equations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+        
+        
+        
+        
+        
+        
+
+
+"""
+Class that solves the SW equations 
+"""
+class FORCE:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.kernel = Common.get_kernel(self.cl_ctx, "FORCE_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 1
+        ghost_cells_y = 1
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):        
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+        
+            self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                
+            self.t += local_dt
+            
+            self.cl_data.swap()
+        
+        return self.t
+        
+        
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/FORCE_kernel.opencl
+++ b/SWESimulators/FORCE_kernel.opencl
@@ -0,0 +1,168 @@
+/*
+This OpenCL kernel implements the classical Lax-Friedrichs scheme
+for the shallow water equations, with edge fluxes.
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.opencl"
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxF(__local float Q[3][block_height+2][block_width+2],
+                  __local float F[3][block_height+1][block_width+1],
+                  const float g_, const float dx_, const float dt_) {
+                      
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Compute fluxes along the x axis
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 1; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i;
+            
+            // Q at interface from the right and left
+            const float3 Qp = (float3)(Q[0][l][k+1],
+                                       Q[1][l][k+1],
+                                       Q[2][l][k+1]);
+            const float3 Qm = (float3)(Q[0][l][k],
+                                       Q[1][l][k],
+                                       Q[2][l][k]);
+                                       
+            // Computed flux
+            const float3 flux = FORCE_1D_flux(Qm, Qp, g_, dx_, dt_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */
+void computeFluxG(__local float Q[3][block_height+2][block_width+2],
+                  __local float G[3][block_height+1][block_width+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Compute fluxes along the y axis
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 1; //Skip ghost cells
+            
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Qp = (float3)(Q[0][l+1][k],
+                                       Q[2][l+1][k],
+                                       Q[1][l+1][k]);
+            const float3 Qm = (float3)(Q[0][l][k],
+                                       Q[2][l][k],
+                                       Q[1][l][k]);
+
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = FORCE_1D_flux(Qm, Qp, g_, dy_, dt_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_) {
+        
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    __local float Q[3][block_height+2][block_width+2];
+    __local float F[3][block_height+1][block_width+1];
+    
+    
+    //Read into shared memory
+    readBlock1(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Save our input variables
+    const float h0  = Q[0][ty+1][tx+1];
+    const float hu0 = Q[1][ty+1][tx+1];
+    const float hv0 = Q[2][ty+1][tx+1];
+    
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Compute flux along x, and evolve
+    computeFluxF(Q, F, g_, dx_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    evolveF1(Q, F, nx_, ny_, dx_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Compute flux along y, and evolve
+    computeFluxG(Q, F, g_, dy_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    evolveG1(Q, F, nx_, ny_, dy_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Write to main memory
+    writeBlock1(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
--- a/SWESimulators/HLL.py
+++ b/SWESimulators/HLL.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+        
+        
+        
+
+
+"""
+Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
+"""
+class HLL:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    u0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    v0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 cl_ctx,
+                 h0, u0, v0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.lxf_kernel = Common.get_kernel(self.cl_ctx, "HLL_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 1
+        ghost_cells_y = 1
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, u0, v0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):        
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+        
+            self.lxf_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                
+            self.t += local_dt
+            
+            self.cl_data.swap()
+        
+        return self.t
+    
+    
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/HLL2.py
+++ b/SWESimulators/HLL2.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the 2nd order HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+        
+        
+        
+        
+        
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class HLL2:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 theta=1.8, \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.swe_kernel = Common.get_kernel(self.cl_ctx, "HLL2_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 2
+        ghost_cells_y = 2
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.theta = np.float32(theta)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height)
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / (2.0*self.dt) + 1)
+        
+        for i in range(0, n): 
+            #Dimensional splitting: second order accurate for every other timestep,
+            #thus run two timesteps in a go
+            
+            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
+            if (local_dt <= 0.0):
+                break
+                
+            #Along X, then Y
+            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.theta, \
+                    np.int32(0), \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+            self.cl_data.swap()
+            
+            #Along Y, then X
+            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.theta, \
+                    np.int32(1), \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+            self.cl_data.swap()
+            
+            self.t += local_dt
+            
+        
+        return self.t
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/HLL2_kernel.opencl
+++ b/SWESimulators/HLL2_kernel.opencl
@@ -0,0 +1,225 @@
+/*
+This OpenCL kernel implements the second order HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.opencl"
+
+
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+                  __local float Qx[3][block_height+2][block_width+2],
+                  __local float F[3][block_height+1][block_width+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i + 1;
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            const float3 Q_rl = (float3)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                         Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                         Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
+            const float3 Q_rr = (float3)(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
+                                         Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
+                                         Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
+                                         
+            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qx[0][j][i],
+                                         Q[1][l][k] - 0.5f*Qx[1][j][i],
+                                         Q[2][l][k] - 0.5f*Qx[2][j][i]);
+            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qx[0][j][i],
+                                         Q[1][l][k] + 0.5f*Qx[1][j][i],
+                                         Q[2][l][k] + 0.5f*Qx[2][j][i]);
+                                        
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dx_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+
+            // Compute flux based on prediction
+            const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+                  __local float Qy[3][block_height+2][block_width+2],
+                  __local float G[3][block_height+1][block_width+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 2; //Skip ghost cells
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            //NOte that hu and hv are swapped ("transposing" the domain)!
+            const float3 Q_rl = (float3)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                         Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                         Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
+            const float3 Q_rr = (float3)(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
+                                         Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
+                                         Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
+                                        
+            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qy[0][j][i],
+                                         Q[2][l][k] - 0.5f*Qy[2][j][i],
+                                         Q[1][l][k] - 0.5f*Qy[1][j][i]);
+            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qy[0][j][i],
+                                         Q[2][l][k] + 0.5f*Qy[2][j][i],
+                                         Q[1][l][k] + 0.5f*Qy[1][j][i]);
+                                     
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dy_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dy_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+            
+            // Compute flux based on prediction
+            const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            //Note that we here swap hu and hv back to the original
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        int step_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_) {
+            
+    //Shared memory variables
+    __local float Q[3][block_height+4][block_width+4];
+    __local float Qx[3][block_height+2][block_width+2];
+    __local float F[3][block_height+1][block_width+1];
+    
+    
+    
+    
+    //Read into shared memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Set boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Step 0 => evolve x first, then y
+    if (step_ == 0) {
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    //Step 1 => evolve y first, then x
+    else {
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    
+    
+    
+    
+    // Write to main memory for all internal cells
+    writeBlock2(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
--- a/SWESimulators/HLL_kernel.opencl
+++ b/SWESimulators/HLL_kernel.opencl
@@ -0,0 +1,156 @@
+/*
+This OpenCL kernel implements the HLL flux
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.opencl"
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxF(__local float Q[3][block_height+2][block_width+2],
+                  __local float F[3][block_height+1][block_width+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {   
+        const int l = j + 1; //Skip ghost cells     
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) { 
+            const int k = i;
+            
+            const float3 Q_l  = (float3)(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
+            const float3 Q_r  = (float3)(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
+            
+            const float3 flux = HLL_flux(Q_l, Q_r, g_);
+            
+            //Write to shared memory
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxG(__local float Q[3][block_height+2][block_width+2],
+                  __local float G[3][block_height+1][block_width+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 1; //Skip ghost cells
+            
+            //NOte that hu and hv are swapped ("transposing" the domain)!
+            const float3 Q_l = (float3)(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
+            const float3 Q_r = (float3)(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
+                                       
+            // Computed flux
+            const float3 flux = HLL_flux(Q_l, Q_r, g_);
+            
+            //Write to shared memory
+            //Note that we here swap hu and hv back to the original
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_) {
+    //Shared memory variables
+    __local float Q[3][block_height+2][block_width+2];
+    __local float F[3][block_height+1][block_width+1];
+    
+    
+    //Read into shared memory
+    readBlock1(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    noFlowBoundary1(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Compute F flux
+    computeFluxF(Q, F, g_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    evolveF1(Q, F, nx_, ny_, dx_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Compute G flux
+    computeFluxG(Q, F, g_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    evolveG1(Q, F, nx_, ny_, dy_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    
+    // Write to main memory for all internal cells
+    writeBlock1(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
--- a/SWESimulators/KP07.py
+++ b/SWESimulators/KP07.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class KP07:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    f: Coriolis parameter (1.2e-4 s^1)
+    r: Bottom friction coefficient (2.4e-3 m/s)
+    wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
+    wind_tau0: Amplitude of wind stress (Pa)
+    wind_rho: Density of sea water (1025.0 kg / m^3)
+    wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
+    wind_xm: Maximum wind stress for bell shaped wind stress
+    wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
+    wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
+    wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
+    wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
+    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, f=0.0, r=0.0, \
+                 theta=1.3, use_rk2=True,
+                 wind_stress=Common.WindStressParams(), \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+                 
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.kp07_kernel = Common.get_kernel(self.cl_ctx, "KP07_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 2
+        ghost_cells_y = 2
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.f = np.float32(f)
+        self.r = np.float32(r)
+        self.theta = np.float32(theta)
+        self.use_rk2 = use_rk2
+        self.wind_stress = wind_stress
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):        
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+        
+            if (self.use_rk2):
+                self.kp07_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, local_dt, \
+                        self.g, \
+                        self.theta, \
+                        self.f, \
+                        self.r, \
+                        np.int32(0), \
+                        self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.wind_stress.type, \
+                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                        self.wind_stress.x0, self.wind_stress.y0, \
+                        self.wind_stress.u0, self.wind_stress.v0, \
+                        self.t)
+                self.kp07_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, local_dt, \
+                        self.g, \
+                        self.theta, \
+                        self.f, \
+                        self.r, \
+                        np.int32(1), \
+                        self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.wind_stress.type, \
+                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                        self.wind_stress.x0, self.wind_stress.y0, \
+                        self.wind_stress.u0, self.wind_stress.v0, \
+                        self.t)
+            else:
+                self.kp07_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, local_dt, \
+                        self.g, \
+                        self.theta, \
+                        self.f, \
+                        self.r, \
+                        np.int32(0), \
+                        self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.wind_stress.type, \
+                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                        self.wind_stress.x0, self.wind_stress.y0, \
+                        self.wind_stress.u0, self.wind_stress.v0, \
+                        self.t)
+                self.cl_data.swap()
+                
+            self.t += local_dt
+            
+        
+        return self.t
+    
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/KP07_dimsplit.py
+++ b/SWESimulators/KP07_dimsplit.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+
+
+"""
+Class that solves the SW equations using the dimentionally split KP07 scheme
+"""
+class KP07_dimsplit:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 theta=1.3, \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+                 
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.swe_kernel = Common.get_kernel(self.cl_ctx, "KP07_dimsplit_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 2
+        ghost_cells_y = 2
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        self.theta = np.float32(theta)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / (2.0*self.dt) + 1)
+        
+        for i in range(0, n): 
+            #Dimensional splitting: second order accurate for every other timestep,
+            #thus run two timesteps in a go
+            
+            #Compute timestep
+            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
+            if (local_dt <= 0.0):
+                break
+                
+            #Along X, then Y
+            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.theta, \
+                    np.int32(0), \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+            self.cl_data.swap()
+            
+            #Along Y, then X
+            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.theta, \
+                    np.int32(1), \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+            self.cl_data.swap()
+            
+            self.t += 2.0*local_dt
+            
+        
+        return self.t
+    
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/KP07_dimsplit_kernel.opencl
+++ b/SWESimulators/KP07_dimsplit_kernel.opencl
@@ -0,0 +1,217 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.opencl"
+
+
+
+void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+                  __local float Qx[3][block_height+2][block_width+2],
+                  __local float F[3][block_height+1][block_width+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i + 1;
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            const float3 Q_rl = (float3)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                         Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                         Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
+            const float3 Q_rr = (float3)(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
+                                         Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
+                                         Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
+                                         
+            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qx[0][j][i],
+                                         Q[1][l][k] - 0.5f*Qx[1][j][i],
+                                         Q[2][l][k] - 0.5f*Qx[2][j][i]);
+            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qx[0][j][i],
+                                         Q[1][l][k] + 0.5f*Qx[1][j][i],
+                                         Q[2][l][k] + 0.5f*Qx[2][j][i]);
+                                    
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dx_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+
+            // Compute flux based on prediction
+            const float3 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }    
+}
+
+void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+                  __local float Qy[3][block_height+2][block_width+2],
+                  __local float G[3][block_height+1][block_width+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 2; //Skip ghost cells
+            // Reconstruct point values of Q at the left and right hand side 
+            // of the cell for both the left (i) and right (i+1) cell 
+            //NOte that hu and hv are swapped ("transposing" the domain)!
+            const float3 Q_rl = (float3)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                         Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                         Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
+            const float3 Q_rr = (float3)(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
+                                         Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
+                                         Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
+                                        
+            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qy[0][j][i],
+                                         Q[2][l][k] - 0.5f*Qy[2][j][i],
+                                         Q[1][l][k] - 0.5f*Qy[1][j][i]);
+            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qy[0][j][i],
+                                         Q[2][l][k] + 0.5f*Qy[2][j][i],
+                                         Q[1][l][k] + 0.5f*Qy[1][j][i]);
+                                     
+            //Evolve half a timestep (predictor step)
+            const float3 Q_r_bar = Q_rl + dt_/(2.0f*dy_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
+            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dy_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
+            
+            // Compute flux based on prediction
+            const float3 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, g_);
+            
+            //Write to shared memory
+            //Note that we here swap hu and hv back to the original
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+/**
+  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
+  */
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        int step_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_) {
+        
+        
+    //Shared memory variables
+    __local float Q[3][block_height+4][block_width+4];
+    __local float Qx[3][block_height+2][block_width+2];
+    __local float F[3][block_height+1][block_width+1];
+    
+    
+    
+    //Read into shared memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Fix boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    //Step 0 => evolve x first, then y
+    if (step_ == 0) {
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    //Step 1 => evolve y first, then x
+    else {
+        //Compute fluxes along the y axis and evolve
+        minmodSlopeY(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxG(Q, Qx, F, g_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Set boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Compute fluxes along the x axis and evolve
+        minmodSlopeX(Q, Qx, theta_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        computeFluxF(Q, Qx, F, g_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    
+    
+    // Write to main memory for all internal cells
+    writeBlock2(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
--- a/SWESimulators/KP07_kernel.opencl
+++ b/SWESimulators/KP07_kernel.opencl
@@ -0,0 +1,236 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.opencl"
+
+
+
+void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+                  __local float Qx[3][block_height+2][block_width+2],
+                  __local float F[3][block_height+1][block_width+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i + 1;
+            // Q at interface from the right and left
+            const float3 Qp = (float3)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+                                       Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
+                                       Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
+            const float3 Qm = (float3)(Q[0][l][k  ] + 0.5f*Qx[0][j][i  ],
+                                       Q[1][l][k  ] + 0.5f*Qx[1][j][i  ],
+                                       Q[2][l][k  ] + 0.5f*Qx[2][j][i  ]);
+                                       
+            // Computed flux
+            const float3 flux = CentralUpwindFlux(Qm, Qp, g_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }    
+}
+
+void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+                  __local float Qy[3][block_height+2][block_width+2],
+                  __local float G[3][block_height+1][block_width+1],
+                  const float g_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 2; //Skip ghost cells
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Qp = (float3)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+                                       Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
+                                       Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
+            const float3 Qm = (float3)(Q[0][l  ][k] + 0.5f*Qy[0][j  ][i],
+                                       Q[2][l  ][k] + 0.5f*Qy[2][j  ][i],
+                                       Q[1][l  ][k] + 0.5f*Qy[1][j  ][i]);
+                                       
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = CentralUpwindFlux(Qm, Qp, g_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+/**
+  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
+  */
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        float theta_,
+        
+        float f_, //< Coriolis coefficient
+        float r_, //< Bottom friction coefficient
+        
+        int step_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_,
+        
+        //Wind stress parameters
+        int wind_stress_type_, 
+        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+        float x0_, float y0_,
+        float u0_, float v0_,
+        float t_) {
+        
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    //Shared memory variables
+    __local float Q[3][block_height+4][block_width+4];
+    
+    //The following slightly wastes memory, but enables us to reuse the 
+    //funcitons in common.opencl
+    __local float Qx[3][block_height+2][block_width+2];
+    __local float Qy[3][block_height+2][block_width+2];
+    __local float F[3][block_height+1][block_width+1];
+    __local float G[3][block_height+1][block_width+1];
+    
+    
+    
+    //Read into shared memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Fix boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Reconstruct slopes along x and axis
+    minmodSlopeX(Q, Qx, theta_);
+    minmodSlopeY(Q, Qy, theta_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Compute fluxes along the x and y axis
+    computeFluxF(Q, Qx, F, g_);
+    computeFluxG(Q, Qy, G, g_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Sum fluxes and advance in time for all internal cells
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +2
+        const int j = ty + 2;
+        
+        const float X = windStressX(
+            wind_stress_type_, 
+            dx_, dy_, dt_,
+            tau0_, rho_, alpha_, xm_, Rc_,
+            x0_, y0_,
+            u0_, v0_,
+            t_);
+        const float Y = windStressY(
+            wind_stress_type_, 
+            dx_, dy_, dt_,
+            tau0_, rho_, alpha_, xm_, Rc_,
+            x0_, y0_,
+            u0_, v0_,
+            t_);
+        
+        const float h1  = Q[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
+        const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_
+                                     + dt_*X - dt_*f_*Q[2][j][i];
+        const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_
+                                     + dt_*Y + dt_*f_*Q[1][j][i];
+
+        __global float* const h_row  = (__global float*) ((__global char*) h1_ptr_ + h1_pitch_*tj);
+        __global float* const hu_row = (__global float*) ((__global char*) hu1_ptr_ + hu1_pitch_*tj);
+        __global float* const hv_row = (__global float*) ((__global char*) hv1_ptr_ + hv1_pitch_*tj);
+        
+        const float C = 2.0f*r_*dt_/Q[0][j][i];
+                    
+        if  (step_ == 0) {
+            //First step of RK2 ODE integrator
+            
+            h_row[ti] = h1;
+            hu_row[ti] = hu1 / (1.0f + C);
+            hv_row[ti] = hv1 / (1.0f + C);
+        }
+        else if (step_ == 1) {
+            //Second step of RK2 ODE integrator
+            
+            //First read Q^n
+            const float h_a  = h_row[ti];
+            const float hu_a = hu_row[ti];
+            const float hv_a = hv_row[ti];
+            
+            //Compute Q^n+1
+            const float h_b  = 0.5f*(h_a + h1);
+            const float hu_b = 0.5f*(hu_a + hu1);
+            const float hv_b = 0.5f*(hv_a + hv1);
+            
+            //Write to main memory
+            h_row[ti] = h_b;
+            hu_row[ti] = hu_b / (1.0f + 0.5f*C);
+            hv_row[ti] = hv_b / (1.0f + 0.5f*C);
+        }
+    }
+}
--- a/SWESimulators/LxF.py
+++ b/SWESimulators/LxF.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the classical Lax-Friedrichs numerical
+scheme for the shallow water equations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+        
+        
+        
+        
+        
+        
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class LxF:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.lxf_kernel = Common.get_kernel(self.cl_ctx, "LxF_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 1
+        ghost_cells_y = 1
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):        
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+            
+            if (local_dt <= 0.0):
+                break
+        
+            self.lxf_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                
+            self.t += local_dt
+            
+            self.cl_data.swap()
+        
+        return self.t
+        
+        
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/LxF_kernel.opencl
+++ b/SWESimulators/LxF_kernel.opencl
@@ -0,0 +1,158 @@
+/*
+This OpenCL kernel implements the classical Lax-Friedrichs scheme
+for the shallow water equations, with edge fluxes.
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "common.opencl"
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxF(__local float Q[3][block_height+2][block_width+2],
+                  __local float F[3][block_height][block_width+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 1; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i;
+            
+            // Q at interface from the right and left
+            const float3 Qp = (float3)(Q[0][l][k+1],
+                                       Q[1][l][k+1],
+                                       Q[2][l][k+1]);
+            const float3 Qm = (float3)(Q[0][l][k],
+                                       Q[1][l][k],
+                                       Q[2][l][k]);
+                                       
+            // Computed flux
+            const float3 flux = LxF_2D_flux(Qm, Qp, g_, dx_, dt_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */
+void computeFluxG(__local float Q[3][block_height+2][block_width+2],
+                  __local float G[3][block_height+1][block_width],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+            const int k = i + 1; //Skip ghost cells
+            
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Qp = (float3)(Q[0][l+1][k],
+                                       Q[2][l+1][k],
+                                       Q[1][l+1][k]);
+            const float3 Qm = (float3)(Q[0][l][k],
+                                       Q[2][l][k],
+                                       Q[1][l][k]);
+
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = LxF_2D_flux(Qm, Qp, g_, dy_, dt_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }  
+}
+
+
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_) {
+            
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    __local float Q[3][block_height+2][block_width+2];
+    __local float F[3][block_height][block_width+1];
+    __local float G[3][block_height+1][block_width];
+    
+    //Read into shared memory
+    readBlock1(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    //Set boundary conditions
+    noFlowBoundary1(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Compute fluxes along the x and y axis
+    computeFluxF(Q, F, g_, dx_, dt_);
+    computeFluxG(Q, G, g_, dy_, dt_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Evolve for all internal cells
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        //Index of thread within block
+        const int tx = get_local_id(0);
+        const int ty = get_local_id(1);
+        
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+        
+        const float h1  = Q[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
+        const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_;
+        const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
+                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_;
+
+        __global float* const h_row  = (__global float*) ((__global char*) h1_ptr_ + h1_pitch_*tj);
+        __global float* const hu_row = (__global float*) ((__global char*) hu1_ptr_ + hu1_pitch_*tj);
+        __global float* const hv_row = (__global float*) ((__global char*) hv1_ptr_ + hv1_pitch_*tj);
+        
+        h_row[ti] = h1;
+        hu_row[ti] = hu1;
+        hv_row[ti] = hv1;
+    }
+}
--- a/SWESimulators/PlotHelper.py
+++ b/SWESimulators/PlotHelper.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python class aids in plotting results from the numerical 
+simulations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+from matplotlib import pyplot as plt
+import matplotlib.gridspec as gridspec
+import numpy as np
+import time
+
+"""
+Class that makes plotting faster by caching the plots instead of recreating them
+"""
+class PlotHelper:
+
+    def __init__(self, fig, x_coords, y_coords, radius, eta1, u1, v1, eta2=None, u2=None, v2=None, interpolation_type='spline36'):
+        self.ny, self.nx = eta1.shape
+        self.fig = fig;
+        
+        fig.set_figheight(15)
+        fig.set_figwidth(15)
+        
+        min_x = np.min(x_coords[:,0]);
+        min_y = np.min(y_coords[0,:]);
+        
+        max_x = np.max(x_coords[0,:]);
+        max_y = np.max(y_coords[:,0]);
+        
+        domain_extent = [ x_coords[0, 0], x_coords[0, -1], y_coords[0, 0], y_coords[-1, 0] ]
+        
+        if (eta2 is not None):
+            assert(u2 is not None)
+            assert(v2 is not None)
+            self.gs = gridspec.GridSpec(3, 3)
+        else:
+            self.gs = gridspec.GridSpec(2, 3)
+        
+        ax = self.fig.add_subplot(self.gs[0, 0])
+        self.sp_eta = plt.imshow(eta1, interpolation=interpolation_type, origin='bottom', vmin=-0.05, vmax=0.05, extent=domain_extent)
+        plt.axis('tight')
+        ax.set_aspect('equal')
+        plt.title('Eta')
+        plt.colorbar()
+        
+        ax = self.fig.add_subplot(self.gs[0, 1])
+        self.sp_u = plt.imshow(u1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
+        plt.axis('tight')
+        ax.set_aspect('equal')
+        plt.title('U')
+        plt.colorbar()
+        
+        ax = self.fig.add_subplot(self.gs[0, 2])
+        self.sp_v = plt.imshow(v1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
+        plt.axis('tight')
+        ax.set_aspect('equal')
+        plt.title('V')
+        plt.colorbar()
+            
+        ax = self.fig.add_subplot(self.gs[1, 0])
+        self.sp_radial1, = plt.plot(radius.ravel(), eta1.ravel(), '.')
+        plt.axis([0, min(max_x, max_y), -1.5, 1])
+        plt.title('Eta Radial plot')
+
+        ax = self.fig.add_subplot(self.gs[1, 1])
+        self.sp_x_axis1, = plt.plot(x_coords[self.ny/2,:], eta1[self.ny/2,:], 'k+--', label='x-axis')
+        self.sp_y_axis1, = plt.plot(y_coords[:,self.nx/2], eta1[:,self.nx/2], 'kx:', label='y-axis')
+        plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
+        plt.title('Eta along axis')
+        plt.legend()
+
+        ax = self.fig.add_subplot(self.gs[1, 2])
+        self.sp_x_diag1, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
+                                   np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2), \
+                                   'k+--', label='x = -y')
+        self.sp_y_diag1, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
+                                   np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2), \
+                                   'kx:', label='x = y')
+        plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
+        plt.title('Eta along diagonal')
+        plt.legend()
+        
+        
+        if (eta2 is not None):
+            ax = self.fig.add_subplot(self.gs[2, 0])
+            self.sp_radial2, = plt.plot(radius.ravel(), eta2.ravel(), '.')
+            plt.axis([0, min(max_x, max_y), -1.5, 1])
+            plt.title('Eta2 Radial plot')
+
+            ax = self.fig.add_subplot(self.gs[2, 1])
+            self.sp_x_axis2, = plt.plot(x_coords[self.ny/2,:], eta2[self.ny/2,:], 'k+--', label='x-axis')
+            self.sp_y_axis2, = plt.plot(y_coords[:,self.nx/2], eta2[:,self.nx/2], 'kx:', label='y-axis')
+            plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
+            plt.title('Eta2 along axis')
+            plt.legend()
+
+            ax = self.fig.add_subplot(self.gs[2, 2])
+            self.sp_x_diag2, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
+                                       np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2), \
+                                       'k+--', label='x = -y')
+            self.sp_y_diag2, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
+                                       np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2), \
+                                       'kx:', label='x = y')
+            plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
+            plt.title('Eta2 along diagonal')
+            plt.legend()
+        
+        
+        
+        
+        
+    def plot(self, eta1, u1, v1, eta2=None, u2=None, v2=None):
+        self.fig.add_subplot(self.gs[0, 0])
+        self.sp_eta.set_data(eta1)
+
+        self.fig.add_subplot(self.gs[0, 1])
+        self.sp_u.set_data(u1)
+
+        self.fig.add_subplot(self.gs[0, 2])
+        self.sp_v.set_data(v1)
+            
+        self.fig.add_subplot(self.gs[1, 0])
+        self.sp_radial1.set_ydata(eta1.ravel());
+
+        self.fig.add_subplot(self.gs[1, 1])
+        self.sp_x_axis1.set_ydata(eta1[(self.ny+2)/2,:])
+        self.sp_y_axis1.set_ydata(eta1[:,(self.nx+2)/2])
+
+        self.fig.add_subplot(self.gs[1, 2])
+        self.sp_x_diag1.set_ydata(np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2))
+        self.sp_y_diag1.set_ydata(np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2))
+        
+        if (eta2 is not None):
+            self.fig.add_subplot(self.gs[2, 0])
+            self.sp_radial2.set_ydata(eta2.ravel());
+
+            self.fig.add_subplot(self.gs[2, 1])
+            self.sp_x_axis2.set_ydata(eta2[(self.ny+2)/2,:])
+            self.sp_y_axis2.set_ydata(eta2[:,(self.nx+2)/2])
+
+            self.fig.add_subplot(self.gs[2, 2])
+            self.sp_x_diag2.set_ydata(np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2))
+            self.sp_y_diag2.set_ydata(np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2))
+        
+        plt.draw()
+        time.sleep(0.001)
+        
+        
--- a/SWESimulators/WAF.py
+++ b/SWESimulators/WAF.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the Weighted average flux (WAF) described in
+E. Toro, Shock-Capturing methods for free-surface shallow flows, 2001
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import pyopencl as cl #OpenCL in Python
+import Common
+
+
+
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class WAF:
+
+    """
+    Initialization routine
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 cl_ctx, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width=16, block_height=16):
+        self.cl_ctx = cl_ctx
+                 
+        #Create an OpenCL command queue
+        self.cl_queue = cl.CommandQueue(self.cl_ctx)
+
+        #Get kernels
+        self.kernel = Common.get_kernel(self.cl_ctx, "WAF_kernel.opencl", block_width, block_height)
+        
+        #Create data by uploading to device
+        ghost_cells_x = 2
+        ghost_cells_y = 2
+        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #OpenCL kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g)
+        
+        #Initialize time
+        self.t = np.float32(0.0)
+        
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                      ) 
+    
+    
+    
+    
+    """
+    Function which steps n timesteps
+    """
+    def step(self, t_end=0.0):
+        n = int(t_end / (2.0*self.dt) + 1)
+        
+        for i in range(0, n):
+            #Dimensional splitting: second order accurate for every other timestep,
+            #thus run two timesteps in a go
+            
+            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
+            if (local_dt <= 0.0):
+                break
+                
+            #Along X, then Y
+            self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    np.int32(0), \
+                    self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+            self.cl_data.swap()
+            
+            #Along Y, then X
+            self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                    self.nx, self.ny, \
+                    self.dx, self.dy, local_dt, \
+                    self.g, \
+                    np.int32(1), \
+                    self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+            self.cl_data.swap()
+                
+            self.t += local_dt
+            
+        
+        return self.t
+    
+    
+    
+    
+    def download(self):
+        return self.cl_data.download(self.cl_queue)
+
--- a/SWESimulators/WAF_kernel.opencl
+++ b/SWESimulators/WAF_kernel.opencl
@@ -0,0 +1,191 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include "common.opencl"
+
+
+
+/**
+  * Computes the flux along the x axis for all faces
+  */
+void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+                  __local float F[3][block_height+1][block_width+1],
+                  const float g_, const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+                      
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+            const int k = i + 1;
+            
+            // Q at interface from the right and left
+            const float3 Ql2 = (float3)(Q[0][l][k-1], Q[1][l][k-1], Q[2][l][k-1]);
+            const float3 Ql1 = (float3)(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
+            const float3 Qr1 = (float3)(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
+            const float3 Qr2 = (float3)(Q[0][l][k+2], Q[1][l][k+2], Q[2][l][k+2]);
+
+            // Computed flux
+            const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dx_, dt_);
+            F[0][j][i] = flux.x;
+            F[1][j][i] = flux.y;
+            F[2][j][i] = flux.z;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+/**
+  * Computes the flux along the y axis for all faces
+  */
+void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+                  __local float G[3][block_height+1][block_width+1],
+                  const float g_, const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Compute fluxes along the y axis
+    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 2; //Skip ghost cells
+            // Q at interface from the right and left
+            // Note that we swap hu and hv
+            const float3 Ql2 = (float3)(Q[0][l-1][k], Q[2][l-1][k], Q[1][l-1][k]);
+            const float3 Ql1 = (float3)(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
+            const float3 Qr1 = (float3)(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
+            const float3 Qr2 = (float3)(Q[0][l+2][k], Q[2][l+2][k], Q[1][l+2][k]);
+            
+            // Computed flux
+            // Note that we swap back
+            const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dy_, dt_);
+            G[0][j][i] = flux.x;
+            G[1][j][i] = flux.z;
+            G[2][j][i] = flux.y;
+        }
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__kernel void swe_2D(
+        int nx_, int ny_,
+        float dx_, float dy_, float dt_,
+        float g_, int step_,
+        
+        //Input h^n
+        __global float* h0_ptr_, int h0_pitch_,
+        __global float* hu0_ptr_, int hu0_pitch_,
+        __global float* hv0_ptr_, int hv0_pitch_,
+        
+        //Output h^{n+1}
+        __global float* h1_ptr_, int h1_pitch_,
+        __global float* hu1_ptr_, int hu1_pitch_,
+        __global float* hv1_ptr_, int hv1_pitch_) {    
+    //Shared memory variables
+    __local float Q[3][block_height+4][block_width+4];
+    __local float F[3][block_height+1][block_width+1];
+    
+    
+    
+    //Read into shared memory Q from global memory
+    readBlock2(h0_ptr_, h0_pitch_,
+               hu0_ptr_, hu0_pitch_,
+               hv0_ptr_, hv0_pitch_,
+               Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    //Set boundary conditions
+    noFlowBoundary2(Q, nx_, ny_);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    
+    
+    //Step 0 => evolve x first, then y
+    if (step_ == 0) {
+        //Compute fluxes along the x axis and evolve
+        computeFluxF(Q, F, g_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Fix boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Compute fluxes along the y axis and evolve
+        computeFluxG(Q, F, g_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    //Step 1 => evolve y first, then x
+    else {
+        //Compute fluxes along the y axis and evolve
+        computeFluxG(Q, F, g_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveG2(Q, F, nx_, ny_, dy_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Fix boundary conditions
+        noFlowBoundary2(Q, nx_, ny_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        //Compute fluxes along the x axis and evolve
+        computeFluxF(Q, F, g_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        evolveF2(Q, F, nx_, ny_, dx_, dt_);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+
+    
+    // Write to main memory for all internal cells
+    writeBlock2(h1_ptr_, h1_pitch_,
+                hu1_ptr_, hu1_pitch_,
+                hv1_ptr_, hv1_pitch_,
+                Q, nx_, ny_);
+}
--- a/SWESimulators/init.py
+++ b/SWESimulators/init.py
@@ -0,0 +1,5 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+
+
+# Nothing general to do
--- a/SWESimulators/common.opencl
+++ b/SWESimulators/common.opencl
@@ -0,0 +1,972 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
+for the shallow water equations, described in 
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160. 
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+  * Reads a block of data  with one ghost cell for the shallow water equations
+  */
+void readBlock1(__global float* h_ptr_, int h_pitch_,
+                __global float* hu_ptr_, int hu_pitch_,
+                __global float* hv_ptr_, int hv_pitch_,
+                __local float Q[3][block_height+2][block_width+2], 
+                const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+    
+    //Read into shared memory
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        const int l = clamp(by + j, 0, ny_+1); // Out of bounds
+        
+        //Compute the pointer to current row in the arrays
+        __global float* const h_row = (__global float*) ((__global char*) h_ptr_ + h_pitch_*l);
+        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*l);
+        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*l);
+        
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
+            
+            Q[0][j][i] = h_row[k];
+            Q[1][j][i] = hu_row[k];
+            Q[2][j][i] = hv_row[k];
+        }
+    }
+}
+
+
+
+
+
+/**
+  * Reads a block of data  with two ghost cells for the shallow water equations
+  */
+void readBlock2(__global float* h_ptr_, int h_pitch_,
+                __global float* hu_ptr_, int hu_pitch_,
+                __global float* hv_ptr_, int hv_pitch_,
+                __local float Q[3][block_height+4][block_width+4], 
+                const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of block within domain
+    const int bx = get_local_size(0) * get_group_id(0);
+    const int by = get_local_size(1) * get_group_id(1);
+    
+    //Read into shared memory
+    for (int j=ty; j<block_height+4; j+=get_local_size(1)) {
+        const int l = clamp(by + j, 0, ny_+3); // Out of bounds
+        
+        //Compute the pointer to current row in the arrays
+        __global float* const h_row = (__global float*) ((__global char*) h_ptr_ + h_pitch_*l);
+        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*l);
+        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*l);
+        
+        for (int i=tx; i<block_width+4; i+=get_local_size(0)) {
+            const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
+            
+            Q[0][j][i] = h_row[k];
+            Q[1][j][i] = hu_row[k];
+            Q[2][j][i] = hv_row[k];
+        }
+    }
+}
+
+
+
+
+/**
+  * Writes a block of data to global memory for the shallow water equations.
+  */
+void writeBlock1(__global float* h_ptr_, int h_pitch_,
+                 __global float* hu_ptr_, int hu_pitch_,
+                 __global float* hv_ptr_, int hv_pitch_,
+                 __local float Q[3][block_height+2][block_width+2],
+                 const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    //Only write internal cells
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+
+        __global float* const h_row  = (__global float*) ((__global char*) h_ptr_ + h_pitch_*tj);
+        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*tj);
+        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*tj);
+        
+        h_row[ti]  = Q[0][j][i];
+        hu_row[ti] = Q[1][j][i];
+        hv_row[ti] = Q[2][j][i];
+    }
+}
+
+
+
+
+
+/**
+  * Writes a block of data to global memory for the shallow water equations.
+  */
+void writeBlock2(__global float* h_ptr_, int h_pitch_,
+                 __global float* hu_ptr_, int hu_pitch_,
+                 __global float* hv_ptr_, int hv_pitch_,
+                 __local float Q[3][block_height+4][block_width+4], 
+                 const int nx_, const int ny_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    //Only write internal cells
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +2
+        const int j = ty + 2;
+
+        __global float* const h_row  = (__global float*) ((__global char*) h_ptr_ + h_pitch_*tj);
+        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*tj);
+        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*tj);
+        
+        h_row[ti]  = Q[0][j][i];
+        hu_row[ti] = Q[1][j][i];
+        hv_row[ti] = Q[2][j][i];
+    }
+}
+
+
+
+
+
+
+/**
+  * No flow boundary conditions for the shallow water equations
+  * with one ghost cell in each direction
+  */
+void noFlowBoundary1(__local float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
+    //Global index
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    //Block-local indices
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    const int i = tx + 1; //Skip local ghost cells, i.e., +1
+    const int j = ty + 1;
+    
+    //Fix boundary conditions
+    if (ti == 1) {
+        Q[0][j][i-1] =  Q[0][j][i];
+        Q[1][j][i-1] = -Q[1][j][i];
+        Q[2][j][i-1] =  Q[2][j][i];
+    }
+    if (ti == nx_) {
+        Q[0][j][i+1] =  Q[0][j][i];
+        Q[1][j][i+1] = -Q[1][j][i];
+        Q[2][j][i+1] =  Q[2][j][i];
+    }
+    if (tj == 1) {
+        Q[0][j-1][i] =  Q[0][j][i];
+        Q[1][j-1][i] =  Q[1][j][i];
+        Q[2][j-1][i] = -Q[2][j][i];
+    }
+    if (tj == ny_) {
+        Q[0][j+1][i] =  Q[0][j][i];
+        Q[1][j+1][i] =  Q[1][j][i];
+        Q[2][j+1][i] = -Q[2][j][i];
+    }
+}
+
+
+
+
+
+
+/**
+  * No flow boundary conditions for the shallow water equations
+  * with two ghost cells in each direction
+  */
+void noFlowBoundary2(__local float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
+    //Global index
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    //Block-local indices
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    const int i = tx + 2; //Skip local ghost cells, i.e., +2
+    const int j = ty + 2;
+    
+    if (ti == 2) {
+        Q[0][j][i-1] =  Q[0][j][i];
+        Q[1][j][i-1] = -Q[1][j][i];
+        Q[2][j][i-1] =  Q[2][j][i];
+        
+        Q[0][j][i-2] =  Q[0][j][i+1];
+        Q[1][j][i-2] = -Q[1][j][i+1];
+        Q[2][j][i-2] =  Q[2][j][i+1];
+    }
+    if (ti == nx_+1) {
+        Q[0][j][i+1] =  Q[0][j][i];
+        Q[1][j][i+1] = -Q[1][j][i];
+        Q[2][j][i+1] =  Q[2][j][i];
+        
+        Q[0][j][i+2] =  Q[0][j][i-1];
+        Q[1][j][i+2] = -Q[1][j][i-1];
+        Q[2][j][i+2] =  Q[2][j][i-1];
+    }
+    if (tj == 2) {
+        Q[0][j-1][i] =  Q[0][j][i];
+        Q[1][j-1][i] =  Q[1][j][i];
+        Q[2][j-1][i] = -Q[2][j][i];
+        
+        Q[0][j-2][i] =  Q[0][j+1][i];
+        Q[1][j-2][i] =  Q[1][j+1][i];
+        Q[2][j-2][i] = -Q[2][j+1][i];
+    }
+    if (tj == ny_+1) {
+        Q[0][j+1][i] =  Q[0][j][i];
+        Q[1][j+1][i] =  Q[1][j][i];
+        Q[2][j+1][i] = -Q[2][j][i];
+        
+        Q[0][j+2][i] =  Q[0][j-1][i];
+        Q[1][j+2][i] =  Q[1][j-1][i];
+        Q[2][j+2][i] = -Q[2][j-1][i];
+    }
+}
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the x axis (dimensional splitting)
+  */
+void evolveF1(__local float Q[3][block_height+2][block_width+2],
+              __local float F[3][block_height+1][block_width+1],
+              const int nx_, const int ny_,
+              const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+        
+        Q[0][j][i] = Q[0][j][i] + (F[0][ty][tx] - F[0][ty][tx+1]) * dt_ / dx_;
+        Q[1][j][i] = Q[1][j][i] + (F[1][ty][tx] - F[1][ty][tx+1]) * dt_ / dx_;
+        Q[2][j][i] = Q[2][j][i] + (F[2][ty][tx] - F[2][ty][tx+1]) * dt_ / dx_;
+    }
+}
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the x axis (dimensional splitting)
+  */
+void evolveF2(__local float Q[3][block_height+4][block_width+4],
+              __local float F[3][block_height+1][block_width+1],
+              const int nx_, const int ny_,
+              const float dx_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +1
+        const int j = ty + 2;
+        
+        Q[0][j][i] = Q[0][j][i] + (F[0][ty][tx] - F[0][ty][tx+1]) * dt_ / dx_;
+        Q[1][j][i] = Q[1][j][i] + (F[1][ty][tx] - F[1][ty][tx+1]) * dt_ / dx_;
+        Q[2][j][i] = Q[2][j][i] + (F[2][ty][tx] - F[2][ty][tx+1]) * dt_ / dx_;
+    }
+}
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the y axis (dimensional splitting)
+  */
+void evolveG1(__local float Q[3][block_height+2][block_width+2],
+              __local float G[3][block_height+1][block_width+1],
+              const int nx_, const int ny_,
+              const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
+    const int tj = get_global_id(1) + 1;
+    
+    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
+        const int i = tx + 1; //Skip local ghost cells, i.e., +1
+        const int j = ty + 1;
+        
+        Q[0][j][i] = Q[0][j][i] + (G[0][ty][tx] - G[0][ty+1][tx]) * dt_ / dy_;
+        Q[1][j][i] = Q[1][j][i] + (G[1][ty][tx] - G[1][ty+1][tx]) * dt_ / dy_;
+        Q[2][j][i] = Q[2][j][i] + (G[2][ty][tx] - G[2][ty+1][tx]) * dt_ / dy_;
+    }
+}
+
+
+
+
+
+
+
+/**
+  * Evolves the solution in time along the y axis (dimensional splitting)
+  */
+void evolveG2(__local float Q[3][block_height+4][block_width+4],
+              __local float G[3][block_height+1][block_width+1],
+              const int nx_, const int ny_,
+              const float dy_, const float dt_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Index of cell within domain
+    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
+    const int tj = get_global_id(1) + 2;
+    
+    if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
+        const int i = tx + 2; //Skip local ghost cells, i.e., +2
+        const int j = ty + 2;
+        
+        Q[0][j][i] = Q[0][j][i] + (G[0][ty][tx] - G[0][ty+1][tx]) * dt_ / dy_;
+        Q[1][j][i] = Q[1][j][i] + (G[1][ty][tx] - G[1][ty+1][tx]) * dt_ / dy_;
+        Q[2][j][i] = Q[2][j][i] + (G[2][ty][tx] - G[2][ty+1][tx]) * dt_ / dy_;
+    }
+}
+
+
+
+
+
+
+
+
+
+
+/**
+  * Reconstructs a slope using the minmod limiter based on three 
+  * consecutive values
+  */
+float minmodSlope(float left, float center, float right, float theta) {
+    const float backward = (center - left) * theta;
+    const float central = (right - left) * 0.5f;
+    const float forward = (right - center) * theta;
+    
+	return 0.25f
+		*copysign(1.0f, backward)
+		*(copysign(1.0f, backward) + copysign(1.0f, central))
+		*(copysign(1.0f, central) + copysign(1.0f, forward))
+		*min( min(fabs(backward), fabs(central)), fabs(forward) );
+}
+
+
+
+
+/**
+  * Reconstructs a minmod slope for a whole block along x
+  */
+void minmodSlopeX(__local float  Q[3][block_height+4][block_width+4],
+                  __local float Qx[3][block_height+2][block_width+2],
+                  const float theta_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Reconstruct slopes along x axis
+    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+            const int k = i + 1;
+            for (int p=0; p<3; ++p) {
+                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
+            }
+        }
+    }
+}
+
+
+/**
+  * Reconstructs a minmod slope for a whole block along y
+  */
+void minmodSlopeY(__local float  Q[3][block_height+4][block_width+4],
+                  __local float Qy[3][block_height+2][block_width+2],
+                  const float theta_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+        const int l = j + 1;
+        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+            const int k = i + 2; //Skip ghost cells
+            for (int p=0; p<3; ++p) {
+                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
+            }
+        }
+    }
+}
+
+
+
+
+
+
+float windStressX(int wind_stress_type_,
+                float dx_, float dy_, float dt_,
+                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+                float x0_, float y0_,
+                float u0_, float v0_,
+                float t_) {
+    
+    float X = 0.0f;
+    
+    switch (wind_stress_type_) {
+    case 0: //UNIFORM_ALONGSHORE
+        {
+            const float y = (get_global_id(1)+0.5f)*dy_;
+            X = tau0_/rho_ * exp(-alpha_*y);
+        }
+        break;
+    case 1: //BELL_SHAPED_ALONGSHORE
+        if (t_ <= 48.0f*3600.0f) {
+            const float a = alpha_*((get_global_id(0)+0.5f)*dx_-xm_);
+            const float aa = a*a;
+            const float y = (get_global_id(1)+0.5f)*dy_;
+            X = tau0_/rho_ * exp(-aa) * exp(-alpha_*y);
+        }
+        break;
+    case 2: //MOVING_CYCLONE
+        {
+            const float x = (get_global_id(0))*dx_;
+            const float y = (get_global_id(1)+0.5f)*dy_;
+            const float a = (x-x0_-u0_*(t_+dt_));
+            const float aa = a*a;
+            const float b = (y-y0_-v0_*(t_+dt_));
+            const float bb = b*b;
+            const float r = sqrt(aa+bb);
+            const float c = 1.0f - r/Rc_;
+            const float xi = c*c;
+            
+            X = -(tau0_/rho_) * (b/Rc_) * exp(-0.5f*xi);
+        }
+        break;
+    }
+
+    return X;
+}
+
+
+
+
+
+
+float windStressY(int wind_stress_type_,
+                float dx_, float dy_, float dt_,
+                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
+                float x0_, float y0_,
+                float u0_, float v0_,
+                float t_) {
+    float Y = 0.0f;
+    
+    switch (wind_stress_type_) {
+    case 2: //MOVING_CYCLONE:
+        {
+            const float x = (get_global_id(0)+0.5f)*dx_; 
+            const float y = (get_global_id(1))*dy_;
+            const float a = (x-x0_-u0_*(t_+dt_));
+            const float aa = a*a;
+            const float b = (y-y0_-v0_*(t_+dt_));
+            const float bb = b*b;
+            const float r = sqrt(aa+bb);
+            const float c = 1.0f - r/Rc_;
+            const float xi = c*c;
+            
+            Y = (tau0_/rho_) * (a/Rc_) * exp(-0.5f*xi);
+        }
+        break;
+    }
+
+    return Y;
+}
+
+
+
+
+
+
+
+float3 F_func(const float3 Q, const float g) {
+    float3 F;
+
+    F.x = Q.y;                              //hu
+    F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x;   //hu*hu/h + 0.5f*g*h*h;
+    F.z = Q.y*Q.z / Q.x;                    //hu*hv/h;
+
+    return F;
+}
+
+
+
+
+
+/**
+  * Central upwind flux function
+  */
+float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
+    const float3 Fp = F_func(Qp, g);
+    const float up = Qp.y / Qp.x;   // hu / h
+    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
+
+    const float3 Fm = F_func(Qm, g);
+    const float um = Qm.y / Qm.x;   // hu / h
+    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
+    
+    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
+    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
+    
+    return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
+}
+
+
+
+
+
+
+
+
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
+  */
+float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_func(Q_l, g_);
+    }
+    else if (S_r <= 0.0f) {
+        return F_func(Q_r, g_);
+    }
+    //Or estimate flux in the star region
+    else {
+        const float3 F_l = F_func(Q_l, g_);
+        const float3 F_r = F_func(Q_r, g_);
+        const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
+        return flux;
+    }
+}
+
+
+
+
+
+
+
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
+  */
+float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_l;
+    }
+    else if (S_r <= 0.0f) {
+        return F_r;
+    }
+    //Or estimate flux in the "left star" region
+    else if (S_l <= 0.0f && 0.0f <=S_star) {
+        const float v_l = Q_l.z / h_l;
+        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * (float3)(1, S_star, v_l);
+        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
+        return flux;
+    }
+    //Or estimate flux in the "righ star" region
+    else if (S_star <= 0.0f && 0.0f <=S_r) {
+        const float v_r = Q_r.z / h_r;
+        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * (float3)(1, S_star, v_r);
+        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
+        return flux;
+    }
+    else {
+        return -99999.9f; //Something wrong here
+    }
+}
+
+
+
+/**
+  * Superbee flux limiter for WAF.
+  * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
+  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
+  * @param c_ the courant number for wave k, dt*S_k/dx
+  */
+float WAF_superbee(float r_, float c_) {
+    // r <= 0.0
+    if (r_ <= 0.0f) { 
+        return 1.0f;
+    }
+    // 0.0 <= r <= 1/2
+    else if (r_ <= 0.5f) { 
+        return 1.0f - 2.0f*(1.0f - fabs(c_))*r_;
+    }
+    // 1/2 <= r <= 1
+    else if (r_ <= 1.0f) {
+        return fabs(c_);
+    }
+    // 1 <= r <= 2
+    else  if (r_ <= 2.0f) {
+        return 1.0f - (1.0f - fabs(c_))*r_;
+    }
+    // r >= 2
+    else {
+        return 2.0f*fabs(c_) - 1.0f;
+    }
+}
+
+
+
+
+float WAF_albada(float r_, float c_) {
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    else {
+        return 1.0f - (1.0f - fabs(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
+    }
+}
+
+
+float WAF_minbee(float r_, float c_) {
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    else if (r_ <= 1.0f) {
+        return 1.0f - (1.0f - fabs(c_))*r_;
+    }
+    else {
+        return fabs(c_);
+    }
+}
+
+
+float WAF_minmod(float r_, float c_) {
+    if (r_ <= 0.0f) {
+        return fabs(c_);
+    }
+    else if (r_ <= 1.0f) {
+        return (1.0f - r_) * (1.0f - c_);
+    }
+    else {
+        return 1.0f;
+    }
+}
+
+
+
+/**
+  * Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
+  * @param r_ The flux limiter parameter (see Toro 2001, p. 203)
+  * @param Q_l2 Q_{i-1}
+  * @param Q_l1 Q_{i}
+  * @param Q_r1 Q_{i+1}
+  * @param Q_r2 Q_{i+2}
+  */
+float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
+    const float h_l = Q_l1.x;
+    const float h_r = Q_r1.x;
+    
+    const float h_l2 = Q_l2.x;
+    const float h_r2 = Q_r2.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l1.y / h_l;
+    const float u_r = Q_r1.y / h_r;
+    
+    const float v_l = Q_l1.z / h_l;
+    const float v_r = Q_r1.z / h_r;
+    
+    const float v_l2 = Q_l2.z / h_l2;
+    const float v_r2 = Q_r2.z / h_r2;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l;//*q_l;
+    const float S_r = u_r + c_r;//*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * (float3)(1, S_star, v_l);
+    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * (float3)(1, S_star, v_r);
+    
+    // Estimate the fluxes in the four regions
+    const float3 F_1 = F_func(Q_l1, g_);
+    const float3 F_4 = F_func(Q_r1, g_);
+    
+    const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
+    const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
+    //const float3 F_2 = F_func(Q_star_l, g_);
+    //const float3 F_3 = F_func(Q_star_r, g_);
+    
+    // Compute the courant numbers for the waves
+    const float c_1 = S_l * dt_ / dx_;
+    const float c_2 = S_star * dt_ / dx_;
+    const float c_3 = S_r * dt_ / dx_;
+    
+    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
+    // We use h for the tangential direction, and v for the normal direction
+    const float dh = h_r - h_l;
+    const float rh_m = (h_l - h_l2) / dh;
+    const float rh_p = (h_r2 - h_r) / dh;
+    
+    const float dv = v_r - v_l;
+    const float rv_m = (v_l - v_l2) / dv;
+    const float rv_p = (v_r2 - v_r) / dv;
+    
+    // Compute the r parameters for the flux limiter
+    // Note that you use h for h and hu, and v for hv component/equation
+    const float rh_1 = (c_1 > 0.0f) ? rh_m : rh_p; 
+    const float rv_1 = (c_1 > 0.0f) ? rv_m : rv_p; 
+    
+    const float rh_2 = (c_2 > 0.0f) ? rh_m : rh_p; 
+    const float rv_2 = (c_2 > 0.0f) ? rv_m : rv_p; 
+    
+    const float rh_3 = (c_3 > 0.0f) ? rh_m : rh_p;
+    const float rv_3 = (c_3 > 0.0f) ? rv_m : rv_p;
+    
+    // Compute the limiter
+    const float A_1 = sign(c_1)*WAF_minbee(rh_1, c_1);
+    const float A_2 = sign(c_2)*WAF_minbee(rv_2, c_2);
+    const float A_3 = sign(c_3)*WAF_minbee(rh_3, c_3);
+        
+    //Average the fluxes
+    const float3 flux = 0.5f*( F_1 + F_4 )
+                      - 0.5f*( A_1 * (F_2 - F_1)
+                             + A_2 * (F_3 - F_2)
+                             + A_3 * (F_4 - F_3) );
+
+    /*
+    const float d_0 = -1.0f;
+    const float d_1 = -0.5f;//max(min(sign(c_1)*WAF_minbee(rh_1, c_1), 1.0f), -1.0f);
+    const float d_2 = 0.0f;//max(min(sign(c_2)*WAF_minbee(rh_2, c_2), 1.0f), -1.0f);
+    const float d_3 = 0.5f;//max(min(sign(c_3)*WAF_minbee(rh_3, c_3), 1.0f), -1.0f);
+    const float d_4 = 1.0f;
+    const float3 flux = 0.5f*(d_1 - d_0) * F_1
+                        + 0.5f*(d_2 - d_1) * F_2
+                        + 0.5f*(d_3 - d_2) * F_3
+                        + 0.5f*(d_4 - d_3) * F_4;
+    */
+    /*
+    const float3 F_hllc = (S_r*F_1 - S_l*F_4 + S_r*S_l*(Q_r1 - Q_l1)) / (S_r-S_l);
+    const float3 flux = 0.5f*(d_1 - d_0) * F_1
+                        + 0.5f*(d_3 - d_1) * F_hllc
+                        + 0.5f*(d_4 - d_3) * F_4;
+      */
+                             /*
+    const float c_0 = -1.0f;
+    const float c_4 = 1.0f;
+    const float3 flux = 0.5f*(c_1 - c_0) * F_1
+                        + 0.5f*(c_2 - c_1) * F_2
+                        + 0.5f*(c_3 - c_2) * F_3
+                        + 0.5f*(c_4 - c_3) * F_4;
+                        */
+    //const float3 flux = 0.5f*( F_1 + F_4 ) - 0.5f*( sign(c_3) * A_3 * (F_4 - F_3) );
+    return flux;
+}
+
+
+
+
+
+
+
+/**
+  * Lax-Friedrichs flux (Toro 2001, p 163)
+  */
+float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    return 0.5f*(F_l + F_r) + (Q_l - Q_r) * dx_ / (2.0f*dt_);
+}
+
+
+
+/**
+  * Lax-Friedrichs extended to 2D
+  */
+float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Note numerical diffusion for 2D here (0.25)
+    return 0.5f*(F_l + F_r) + (Q_l - Q_r) * dx_ / (4.0f*dt_);
+}
+
+
+
+
+/**
+  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
+  */
+float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (F_l - F_r)*dt_/(2.0f*dx_);
+    
+    return F_func(Q_lw2, g_);
+}
+
+
+
+
+
+
+/**
+  * Godunovs centered scheme (Toro 2001, p 165)
+  */
+float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (F_l - F_r)*dt_/dx_;
+    
+    return F_func(Q_godc, g_);
+}
+    
+
+    
+    
+/**
+  * First Ordered Centered (Toro 2001, p.163)
+  */
+float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    return 0.5f*(F_lf + F_lw2);
+}
+
+
+
+
+