Merge pull request #1 from babrodtk/opencl_to_cuda

Opencl to cuda
2025-05-18 14:34:13 +02:00 · 2018-08-01 11:08:08 +02:00 · 2018-08-01 11:08:08 +02:00 · ed48305953
commit ed48305953
parent e5200cd200 4f0a73db33
42 changed files with 13037 additions and 6366 deletions
--- a/ConvergenceSmooth.ipynb
+++ b/ConvergenceSmooth.ipynb
--- a/ConvergenceSmooth1D.ipynb
+++ b/ConvergenceSmooth1D.ipynb
--- a/CUDA.ipynb
+++ b/CUDA.ipynb
--- a/RotatingConvergenceRates.ipynb
+++ b/RotatingConvergenceRates.ipynb
@ -1,765 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "This notebook sets up and runs a set of benchmarks to compare\n",
    "different numerical discretizations of the SWEs\n",
    "\n",
    "Copyright (C) 2016  SINTEF ICT\n",
    "\n",
    "This program is free software: you can redistribute it and/or modify\n",
    "it under the terms of the GNU General Public License as published by\n",
    "the Free Software Foundation, either version 3 of the License, or\n",
    "(at your option) any later version.\n",
    "\n",
    "This program is distributed in the hope that it will be useful,\n",
    "but WITHOUT ANY WARRANTY; without even the implied warranty of\n",
    "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n",
    "GNU General Public License for more details.\n",
    "\n",
    "You should have received a copy of the GNU General Public License\n",
    "along with this program.  If not, see <http://www.gnu.org/licenses/>.\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import modules and set up environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Lets have matplotlib \"inline\"\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'retina'\n",
    "\n",
    "#Import packages we need\n",
    "import numpy as np\n",
    "from matplotlib import animation, rc\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "import os\n",
    "import pyopencl\n",
    "import datetime\n",
    "import sys\n",
    "\n",
    "#Set large figure sizes\n",
    "rc('figure', figsize=(6.0, 4.0))\n",
    "rc('animation', html='html5')\n",
    "\n",
    "#Import our simulator\n",
    "from SWESimulators import FBL, CTCS,KP07, CDKLM16, PlotHelper, Common\n",
    "#Import initial condition and bathymetry generating functions:\n",
    "from SWESimulators.BathymetryAndICs import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Make sure we get compiler output from OpenCL\n",
    "os.environ[\"PYOPENCL_COMPILER_OUTPUT\"] = \"1\"\n",
    "\n",
    "#Set which CL device to use, and disable kernel caching\n",
    "if (str.lower(sys.platform).startswith(\"linux\")):\n",
    "    os.environ[\"PYOPENCL_CTX\"] = \"0\"\n",
    "else:\n",
    "    os.environ[\"PYOPENCL_CTX\"] = \"1\"\n",
    "os.environ[\"CUDA_CACHE_DISABLE\"] = \"1\"\n",
    "os.environ[\"PYOPENCL_COMPILER_OUTPUT\"] = \"1\"\n",
    "os.environ[\"PYOPENCL_NO_CACHE\"] = \"1\"\n",
    "\n",
    "#Create OpenCL context\n",
    "cl_ctx = pyopencl.create_some_context()\n",
    "print \"Using \", cl_ctx.devices[0].name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Create output directory for images\n",
    "imgdir='images_convergence_' + datetime.datetime.now().strftime(\"%Y_%m_%d-%H_%M_%S\")\n",
    "os.makedirs(imgdir)\n",
    "print \"Saving images to \" + imgdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def setBwStyles(ax):\n",
    "    from cycler import cycler\n",
    "\n",
    "    ax.set_prop_cycle( cycler('marker', ['.', 'x', 4, '+', '*', '1']) +\n",
    "                       cycler('linestyle', ['-.', '--', ':', '-.', '--', ':']) +\n",
    "                       #cycler('markersize', [15, 15, 15, 15, 15, 15]) +\n",
    "                       cycler('color', ['k', 'k', 'k', 'k', 'k', 'k']) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def rebin(a, *args):\n",
    "    '''rebin ndarray data into a smaller ndarray of the same rank whose dimensions\n",
    "    are factors of the original dimensions. eg. An array with 6 columns and 4 rows\n",
    "    can be reduced to have 6,3,2 or 1 columns and 4,2 or 1 rows.\n",
    "    example usages:\n",
    "    >>> a=rand(6,4); b=rebin(a,3,2)\n",
    "    >>> a=rand(6); b=rebin(a,2)\n",
    "    '''\n",
    "    shape = a.shape\n",
    "    lenShape = len(shape)\n",
    "    factor = np.asarray(shape)/np.asarray(args)\n",
    "    evList = ['a.reshape('] + \\\n",
    "             ['args[%d],factor[%d],'%(i,i) for i in range(lenShape)] + \\\n",
    "             [')'] + ['.sum(%d)'%(i+1) for i in range(lenShape)] + \\\n",
    "             ['/factor[%d]'%i for i in range(lenShape)]\n",
    "    #print ''.join(evList)\n",
    "    return eval(''.join(evList))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Global parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "width = 512000\n",
    "height = 512000\n",
    "\n",
    "domain_sizes = [16, 32, 64, 128, 256]#, 512, 1024, 2048, 4096]\n",
    "reference_domain_size = 4 * max(domain_sizes)\n",
    "\n",
    "\n",
    "#schemes = [\"FBL\"] \n",
    "schemes = [\"FBL\", \"CTCS\", \"KP\", \"CDKLM\"]\n",
    "\n",
    "#Timestep size    \n",
    "dt = 8000/reference_domain_size\n",
    "    \n",
    "g = 9.81\n",
    "r = 0.0\n",
    "\n",
    "# Coriolis parameters: f + beta * y\n",
    "f = 8.0e-5\n",
    "\n",
    "timesteps = 5\n",
    "\n",
    "end_time = (timesteps - 0.01)*dt\n",
    "make_netCDF = False\n",
    "\n",
    "print(\"Timesteps = \" + str(end_time / dt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def initDataBump(h0, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f):\n",
    "        \n",
    "    waterHeight = 50\n",
    "    \n",
    "    def my_exp(i, j):\n",
    "        size = 0.3\n",
    "        x = (i + 0.5 - reference_domain_size/2.0) / float(reference_domain_size)\n",
    "        y = (j + 0.5 - reference_domain_size/2.0) / float(reference_domain_size)\n",
    "        return np.exp(-10*(x*x/(size*size)+y*y/(size*size))) * (np.sqrt(x**2 + y**2) < size)\n",
    "    \n",
    "    def my_cos(i, j):\n",
    "        size = 0.6\n",
    "        x = 2*(i + 0.5 - reference_domain_size/2.0) / float(reference_domain_size)\n",
    "        y = 2*(j + 0.5 - reference_domain_size/2.0) / float(reference_domain_size)\n",
    "        r = np.sqrt(x**2 + y**2)\n",
    "        return 0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)\n",
    "    \n",
    "    #Generate disturbance at reference scale and downsample \n",
    "    disturbance = np.fromfunction(lambda i, j: my_cos(i,j), (reference_domain_size, reference_domain_size))    \n",
    "    disturbance = rebin(disturbance, nx, ny)\n",
    "    \n",
    "    validCells = [ghosts[2], eta0.shape[0] - ghosts[0], ghosts[3], eta0.shape[1] - ghosts[1]]\n",
    "    \n",
    "    eta0.fill(0.0)\n",
    "    eta0[validCells[0]:validCells[1], validCells[2]:validCells[3]] += (0.01*disturbance)\n",
    "    h0.fill(waterHeight)\n",
    "    u0.fill(0.0)\n",
    "    v0.fill(0.0)\n",
    "\n",
    "def initDataBalancedBump(h0, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f):\n",
    "    bump_posx = 0.5\n",
    "    bump_posy = 0.5\n",
    "    bump_height = 0.25\n",
    "    bump_width_factor = 20*nx\n",
    "    waterHeight = 50 \n",
    "    initializeBalancedBumpOverPoint(eta0, u0, v0, # allocated buffers to be filled with data (output)\n",
    "                                    nx, ny, dx, dy, ghosts, # grid data\n",
    "                                    bump_posx, bump_posy, # relative placement of bump center\n",
    "                                    bump_height, bump_width_factor, # bump information\n",
    "                                    f, waterHeight, # parameters defined at the bump centre (coriolis force, water depth)\n",
    "                                    g)\n",
    "    \n",
    "    # Scale eta to be out of geostrophic balance\n",
    "    eta0 *= 1.1\n",
    "    h0.fill(waterHeight);\n",
    "    \n",
    "def initData(h0, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f):\n",
    "    initDataBump(h0, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f)\n",
    "    \n",
    "def testInitData(domain_size):\n",
    "    \n",
    "    nx = domain_size\n",
    "    ny = domain_size\n",
    "    \n",
    "    dx = float(width/nx)\n",
    "    dy = float(height/ny)\n",
    "    \n",
    "    ghosts = [1, 1, 1, 1] # north, east, south, west\n",
    "    dataShape = (ny + ghosts[0]+ghosts[2], \n",
    "                 nx + ghosts[1]+ghosts[3])\n",
    "\n",
    "    h0 = np.zeros(dataShape, dtype=np.float32);\n",
    "    eta0 = np.zeros(dataShape, dtype=np.float32);\n",
    "    u0 = np.zeros((dataShape[0], dataShape[1]+1), dtype=np.float32);\n",
    "    v0 = np.zeros((dataShape[0]+1, dataShape[1]), dtype=np.float32);\n",
    "    \n",
    "    initData(h0, eta0, u0, v0, nx, ny, dx, dy, ghosts, g, f)\n",
    "    \n",
    "    return eta0\n",
    "    \n",
    "plt.figure()\n",
    "for i, domain_size in enumerate(domain_sizes):\n",
    "    eta0 = testInitData(domain_size)\n",
    "    plt.subplot(1, len(domain_sizes)+1, i+1)\n",
    "    plt.imshow(eta0, interpolation='nearest')\n",
    "    print(\"Max={:.05f}, min={:.05f}, sum={:.010f}\".format(np.max(eta0), np.min(eta0), np.sum(eta0/(domain_size*domain_size))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def plotData(eta0, u0, v0, eta1, u1, v1):\n",
    "    fig, axarr = plt.subplots(2, 3)\n",
    "    axarr[0, 0].imshow(eta0, interpolation=\"nearest\")\n",
    "    axarr[0, 1].imshow(u0, interpolation=\"nearest\")\n",
    "    axarr[0, 2].imshow(v0, interpolation=\"nearest\")\n",
    "    axarr[1, 0].imshow(eta1, interpolation=\"nearest\")\n",
    "    axarr[1, 1].imshow(u1, interpolation=\"nearest\")\n",
    "    axarr[1, 2].imshow(v1, interpolation=\"nearest\")\n",
    "    print(\"Eta0: Maximum = {:.05f}, minimum = {:.05f}\".format(np.max(eta0), np.min(eta0)))\n",
    "    print(\"Eta1: Maximum = {:.05f}, minimum = {:.05f}\".format(np.max(eta1), np.min(eta1)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Forward Backward Linear"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def runFBL(domain_size):\n",
    "    #Clean up old simulator if any:\n",
    "    if 'fbl_sim' in globals():\n",
    "        fbl_sim.cleanUp()\n",
    "    \n",
    "    nx = domain_size\n",
    "    ny = domain_size\n",
    "    \n",
    "    dx = float(width/nx)\n",
    "    dy = float(height/ny)\n",
    "    \n",
    "    ghosts = [0, 0, 0, 0] # north, east, south, west\n",
    "    dataShape = (ny + ghosts[0]+ghosts[2], \n",
    "                 nx + ghosts[1]+ghosts[3])\n",
    "\n",
    "    h0 = np.zeros(dataShape, dtype=np.float32);\n",
    "    eta0 = np.zeros(dataShape, dtype=np.float32);\n",
    "    u0 = np.zeros((dataShape[0], dataShape[1]+1), dtype=np.float32);\n",
    "    v0 = np.zeros((dataShape[0]+1, dataShape[1]), dtype=np.float32);\n",
    "\n",
    "    # Generate bump in geostrophic balance\n",
    "    initData(h0, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f)\n",
    "\n",
    "    #Initialize simulator\n",
    "    reload(FBL)\n",
    "    fbl_sim = FBL.FBL(cl_ctx, \\\n",
    "                  h0, eta0, u0, v0, \\\n",
    "                  nx, ny, \\\n",
    "                  dx, dy, dt, \\\n",
    "                  g, f, r, \\\n",
    "                  write_netcdf=make_netCDF)\n",
    "\n",
    "    t = fbl_sim.step(end_time)\n",
    "    eta1, u1, v1 = fbl_sim.download()\n",
    "    print \"\\t\\tt=\" + str(t) +  \"\\tMax eta: \" + str(np.max(eta1))\n",
    "    \n",
    "    return [eta0, u0, v0, eta1, u1, v1]\n",
    "\n",
    "[eta0, u0, v0, eta1, u1, v1] = runFBL(16)\n",
    "plotData(eta0, u0, v0, eta1, u1, v1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if make_netCDF:\n",
    "    fbl_sim.cleanUp()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Centered in time, centered in space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#Centered in time, centered in space\n",
    "\n",
    "def runCTCS(domain_size):\n",
    "    #Clean up old simulator if any:\n",
    "    if 'ctcs_sim' in globals():\n",
    "        ctcs_sim.cleanUp()\n",
    "    \n",
    "    nx = domain_size\n",
    "    ny = domain_size\n",
    "    \n",
    "    dx = float(width/nx)\n",
    "    dy = float(height/ny)\n",
    "    \n",
    "    ghosts = [1,1,1,1] # north, east, south, west\n",
    "    validDomain = np.array([1,1,1,1])\n",
    "    dataShape = (ny + ghosts[0]+ghosts[2], \n",
    "                 nx + ghosts[1]+ghosts[3])\n",
    "\n",
    "    h0 = np.zeros(dataShape, dtype=np.float32);\n",
    "    eta0 = np.zeros(dataShape, dtype=np.float32);\n",
    "    u0 = np.zeros((dataShape[0], dataShape[1]+1), dtype=np.float32);\n",
    "    v0 = np.zeros((dataShape[0]+1, dataShape[1]), dtype=np.float32);    \n",
    "\n",
    "    initData(h0, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f)\n",
    "    \n",
    "    # Eddy viscocity parameter\n",
    "    A = 0.5*dx\n",
    "    \n",
    "    reload(CTCS)\n",
    "    ctcs_sim = CTCS.CTCS(cl_ctx, \\\n",
    "                         h0, eta0, u0, v0, \\\n",
    "                         nx, ny, dx, dy, dt, \\\n",
    "                         g, f, r, A, \\\n",
    "                         write_netcdf=make_netCDF)\n",
    "\n",
    "    t = ctcs_sim.step(end_time)\n",
    "    eta1, u1, v1 = ctcs_sim.download()\n",
    "    \n",
    "    # Remove ghost cells\n",
    "    eta1 = eta1[validDomain[3]:-validDomain[1], validDomain[2]:-validDomain[0]]\n",
    "    \n",
    "    print \"\\t\\tt=\" + str(t) +  \"\\tMax eta: \" + str(np.max(eta1))\n",
    "    \n",
    "    return [eta0, u0, v0, eta1, u1, v1]\n",
    "\n",
    "[eta0, u0, v0, eta1, u1, v1] = runCTCS(16)\n",
    "plotData(eta0, u0, v0, eta1, u1, v1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if make_netCDF:\n",
    "    ctcs_sim.cleanUp()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CDKLM 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def runCDKLM(domain_size):\n",
    "    #Clean up old simulator if any:\n",
    "    if 'cdklm_sim' in globals():\n",
    "        cdklm_sim.cleanUp()\n",
    "\n",
    "    #Coriolis well balanced reconstruction scheme\n",
    "    \n",
    "    nx = domain_size\n",
    "    ny = domain_size\n",
    "    \n",
    "    dx = float(width/nx)\n",
    "    dy = float(height/ny)\n",
    "\n",
    "    ghosts = np.array([2,2,2,2]) # north, east, south, west\n",
    "    validDomain = np.array([2,2,2,2])\n",
    "    dataShape = (ny + ghosts[0]+ghosts[2], \n",
    "                 nx + ghosts[1]+ghosts[3])\n",
    "\n",
    "    Hi = np.zeros((dataShape[0]+1, dataShape[1]+1), dtype=np.float32)\n",
    "    eta0 = np.zeros(dataShape, dtype=np.float32)\n",
    "    u0   = np.zeros(dataShape, dtype=np.float32)\n",
    "    v0   = np.zeros(dataShape, dtype=np.float32)\n",
    "\n",
    "    initData(Hi, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f)\n",
    "\n",
    "    #Initialize simulator\n",
    "    reload(CDKLM16)\n",
    "    cdklm_sim = CDKLM16.CDKLM16(cl_ctx, \\\n",
    "                                eta0, u0, v0, Hi, \\\n",
    "                                nx, ny, dx, dy, dt, \\\n",
    "                                g, f, r, \\\n",
    "                                rk_order=2, \n",
    "                                write_netcdf=make_netCDF)\n",
    "\n",
    "\n",
    "    t = cdklm_sim.step(end_time)\n",
    "    eta1, u1, v1 = cdklm_sim.download()\n",
    "    \n",
    "    # Remove ghost cells\n",
    "    eta1 = eta1[validDomain[3]:-validDomain[1], validDomain[2]:-validDomain[0]]\n",
    "    \n",
    "    print \"\\t\\tt=\" + str(t) +  \"\\tMax eta: \" + str(np.max(eta1))\n",
    "    \n",
    "    return [eta0, u0, v0, eta1, u1, v1]\n",
    "\n",
    "[eta0, u0, v0, eta1, u1, v1] = runCDKLM(16)\n",
    "plotData(eta0, u0, v0, eta1, u1, v1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if make_netCDF:\n",
    "     cdklm_sim.cleanUp()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Kurganov-Petrova 2007"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def runKP(domain_size):\n",
    "    #Clean up old simulator if any:\n",
    "    if 'kp07_sim' in globals():\n",
    "        kp07_sim.cleanUp()\n",
    "    \n",
    "    # Kurganov-Petrova 2007\n",
    "    \n",
    "    nx = domain_size\n",
    "    ny = domain_size\n",
    "    \n",
    "    dx = float(width/nx)\n",
    "    dy = float(height/ny)\n",
    "    \n",
    "    ghosts = np.array([2,2,2,2]) # north, east, south, west\n",
    "    validDomain = np.array([2,2,2,2])\n",
    "    dataShape = (ny + ghosts[0]+ghosts[2], \n",
    "                 nx + ghosts[1]+ghosts[3])\n",
    "\n",
    "    Hi = np.zeros((dataShape[0]+1, dataShape[1]+1), dtype=np.float32)\n",
    "    eta0 = np.zeros(dataShape, dtype=np.float32)\n",
    "    u0 =   np.zeros(dataShape, dtype=np.float32)\n",
    "    v0 =   np.zeros(dataShape, dtype=np.float32)\n",
    "\n",
    "    initData(Hi, eta0, u0, v0, \\\n",
    "            nx, ny, dx, dy, ghosts, \\\n",
    "            g, f)\n",
    "\n",
    "    #Initialize simulator\n",
    "    reload(KP07)\n",
    "    kp07_sim = KP07.KP07(cl_ctx, \\\n",
    "                         eta0, Hi, u0, v0, \\\n",
    "                         nx, ny, dx, dy, dt, \\\n",
    "                         g, f, r, \\\n",
    "                         write_netcdf=make_netCDF,\\\n",
    "                         use_rk2=True)\n",
    "\n",
    "    t = kp07_sim.step(end_time)\n",
    "    eta1, u1, v1 = kp07_sim.download()\n",
    "    \n",
    "    # Remove ghost cells\n",
    "    eta1 = eta1[validDomain[3]:-validDomain[1], validDomain[2]:-validDomain[0]]\n",
    "    \n",
    "    print \"\\t\\tt=\" + str(t) +  \"\\tMax eta: \" + str(np.max(eta1))\n",
    "    \n",
    "    return [eta0, u0, v0, eta1, u1, v1]\n",
    "\n",
    "[eta0, u0, v0, eta1, u1, v1] = runKP(16)\n",
    "plotData(eta0, u0, v0, eta1, u1, v1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if make_netCDF:\n",
    "    kp07_sim.cleanUp()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Control "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for scheme in schemes:\n",
    "    print \"Scheme: \" + scheme\n",
    "    \n",
    "    data = {};\n",
    "    \n",
    "    # Make reference solution\n",
    "    print \"\\tDomain size (reference solution): \" + str(reference_domain_size)\n",
    "    [_, _, _, eta1_ref, _, _] = eval(\"run\" + scheme + \"(\" + str(reference_domain_size) + \")\")\n",
    "    \n",
    "    data[str(reference_domain_size)] = eta1_ref\n",
    "\n",
    "    # Run all domain sizes\n",
    "    for domain_size in domain_sizes:\n",
    "        print \"\\tDomain size: \" + str(domain_size)\n",
    "        [_, _, _, eta1, _, _] = eval(\"run\" + scheme + \"(\" + str(domain_size) + \")\")\n",
    "        \n",
    "        data[str(domain_size)] = eta1\n",
    "        \n",
    "    \n",
    "    out_filename = imgdir + \"/\" + scheme + \"_data.npz\"\n",
    "    np.savez(out_filename, **data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "error = np.zeros([len(schemes), len(domain_sizes)])\n",
    "\n",
    "for k, scheme in enumerate(schemes):\n",
    "    print \"Scheme: \" + scheme\n",
    "    \n",
    "    in_filename = imgdir + \"/\" + scheme + \"_data.npz\"\n",
    "    npzfile = np.load(in_filename)\n",
    "    \n",
    "    #Get reference\n",
    "    eta1_ref = npzfile[str(reference_domain_size)].astype(np.float64)\n",
    "    \n",
    "    # Run all domain sizes\n",
    "    for l, domain_size in enumerate(domain_sizes):\n",
    "        eta1 = npzfile[str(domain_size)].astype(np.float64)\n",
    "        \n",
    "        print(\"Max={:.05f}, min={:.05f}, sum={:.010f}\".format(np.max(eta1), np.min(eta1), np.sum(eta1/(domain_size*domain_size))))\n",
    "\n",
    "        #ver 1 : downsample til minste opplk\u00f8sning\n",
    "        \"\"\"\n",
    "        eta1_ref_downsampled = rebin(eta1_ref, min(domain_sizes), min(domain_sizes))\n",
    "        eta1_downsampled = rebin(eta1, min(domain_sizes), min(domain_sizes))\n",
    "        tmp =eta1_ref_downsampled - eta1_downsampled\n",
    "        error[k, l] = np.linalg.norm(tmp.flatten(), ord=2)\n",
    "        \"\"\"\n",
    "        \n",
    "        #\"\"\"\n",
    "        #ver 2: downsample til current oppl\u00f8sning\n",
    "        eta1_ref_downsampled = rebin(eta1_ref, domain_size, domain_size)\n",
    "        eta1_downsampled = eta1\n",
    "        tmp =eta1_ref_downsampled - eta1_downsampled\n",
    "        error[k, l] = np.linalg.norm(tmp, ord='fro') / (domain_size*domain_size)\n",
    "        #\"\"\"\n",
    "        \n",
    "        \"\"\"\n",
    "        #ver 3: upsample til refereanseoppl\u00f8sning\n",
    "        eta1_ref_downsampled = eta1_ref\n",
    "        upsampling = np.ones(np.divide(eta1_ref.shape, eta1.shape))\n",
    "        eta1_downsampled = np.kron(eta1, upsampling)\n",
    "        tmp =eta1_ref_downsampled - eta1_downsampled\n",
    "        error[k, l] = np.linalg.norm(tmp.flatten(), ord=2)\n",
    "        \"\"\"\n",
    "        \n",
    "        \n",
    "fig = plt.figure()\n",
    "setBwStyles(fig.gca())\n",
    "\n",
    "x = np.linspace(domain_sizes[0], domain_sizes[-1], 100);\n",
    "\n",
    "#scaling = np.min(error[:,0]) * domain_sizes[0]**0.5 * 0.5\n",
    "#plt.loglog(x, scaling/(np.sqrt(x)), '-', color='gray', label='Order 0.5')\n",
    "\n",
    "scaling = np.max(error[:,0]) * domain_sizes[0] * 2\n",
    "plt.loglog(x, scaling/x, '-', color='gray', label='Order 1')\n",
    "\n",
    "scaling = np.min(error[:,0]) * domain_sizes[0]**2 * 0.5\n",
    "plt.loglog(x, scaling/(x*x), '-', color='gray', label='Order 2')\n",
    "\n",
    "for k in range(len(schemes)):\n",
    "    print \"Scheme \" + str(schemes[k])\n",
    "    for l in range(len(domain_sizes)):\n",
    "        print \"\\tDomain size: \" + str(domain_sizes[l]) + \": \" + str(error[k,l])\n",
    "    plt.loglog(domain_sizes, error[k,:], label=schemes[k], markersize=15)\n",
    "#plt.loglog(domain_sizes, np.abs(error[0,:]-error[1,:]), label=\"Diff\", markersize=15)\n",
    "    \n",
    "plt.xlabel('Number of cells')\n",
    "plt.ylabel('Error')\n",
    "plt.legend(markerscale=0.5)\n",
    "\n",
    "plt.savefig(imgdir + \"/\" + \"convergence.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "git": {
   "suppress_outputs": true
  },
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
--- a/SWESimulators/CDKLM16.py
+++ b/SWESimulators/CDKLM16.py
@ -1,205 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements 
 Alina Chertock, Michael Dudzinski, A. Kurganov & Maria Lukacova-Medvidova (2016)
 Well-Balanced Schemes for the Shallow Water Equations with Coriolis Forces
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
 import numpy as np
 import pyopencl as cl #OpenCL in Python
 from SWESimulators import Common
 """
 Class that solves the SW equations using the Forward-Backward linear scheme
 """
 class CDKLM16:
    """
    Initialization routine
    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
    u0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
    v0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
    nx: Number of cells along x-axis
    ny: Number of cells along y-axis
    dx: Grid cell spacing along x-axis (20 000 m)
    dy: Grid cell spacing along y-axis (20 000 m)
    dt: Size of each timestep (90 s)
    g: Gravitational accelleration (9.81 m/s^2)
    f: Coriolis parameter (1.2e-4 s^1)
    r: Bottom friction coefficient (2.4e-3 m/s)
    """
    def __init__(self, \
                 cl_ctx, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, f, r, \
                 theta=1.3, use_rk2=True,
                 wind_stress=Common.WindStressParams(), \
                 block_width=16, block_height=16):
        self.cl_ctx = cl_ctx
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
        self.kernel = Common.get_kernel(self.cl_ctx, "CDKLM16_kernel.opencl", block_width, block_height)
        #Create data by uploading to device
        ghost_cells_x = 3
        ghost_cells_y = 3
        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
        #OpenCL kernel
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.dt = np.float32(dt)
        self.g = np.float32(g)
        self.f = np.float32(f)
        self.r = np.float32(r)
        self.theta = np.float32(theta)
        self.use_rk2 = use_rk2
        self.wind_stress = wind_stress
        #Initialize time
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
        self.local_size = (block_width, block_height) 
        self.global_size = ( \
                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
                      ) 
    def __str__(self):
        return "Chertok, Dudzinski, Kurganov, Lukacova-Medvidova"
    """
    Function which steps n timesteps
    """
    def step(self, t_end=0.0):
        n = int(t_end / self.dt + 1)
        for i in range(0, n):        
            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
            if (local_dt <= 0.0):
                break
            if (self.use_rk2):
                self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
                        self.nx, self.ny, \
                        self.dx, self.dy, local_dt, \
                        self.g, \
                        self.theta, \
                        self.f, \
                        self.r, \
                        np.int32(0), \
                        self.cl_data.h0.data, self.cl_data.h0.pitch, \
                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
                        self.cl_data.h1.data, self.cl_data.h1.pitch, \
                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
                        self.wind_stress.type, \
                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                        self.wind_stress.x0, self.wind_stress.y0, \
                        self.wind_stress.u0, self.wind_stress.v0, \
                        self.t)
                self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
                        self.nx, self.ny, \
                        self.dx, self.dy, local_dt, \
                        self.g, \
                        self.theta, \
                        self.f, \
                        self.r, \
                        np.int32(1), \
                        self.cl_data.h1.data, self.cl_data.h1.pitch, \
                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
                        self.cl_data.h0.data, self.cl_data.h0.pitch, \
                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
                        self.wind_stress.type, \
                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                        self.wind_stress.x0, self.wind_stress.y0, \
                        self.wind_stress.u0, self.wind_stress.v0, \
                        self.t)
            else:
                self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
                        self.nx, self.ny, \
                        self.dx, self.dy, local_dt, \
                        self.g, \
                        self.theta, \
                        self.f, \
                        self.r, \
                        np.int32(0), \
                        self.cl_data.h0.data, self.cl_data.h0.pitch, \
                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
                        self.cl_data.h1.data, self.cl_data.h1.pitch, \
                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
                        self.wind_stress.type, \
                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                        self.wind_stress.x0, self.wind_stress.y0, \
                        self.wind_stress.u0, self.wind_stress.v0, \
                        self.t)
                self.cl_data.swap()
            self.t += local_dt
        return self.t
    """
    Static function which reads a text file and creates an OpenCL kernel from that
    """
    def get_kernel(self, kernel_filename):
        #Read the proper program
        module_path = os.path.dirname(os.path.realpath(__file__))
        fullpath = os.path.join(module_path, kernel_filename)
        with open(fullpath, "r") as kernel_file:
            kernel_string = kernel_file.read()
            kernel = cl.Program(self.cl_ctx, kernel_string).build()
        return kernel
    def download(self):
        return self.cl_data.download(self.cl_queue)
--- a/SWESimulators/CDKLM16_kernel.opencl
+++ b/SWESimulators/CDKLM16_kernel.opencl
@ -1,440 +0,0 @@
 /*
 This OpenCL kernel implements the Kurganov-Petrova numerical scheme 
 for the shallow water equations, described in 
 A. Kurganov & Guergana Petrova
 A Second-Order Well-Balanced Positivity Preserving Central-Upwind
 Scheme for the Saint-Venant System Communications in Mathematical
 Sciences, 5 (2007), 133-160. 
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "common.opencl"
 float3 CDKLM16_F_func(const float3 Q, const float g) {
    float3 F;
    F.x = Q.x*Q.y;                        //h*u
    F.y = Q.x*Q.y*Q.y + 0.5f*g*Q.x*Q.x;   //h*u*u + 0.5f*g*h*h;
    F.z = Q.x*Q.y*Q.z;                    //h*u*v;
    return F;
 }
 /**
  * Note that the input vectors are (h, u, v), thus not the regular
  * (h, hu, hv)
  */
 float3 CDKLM16_flux(const float3 Qm, float3 Qp, const float g) {
    const float3 Fp = CDKLM16_F_func(Qp, g);
    const float up = Qp.y;         // u
    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
    const float3 Fm = CDKLM16_F_func(Qm, g);
    const float um = Qm.y;         // u
    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
    float3 F;
    F.x = ((ap*Fm.x - am*Fp.x) + ap*am*(Qp.x-Qm.x))/(ap-am);
    F.y = ((ap*Fm.y - am*Fp.y) + ap*am*(Qp.y-Qm.y))/(ap-am);
    F.z = (Qm.y + Qp.y > 0) ? Fm.z : Fp.z; //Upwinding to be consistent
    return F;
 }
 __kernel void swe_2D(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
        float theta_,
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        int step_,
        //Input h^n
        __global float* h0_ptr_, int h0_pitch_,
        __global float* hu0_ptr_, int hu0_pitch_,
        __global float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
        __global float* h1_ptr_, int h1_pitch_,
        __global float* hu1_ptr_, int hu1_pitch_,
        __global float* hv1_ptr_, int hv1_pitch_,
        //Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
    const int bx = get_local_size(0) * get_group_id(0);
    const int by = get_local_size(1) * get_group_id(1);
    //Index of cell within domain
    const int ti = get_global_id(0) + 3; //Skip global ghost cells, i.e., +3
    const int tj = get_global_id(1) + 3;
    // Our physical variables
    __local float R[3][block_height+6][block_width+6];
    // Our reconstruction variables
    __local float Q[4][block_height+4][block_width+4];
    __local float Qx[4][block_height][block_width+2];
    __local float Qy[4][block_height+2][block_width];
    // Our fluxes
    __local float F[3][block_height][block_width+1];
    __local float G[3][block_height+1][block_width];
    //Read into shared memory
    for (int j=ty; j<block_height+6; j+=get_local_size(1)) {
        const int l = clamp(by + j, 0, ny_+5); // Out of bounds
        //Compute the pointer to current row in the arrays
        __global float* const h_row = (__global float*) ((__global char*) h0_ptr_ + h0_pitch_*l);
        __global float* const hu_row = (__global float*) ((__global char*) hu0_ptr_ + hu0_pitch_*l);
        __global float* const hv_row = (__global float*) ((__global char*) hv0_ptr_ + hv0_pitch_*l);
        for (int i=tx; i<block_width+6; i+=get_local_size(0)) {
            const int k = clamp(bx + i, 0, nx_+5); // Out of bounds
            R[0][j][i] = h_row[k];
            R[1][j][i] = hu_row[k];
            R[2][j][i] = hv_row[k];
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    //Fix boundary conditions
    {
        const int i = tx + 3; //Skip local ghost cells, i.e., +3
        const int j = ty + 3;
        if (ti == 3) {
            R[0][j][i-1] =  R[0][j][i];
            R[1][j][i-1] = -R[1][j][i];
            R[2][j][i-1] =  R[2][j][i];
            R[0][j][i-2] =  R[0][j][i+1];
            R[1][j][i-2] = -R[1][j][i+1];
            R[2][j][i-2] =  R[2][j][i+1];
            R[0][j][i-3] =  R[0][j][i+2];
            R[1][j][i-3] = -R[1][j][i+2];
            R[2][j][i-3] =  R[2][j][i+2];
        }
        if (ti == nx_+2) {
            R[0][j][i+1] =  R[0][j][i];
            R[1][j][i+1] = -R[1][j][i];
            R[2][j][i+1] =  R[2][j][i];
            R[0][j][i+2] =  R[0][j][i-1];
            R[1][j][i+2] = -R[1][j][i-1];
            R[2][j][i+2] =  R[2][j][i-1];
            R[0][j][i+3] =  R[0][j][i-2];
            R[1][j][i+3] = -R[1][j][i-2];
            R[2][j][i+3] =  R[2][j][i-2];
        }
        if (tj == 3) {
            R[0][j-1][i] =  R[0][j][i];
            R[1][j-1][i] =  R[1][j][i];
            R[2][j-1][i] = -R[2][j][i];
            R[0][j-2][i] =  R[0][j+1][i];
            R[1][j-2][i] =  R[1][j+1][i];
            R[2][j-2][i] = -R[2][j+1][i];
            R[0][j-3][i] =  R[0][j+2][i];
            R[1][j-3][i] =  R[1][j+2][i];
            R[2][j-3][i] = -R[2][j+2][i];
        }
        if (tj == ny_+2) {
            R[0][j+1][i] =  R[0][j][i];
            R[1][j+1][i] =  R[1][j][i];
            R[2][j+1][i] = -R[2][j][i];
            R[0][j+2][i] =  R[0][j-1][i];
            R[1][j+2][i] =  R[1][j-1][i];
            R[2][j+2][i] = -R[2][j-1][i];
            R[0][j+3][i] =  R[0][j-2][i];
            R[1][j+3][i] =  R[1][j-2][i];
            R[2][j+3][i] = -R[2][j-2][i];
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    //Create our "steady state" reconstruction variables (u, v, K, L)
    for (int j=ty; j<block_height+4; j+=get_local_size(1)) {
        const int l = j + 1; //Skip one "ghost cell row" of Q, going from 6x6 to 4x4 "halo"
        for (int i=tx; i<block_width+4; i+=get_local_size(0)) {
            const int k = i + 1;
            const float h = R[0][l][k];
            const float u = R[1][l][k] / h;
            const float v = R[2][l][k] / h;
            const float B = 0.0f;
            const float U = 0.25f * f_/g_ * (1.0*R[1][l+1][k]/R[0][l+1][k] + 2.0f*u + 1.0f*R[1][l-1][k]/R[0][l-1][k]);
            const float V = 0.25f * f_/g_ * (1.0*R[2][l][k+1]/R[0][l][k+1] + 2.0f*v + 1.0f*R[2][l][k-1]/R[0][l][k-1]);
            //const float U = f_/g_ * u;
            //const float V = f_/g_ * v;
            const float K = h + B - V;
            const float L = h + B + U;
            Q[0][j][i] = u;
            Q[1][j][i] = v;
            Q[2][j][i] = K;
            Q[3][j][i] = L;         
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    //Reconstruct slopes along x axis
    for (int j=ty; j<block_height; j+=get_local_size(1)) {
        const int l = j + 2; //Skip ghost cells
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            const int k = i + 1;
            for (int p=0; p<4; ++p) {
                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
            }
        }
    }
    //Reconstruct slopes along y axis
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        const int l = j + 1;
        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
            const int k = i + 2; //Skip ghost cells
            for (int p=0; p<4; ++p) {
                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    //Compute fluxes along the x axis
    for (int j=ty; j<block_height; j+=get_local_size(1)) {
        const int l = j + 2; //Skip ghost cells (be consistent with reconstruction offsets)
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const int k = i + 1;
            // R=(u, v, K, L) reconstructed at a cell interface from the right (p) and left (m)
            const float4 Rp = (float4)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
                                       Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
                                       Q[2][l][k+1] - 0.5f*Qx[2][j][i+1],
                                       Q[3][l][k+1] - 0.5f*Qx[3][j][i+1]);
            const float4 Rm = (float4)(Q[0][l][k  ] + 0.5f*Qx[0][j][i  ],
                                       Q[1][l][k  ] + 0.5f*Qx[1][j][i  ],
                                       Q[2][l][k  ] + 0.5f*Qx[2][j][i  ],
                                       Q[3][l][k  ] + 0.5f*Qx[3][j][i  ]);
            // Variables to reconstruct h from u, v, K, L
            const float vp = Q[1][l][k+1];
            const float vm = Q[1][l][k  ];
            const float V = 0.5f * f_/g_ * (vp + vm);
            const float B = 0.0f;
            // Reconstruct h = K/g + V - B
            const float hp = Rp.z + V - B;
            const float hm = Rm.z + V - B;
            // Our flux variables Q=(h, u, v)
            const float3 Qp = (float3)(hp, Rp.x, Rp.y);
            const float3 Qm = (float3)(hm, Rm.x, Rm.y);
            // Computed flux
            const float3 flux = CDKLM16_flux(Qm, Qp, g_);
            F[0][j][i] = flux.x;
            F[1][j][i] = flux.y;
            F[2][j][i] = flux.z;
        }
    }
    //Compute fluxes along the y axis
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const int l = j + 1;
        for (int i=tx; i<block_width; i+=get_local_size(0)) {
            const int k = i + 2; //Skip ghost cells
            // Q at interface from the right and left
            const float4 Rp = (float4)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
                                       Q[1][l+1][k] - 0.5f*Qy[1][j+1][i],
                                       Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
                                       Q[3][l+1][k] - 0.5f*Qy[3][j+1][i]);
            const float4 Rm = (float4)(Q[0][l  ][k] + 0.5f*Qy[0][j  ][i],
                                       Q[1][l  ][k] + 0.5f*Qy[1][j  ][i],
                                       Q[2][l  ][k] + 0.5f*Qy[2][j  ][i],
                                       Q[3][l  ][k] + 0.5f*Qy[3][j  ][i]);
            // Variables to reconstruct h from u, v, K, L
            const float up = Q[0][l+1][k];
            const float um = Q[0][l  ][k];
            const float U = 0.5f * f_/g_ * (up + um);
            const float B = 0.0f;
            // Reconstruct h = L/g - U - B
            const float hp = Rp.w - U - B;
            const float hm = Rm.w - U - B;
            // Our flux variables Q=(h, v, u)
            // Note that we swap u and v
            const float3 Qp = (float3)(hp, Rp.y, Rp.x);
            const float3 Qm = (float3)(hm, Rm.y, Rm.x);
            // Computed flux
            // Note that we swap back u and v
            const float3 flux = CDKLM16_flux(Qm, Qp, g_);
            G[0][j][i] = flux.x;
            G[1][j][i] = flux.z;
            G[2][j][i] = flux.y;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    //Sum fluxes and advance in time for all internal cells
    if (ti > 2 && ti < nx_+3 && tj > 2 && tj < ny_+3) {
        const int i = tx + 3; //Skip local ghost cells, i.e., +2
        const int j = ty + 3;
        const float X = windStressX(
            wind_stress_type_, 
            dx_, dy_, dt_,
            tau0_, rho_, alpha_, xm_, Rc_,
            x0_, y0_,
            u0_, v0_,
            t_);
        const float Y = windStressY(
            wind_stress_type_, 
            dx_, dy_, dt_,
            tau0_, rho_, alpha_, xm_, Rc_,
            x0_, y0_,
            u0_, v0_,
            t_);
        const float h1  = R[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
        const float hu1 = R[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_
                                     + dt_*X - dt_*f_*R[2][j][i];
        const float hv1 = R[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_
                                     + dt_*Y + dt_*f_*R[1][j][i];
        __global float* const h_row  = (__global float*) ((__global char*) h1_ptr_ + h1_pitch_*tj);
        __global float* const hu_row = (__global float*) ((__global char*) hu1_ptr_ + hu1_pitch_*tj);
        __global float* const hv_row = (__global float*) ((__global char*) hv1_ptr_ + hv1_pitch_*tj);
        const float C = 2.0f*r_*dt_/R[0][j][i];
        if  (step_ == 0) {
            //First step of RK2 ODE integrator
            h_row[ti] = h1;
            hu_row[ti] = hu1 / (1.0f + C);
            hv_row[ti] = hv1 / (1.0f + C);
        }
        else if (step_ == 1) {
            //Second step of RK2 ODE integrator
            //First read Q^n
            const float h_a  = h_row[ti];
            const float hu_a = hu_row[ti];
            const float hv_a = hv_row[ti];
            //Compute Q^n+1
            const float h_b  = 0.5f*(h_a + h1);
            const float hu_b = 0.5f*(hu_a + hu1);
            const float hv_b = 0.5f*(hv_a + hv1);
            //Write to main memory
            h_row[ti] = h_b;
            hu_row[ti] = hu_b / (1.0f + 0.5f*C);
            hv_row[ti] = hv_b / (1.0f + 0.5f*C);
        }
    }
 }
--- a/SWESimulators/CTCS.py
+++ b/SWESimulators/CTCS.py
@ -1,195 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements the Centered in Time, Centered in Space
 (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
 import numpy as np
 import pyopencl as cl #OpenCL in Python
 from SWESimulators import Common
 """
 Class that solves the SW equations using the Centered in time centered in space scheme
 """
 class CTCS:
    """
    Initialization routine
    H: Water depth incl ghost cells, (nx+2)*(ny+2) cells
    eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
    hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
    nx: Number of cells along x-axis
    ny: Number of cells along y-axis
    dx: Grid cell spacing along x-axis (20 000 m)
    dy: Grid cell spacing along y-axis (20 000 m)
    dt: Size of each timestep (90 s)
    g: Gravitational accelleration (9.81 m/s^2)
    f: Coriolis parameter (1.2e-4 s^1)
    r: Bottom friction coefficient (2.4e-3 m/s)
    A: Eddy viscosity coefficient (O(dx))
    wind_stress: Wind stress parameters
    """
    def __init__(self, \
                 cl_ctx, \
                 H, eta0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, f, r, A, \
                 wind_stress=Common.WindStressParams(), \
                 block_width=16, block_height=16):
        self.cl_ctx = cl_ctx
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        reload(Common)
        #Get kernels
        self.u_kernel = Common.get_kernel(self.cl_ctx, "CTCS_U_kernel.opencl", block_width, block_height)
        self.v_kernel = Common.get_kernel(self.cl_ctx, "CTCS_V_kernel.opencl", block_width, block_height)
        self.eta_kernel = Common.get_kernel(self.cl_ctx, "CTCS_eta_kernel.opencl", block_width, block_height)
        #Create data by uploading to device
        ghost_cells_x = 1
        ghost_cells_y = 1
        self.H = Common.OpenCLArray2D(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, H)
        self.cl_data = Common.SWEDataArkawaC(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
        #OpenCL kernel
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.dt = np.float32(dt)
        self.g = np.float32(g)
        self.f = np.float32(f)
        self.r = np.float32(r)
        self.A = np.float32(A)
        self.wind_stress = wind_stress
        #Initialize time
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
        self.local_size = (block_width, block_height) 
        self.global_size = ( \
                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
                      ) 
    def __str__(self):
        return "Centered in time, centered in space"
    """
    Function which steps n timesteps
    """
    def step(self, t_end=0.0):
        n = int(t_end / self.dt + 1)
        for i in range(0, n):
            #Notation: 
            # cl_data.u0 => U^{n-1} before U kernel, U^{n+1} after U kernel
            # cl_data.u1 => U^{n}
            # When we call cl_data.swap(), we swap these, so that
            # cl_data.u0 => U^{n}
            # cl_data.u1 => U^{n+1} (U kernel has been executed)
            # Now we are ready for the next time step
            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
            if (local_dt <= 0.0):
                break
            self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, self.r, \
                    self.cl_data.h0.data, self.cl_data.h0.pitch,     # eta^{n-1} => eta^{n+1} \
                    self.cl_data.hu1.data, self.cl_data.hu1.pitch,   # U^{n} \
                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)   # V^{n}
            self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, self.r, self.A,\
                    self.H.data, self.H.pitch, \
                    self.cl_data.h1.data, self.cl_data.h1.pitch,      # eta^{n} \
                    self.cl_data.hu0.data, self.cl_data.hu0.pitch,    # U^{n-1} => U^{n+1} \
                    self.cl_data.hu1.data, self.cl_data.hu1.pitch,    # U^{n} \
                    self.cl_data.hv1.data, self.cl_data.hv1.pitch,    # V^{n} \
                    self.wind_stress.type, \
                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                    self.wind_stress.x0, self.wind_stress.y0, \
                    self.wind_stress.u0, self.wind_stress.v0, \
                    self.t)
            self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, self.r, self.A,\
                    self.H.data, self.H.pitch, \
                    self.cl_data.h1.data, self.cl_data.h1.pitch,     # eta^{n} \
                    self.cl_data.hu1.data, self.cl_data.hu1.pitch,   # U^{n} \
                    self.cl_data.hv0.data, self.cl_data.hv0.pitch,   # V^{n-1} => V^{n+1} \
                    self.cl_data.hv1.data, self.cl_data.hv1.pitch,   # V^{n} \
                    self.wind_stress.type, \
                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                    self.wind_stress.x0, self.wind_stress.y0, \
                    self.wind_stress.u0, self.wind_stress.v0, \
                    self.t)
            #After the kernels, swap the data pointers
            self.cl_data.swap()
            self.t += local_dt
        return self.t
    def download(self):
        return self.cl_data.download(self.cl_queue)
--- a/SWESimulators/CTCS2Layer.py
+++ b/SWESimulators/CTCS2Layer.py
@ -1,435 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements the Centered in Time, Centered in Space
 (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
 import os
 import time
 import numpy as np
 import pyopencl as cl #OpenCL in Python
 """
 Class that holds data for the SW equations in OpenCL
 """
 class CTCS2LayerDataCL:
    """
    Uploads initial data to the CL device
    """
    def __init__(self, cl_ctx, h1_0, eta1_0, u1_0, v1_0, \
                               h2_0, eta2_0, u2_0, v2_0):
        #Make sure that the data is single precision floating point
        if (not np.issubdtype(h1_0.dtype, np.float32) or np.isfortran(h1_0)):
            print "Converting H_0"
            h1_0 = h1_0.astype(np.float32, order='C')
        if (not np.issubdtype(eta1_0.dtype, np.float32) or np.isfortran(eta1_0)):
            print "Converting Eta_0"
            eta1_0 = eta1_0.astype(np.float32, order='C')
        if (not np.issubdtype(u1_0.dtype, np.float32) or np.isfortran(u1_0)):
            print "Converting U_0"
            u1_0 = u1_0.astype(np.float32, order='C')
        if (not np.issubdtype(v1_0.dtype, np.float32) or np.isfortran(v1_0)):
            print "Converting V_0"
            v1_0 = v1_0.astype(np.float32, order='C')
        #Same for second (deepest) layer
        if (not np.issubdtype(h2_0.dtype, np.float32) or np.isfortran(h2_0)):
            print "Converting H2_0"
            h2_0 = h2_0.astype(np.float32, order='C')
        if (not np.issubdtype(eta2_0.dtype, np.float32) or np.isfortran(eta2_0)):
            print "Converting Eta2_0"
            eta2_0 = eta2_0.astype(np.float32, order='C')
        if (not np.issubdtype(u2_0.dtype, np.float32) or np.isfortran(u2_0)):
            print "Converting U2_0"
            u2_0 = u2_0.astype(np.float32, order='C')
        if (not np.issubdtype(v2_0.dtype, np.float32) or np.isfortran(v2_0)):
            print "Converting V2_0"
            v2_0 = v2_0.astype(np.float32, order='C')
        self.ny, self.nx = h1_0.shape
        self.nx = self.nx - 2 # Ghost cells
        self.ny = self.ny - 2
        assert(h1_0.shape == (self.ny+2, self.nx+2))
        assert(eta1_0.shape == (self.ny+2, self.nx+2))
        assert(u1_0.shape == (self.ny+2, self.nx+1))
        assert(v1_0.shape == (self.ny+1, self.nx+2))
        #Same for layer 2
        assert(h2_0.shape == (self.ny+2, self.nx+2))
        assert(eta2_0.shape == (self.ny+2, self.nx+2))
        assert(u2_0.shape == (self.ny+2, self.nx+1))
        assert(v2_0.shape == (self.ny+1, self.nx+2))
        #Upload data to the device
        mf = cl.mem_flags
        self.h1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=h1_0)
        self.eta1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta1_0)
        self.eta1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta1_0)
        self.u1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u1_0)
        self.u1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u1_0)
        self.v1_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v1_0)
        self.v1_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v1_0)
        #Same for layer 2
        self.h2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=h2_0)
        self.eta2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta2_0)
        self.eta2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=eta2_0)
        self.u2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u2_0)
        self.u2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=u2_0)
        self.v2_0 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v2_0)
        self.v2_1 = cl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=v2_0)
        #Compute pitches
        self.h1_0_pitch = np.int32(h1_0.shape[1]*4)
        self.eta1_0_pitch = np.int32(eta1_0.shape[1]*4)
        self.eta1_1_pitch = np.int32(eta1_0.shape[1]*4)
        self.u1_0_pitch = np.int32(u1_0.shape[1]*4)
        self.u1_1_pitch = np.int32(u1_0.shape[1]*4)
        self.v1_0_pitch = np.int32(v1_0.shape[1]*4)
        self.v1_1_pitch = np.int32(v1_0.shape[1]*4)
        #Same for layer 2
        self.h2_0_pitch = np.int32(h2_0.shape[1]*4)
        self.eta2_0_pitch = np.int32(eta2_0.shape[1]*4)
        self.eta2_1_pitch = np.int32(eta2_0.shape[1]*4)
        self.u2_0_pitch = np.int32(u2_0.shape[1]*4)
        self.u2_1_pitch = np.int32(u2_0.shape[1]*4)
        self.v2_0_pitch = np.int32(v2_0.shape[1]*4)
        self.v2_1_pitch = np.int32(v2_0.shape[1]*4)
    """
    Swaps the variables after a timestep has been completed
    """
    def swap(self):
        self.eta1_1, self.eta1_0 = self.eta1_0, self.eta1_1
        self.u1_1, self.u1_0 = self.u1_0, self.u1_1
        self.v1_1, self.v1_0 = self.v1_0, self.v1_1
        #Same for layer 2
        self.eta2_1, self.eta2_0 = self.eta2_0, self.eta2_1
        self.u2_1, self.u2_0 = self.u2_0, self.u2_1
        self.v2_1, self.v2_0 = self.v2_0, self.v2_1
    """
    Enables downloading data from CL device to Python
    """
    def download(self, cl_queue):
        #Allocate data on the host for result
        eta1_1 = np.empty((self.ny+2, self.nx+2), dtype=np.float32, order='C')
        u1_1 = np.empty((self.ny+2, self.nx+1), dtype=np.float32, order='C')
        v1_1 = np.empty((self.ny+1, self.nx+2), dtype=np.float32, order='C')
        #Same for layer 2
        eta2_1 = np.empty((self.ny+2, self.nx+2), dtype=np.float32, order='C')
        u2_1 = np.empty((self.ny+2, self.nx+1), dtype=np.float32, order='C')
        v2_1 = np.empty((self.ny+1, self.nx+2), dtype=np.float32, order='C')
        #Copy data from device to host
        cl.enqueue_copy(cl_queue, eta1_1, self.eta1_1)
        cl.enqueue_copy(cl_queue, u1_1, self.u1_1)
        cl.enqueue_copy(cl_queue, v1_1, self.v1_1)
        #Same for layer 2
        cl.enqueue_copy(cl_queue, eta2_1, self.eta2_1)
        cl.enqueue_copy(cl_queue, u2_1, self.u2_1)
        cl.enqueue_copy(cl_queue, v2_1, self.v2_1)
        #Return
        return eta1_1, u1_1, v1_1, eta2_1, u2_1, v2_1
 """
 Class that solves the SW equations using the Centered in time centered in space scheme
 """
 class CTCS2Layer:
    """
    Initialization routine
    h1_0: Water depth incl ghost cells, (nx+2)*(ny+2) cells
    eta1_0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
    u1_0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
    v1_0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
    h2_0: Water depth (layer 2) incl ghost cells, (nx+2)*(ny+2) cells
    eta2_0: Initial deviation from mean sea level (layer 2) incl ghost cells, (nx+2)*(ny+2) cells
    u2_0: Initial momentum (layer 2) along x-axis incl ghost cells, (nx+1)*(ny+2) cells
    v2_0: Initial momentum (layer 2) along y-axis incl ghost cells, (nx+2)*(ny+1) cells
    nx: Number of cells along x-axis
    ny: Number of cells along y-axis
    dx: Grid cell spacing along x-axis (20 000 m)
    dy: Grid cell spacing along y-axis (20 000 m)
    dt: Size of each timestep (90 s)
    g: Gravitational accelleration (9.81 m/s^2)
    f: Coriolis parameter (1.2e-4 s^1)
    r: Bottom friction coefficient (2.4e-3 m/s)
    r2: Inter-layer friction coefficient (m/s)
    A: Eddy viscosity coefficient (O(dx))
    rho1: Density of upper layer (1025.0 kg / m^3)
    rho2: Density of lower layer (1000.0 kg / m^3)
    wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
    wind_tau0: Amplitude of wind stress (Pa)
    wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
    wind_xm: Maximum wind stress for bell shaped wind stress
    wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
    wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
    wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
    wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
    """
    def __init__(self, \
                 h1_0, eta1_0, u1_0, v1_0, \
                 h2_0, eta2_0, u2_0, v2_0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, f, r1, r2, A, \
                 rho1, rho2,
                 wind_type=99, # "no wind" \
                 wind_tau0=0, wind_alpha=0, wind_xm=0, wind_Rc=0, \
                 wind_x0=0, wind_y0=0, \
                 wind_u0=0, wind_v0=0):
        #Make sure we get compiler output from OpenCL
        os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
        #Set which CL device to use
        os.environ["PYOPENCL_CTX"] = "1"
        #Create OpenCL context
        self.cl_ctx = cl.create_some_context()
        print "Using ", self.cl_ctx.devices[0].name
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
        self.u_kernel = self.get_kernel("CTCS2Layer_U_kernel.opencl")
        self.v_kernel = self.get_kernel("CTCS2Layer_V_kernel.opencl")
        self.eta_kernel = self.get_kernel("CTCS2Layer_eta_kernel.opencl")
        #Create data by uploading to device
        self.cl_data = CTCS2LayerDataCL(self.cl_ctx, h1_0, eta1_0, u1_0, v1_0, h2_0, eta2_0, u2_0, v2_0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
        #OpenCL kernel
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.dt = np.float32(dt)
        self.g = np.float32(g)
        self.f = np.float32(f)
        self.r1 = np.float32(r1)
        self.r2 = np.float32(r2)
        self.A = np.float32(A)
        assert(rho1 <= rho2)
        self.rho1 = np.float32(rho1)
        self.rho2 = np.float32(rho2)
        self.wind_type = np.int32(wind_type)
        self.wind_tau0 = np.float32(wind_tau0)
        self.wind_alpha = np.float32(wind_alpha)
        self.wind_xm = np.float32(wind_xm)
        self.wind_Rc = np.float32(wind_Rc)
        self.wind_x0 = np.float32(wind_x0)
        self.wind_y0 = np.float32(wind_y0)
        self.wind_u0 = np.float32(wind_u0)
        self.wind_v0 = np.float32(wind_v0)
        #Initialize time
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
        self.local_size = (8, 8) # WARNING::: MUST MATCH defines of block_width/height in kernels!
        self.global_size = ( \
                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
                      ) 
    """
    Function which steps n timesteps
    """
    def step(self, t_end=0.0):
        n = int(t_end / self.dt + 1)
        for i in range(0, n):
            #Notation: 
            # cl_data.u0 => U^{n-1} before U kernel, U^{n+1} after U kernel
            # cl_data.u1 => U^{n}
            # When we call cl_data.swap(), we swap these, so that
            # cl_data.u0 => U^{n}
            # cl_data.u1 => U^{n+1} (U kernel has been executed)
            # Now we are ready for the next time step
            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
            if (local_dt <= 0.0):
                break
            self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    \
                    self.cl_data.eta1_0, self.cl_data.eta1_0_pitch,    # eta^{n-1} => eta^{n+1} \
                    self.cl_data.u1_1, self.cl_data.u1_1_pitch,        # U^{n} \
                    self.cl_data.v1_1, self.cl_data.v1_1_pitch,        # V^{n}
                    \
                    self.cl_data.eta2_0, self.cl_data.eta2_0_pitch, \
                    self.cl_data.u2_1, self.cl_data.u2_1_pitch, \
                    self.cl_data.v2_1, self.cl_data.v2_1_pitch)
            self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, \
                    self.r1, self.r2, \
                    self.A, \
                    self.rho1, self.rho2, \
                    \
                    self.cl_data.h1_0, self.cl_data.h1_0_pitch, \
                    self.cl_data.eta1_1, self.cl_data.eta1_1_pitch, # eta^{n} \
                    self.cl_data.u1_0, self.cl_data.u1_0_pitch,     # U^{n-1} => U^{n+1} \
                    self.cl_data.u1_1, self.cl_data.u1_1_pitch,     # U^{n} \
                    self.cl_data.v1_1, self.cl_data.v1_1_pitch,     # V^{n} \
                    \
                    self.cl_data.h2_0, self.cl_data.h2_0_pitch, \
                    self.cl_data.eta2_1, self.cl_data.eta2_1_pitch, \
                    self.cl_data.u2_0, self.cl_data.u2_0_pitch, \
                    self.cl_data.u2_1, self.cl_data.u2_1_pitch, \
                    self.cl_data.v2_1, self.cl_data.v2_1_pitch, \
                    \
                    self.wind_type, \
                    self.wind_tau0, self.wind_alpha, self.wind_xm, self.wind_Rc, \
                    self.wind_x0, self.wind_y0, \
                    self.wind_u0, self.wind_v0, \
                    self.t)
            self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, \
                    self.r1, self.r2, \
                    self.A, \
                    self.rho1, self.rho2, \
                    \
                    self.cl_data.h1_0, self.cl_data.h1_0_pitch, \
                    self.cl_data.eta1_1, self.cl_data.eta1_1_pitch,   # eta^{n} \
                    self.cl_data.u1_1, self.cl_data.u1_1_pitch,       # U^{n} \
                    self.cl_data.v1_0, self.cl_data.v1_0_pitch,       # V^{n-1} => V^{n+1} \
                    self.cl_data.v1_1, self.cl_data.v1_1_pitch,       # V^{n} \
                    \
                    self.cl_data.h2_0, self.cl_data.h2_0_pitch, \
                    self.cl_data.eta2_1, self.cl_data.eta2_1_pitch, \
                    self.cl_data.u2_1, self.cl_data.u2_1_pitch, \
                    self.cl_data.v2_0, self.cl_data.v2_0_pitch, \
                    self.cl_data.v2_1, self.cl_data.v2_1_pitch, \
                    \
                    self.wind_type, \
                    self.wind_tau0, self.wind_alpha, self.wind_xm, self.wind_Rc, \
                    self.wind_x0, self.wind_y0, \
                    self.wind_u0, self.wind_v0, \
                    self.t)
            #After the kernels, swap the data pointers
            self.cl_data.swap()
            self.t += local_dt
        return self.t
    """
    Static function which reads a text file and creates an OpenCL kernel from that
    """
    def get_kernel(self, kernel_filename):
        #Read the proper program
        module_path = os.path.dirname(os.path.realpath(__file__))
        fullpath = os.path.join(module_path, kernel_filename)
        with open(fullpath, "r") as kernel_file:
            kernel_string = kernel_file.read()
            kernel = cl.Program(self.cl_ctx, kernel_string).build()
        return kernel
    def download(self):
        return self.cl_data.download(self.cl_queue)
--- a/SWESimulators/CTCS2Layer_U_kernel.opencl
+++ b/SWESimulators/CTCS2Layer_U_kernel.opencl
@ -1,414 +0,0 @@
 /**
 This OpenCL kernel implements part of the Centered in Time, Centered 
 in Space (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #define block_height 8
 #define block_width 8
 typedef __local float eta_shmem[block_height+2][block_width+1];
 typedef __local float u_shmem[block_height+2][block_width+2];
 typedef __local float v_shmem[block_height+1][block_width+1];
 float windStressX(int wind_stress_type_,
                float dx_, float dy_, float dt_,
                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
                float x0_, float y0_,
                float u0_, float v0_,
                float t_) {
    float X = 0.0f;
    switch (wind_stress_type_) {
    case 0: //UNIFORM_ALONGSHORE
        {
            const float y = (get_global_id(1)+0.5f)*dy_;
            X = tau0_/rho_ * exp(-alpha_*y);
        }
        break;
    case 1: //BELL_SHAPED_ALONGSHORE
        if (t_ <= 48.0f*3600.0f) {
            const float a = alpha_*((get_global_id(0)+0.5f)*dx_-xm_);
            const float aa = a*a;
            const float y = (get_global_id(1)+0.5f)*dy_;
            X = tau0_/rho_ * exp(-aa) * exp(-alpha_*y);
        }
        break;
    case 2: //MOVING_CYCLONE
        {
            const float x = (get_global_id(0))*dx_;
            const float y = (get_global_id(1)+0.5f)*dy_;
            const float a = (x-x0_-u0_*(t_+dt_));
            const float aa = a*a;
            const float b = (y-y0_-v0_*(t_+dt_));
            const float bb = b*b;
            const float r = sqrt(aa+bb);
            const float c = 1.0f - r/Rc_;
            const float xi = c*c;
            X = -(tau0_/rho_) * (b/Rc_) * exp(-0.5f*xi);
        }
        break;
    }
    return X;
 }
 /**
  * Kernel that evolves U one step in time.
  */
 __kernel void computeUKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r1_, //< Inter-layer friction coefficient
        float r2_, //< Bottom friction coefficient
        //Numerical diffusion
        float A_,
        //Density of each layer
        float rho1_,
        float rho2_,
        //Data for layer 1
        __global float* H1_ptr_, int H1_pitch_,
        __global float* eta1_1_ptr_, int eta1_1_pitch_, // eta^n
        __global float* U1_0_ptr_, int U1_0_pitch_, // U^n-1, also output, U^n+1
        __global float* U1_1_ptr_, int U1_1_pitch_, // U^n
        __global float* V1_1_ptr_, int V1_1_pitch_, // V^n
        //Data for layer 2
        __global float* H2_ptr_, int H2_pitch_,
        __global float* eta2_1_ptr_, int eta2_1_pitch_, // eta^n
        __global float* U2_0_ptr_, int U2_0_pitch_, // U^n-1, also output, U^n+1
        __global float* U2_1_ptr_, int U2_1_pitch_, // U^n
        __global float* V2_1_ptr_, int V2_1_pitch_, // V^n
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    eta_shmem H1_shared;
    eta_shmem eta1_shared;
    u_shmem U1_shared;
    v_shmem V1_shared;
    eta_shmem H2_shared;
    eta_shmem eta2_shared;
    u_shmem U2_shared;
    v_shmem V2_shared;
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    //Compute pointer to current row in the U array
    __global float* const U1_0_row = (__global float*) ((__global char*) U1_0_ptr_ + U1_0_pitch_*tj);
    __global float* const U2_0_row = (__global float*) ((__global char*) U2_0_ptr_ + U2_0_pitch_*tj);
    //Read current U
    float U1_0 = 0.0f;
    float U2_0 = 0.0f;
    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
        U1_0 = U1_0_row[ti];
        U2_0 = U2_0_row[ti];
    }
    //Read H and eta into shared memory: (nx+1)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // "fake" global ghost cells by clamping
        const int l = clamp(by + j - 1, 1, ny_);
        //Compute the pointer to current row in the H and eta arrays
        __global float* const H1_row = (__global float*) ((__global char*) H1_ptr_ + H1_pitch_*l);
        __global float* const H2_row = (__global float*) ((__global char*) H2_ptr_ + H2_pitch_*l);
        __global float* const eta1_1_row = (__global float*) ((__global char*) eta1_1_ptr_ + eta1_1_pitch_*l);
        __global float* const eta2_1_row = (__global float*) ((__global char*) eta2_1_ptr_ + eta2_1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // "fake" global ghost cells by clamping
            const int k = clamp(bx + i, 1, nx_);
            H1_shared[j][i] = H1_row[k];
            H2_shared[j][i] = H2_row[k];
            eta1_shared[j][i] = eta1_1_row[k];
            eta2_shared[j][i] = eta2_1_row[k];
        }
    }
    //Read U into shared memory: (nx+2)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // "fake" ghost cells by clamping
        const int l = clamp(by + j - 1, 1, ny_);
        //Compute the pointer to current row in the U array
        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // Prevent out-of-bounds
            const int k = clamp(bx + i - 1, 0, nx_);
            U1_shared[j][i] = U1_1_row[k];
            U2_shared[j][i] = U2_1_row[k];
        }
    }
    //Read V into shared memory: (nx+1)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // Prevent out-of-bounds
        const int l = clamp(by + j - 1, 0, ny_);
        //Compute the pointer to current row in the V array
        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // "fake" ghost cells by clamping
            const int k = clamp(bx + i, 1, nx_);
            V1_shared[j][i] = V1_1_row[k];
            V2_shared[j][i] = V2_1_row[k];
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    /**
      * Now get all our required variables as short-hands
      * here we use the notation of
      *  Var1_00 as var_i,j for layer 1
      *  Var2_p0 as var_i+1,j for layer 2
      *  Var1_0m as var_i,j-1 for layer 1
      * etc
      */
    //Layer 1
    const float U1_00 = U1_shared[ty+1][tx+1]; //U at "center"
    const float U1_0p = U1_shared[ty+2][tx+1]; //U at "north"
    const float U1_0m = U1_shared[ty  ][tx+1]; //U at "south"
    const float U1_p0 = U1_shared[ty+1][tx+2]; //U at "east"
    const float U1_m0 = U1_shared[ty+1][tx  ]; //U at "west"
    const float V1_00 = V1_shared[ty+1][tx  ];
    const float V1_p0 = V1_shared[ty+1][tx+1];
    const float V1_0m = V1_shared[ty  ][tx  ];
    const float V1_pm = V1_shared[ty  ][tx+1];
    const float H1_0m = H1_shared[ty  ][tx  ]; 
    const float H1_00 = H1_shared[ty+1][tx  ]; 
    const float H1_0p = H1_shared[ty+2][tx  ];
    const float H1_pm = H1_shared[ty  ][tx+1];
    const float H1_p0 = H1_shared[ty+1][tx+1]; 
    const float H1_pp = H1_shared[ty+2][tx+1];
    const float eta1_0m = eta1_shared[ty  ][tx  ]; 
    const float eta1_00 = eta1_shared[ty+1][tx  ]; 
    const float eta1_0p = eta1_shared[ty+2][tx  ];
    const float eta1_pm = eta1_shared[ty  ][tx+1];
    const float eta1_p0 = eta1_shared[ty+1][tx+1]; 
    const float eta1_pp = eta1_shared[ty+2][tx+1];
    //Layer 2 (bottom)
    const float U2_00 = U2_shared[ty+1][tx+1]; 
    const float U2_0p = U2_shared[ty+2][tx+1]; 
    const float U2_0m = U2_shared[ty  ][tx+1]; 
    const float U2_p0 = U2_shared[ty+1][tx+2]; 
    const float U2_m0 = U2_shared[ty+1][tx  ]; 
    const float V2_00 = V2_shared[ty+1][tx  ];
    const float V2_p0 = V2_shared[ty+1][tx+1];
    const float V2_0m = V2_shared[ty  ][tx  ];
    const float V2_pm = V2_shared[ty  ][tx+1];
    const float H2_0m = H2_shared[ty  ][tx  ]; 
    const float H2_00 = H2_shared[ty+1][tx  ]; 
    const float H2_0p = H2_shared[ty+2][tx  ];
    const float H2_pm = H2_shared[ty  ][tx+1];
    const float H2_p0 = H2_shared[ty+1][tx+1]; 
    const float H2_pp = H2_shared[ty+2][tx+1];
    const float eta2_0m = eta2_shared[ty  ][tx  ]; 
    const float eta2_00 = eta2_shared[ty+1][tx  ]; 
    const float eta2_0p = eta2_shared[ty+2][tx  ];
    const float eta2_pm = eta2_shared[ty  ][tx+1];
    const float eta2_p0 = eta2_shared[ty+1][tx+1]; 
    const float eta2_pp = eta2_shared[ty+2][tx+1];
    //Reconstruct Eta_bar at the V position
    const float eta1_bar_0m = 0.25f*(eta1_0m + eta1_pm + eta1_00 + eta1_p0);
    const float eta1_bar_00 = 0.25f*(eta1_00 + eta1_p0 + eta1_0p + eta1_pp);
    const float eta2_bar_0m = 0.25f*(eta2_0m + eta2_pm + eta2_00 + eta2_p0);
    const float eta2_bar_00 = 0.25f*(eta2_00 + eta2_p0 + eta2_0p + eta2_pp);
    //Reconstruct H_bar and H_x (at the U position)
    const float H1_bar_0m = 0.25f*(H1_0m + H1_pm + H1_00 + H1_p0);
    const float H1_bar_00 = 0.25f*(H1_00 + H1_p0 + H1_0p + H1_pp);
    const float H1_x = 0.5f*(H1_00 + H1_p0);
    const float H2_bar_0m = 0.25f*(H2_0m + H2_pm + H2_00 + H2_p0);
    const float H2_bar_00 = 0.25f*(H2_00 + H2_p0 + H2_0p + H2_pp);
    const float H2_x = 0.5f*(H2_00 + H2_p0);
    //Compute layer thickness of top layer
    const float h1_p0 = H1_p0 + eta1_p0 - eta2_p0;
    const float h1_00 = H1_00 + eta1_00 - eta2_00;
    const float h1_bar_0m = H1_bar_0m + eta1_bar_0m - eta2_bar_0m;
    const float h1_bar_00 = H1_bar_00 + eta1_bar_00 - eta2_bar_00;
    const float h2_p0 = H2_p0 + eta2_p0;
    const float h2_00 = H2_00 + eta2_00;
    const float h2_bar_0m = H2_bar_0m + eta2_bar_0m;
    const float h2_bar_00 = H2_bar_00 + eta2_bar_00;
    //Compute pressure components
    const float h1_x = 0.5f*(h1_p0 + h1_00);
    const float h2_x = 0.5f*(h2_p0 + h2_00);
    //const float epsilon = (rho2_ - rho1_)/rho2_;
    //const float P1_x = -g_*h1_x * (eta1_p0 - eta1_00 + h2_p0 - h2_00) * (1.0f - epsilon);
    //const float P2_x = -g_*h2_x * (eta2_p0 - eta2_00 + H2_p0 - H2_00);
    const float P1_x = - g_*h1_x*(eta1_p0 - eta1_00) - 0.5f*g_*(eta1_p0*eta1_p0 - eta1_00*eta1_00);
    const float P2_x = - g_ * (rho1_/rho2_) * 
                            ( //Pressure contribution from top layer
                            h2_x*(eta1_p0 - eta1_00) + 0.5f*(eta1_p0*eta1_p0 - eta1_00*eta1_00) 
                            )
                       - g_ * ((rho2_ - rho1_)/rho2_) * 
                            ( //Pressure contribution from bottom layer
                            h2_x*(eta2_p0 - eta2_00) + 0.5f*(eta2_p0*eta2_p0 - eta2_00*eta2_00) 
                            );
    //Reconstruct V at the U position
    const float V1_bar = 0.25f*(V1_0m + V1_00 + V1_pm + V1_p0);
    const float V2_bar = 0.25f*(V2_0m + V2_00 + V2_pm + V2_p0);
    //Calculate the bottom and/or inter-layer friction coefficient
    //FIXME: Should this be h instead of H?
    const float C1 = r1_/H1_x;
    const float C2 = r2_/H2_x;
    //Calculate numerical diffusion / subgrid energy loss coefficient
    const float D = 2.0f*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
    //Calculate nonlinear effects
    const float N1_a = (U1_p0 + U1_00)*(U1_p0 + U1_00) / (h1_p0);
    const float N1_b = (U1_00 + U1_m0)*(U1_00 + U1_m0) / (h1_00);
    const float N1_c = (U1_0p + U1_00)*(V1_p0 + V1_00) / (h1_bar_00);
    const float N1_d = (U1_00 + U1_0m)*(V1_pm + V1_0m) / (h1_bar_0m);
    const float N1 = 0.25f*( N1_a - N1_b + (dx_/dy_)*(N1_c - N1_d) );
    const float N2_a = (U2_p0 + U2_00)*(U2_p0 + U2_00) / (h2_p0);
    const float N2_b = (U2_00 + U2_m0)*(U2_00 + U2_m0) / (h2_00);
    const float N2_c = (U2_0p + U2_00)*(V2_p0 + V2_00) / (h2_bar_00);
    const float N2_d = (U2_00 + U2_0m)*(V2_pm + V2_0m) / (h2_bar_0m);
    const float N2 = 0.25f*( N2_a - N2_b + (dx_/dy_)*(N2_c - N2_d) );
    //Calculate eddy viscosity terms
    const float E1 = (U1_p0 - U1_0 + U1_m0)/(dx_*dx_) + (U1_0p - U1_0 + U1_0m)/(dy_*dy_);
    const float E2 = (U2_p0 - U2_0 + U2_m0)/(dx_*dx_) + (U2_0p - U2_0 + U2_0m)/(dy_*dy_);
    //Calculate the wind shear stress for the top layer
    const float X = windStressX(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho1_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    //Compute U at the next timestep
    float U1_2 = (U1_0 + 2.0f*dt_*(f_*V1_bar + (N1 + P1_x)/dx_ + X + C1*U2_0 + A_*E1) ) / (1.0f + D);
    float U2_2 = (U2_0 + 2.0f*dt_*(f_*V2_bar + (N2 + P2_x)/dx_     + C1*U1_0 + A_*E2) ) / (1.0f + 2.0f*dt_*C2 + D);
    //Write to main memory for internal cells
    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
        U1_0_row[ti] = U1_2;
        U2_0_row[ti] = U2_2;
    }
 }
--- a/SWESimulators/CTCS2Layer_V_kernel.opencl
+++ b/SWESimulators/CTCS2Layer_V_kernel.opencl
@ -1,395 +0,0 @@
 /**
 This OpenCL kernel implements part of the Centered in Time, Centered 
 in Space (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #define block_height 8
 #define block_width 8
 typedef __local float eta_shmem[block_height+1][block_width+2];
 typedef __local float u_shmem[block_height+1][block_width+1];
 typedef __local float v_shmem[block_height+2][block_width+2];
 float windStressY(int wind_stress_type_,
                float dx_, float dy_, float dt_,
                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
                float x0_, float y0_,
                float u0_, float v0_,
                float t_) {
    float Y = 0.0f;
    switch (wind_stress_type_) {
    case 2: //MOVING_CYCLONE:
        {
            const float x = (get_global_id(0)+0.5f)*dx_; 
            const float y = (get_global_id(1))*dy_;
            const float a = (x-x0_-u0_*(t_+dt_));
            const float aa = a*a;
            const float b = (y-y0_-v0_*(t_+dt_));
            const float bb = b*b;
            const float r = sqrt(aa+bb);
            const float c = 1.0f - r/Rc_;
            const float xi = c*c;
            Y = (tau0_/rho_) * (a/Rc_) * exp(-0.5f*xi);
        }
        break;
    }
    return Y;
 }
 /**
  * Kernel that evolves V one step in time.
  */
 __kernel void computeVKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r1_, //< Inter-layer friction coefficient
        float r2_, //< Bottom friction coefficient
        //Numerical diffusion
        float A_,
        //Density of each layer
        float rho1_,
        float rho2_,
        //Data for layer 1
        __global float* H1_ptr_, int H1_pitch_,
        __global float* eta1_1_ptr_, int eta1_1_pitch_, // eta^n
        __global float* U1_1_ptr_, int U1_1_pitch_,     // U^n
        __global float* V1_0_ptr_, int V1_0_pitch_,     // V^n-1, also output V^n+1
        __global float* V1_1_ptr_, int V1_1_pitch_,     // V^n
        //Data for layer 2
        __global float* H2_ptr_, int H2_pitch_,
        __global float* eta2_1_ptr_, int eta2_1_pitch_, 
        __global float* U2_1_ptr_, int U2_1_pitch_,     
        __global float* V2_0_ptr_, int V2_0_pitch_,     
        __global float* V2_1_ptr_, int V2_1_pitch_,     
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    eta_shmem H1_shared;
    eta_shmem eta1_shared;
    u_shmem U1_shared;
    v_shmem V1_shared;
    eta_shmem H2_shared;
    eta_shmem eta2_shared;
    u_shmem U2_shared;
    v_shmem V2_shared;
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    //Compute pointer to current row in the V array
    __global float* const V1_0_row = (__global float*) ((__global char*) V1_0_ptr_ + V1_0_pitch_*tj);
    __global float* const V2_0_row = (__global float*) ((__global char*) V2_0_ptr_ + V2_0_pitch_*tj);
    //Read current V
    float V1_0 = 0.0f;
    float V2_0 = 0.0f;
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
        V1_0 = V1_0_row[ti];
        V2_0 = V2_0_row[ti];
    }
    //Read H and eta into shared memory: (nx+2)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // "fake" global ghost cells by clamping
        const int l = clamp(by + j, 1, ny_);
        //Compute the pointer to current row in the H and eta arrays
        __global float* const H1_row = (__global float*) ((__global char*) H1_ptr_ + H1_pitch_*l);
        __global float* const H2_row = (__global float*) ((__global char*) H2_ptr_ + H2_pitch_*l);
        __global float* const eta1_1_row = (__global float*) ((__global char*) eta1_1_ptr_ + eta1_1_pitch_*l);
        __global float* const eta2_1_row = (__global float*) ((__global char*) eta2_1_ptr_ + eta2_1_pitch_*l);
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // "fake" global ghost cells by clamping
            const int k = clamp(bx + i - 1, 1, nx_);
            H1_shared[j][i] = H1_row[k];
            H2_shared[j][i] = H2_row[k];
            eta1_shared[j][i] = eta1_1_row[k];
            eta2_shared[j][i] = eta2_1_row[k];
        }
    }
    //Read U into shared memory: (nx+1)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // "fake" ghost cells by clamping
        const int l = clamp(by + j, 1, ny_);
        //Compute the pointer to current row in the U array
        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // Prevent out-of-bounds
            const int k = clamp(bx + i - 1, 0, nx_);
            U1_shared[j][i] = U1_1_row[k];
            U2_shared[j][i] = U2_1_row[k];
        }
    }
    //Read V into shared memory: (nx+2)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // Prevent out-of-bounds
        const int l = clamp(by + j - 1, 0, ny_);
        //Compute the pointer to current row in the V array
        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // "fake" ghost cells by clamping
            const int k = clamp(bx + i - 1, 1, nx_);
            V1_shared[j][i] = V1_1_row[k];
            V2_shared[j][i] = V2_1_row[k];
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    /**
      * Now get all our required variables as short-hands
      * here we use the notation of
      *  Var_00 as var_i,j
      *  Var_p0 as var_i+1,j
      *  Var_0m as var_i,j-1
      * etc
      */
    //Layer 1
    const float V1_00 = V1_shared[ty+1][tx+1]; //V at "center"
    const float V1_0p = V1_shared[ty+2][tx+1]; //V at "north"
    const float V1_0m = V1_shared[ty  ][tx+1]; //V at "south"
    const float V1_p0 = V1_shared[ty+1][tx+2]; //V at "east"
    const float V1_m0 = V1_shared[ty+1][tx  ]; //V at "west"
    const float U1_00 = U1_shared[ty  ][tx+1];
    const float U1_0p = U1_shared[ty+1][tx+1];
    const float U1_m0 = U1_shared[ty  ][tx  ];
    const float U1_mp = U1_shared[ty+1][tx  ];
    const float H1_m0 = H1_shared[ty  ][tx  ]; 
    const float H1_00 = H1_shared[ty  ][tx+1]; 
    const float H1_p0 = H1_shared[ty  ][tx+2];
    const float H1_mp = H1_shared[ty+1][tx  ];
    const float H1_0p = H1_shared[ty+1][tx+1];
    const float H1_pp = H1_shared[ty+1][tx+2];
    const float eta1_m0 = eta1_shared[ty  ][tx  ]; 
    const float eta1_00 = eta1_shared[ty  ][tx+1]; 
    const float eta1_p0 = eta1_shared[ty  ][tx+2];
    const float eta1_mp = eta1_shared[ty+1][tx  ]; 
    const float eta1_0p = eta1_shared[ty+1][tx+1]; 
    const float eta1_pp = eta1_shared[ty+1][tx+2];
    //Layer 2 (bottom)
    const float V2_00 = V2_shared[ty+1][tx+1];
    const float V2_0p = V2_shared[ty+2][tx+1];
    const float V2_0m = V2_shared[ty  ][tx+1];
    const float V2_p0 = V2_shared[ty+1][tx+2];
    const float V2_m0 = V2_shared[ty+1][tx  ];
    const float U2_00 = U2_shared[ty  ][tx+1];
    const float U2_0p = U2_shared[ty+1][tx+1];
    const float U2_m0 = U2_shared[ty  ][tx  ];
    const float U2_mp = U2_shared[ty+1][tx  ];
    const float H2_m0 = H2_shared[ty  ][tx  ]; 
    const float H2_00 = H2_shared[ty  ][tx+1]; 
    const float H2_p0 = H2_shared[ty  ][tx+2];
    const float H2_mp = H2_shared[ty+1][tx  ];
    const float H2_0p = H2_shared[ty+1][tx+1];
    const float H2_pp = H2_shared[ty+1][tx+2];
    const float eta2_m0 = eta2_shared[ty  ][tx  ]; 
    const float eta2_00 = eta2_shared[ty  ][tx+1]; 
    const float eta2_p0 = eta2_shared[ty  ][tx+2];
    const float eta2_mp = eta2_shared[ty+1][tx  ]; 
    const float eta2_0p = eta2_shared[ty+1][tx+1]; 
    const float eta2_pp = eta2_shared[ty+1][tx+2];
    //Reconstruct Eta_bar at the V position
    const float eta1_bar_m0 = 0.25f*(eta1_m0 + eta1_mp + eta1_00 + eta1_0p);
    const float eta1_bar_00 = 0.25f*(eta1_00 + eta1_0p + eta1_p0 + eta1_pp);
    const float eta2_bar_m0 = 0.25f*(eta2_m0 + eta2_mp + eta2_00 + eta2_0p);
    const float eta2_bar_00 = 0.25f*(eta2_00 + eta2_0p + eta2_p0 + eta2_pp);
    //Reconstruct H_bar and H_y (at the V position)
    const float H1_bar_m0 = 0.25f*(H1_m0 + H1_mp + H1_00 + H1_0p);
    const float H1_bar_00 = 0.25f*(H1_00 + H1_0p + H1_p0 + H1_pp);
    const float H1_y = 0.5f*(H1_00 + H1_0p);
    const float H2_bar_m0 = 0.25f*(H2_m0 + H2_mp + H2_00 + H2_0p);
    const float H2_bar_00 = 0.25f*(H2_00 + H2_0p + H2_p0 + H2_pp);
    const float H2_y = 0.5f*(H2_00 + H2_0p);
    //Compute layer thickness of top layer
    const float h1_0p = H1_0p + eta1_0p - eta2_0p;
    const float h1_00 = H1_00 + eta1_00 - eta2_00;
    const float h1_bar_00 = H1_bar_00 + eta1_bar_00 - eta2_bar_00;
    const float h1_bar_m0 = H1_bar_m0 + eta1_bar_m0 - eta2_bar_m0;
    const float h2_0p = H2_0p + eta2_0p;
    const float h2_00 = H2_00 + eta2_00;
    const float h2_bar_00 = H2_bar_00 + eta2_bar_00;
    const float h2_bar_m0 = H2_bar_m0 + eta2_bar_m0;
    //Compute pressure components
    const float h1_y = 0.5f*(h1_0p + h1_00);
    const float h2_y = 0.5f*(h2_0p + h2_00);
    //const float epsilon = (rho2_ - rho1_)/rho2_;
    //const float P1_y = -0.5f*g_*(h1_0p + h1_00) * (eta1_0p - eta1_00 + h2_0p - h2_00) * (1.0f - epsilon);
    //const float P2_y = -0.5f*g_*(h2_0p + h2_00) * (eta2_0p - eta2_00 + H2_0p - H2_00);
    const float P1_y = -g_*h1_y*(eta1_0p - eta1_00) - 0.5f*g_*(eta1_0p*eta1_0p - eta1_00*eta1_00);
    const float P2_y = -g_ * (rho1_/rho2_) * 
                            ( //Pressure contribution from top layer
                            h2_y*(eta1_0p - eta1_00) + 0.5f*(eta1_0p*eta1_0p - eta1_00*eta1_00) 
                            )
                       -g_ * ((rho2_ - rho1_)/rho2_) * 
                            ( //Pressure contribution from bottom layer
                            h2_y*(eta2_0p - eta2_00) + 0.5f*(eta2_0p*eta2_0p - eta2_00*eta2_00) 
                            );
    //Reconstruct U at the V position
    const float U1_bar = 0.25f*(U1_m0 + U1_00 + U1_mp + U1_0p);
    const float U2_bar = 0.25f*(U2_m0 + U2_00 + U2_mp + U2_0p);
    //Calculate the friction coefficient
    //FIXME: Should this be h instead of H?
    const float C1 = r1_/H1_y;
    const float C2 = r2_/H2_y;
    //Calculate numerical diffusion / subgrid energy loss coefficient
    const float D = 2.0f*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
    //Calculate nonlinear effects
    const float N1_a = (V1_0p + V1_00)*(V1_0p + V1_00) / (h1_0p);
    const float N1_b = (V1_00 + V1_0m)*(V1_00 + V1_0m) / (h1_00);
    const float N1_c = (U1_0p + U1_00)*(V1_p0 + V1_00) / (h1_bar_00);
    const float N1_d = (U1_mp + U1_m0)*(V1_00 + V1_m0) / (h1_bar_m0);
    const float N1 = 0.25f*( N1_a - N1_b + (dy_/dx_)*(N1_c - N1_d) );
    const float N2_a = (V2_0p + V2_00)*(V2_0p + V2_00) / (h2_0p);
    const float N2_b = (V2_00 + V2_0m)*(V2_00 + V2_0m) / (h2_00);
    const float N2_c = (U2_0p + U2_00)*(V2_p0 + V2_00) / (h2_bar_00);
    const float N2_d = (U2_mp + U2_m0)*(V2_00 + V2_m0) / (h2_bar_m0);
    const float N2 = 0.25f*( N2_a - N2_b + (dy_/dx_)*(N2_c - N2_d) );
    //Calculate eddy viscosity term
    const float E1 = (V1_p0 - V1_0 + V1_m0)/(dx_*dx_) + (V1_0p - V1_0 + V1_0m)/(dy_*dy_);
    const float E2 = (V2_p0 - V2_0 + V2_m0)/(dx_*dx_) + (V2_0p - V2_0 + V2_0m)/(dy_*dy_);
    //Calculate the wind shear stress
    const float Y = windStressY(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho1_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    //Compute the V at the next timestep
    float V1_2 = (V1_0 + 2.0f*dt_*(-f_*U1_bar + (N1 + P1_y)/dy_ + Y + C1*V2_0 + A_*E1) ) / (1.0f + D);
    float V2_2 = (V2_0 + 2.0f*dt_*(-f_*U2_bar + (N2 + P2_y)/dy_     + C1*V1_0 + A_*E2) ) / (1.0f + 2.0f*dt_*C2 + D);
    //Write to main memory for internal cells
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
        V1_0_row[ti] = V1_2;
        V2_0_row[ti] = V2_2;
    }
 }
--- a/SWESimulators/CTCS2Layer_eta_kernel.opencl
+++ b/SWESimulators/CTCS2Layer_eta_kernel.opencl
@ -1,128 +0,0 @@
 /**
 This OpenCL kernel implements part of the Centered in Time, Centered 
 in Space (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #define block_height 8
 #define block_width 8
 typedef __local float u_shmem[block_height][block_width+1];
 typedef __local float v_shmem[block_height+1][block_width];
 /**
  * Kernel that evolves eta one step in time.
  */
 __kernel void computeEtaKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Data for layer 1
        __global float* eta1_0_ptr_, int eta1_0_pitch_, //eta_1^n-1 (also used as output, that is eta_1^n+1)
        __global float* U1_1_ptr_, int U1_1_pitch_, // U^n
        __global float* V1_1_ptr_, int V1_1_pitch_, // V^n
        //Data for layer 2
        __global float* eta2_0_ptr_, int eta2_0_pitch_, //eta_2^n-1 (also used as output, that is eta_2^n+1)
        __global float* U2_1_ptr_, int U2_1_pitch_, // U^n
        __global float* V2_1_ptr_, int V2_1_pitch_ // V^n
        ) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    //Layer 1
    u_shmem U1_1_shared;
    v_shmem V1_1_shared;
    //Layer 2
    u_shmem U2_1_shared;
    v_shmem V2_1_shared;
    //Compute pointer to current row in the eta arrays
    __global float* eta1_0_row = (__global float*) ((__global char*) eta1_0_ptr_ + eta1_0_pitch_*tj);
    __global float* eta2_0_row = (__global float*) ((__global char*) eta2_0_ptr_ + eta2_0_pitch_*tj);
    //Read current eta
    float eta1_0 = 0.0f;
    float eta2_0 = 0.0f;
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
        eta1_0 = eta1_0_row[ti];
        eta2_0 = eta2_0_row[ti];
    }
    //Read U into shared memory
    for (int j=ty; j<block_height; j+=get_local_size(1)) {
        const int l = clamp(by + j, 1, ny_); // fake ghost cells
        //Compute the pointer to current row in the U array
        __global float* const U1_1_row = (__global float*) ((__global char*) U1_1_ptr_ + U1_1_pitch_*l);
        __global float* const U2_1_row = (__global float*) ((__global char*) U2_1_ptr_ + U2_1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const int k = clamp(bx + i - 1, 0, nx_); // prevent out of bounds
            U1_1_shared[j][i] = U1_1_row[k];
            U2_1_shared[j][i] = U2_1_row[k];
        }
    }
    //Read V into shared memory
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const int l = clamp(by + j - 1, 0, ny_); // prevent out of bounds
        //Compute the pointer to current row in the V array
        __global float* const V1_1_row = (__global float*) ((__global char*) V1_1_ptr_ + V1_1_pitch_*l);
        __global float* const V2_1_row = (__global float*) ((__global char*) V2_1_ptr_ + V2_1_pitch_*l);
        for (int i=tx; i<block_width; i+=get_local_size(0)) {
            const int k = clamp(bx + i, 1, nx_); // fake ghost cells
            V1_1_shared[j][i] = V1_1_row[k];
            V2_1_shared[j][i] = V2_1_row[k];
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    //Compute the H at the next timestep
    float eta1_2 = eta1_0 - 2.0f*dt_/dx_ * (U1_1_shared[ty][tx+1] - U1_1_shared[ty][tx] + U2_1_shared[ty][tx+1] - U2_1_shared[ty][tx])
                          - 2.0f*dt_/dy_ * (V1_1_shared[ty+1][tx] - V1_1_shared[ty][tx] + V2_1_shared[ty+1][tx] - V2_1_shared[ty][tx]);
    float eta2_2 = eta2_0 - 2.0f*dt_/dx_ * (U2_1_shared[ty][tx+1] - U2_1_shared[ty][tx])
                          - 2.0f*dt_/dy_ * (V2_1_shared[ty+1][tx] - V2_1_shared[ty][tx]);
    //Write to main memory
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
        eta1_0_row[ti] = eta1_2;
        eta2_0_row[ti] = eta2_2;
    }
 }
--- a/SWESimulators/CTCS_U_kernel.opencl
+++ b/SWESimulators/CTCS_U_kernel.opencl
@ -1,218 +0,0 @@
 /**
 This OpenCL kernel implements part of the Centered in Time, Centered 
 in Space (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "common.opencl"
 /**
  * Kernel that evolves U one step in time.
  */
 __kernel void computeUKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        //Numerical diffusion
        float A_,
        //Data
        __global float* H_ptr_, int H_pitch_,
        __global float* eta1_ptr_, int eta1_pitch_, // eta^n
        __global float* U0_ptr_, int U0_pitch_, // U^n-1, also output, U^n+1
        __global float* U1_ptr_, int U1_pitch_, // U^n
        __global float* V1_ptr_, int V1_pitch_, // V^n
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    __local float H_shared[block_height+2][block_width+1];
    __local float eta1_shared[block_height+2][block_width+1];
    __local float U1_shared[block_height+2][block_width+2];
    __local float V1_shared[block_height+1][block_width+1];
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    //Compute pointer to current row in the U array
    __global float* const U0_row = (__global float*) ((__global char*) U0_ptr_ + U0_pitch_*tj);
    //Read current U
    float U0 = 0.0f;
    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
        U0 = U0_row[ti];
    }
    //Read H and eta into shared memory: (nx+1)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // "fake" global ghost cells by clamping
        const int l = clamp(by + j - 1, 1, ny_);
        //Compute the pointer to current row in the H and eta arrays
        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
        __global float* const eta1_row = (__global float*) ((__global char*) eta1_ptr_ + eta1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // "fake" global ghost cells by clamping
            const int k = clamp(bx + i, 1, nx_);
            H_shared[j][i] = H_row[k];
            eta1_shared[j][i] = eta1_row[k];
        }
    }
    //Read U into shared memory: (nx+2)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // "fake" ghost cells by clamping
        const int l = clamp(by + j - 1, 1, ny_);
        //Compute the pointer to current row in the U array
        __global float* const U1_row = (__global float*) ((__global char*) U1_ptr_ + U1_pitch_*l);
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // Prevent out-of-bounds
            const int k = clamp(bx + i - 1, 0, nx_);
            U1_shared[j][i] = U1_row[k];
        }
    }
    //Read V into shared memory: (nx+1)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // Prevent out-of-bounds
        const int l = clamp(by + j - 1, 0, ny_);
        //Compute the pointer to current row in the U array
        __global float* const V1_row = (__global float*) ((__global char*) V1_ptr_ + V1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // "fake" ghost cells by clamping
            const int k = clamp(bx + i, 1, nx_);
            V1_shared[j][i] = V1_row[k];
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    /**
      * Now get all our required variables as short-hands
      * here we use the notation of
      *  Var_00 as var_i,j
      *  Var_p0 as var_i+1,j
      *  Var_0m as var_i,j-1
      * etc
      */
    const float U_00 = U1_shared[ty+1][tx+1]; //U at "center"
    const float U_0p = U1_shared[ty+2][tx+1]; //U at "north"
    const float U_0m = U1_shared[ty  ][tx+1]; //U at "south"
    const float U_p0 = U1_shared[ty+1][tx+2]; //U at "east"
    const float U_m0 = U1_shared[ty+1][tx  ]; //U at "west"
    const float V_00 = V1_shared[ty+1][tx  ];
    const float V_p0 = V1_shared[ty+1][tx+1];
    const float V_0m = V1_shared[ty  ][tx  ];
    const float V_pm = V1_shared[ty  ][tx+1];
    const float H_0m = H_shared[ty  ][tx  ]; 
    const float H_00 = H_shared[ty+1][tx  ]; 
    const float H_0p = H_shared[ty+2][tx  ];
    const float H_pm = H_shared[ty  ][tx+1];
    const float H_p0 = H_shared[ty+1][tx+1]; 
    const float H_pp = H_shared[ty+2][tx+1];
    const float eta_0m = eta1_shared[ty  ][tx  ]; 
    const float eta_00 = eta1_shared[ty+1][tx  ]; 
    const float eta_0p = eta1_shared[ty+2][tx  ];
    const float eta_pm = eta1_shared[ty  ][tx+1];
    const float eta_p0 = eta1_shared[ty+1][tx+1]; 
    const float eta_pp = eta1_shared[ty+2][tx+1];
    //Reconstruct H_bar and H_x (at the U position)
    const float H_bar_0m = 0.25f*(H_0m + H_pm + H_00 + H_p0);
    const float H_bar_00 = 0.25f*(H_00 + H_p0 + H_0p + H_pp);
    const float H_x = 0.5f*(H_00 + H_p0);
    //Reconstruct Eta_bar at the V position
    const float eta_bar_0m = 0.25f*(eta_0m + eta_pm + eta_00 + eta_p0);
    const float eta_bar_00 = 0.25f*(eta_00 + eta_p0 + eta_0p + eta_pp);
    //Reconstruct V at the U position
    const float V_bar = 0.25f*(V_0m + V_00 + V_pm + V_p0);
    //Calculate the friction coefficient
    const float C = 1.0 + 2*r_*dt_/H_x + 2*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
    //Calculate the pressure/gravitational effect
    const float h_p0 = H_p0 + eta_p0;
    const float h_00 = H_00 + eta_00;
    const float h_x = 0.5*(h_00 + h_p0); //Could possibly use h for pressure terms instead of H
    const float P_x_hat = -0.5f*g_*(eta_p0*eta_p0 - eta_00*eta_00);
    const float P_x = -g_*h_x*(eta_p0 - eta_00) + P_x_hat;
    //Calculate nonlinear effects
    const float N_a = (U_p0 + U_00)*(U_p0 + U_00) / (H_p0 + eta_p0);
    const float N_b = (U_00 + U_m0)*(U_00 + U_m0) / (H_00 + eta_00);
    const float N_c = (U_0p + U_00)*(V_p0 + V_00) / (H_bar_00 + eta_bar_00);
    const float N_d = (U_00 + U_0m)*(V_pm + V_0m) / (H_bar_0m + eta_bar_0m);
    float N = 0.25f*( N_a - N_b + (dx_/dy_)*(N_c - N_d) );
    //Calculate eddy viscosity term
    float E = (U_p0 - U0 + U_m0)/(dx_*dx_) + (U_0p - U0 + U_0m)/(dy_*dy_);
    //Calculate the wind shear stress
    float X = windStressX(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    //Compute the V at the next timestep
    float U2 = (U0 + 2.0f*dt_*(f_*V_bar + (N + P_x)/dx_ + X + A_*E) ) / C;
    //Write to main memory for internal cells
    if (ti > 0 && ti < nx_ && tj > 0 && tj < ny_+1) {
        U0_row[ti] = U2;
    }
 }
--- a/SWESimulators/CTCS_V_kernel.opencl
+++ b/SWESimulators/CTCS_V_kernel.opencl
@ -1,222 +0,0 @@
 /**
 This OpenCL kernel implements part of the Centered in Time, Centered 
 in Space (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5.
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "common.opencl"
 /**
  * Kernel that evolves V one step in time.
  */
 __kernel void computeVKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        //Numerical diffusion
        float A_,
        //Data
        __global float* H_ptr_, int H_pitch_,
        __global float* eta1_ptr_, int eta1_pitch_, // eta^n
        __global float* U1_ptr_, int U1_pitch_, // U^n
        __global float* V0_ptr_, int V0_pitch_, // V^n-1, also output V^n+1
        __global float* V1_ptr_, int V1_pitch_, // V^n
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    __local float H_shared[block_height+1][block_width+2];
    __local float eta1_shared[block_height+1][block_width+2];
    __local float U1_shared[block_height+1][block_width+1];
    __local float V1_shared[block_height+2][block_width+2];
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    //Compute pointer to current row in the V array
    __global float* const V0_row = (__global float*) ((__global char*) V0_ptr_ + V0_pitch_*tj);
    //Read current V
    float V0 = 0.0f;
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
        V0 = V0_row[ti];
    }
    //Read H and eta into shared memory: (nx+2)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // "fake" global ghost cells by clamping
        const int l = clamp(by + j, 1, ny_);
        //Compute the pointer to current row in the H and eta arrays
        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
        __global float* const eta1_row = (__global float*) ((__global char*) eta1_ptr_ + eta1_pitch_*l);
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // "fake" global ghost cells by clamping
            const int k = clamp(bx + i - 1, 1, nx_);
            H_shared[j][i] = H_row[k];
            eta1_shared[j][i] = eta1_row[k];
        }
    }
    //Read U into shared memory: (nx+1)*(ny+1) cells
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        // "fake" ghost cells by clamping
        const int l = clamp(by + j, 1, ny_);
        //Compute the pointer to current row in the U array
        __global float* const U1_row = (__global float*) ((__global char*) U1_ptr_ + U1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            // Prevent out-of-bounds
            const int k = clamp(bx + i - 1, 0, nx_);
            U1_shared[j][i] = U1_row[k];
        }
    }
    //Read V into shared memory: (nx+2)*(ny+2) cells
    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
        // Prevent out-of-bounds
        const int l = clamp(by + j - 1, 0, ny_);
        //Compute the pointer to current row in the U array
        __global float* const V1_row = (__global float*) ((__global char*) V1_ptr_ + V1_pitch_*l);
        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
            // "fake" ghost cells by clamping
            const int k = clamp(bx + i - 1, 1, nx_);
            V1_shared[j][i] = V1_row[k];
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    /**
      * Now get all our required variables as short-hands
      * here we use the notation of
      *  Var_00 as var_i,j
      *  Var_p0 as var_i+1,j
      *  Var_0m as var_i,j-1
      * etc
      */
    const float V_00 = V1_shared[ty+1][tx+1]; //V at "center"
    const float V_0p = V1_shared[ty+2][tx+1]; //V at "north"
    const float V_0m = V1_shared[ty  ][tx+1]; //V at "south"
    const float V_p0 = V1_shared[ty+1][tx+2]; //V at "east"
    const float V_m0 = V1_shared[ty+1][tx  ]; //V at "west"
    const float U_00 = U1_shared[ty  ][tx+1];
    const float U_0p = U1_shared[ty+1][tx+1];
    const float U_m0 = U1_shared[ty  ][tx  ];
    const float U_mp = U1_shared[ty+1][tx  ];
    const float H_m0 = H_shared[ty  ][tx  ]; 
    const float H_00 = H_shared[ty  ][tx+1]; 
    const float H_p0 = H_shared[ty  ][tx+2];
    const float H_mp = H_shared[ty+1][tx  ];
    const float H_0p = H_shared[ty+1][tx+1]; 
    const float H_pp = H_shared[ty+1][tx+2];
    const float eta_m0 = eta1_shared[ty  ][tx  ]; 
    const float eta_00 = eta1_shared[ty  ][tx+1]; 
    const float eta_p0 = eta1_shared[ty  ][tx+2];
    const float eta_mp = eta1_shared[ty+1][tx  ]; 
    const float eta_0p = eta1_shared[ty+1][tx+1]; 
    const float eta_pp = eta1_shared[ty+1][tx+2];
    //Reconstruct H_bar and H_y (at the V position)
    const float H_bar_m0 = 0.25f*(H_m0 + H_mp + H_00 + H_0p);
    const float H_bar_00 = 0.25f*(H_00 + H_0p + H_p0 + H_pp);
    const float H_y = 0.5f*(H_00 + H_0p);
    //Reconstruct Eta_bar at the V position
    const float eta_bar_m0 = 0.25f*(eta_m0 + eta_mp + eta_00 + eta_0p);
    const float eta_bar_00 = 0.25f*(eta_00 + eta_0p + eta_p0 + eta_pp);
    //Reconstruct U at the V position
    const float U_bar = 0.25f*(U_m0 + U_00 + U_mp + U_0p);
    //Calculate the friction coefficient
    const float C = 1.0 + 2*r_*dt_/H_y + 2*A_*dt_*(dx_*dx_ + dy_*dy_)/(dx_*dx_*dy_*dy_);
    //Calculate the pressure/gravitational effect
    const float h_0p = H_0p + eta_0p;
    const float h_00 = H_00 + eta_00;
    const float h_y = 0.5*(h_00 + h_0p); //Could possibly use h for pressure terms instead of H
    const float P_y_hat = -0.5f*g_*(eta_0p*eta_0p - eta_00*eta_00);
    const float P_y = -g_*h_y*(eta_0p - eta_00) + P_y_hat;
    //Calculate nonlinear effects
    const float N_a = (V_0p + V_00)*(V_0p + V_00) / (H_0p + eta_0p);
    const float N_b = (V_00 + V_0m)*(V_00 + V_0m) / (H_00 + eta_00);
    const float N_c = (U_0p + U_00)*(V_p0 + V_00) / (H_bar_00 + eta_bar_00);
    const float N_d = (U_mp + U_m0)*(V_00 + V_m0) / (H_bar_m0 + eta_bar_m0);
    float N = 0.25f*( N_a - N_b + (dy_/dx_)*(N_c - N_d) );
    //Calculate eddy viscosity term
    float E = (V_p0 - V0 + V_m0)/(dx_*dx_) + (V_0p - V0 + V_0m)/(dy_*dy_);
    //Calculate the wind shear stress
    float Y = windStressY(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    //Compute the V at the next timestep
    float V2 = (V0 + 2.0f*dt_*(-f_*U_bar + (N + P_y)/dy_ + Y + A_*E) ) / C;
    //Write to main memory for internal cells
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_) {
        V0_row[ti] = V2;
    }
 }
--- a/SWESimulators/CTCS_eta_kernel.opencl
+++ b/SWESimulators/CTCS_eta_kernel.opencl
@ -1,109 +0,0 @@
 /**
 This OpenCL kernel implements part of the Centered in Time, Centered 
 in Space (leapfrog) numerical scheme for the shallow water equations, 
 described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
  * Kernel that evolves eta one step in time.
  */
 __kernel void computeEtaKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        //Data
        __global float* eta0_ptr_, int eta0_pitch_, //eta^n-1 (also used as output, that is eta^n+1)
        __global float* U1_ptr_, int U1_pitch_, // U^n
        __global float* V1_ptr_, int V1_pitch_ // V^n
        ) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Start of block within domain
    const int bx = get_local_size(0) * get_group_id(0) + 1; //Skip global ghost cells
    const int by = get_local_size(1) * get_group_id(1) + 1; //Skip global ghost cells
    //Index of cell within domain
    const int ti = bx + tx;
    const int tj = by + ty;
    __local float U1_shared[block_height][block_width+1];
    __local float V1_shared[block_height+1][block_width];
    //Compute pointer to current row in the U array
    __global float* eta0_row = (__global float*) ((__global char*) eta0_ptr_ + eta0_pitch_*tj);
    //Read current eta
    float eta0 = 0.0f;
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
        eta0 = eta0_row[ti];
    }
    //Read U into shared memory
    for (int j=ty; j<block_height; j+=get_local_size(1)) {
        const int l = clamp(by + j, 1, ny_); // fake ghost cells
        //Compute the pointer to current row in the V array
        __global float* const U1_row = (__global float*) ((__global char*) U1_ptr_ + U1_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const int k = clamp(bx + i - 1, 0, nx_); // prevent out of bounds
            U1_shared[j][i] = U1_row[k];
        }
    }
    //Read V into shared memory
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const int l = clamp(by + j - 1, 0, ny_); // prevent out of bounds
        //Compute the pointer to current row in the V array
        __global float* const V1_row = (__global float*) ((__global char*) V1_ptr_ + V1_pitch_*l);
        for (int i=tx; i<block_width; i+=get_local_size(0)) {
            const int k = clamp(bx + i, 1, nx_); // fake ghost cells
            V1_shared[j][i] = V1_row[k];
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    //Compute the H at the next timestep
    float eta2 = eta0 - 2.0f*dt_/dx_ * (U1_shared[ty][tx+1] - U1_shared[ty][tx])
                      - 2.0f*dt_/dy_ * (V1_shared[ty+1][tx] - V1_shared[ty][tx]);
    //Write to main memory
    if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
        eta0_row[ti] = eta2;
    }
 }
--- a/SWESimulators/Common.py
+++ b/SWESimulators/Common.py
@ -1,79 +1,159 @@
 import pyopencl
 import os
 import numpy as np
 import time
 import re
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 """
-Static function which reads a text file and creates an OpenCL kernel from that
+Class which keeps track of the CUDA context and some helper functions
 """
-def get_kernel(cl_ctx, kernel_filename, block_width, block_height):
+class CudaContext(object):
-    import datetime
+    def __init__(self, verbose=True, blocking=False):
        self.verbose = verbose
        self.blocking = blocking
        self.kernels = {}
        cuda.init(flags=0)
        if (self.verbose):
            print("CUDA version " + str(cuda.get_version()))
            print("Driver version " + str(cuda.get_driver_version()))
        self.cuda_device = cuda.Device(0)
        if (self.verbose):
            print("Using " + self.cuda_device.name())
            print(" => compute capability: " + str(self.cuda_device.compute_capability()))
            print(" => memory: " + str(self.cuda_device.total_memory() / (1024*1024)) + " MB")
        if (self.blocking):
            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
            if (self.verbose):
                print("=== WARNING ===")
                print("Using blocking context")
                print("=== WARNING ===")
        else:
            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
        if (self.verbose):
            print("Created context <" + str(self.cuda_context.handle) + ">")
    def __del__(self, *args):
        if self.verbose:
            print("Cleaning up CUDA context <" + str(self.cuda_context.handle) + ">")
        # Loop over all contexts in stack, and remove "this"
        other_contexts = []
        while (cuda.Context.get_current() != None):
            context = cuda.Context.get_current()
            if (self.verbose):
                if (context.handle != self.cuda_context.handle):
                    print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> which we do not own")
                    other_contexts = [context] + other_contexts
                    cuda.Context.pop()
                else:
                    print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> (ourselves)")
                    cuda.Context.pop()
        # Add all the contexts we popped that were not our own
        for context in other_contexts:
            if (self.verbose):
                print(" `-> <" + str(self.cuda_context.handle) + "> Pushing <" + str(context.handle) + ">")
            cuda.Context.push(context)
        if (self.verbose):
            print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
        self.cuda_context.detach()
    """
    Reads a text file and creates an OpenCL kernel from that
    """
    def get_kernel(self, kernel_filename, block_width, block_height):
        # Generate a kernel ID for our cache
        module_path = os.path.dirname(os.path.realpath(__file__))
        kernel_hash = ""
        # Loop over file and includes, and check if something has changed
        files = [kernel_filename]
        while len(files):
            filename = os.path.join(module_path, files.pop())
            modified = os.path.getmtime(filename)
            with open(filename, "r") as file:
                file_str = file.read()
                file_hash = filename + "_" + str(hash(file_str)) + ":" + str(modified) + "--"
                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
                files = files + includes #WARNING FIXME This will not work with circular includes
            kernel_hash = kernel_hash + file_hash
        # Recompile kernel if file or includes have changed
        if (kernel_hash not in self.kernels.keys()):
            #Create define string
            define_string = "#define block_width " + str(block_width) + "\n"
            define_string += "#define block_height " + str(block_height) + "\n\n"
    define_string += "#ifndef my_variable_to_force_recompilation\n"
    define_string += "#define my_variable_to_force_recompilation " + datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S") + "\n"
    define_string += "#undef my_variable_to_force_recompilation \n"
    define_string += "#endif\n\n"
            kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
            self.kernels[kernel_hash] = cuda_compiler.SourceModule(kernel_string, include_dirs=[module_path])
-    def shellquote(s):
+        return self.kernels[kernel_hash]
        assert(cl_ctx.num_devices == 1)
        platform_name = cl_ctx.devices[0].get_info(pyopencl.device_info.PLATFORM).name
        platform_name = platform_name.upper()
        if ('INTEL' in platform_name):
            #Intel CL compiler doesn't like spaces in include paths. We have to escape them
            return '"' + s.replace(" ", "\\ ") + '"'
        elif ('NVIDIA' in platform_name):
            #NVIDIA doesn't like double quoted paths...
            #return "'" + s + "'"
            return s
    module_path = os.path.dirname(os.path.realpath(__file__))
    module_path_escaped = shellquote(module_path)
    options = ['-I', module_path_escaped]
    #Read the proper program
    fullpath = os.path.join(module_path, kernel_filename)
    with open(fullpath, "r") as kernel_file:
        kernel_string = define_string + kernel_file.read()
        kernel = pyopencl.Program(cl_ctx, kernel_string).build(options)
    return kernel
    """
    Clears the kernel cache (useful for debugging & development)
    """
    def clear_kernel_cache(self):
        self.kernels = {}
 class Timer(object):
    def __init__(self, tag, verbose=True):
        self.verbose = verbose
        self.tag = tag
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000 # millisecs
        if self.verbose:
            print("=> " + self.tag + ' %f ms' % self.msecs)
 """
 Class that holds data 
 """
-class OpenCLArray2D:
+class CUDAArray2D:
    """
    Uploads initial data to the CL device
    """
-    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, data):
+    def __init__(self, stream, nx, ny, halo_x, halo_y, data):
        host_data = self.convert_to_float32(data)
        self.nx = nx
        self.ny = ny
        self.nx_halo = nx + 2*halo_x
        self.ny_halo = ny + 2*halo_y
        assert(host_data.shape[1] == self.nx_halo)
        assert(host_data.shape[0] == self.ny_halo)
-        assert(data.shape == (self.ny_halo, self.nx_halo))
+        #Make sure data is in proper format
        assert np.issubdtype(data.dtype, np.float32), "Wrong datatype: %s" % str(data.dtype)
        assert not np.isfortran(data), "Wrong datatype (Fortran, expected C)"
        assert data.shape == (self.ny_halo, self.nx_halo), "Wrong data shape: %s" % str(data.shape)
        #Upload data to the device
-        mf = pyopencl.mem_flags
+        self.data = pycuda.gpuarray.to_gpu_async(data, stream=stream)
        self.data = pyopencl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_data)
-        self.bytes_per_float = host_data.itemsize
+        self.bytes_per_float = data.itemsize
        assert(self.bytes_per_float == 4)
        self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
@ -81,27 +161,15 @@ class OpenCLArray2D:
    """
    Enables downloading data from CL device to Python
    """
-    def download(self, cl_queue):
+    def download(self, stream, async=False):
        #Allocate data on the host for result
        host_data = np.empty((self.ny_halo, self.nx_halo), dtype=np.float32, order='C')
        #Copy data from device to host
-        pyopencl.enqueue_copy(cl_queue, host_data, self.data)
+        if (async):
-        
+            host_data = self.data.get_async(stream=stream)
-        #Return
+            return host_data
        else:
            host_data = self.data.get(stream=stream)#, pagelocked=True) # pagelocked causes crash on windows at least
            return host_data
    """
    Converts to C-style float 32 array suitable for the GPU/OpenCL
    """
    @staticmethod
    def convert_to_float32(data):
        if (not np.issubdtype(data.dtype, np.float32) or np.isfortran(data)):
            #print("Converting H0")
            return data.astype(np.float32, order='C')
        else:
            return data
@ -111,20 +179,20 @@ class OpenCLArray2D:
 """
-A class representing an Akrawa A type (unstaggered, logically Cartesian) grid
+A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
 """
-class SWEDataArkawaA:
+class SWEDataArakawaA:
    """
    Uploads initial data to the CL device
    """
-    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, h0, hu0, hv0):
+    def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0):
-        self.h0  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.h0  = CUDAArray2D(stream, nx, ny, halo_x, halo_y, h0)
-        self.hu0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
+        self.hu0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hu0)
-        self.hv0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
+        self.hv0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hv0)
-        self.h1  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
+        self.h1  = CUDAArray2D(stream, nx, ny, halo_x, halo_y, h0)
-        self.hu1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
+        self.hu1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hu0)
-        self.hv1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
+        self.hv1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hv0)
    """
    Swaps the variables after a timestep has been completed
@ -137,153 +205,11 @@ class SWEDataArkawaA:
    """
    Enables downloading data from CL device to Python
    """
-    def download(self, cl_queue):
+    def download(self, stream):
-        h_cpu  = self.h0.download(cl_queue)
+        h_cpu  = self.h0.download(stream, async=True)
-        hu_cpu = self.hu0.download(cl_queue)
+        hu_cpu = self.hu0.download(stream, async=True)
-        hv_cpu = self.hv0.download(cl_queue)
+        hv_cpu = self.hv0.download(stream, async=False)
        return h_cpu, hu_cpu, hv_cpu
 """
 A class representing an Akrawa A type (unstaggered, logically Cartesian) grid
 """
 class SWEDataArkawaA:
    """
    Uploads initial data to the CL device
    """
    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, h0, hu0, hv0):
        self.h0  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
        self.hu0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
        self.hv0 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
        self.h1  = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
        self.hu1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hu0)
        self.hv1 = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, hv0)
    """
    Swaps the variables after a timestep has been completed
    """
    def swap(self):
        self.h1,  self.h0  = self.h0,  self.h1
        self.hu1, self.hu0 = self.hu0, self.hu1
        self.hv1, self.hv0 = self.hv0, self.hv1
    """
    Enables downloading data from CL device to Python
    """
    def download(self, cl_queue):
        h_cpu  = self.h0.download(cl_queue)
        hu_cpu = self.hu0.download(cl_queue)
        hv_cpu = self.hv0.download(cl_queue)
        return h_cpu, hu_cpu, hv_cpu
 """
 A class representing an Akrawa C type (staggered, u fluxes on east/west faces, v fluxes on north/south faces) grid
 We use h as cell centers
 """
 class SWEDataArkawaC:
    """
    Uploads initial data to the CL device
    """
    def __init__(self, cl_ctx, nx, ny, halo_x, halo_y, h0, hu0, hv0):
        #FIXME: This at least works for 0 and 1 ghost cells, but not convinced it generalizes
        assert(halo_x <= 1 and halo_y <= 1)
        self.h0   = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
        self.hu0  = OpenCLArray2D(cl_ctx, nx+1, ny, 0, halo_y, hu0)
        self.hv0  = OpenCLArray2D(cl_ctx, nx, ny+1, halo_x, 0, hv0)
        self.h1   = OpenCLArray2D(cl_ctx, nx, ny, halo_x, halo_y, h0)
        self.hu1  = OpenCLArray2D(cl_ctx, nx+1, ny, 0, halo_y, hu0)
        self.hv1  = OpenCLArray2D(cl_ctx, nx, ny+1, halo_x, 0, hv0)
    """
    Swaps the variables after a timestep has been completed
    """
    def swap(self):
        #h is assumed to be constant (bottom topography really)
        self.h1,  self.h0  = self.h0, self.h1
        self.hu1, self.hu0 = self.hu0, self.hu1
        self.hv1, self.hv0 = self.hv0, self.hv1
    """
    Enables downloading data from CL device to Python
    """
    def download(self, cl_queue):
        h_cpu  = self.h0.download(cl_queue)
        hu_cpu = self.hu0.download(cl_queue)
        hv_cpu = self.hv0.download(cl_queue)
        return h_cpu, hu_cpu, hv_cpu
 """
 Class which represents different wind stresses
 """
 class WindStressParams:
    """
    wind_type: TYpe of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
    wind_tau0: Amplitude of wind stress (Pa)
    wind_rho: Density of sea water (1025.0 kg / m^3)
    wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
    wind_xm: Maximum wind stress for bell shaped wind stress
    wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
    wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
    wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
    wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
    """
    def __init__(self, 
                 type=99, # "no wind" \
                 tau0=0, rho=0, alpha=0, xm=0, Rc=0, \
                 x0=0, y0=0, \
                 u0=0, v0=0):
        self.type = np.int32(type)
        self.tau0 = np.float32(tau0)
        self.rho = np.float32(rho)
        self.alpha = np.float32(alpha)
        self.xm = np.float32(xm)
        self.Rc = np.float32(Rc)
        self.x0 = np.float32(x0)
        self.y0 = np.float32(y0)
        self.u0 = np.float32(u0)
        self.v0 = np.float32(v0)
--- a/SWESimulators/DataOutput.py
+++ b/SWESimulators/DataOutput.py
@ -1,123 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements saving shallow water simulations to a
 netcdf file.
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 import numpy as np
 from netCDF4 import Dataset
 class CTCSNetCDFWriter:
    def __init__(self, outfilename, nx, ny, dx, dy, ignore_ghostcells=True):
        self.ncfile = Dataset(outfilename,'w') 
        self.ignore_ghostcells = ignore_ghostcells
        #Create dimensions 
        self.ncfile.createDimension('time', None) #Unlimited time dimension
        if (self.ignore_ghostcells):
            self.ncfile.createDimension('x_eta', nx)
            self.ncfile.createDimension('y_eta', ny)
            self.ncfile.createDimension('x_u', nx-1)
            self.ncfile.createDimension('y_u', ny)
            self.ncfile.createDimension('x_v', nx)
            self.ncfile.createDimension('y_v', ny-1)
        else:
            self.ncfile.createDimension('x_eta', nx+2)
            self.ncfile.createDimension('y_eta', ny+2)
            self.ncfile.createDimension('x_u', nx+1)
            self.ncfile.createDimension('y_u', ny+2)
            self.ncfile.createDimension('x_v', nx+2)
            self.ncfile.createDimension('y_v', ny+1)
        #Create axis
        self.nc_time = self.ncfile.createVariable('time', np.dtype('float32').char, 'time')
        x_eta = self.ncfile.createVariable('x_eta', np.dtype('float32').char, 'x_eta')
        y_eta = self.ncfile.createVariable('y_eta', np.dtype('float32').char, 'y_eta')
        x_u = self.ncfile.createVariable('x_u', np.dtype('float32').char, 'x_u')
        y_u = self.ncfile.createVariable('y_u', np.dtype('float32').char, 'y_u')
        x_v = self.ncfile.createVariable('x_v', np.dtype('float32').char, 'x_v')
        y_v = self.ncfile.createVariable('y_v', np.dtype('float32').char, 'y_v')
        #Set axis values/ticks
        if (self.ignore_ghostcells):
            x_eta[:] = np.linspace(dx/2.0, nx*dx - dx/2.0, nx)
            y_eta[:] = np.linspace(dy/2.0, ny*dy - dy/2.0, ny)
            x_u[:] = np.linspace(1, (nx-1)*dx, nx-1)
            y_u[:] = np.linspace(dy/2.0, ny*dy - dy/2.0, ny)
            x_v[:] = np.linspace(dx/2.0, nx*dx - dx/2.0, nx)
            y_v[:] = np.linspace(1, (ny-1)*dy, ny-1)
        else:
            x_eta[:] = np.linspace(-dx/2.0, nx*dx + dx/2.0, nx+2)
            y_eta[:] = np.linspace(-dy/2.0, ny*dy + dy/2.0, ny+2)
            x_u[:] = np.linspace(0, nx*dx, nx+1)
            y_u[:] = np.linspace(-dy/2.0, ny*dy + dy/2.0, ny+2)
            x_v[:] = np.linspace(-dx/2.0, nx*dx + dx/2.0, nx+2)
            y_v[:] = np.linspace(0, ny*dy, ny+1)
        #Set units
        self.nc_time.units = 's'
        x_eta.units = 'm'
        y_eta.units = 'm'
        x_u.units = 'm'
        y_u.units = 'm'
        x_v.units = 'm'
        y_v.units = 'm'
        #Create output data variables
        self.nc_eta = self.ncfile.createVariable('eta', np.dtype('float32').char, ('time', 'y_eta', 'x_eta'))
        self.nc_u = self.ncfile.createVariable('u', np.dtype('float32').char, ('time', 'y_u', 'x_u'))
        self.nc_v = self.ncfile.createVariable('v', np.dtype('float32').char, ('time', 'y_v', 'x_v'))
        #Set units
        self.nc_eta.units = 'm'
        self.nc_u.units = 'm'
        self.nc_v.units = 'm'
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        #print("Closing '" + self.ncfile.filepath() + "'")
        self.ncfile.close()
    def write(self, i, t, eta, u, v):
        if (self.ignore_ghostcells):
            self.nc_time[i] = t
            self.nc_eta[i, :] = eta[1:-1, 1:-1]
            self.nc_u[i, :] = u[1:-1, 1:-1]
            self.nc_v[i, :] = v[1:-1, 1:-1]
        else:
            self.nc_time[i] = t
            self.nc_eta[i, :] = eta
            self.nc_u[i, :] = u
            self.nc_v[i, :] = v
--- a/SWESimulators/FBL.py
+++ b/SWESimulators/FBL.py
@ -1,186 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements the Forward Backward Linear numerical 
 scheme for the shallow water equations, described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
 import numpy as np
 import pyopencl as cl #OpenCL in Python
 from SWESimulators import Common
 """
 Class that solves the SW equations using the Forward-Backward linear scheme
 """
 class FBL:
    """
    Initialization routine
    H: Water depth incl ghost cells, (nx+2)*(ny+2) cells
    eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
    hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
    nx: Number of cells along x-axis
    ny: Number of cells along y-axis
    dx: Grid cell spacing along x-axis (20 000 m)
    dy: Grid cell spacing along y-axis (20 000 m)
    dt: Size of each timestep (90 s)
    g: Gravitational accelleration (9.81 m/s^2)
    f: Coriolis parameter (1.2e-4 s^1)
    r: Bottom friction coefficient (2.4e-3 m/s)
    wind_stress: Wind stress parameters
    """
    def __init__(self, \
                 cl_ctx, \
                 H, eta0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, f, r, \
                 wind_stress=Common.WindStressParams(), \
                 block_width=16, block_height=16):
        self.cl_ctx = cl_ctx
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
        self.u_kernel = Common.get_kernel(self.cl_ctx, "FBL_U_kernel.opencl", block_width, block_height)
        self.v_kernel = Common.get_kernel(self.cl_ctx, "FBL_V_kernel.opencl", block_width, block_height)
        self.eta_kernel = Common.get_kernel(self.cl_ctx, "FBL_eta_kernel.opencl", block_width, block_height)
        #Create data by uploading to device
        ghost_cells_x = 0
        ghost_cells_y = 0
        self.H = Common.OpenCLArray2D(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, H)
        self.cl_data = Common.SWEDataArkawaC(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
        #OpenCL kernel
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.dt = np.float32(dt)
        self.g = np.float32(g)
        self.f = np.float32(f)
        self.r = np.float32(r)
        self.wind_stress = wind_stress
        #Initialize time
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
        self.local_size = (8, 8) # WARNING::: MUST MATCH defines of block_width/height in kernels!
        self.global_size = ( \
                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
                      ) 
    def __str__(self):
        return "Forward Backward Linear"
    """
    Function which steps n timesteps
    """
    def step(self, t_end=0.0):
        n = int(t_end / self.dt + 1)
        for i in range(0, n):        
            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
            if (local_dt <= 0.0):
                break
            self.u_kernel.computeUKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, self.r, \
                    self.H.data, self.H.pitch, \
                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
                    self.wind_stress.type, \
                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                    self.wind_stress.x0, self.wind_stress.y0, \
                    self.wind_stress.u0, self.wind_stress.v0, \
                    self.t)
            self.v_kernel.computeVKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, self.r, \
                    self.H.data, self.H.pitch, \
                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
                    self.wind_stress.type, \
                    self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                    self.wind_stress.x0, self.wind_stress.y0, \
                    self.wind_stress.u0, self.wind_stress.v0, \
                    self.t)
            self.eta_kernel.computeEtaKernel(self.cl_queue, self.global_size, self.local_size, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, self.f, self.r, \
                    self.H.data, self.H.pitch, \
                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
                    self.cl_data.h0.data, self.cl_data.h0.pitch)
            self.t += local_dt
        return self.t
    def download(self):
        return self.cl_data.download(self.cl_queue)
--- a/SWESimulators/FBL_U_kernel.opencl
+++ b/SWESimulators/FBL_U_kernel.opencl
@ -1,163 +0,0 @@
 /*
 This OpenCL kernel implements part of the Forward Backward Linear 
 numerical scheme for the shallow water equations, described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "common.opencl"
 /**
  * Kernel that evolves U one step in time.
  */
 __kernel void computeUKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        //Data
        __global float* H_ptr_, int H_pitch_,
        __global float* U_ptr_, int U_pitch_,
        __global float* V_ptr_, int V_pitch_,
        __global float* eta_ptr_, int eta_pitch_,
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    __local float H_shared[block_height][block_width+1];
    __local float V_shared[block_height+1][block_width+1];
    __local float eta_shared[block_height][block_width+1];
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
    const int bx = get_local_size(0) * get_group_id(0);
    const int by = get_local_size(1) * get_group_id(1); 
    //Index of cell within domain
    const int ti = get_global_id(0); 
    const int tj = get_global_id(1);
    //Compute pointer to row "tj" in the U array
    __global float* const U_row = (__global float*) ((__global char*) U_ptr_ + U_pitch_*tj);
    //Read current U
    float U_current = 0.0f;
    if (ti < nx_ + 1 && tj < ny_) {
        U_current = U_row[ti];
    }
    //Read H and eta into local memory
    for (int j=ty; j<block_height; j+=get_local_size(1)) {
        const int l = by + j;
        //Compute the pointer to row "l" in the H and eta arrays
        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
        __global float* const eta_row = (__global float*) ((__global char*) eta_ptr_ + eta_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const int k = bx + i - 1;
            if (k >= 0 && k < nx_ && l < ny_+1) {
                H_shared[j][i] = H_row[k];
                eta_shared[j][i] = eta_row[k];
            }
            else {
                H_shared[j][i] = 0.0f;
                eta_shared[j][i] = 0.0f;
            }
        }
    }
    //Read V into shared memory
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const int l = by + j;
        //Compute the pointer to current row in the V array
        __global float* const V_row = (__global float*) ((__global char*) V_ptr_ + V_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const int k = bx + i - 1;
            if (k >= 0 && k < nx_ && l < ny_+1) {
                V_shared[j][i] = V_row[k];
            }
            else {
                V_shared[j][i] = 0.0f;
            }
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    //Reconstruct H at the U position
    float H_m = 0.5f*(H_shared[ty][tx] + H_shared[ty][tx+1]);
    //Reconstruct V at the U position
    float V_m = 0.0f;
    if (tj==0) {
        V_m = 0.5f*(V_shared[ty+1][tx] + V_shared[ty+1][tx+1]);
    }
    else if (tj==ny_-1) {
        V_m = 0.5f*(V_shared[ty][tx] + V_shared[ty][tx+1]);
    }
    else {
        V_m = 0.25f*(V_shared[ty][tx] + V_shared[ty][tx+1]
                + V_shared[ty+1][tx] + V_shared[ty+1][tx+1]);
    }
    //Calculate the friction coefficient
    float B = H_m/(H_m + r_*dt_);
    //Calculate the gravitational effect
    float P = g_*H_m*(eta_shared[ty][tx] - eta_shared[ty][tx+1])/dx_;
    //Calculate the wind shear stress
    float X = windStressX(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    //Compute the U at the next timestep
    float U_next = B*(U_current + dt_*(f_*V_m + P + X) );
    //Write to main memory for internal cells
    if (ti < nx_+1 && tj < ny_) {
        //Closed boundaries
        if (ti == 0 || ti == nx_) {
            U_next = 0.0f;
        }
        U_row[ti] = U_next;
    }
 }
--- a/SWESimulators/FBL_V_kernel.opencl
+++ b/SWESimulators/FBL_V_kernel.opencl
@ -1,168 +0,0 @@
 /*
 This OpenCL kernel implements part of the Forward Backward Linear 
 numerical scheme for the shallow water equations, described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "common.opencl"
 /**
  * Kernel that evolves V one step in time.
  */
 __kernel void computeVKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        //Data
        __global float* H_ptr_, int H_pitch_,
        __global float* U_ptr_, int U_pitch_,
        __global float* V_ptr_, int V_pitch_,
        __global float* eta_ptr_, int eta_pitch_,
        // Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    __local float H_shared[block_height+1][block_width];
    __local float U_shared[block_height+1][block_width+1];
    __local float eta_shared[block_height+1][block_width];
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
    const int bx = get_local_size(0) * get_group_id(0);
    const int by = get_local_size(1) * get_group_id(1);
    //Index of cell within domain
    const int ti = get_global_id(0); 
    const int tj = get_global_id(1);
    //Compute pointer to current row in the U array
    __global float* const V_row = (__global float*) ((__global char*) V_ptr_ + V_pitch_*tj);
    //Read current V
    float V_current = 0.0f;
    if (ti < nx_ && tj < ny_+1) {
        V_current = V_row[ti];
    }
    //Read H and eta into shared memory
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const int l = by + j - 1;
        //Compute the pointer to current row in the H and eta arrays
        __global float* const H_row = (__global float*) ((__global char*) H_ptr_ + H_pitch_*l);
        __global float* const eta_row = (__global float*) ((__global char*) eta_ptr_ + eta_pitch_*l);
        for (int i=tx; i<block_width; i+=get_local_size(0)) {
            const int k = bx + i;
            if (k < nx_ && l >= 0 && l < ny_+1) {
                H_shared[j][i] = H_row[k];
                eta_shared[j][i] = eta_row[k];
            }
            else {
                H_shared[j][i] = 0.0f;
                eta_shared[j][i] = 0.0f;
            }
        }
    }
    //Read U into shared memory
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const int l = by + j - 1;
        //Compute the pointer to current row in the V array
        __global float* const U_row = (__global float*) ((__global char*) U_ptr_ + U_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const int k = bx + i;
            if (k < nx_+1 && l >= 0 && l < ny_) {
                U_shared[j][i] = U_row[k];
            }
            else {
                U_shared[j][i] = 0.0f;
            }
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    //Reconstruct H at the V position
    float H_m = 0.5f*(H_shared[ty][tx] + H_shared[ty+1][tx]);
    //Reconstruct U at the V position
    float U_m;
    if (ti==0) {
        U_m = 0.5f*(U_shared[ty][tx+1] + U_shared[ty+1][tx+1]);
    }
    else if (ti==nx_-1) {
        U_m = 0.5f*(U_shared[ty][tx] + U_shared[ty+1][tx]);
    }
    else {
        U_m = 0.25f*(U_shared[ty][tx] + U_shared[ty][tx+1]
                + U_shared[ty+1][tx] + U_shared[ty+1][tx+1]);
    }
    //Calculate the friction coefficient
    float B = H_m/(H_m + r_*dt_);
    //Calculate the gravitational effect
    float P = g_*H_m*(eta_shared[ty][tx] - eta_shared[ty+1][tx])/dy_;
    //Calculate the wind shear stress
    float Y = windStressY(
        wind_stress_type_, 
        dx_, dy_, dt_,
        tau0_, rho_, alpha_, xm_, Rc_,
        x0_, y0_,
        u0_, v0_,
        t_);
    //Compute the V at the next timestep
    float V_next = B*(V_current + dt_*(-f_*U_m + P + Y) );
    //Write to main memory
    if (ti < nx_ && tj < ny_+1) {
        //Closed boundaries 
        if (tj == 0) {
            V_next = 0.0f;
        }
        else if (tj == ny_) {
            V_next = 0.0f;
        }
        V_row[ti] = V_next;
    }
 }
--- a/SWESimulators/FBL_eta_kernel.opencl
+++ b/SWESimulators/FBL_eta_kernel.opencl
@ -1,113 +0,0 @@
 /*
 This OpenCL kernel implements part of the Forward Backward Linear 
 numerical scheme for the shallow water equations, described in 
 L. P. Røed, "Documentation of simple ocean models for use in ensemble
 predictions", Met no report 2012/3 and 2012/5 .
 Copyright (C) 2016  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
  * Kernel that evolves eta one step in time.
  */
 __kernel void computeEtaKernel(
        //Discretization parameters
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        //Physical parameters
        float g_, //< Gravitational constant
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        //Data
        __global float* H_ptr_, int H_pitch_,
        __global float* U_ptr_, int U_pitch_,
        __global float* V_ptr_, int V_pitch_,
        __global float* eta_ptr_, int eta_pitch_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
    const int bx = get_local_size(0) * get_group_id(0);
    const int by = get_local_size(1) * get_group_id(1);
    //Index of cell within domain
    const int ti = get_global_id(0); 
    const int tj = get_global_id(1);
    __local float U_shared[block_height][block_width+1];
    __local float V_shared[block_height+1][block_width];
    //Compute pointer to current row in the U array
    __global float* const eta_row = (__global float*) ((__global char*) eta_ptr_ + eta_pitch_*tj);
    //Read current eta
    float eta_current = 0.0f;
    if (ti < nx_ && tj < ny_) {
        eta_current = eta_row[ti];
    }
    //Read U into shared memory
    for (int j=ty; j<block_height; j+=get_local_size(1)) {
        const unsigned int l = by + j;
        //Compute the pointer to current row in the V array
        __global float* const U_row = (__global float*) ((__global char*) U_ptr_ + U_pitch_*l);
        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
            const unsigned int k = bx + i;
            if (k < nx_+1 && l < ny_) {
                U_shared[j][i] = U_row[k];
            }
            else {
                U_shared[j][i] = 0.0f;
            }
        }
    }
    //Read V into shared memory
    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
        const unsigned int l = by + j;
        //Compute the pointer to current row in the V array
        __global float* const V_row = (__global float*) ((__global char*) V_ptr_ + V_pitch_*l);
        for (int i=tx; i<block_width; i+=get_local_size(0)) {
            const unsigned int k = bx + i;
            if (k < nx_ && l < ny_+1) {
                V_shared[j][i] = V_row[k];
            }
            else {
                V_shared[j][i] = 0.0f;
            }
        }
    }
    //Make sure all threads have read into shared mem
    barrier(CLK_LOCAL_MEM_FENCE);
    //Compute the eta at the next timestep
    float eta_next = eta_current - dt_/dx_ * (U_shared[ty][tx+1] - U_shared[ty][tx])
                                 - dt_/dy_ * (V_shared[ty+1][tx] - V_shared[ty][tx]);
    //Write to main memory
    if (ti < nx_ && tj < ny_) {
        eta_row[ti] = eta_next;
    }
 }
--- a/SWESimulators/FORCE.py
+++ b/SWESimulators/FORCE.py
@ -22,7 +22,11 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-import pyopencl as cl #OpenCL in Python
+
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -53,24 +57,27 @@ class FORCE:
    g: Gravitational accelleration (9.81 m/s^2)
    """
    def __init__(self, \
-                 cl_ctx, \
+                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.kernel = Common.get_kernel(self.cl_ctx, "FORCE_kernel.opencl", block_width, block_height)
+        self.force_module = context.get_kernel("FORCE_kernel.cu", block_width, block_height)
        self.force_kernel = self.force_module.get_function("FORCEKernel")
        self.force_kernel.prepare("iiffffPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 1
        ghost_cells_y = 1
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        self.data = Common.SWEDataArakawaA(self.stream, \
                            nx, ny, \
                            ghost_cells_x, ghost_cells_y, \
                            h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -86,10 +93,10 @@ class FORCE:
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height) 
+        self.local_size = (block_width, block_height, 1) 
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      ) 
@ -109,20 +116,20 @@ class FORCE:
            if (local_dt <= 0.0):
                break
-            self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.force_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
-                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
            self.t += local_dt
-            self.cl_data.swap()
+            self.data.swap()
        return self.t
@ -131,5 +138,5 @@ class FORCE:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/FORCE_kernel.opencl
+++ b/SWESimulators/FORCE_kernel.opencl
@ -19,14 +19,15 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "common.opencl"
+#include "common.cu"
 /**
  * Computes the flux along the x axis for all faces
  */
-void computeFluxF(__local float Q[3][block_height+2][block_width+2],
+__device__ 
-                  __local float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][block_height+2][block_width+2],
                  float F[3][block_height+1][block_width+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
@ -34,16 +35,17 @@ void computeFluxF(__local float Q[3][block_height+2][block_width+2],
    const int ty = get_local_id(1);
    //Compute fluxes along the x axis
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+1; i+=block_width) {
            const int k = i;
            // Q at interface from the right and left
-            const float3 Qp = (float3)(Q[0][l][k+1],
+            const float3 Qp = make_float3(Q[0][l][k+1],
                                          Q[1][l][k+1],
                                          Q[2][l][k+1]);
-            const float3 Qm = (float3)(Q[0][l][k],
+            const float3 Qm = make_float3(Q[0][l][k],
                                          Q[1][l][k],
                                          Q[2][l][k]);
@ -54,32 +56,33 @@ void computeFluxF(__local float Q[3][block_height+2][block_width+2],
            F[2][j][i] = flux.z;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
 }
 /**
  * Computes the flux along the y axis for all faces
  */
-void computeFluxG(__local float Q[3][block_height+2][block_width+2],
+__device__ 
-                  __local float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        {
            int i=tx;
            const int k = i + 1; //Skip ghost cells
            // Q at interface from the right and left
            // Note that we swap hu and hv
-            const float3 Qp = (float3)(Q[0][l+1][k],
+            const float3 Qp = make_float3(Q[0][l+1][k],
                                          Q[2][l+1][k],
                                          Q[1][l+1][k]);
-            const float3 Qm = (float3)(Q[0][l][k],
+            const float3 Qm = make_float3(Q[0][l][k],
                                          Q[2][l][k],
                                          Q[1][l][k]);
@ -91,39 +94,26 @@ void computeFluxG(__local float Q[3][block_height+2][block_width+2],
            G[2][j][i] = flux.y;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
 }
-__kernel void swe_2D(
+__global__ void FORCEKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_) {
-    //Index of thread within block
+    __shared__ float Q[3][block_height+2][block_width+2];
-    const int tx = get_local_id(0);
+    __shared__ float F[3][block_height+1][block_width+1];
    const int ty = get_local_id(1);
    //Index of block within domain
    const int bx = get_local_size(0) * get_group_id(0);
    const int by = get_local_size(1) * get_group_id(1);
    //Index of cell within domain
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
    __local float Q[3][block_height+2][block_width+2];
    __local float F[3][block_height+1][block_width+1];
    //Read into shared memory
@ -131,34 +121,28 @@ __kernel void swe_2D(
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Save our input variables
    const float h0  = Q[0][ty+1][tx+1];
    const float hu0 = Q[1][ty+1][tx+1];
    const float hv0 = Q[2][ty+1][tx+1];
    //Set boundary conditions
    noFlowBoundary1(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Compute flux along x, and evolve
    computeFluxF(Q, F, g_, dx_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    evolveF1(Q, F, nx_, ny_, dx_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Set boundary conditions
    noFlowBoundary1(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Compute flux along y, and evolve
    computeFluxG(Q, F, g_, dy_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    evolveG1(Q, F, nx_, ny_, dy_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Write to main memory
    writeBlock1(h1_ptr_, h1_pitch_,
--- a/SWESimulators/HLL.py
+++ b/SWESimulators/HLL.py
@ -21,9 +21,12 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
 import pyopencl as cl #OpenCL in Python
 from SWESimulators import Common
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -39,8 +42,8 @@ class HLL:
    """
    Initialization routine
    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
-    u0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
-    v0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
    nx: Number of cells along x-axis
    ny: Number of cells along y-axis
    dx: Grid cell spacing along x-axis (20 000 m)
@ -49,24 +52,27 @@ class HLL:
    g: Gravitational accelleration (9.81 m/s^2)
    """
    def __init__(self, \
-                 cl_ctx,
+                 context, \
-                 h0, u0, v0, \
+                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.lxf_kernel = Common.get_kernel(self.cl_ctx, "HLL_kernel.opencl", block_width, block_height)
+        self.hll_module = context.get_kernel("HLL_kernel.cu", block_width, block_height)
        self.hll_kernel = self.hll_module.get_function("HLLKernel")
        self.hll_kernel.prepare("iiffffPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 1
        ghost_cells_y = 1
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, u0, v0)
+        self.data = Common.SWEDataArakawaA(self.stream, \
                            nx, ny, \
                            ghost_cells_x, ghost_cells_y, \
                            h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -82,10 +88,10 @@ class HLL:
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height) 
+        self.local_size = (block_width, block_height, 1) 
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      ) 
@ -105,20 +111,20 @@ class HLL:
            if (local_dt <= 0.0):
                break
-            self.lxf_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.hll_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
-                    self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                    self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                    self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
            self.t += local_dt
-            self.cl_data.swap()
+            self.data.swap()
        return self.t
@ -127,5 +133,5 @@ class HLL:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/HLL2.py
+++ b/SWESimulators/HLL2.py
@ -21,7 +21,11 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-import pyopencl as cl #OpenCL in Python
+
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -50,25 +54,28 @@ class HLL2:
    g: Gravitational accelleration (9.81 m/s^2)
    """
    def __init__(self, \
-                 cl_ctx, \
+                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, \
                 theta=1.8, \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.swe_kernel = Common.get_kernel(self.cl_ctx, "HLL2_kernel.opencl", block_width, block_height)
+        self.hll2_module = context.get_kernel("HLL2_kernel.cu", block_width, block_height)
        self.hll2_kernel = self.hll2_module.get_function("HLL2Kernel")
        self.hll2_kernel.prepare("iifffffiPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 2
        ghost_cells_y = 2
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        self.data = Common.SWEDataArakawaA(self.stream, \
                            nx, ny, \
                            ghost_cells_x, ghost_cells_y, \
                            h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -85,15 +92,15 @@ class HLL2:
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height)
+        self.local_size = (block_width, block_height, 1)
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      )
    def __str__(self):
-        return "Harten-Lax-van Leer contact discontinuity"
+        return "Harten-Lax-van Leer (2nd order)"
    """
@ -111,34 +118,34 @@ class HLL2:
                break
            #Along X, then Y
-            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
                    self.theta, \
                    np.int32(0), \
-                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.cl_data.swap()
+            self.data.swap()
            #Along Y, then X
-            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
                    self.theta, \
                    np.int32(1), \
-                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.cl_data.swap()
+            self.data.swap()
            self.t += local_dt
@ -148,5 +155,5 @@ class HLL2:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/HLL2_kernel.opencl
+++ b/SWESimulators/HLL2_kernel.opencl
@ -18,7 +18,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "common.opencl"
+#include "common.cu"
@ -29,31 +29,33 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 /**
  * Computes the flux along the x axis for all faces
  */
-void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+__device__
-                  __local float Qx[3][block_height+2][block_width+2],
+void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  __local float F[3][block_height+1][block_width+1],
+                  float Qx[3][block_height+2][block_width+2],
                  float F[3][block_height+1][block_width+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        const int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+1; i+=block_width) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
-            const float3 Q_rl = (float3)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+            const float3 Q_rl = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
                                            Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
                                            Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
-            const float3 Q_rr = (float3)(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
+            const float3 Q_rr = make_float3(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
                                            Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
                                            Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
-            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qx[0][j][i],
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qx[0][j][i],
                                            Q[1][l][k] - 0.5f*Qx[1][j][i],
                                            Q[2][l][k] - 0.5f*Qx[2][j][i]);
-            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qx[0][j][i],
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qx[0][j][i],
                                            Q[1][l][k] + 0.5f*Qx[1][j][i],
                                            Q[2][l][k] + 0.5f*Qx[2][j][i]);
@ -79,32 +81,34 @@ void computeFluxF(__local float Q[3][block_height+4][block_width+4],
 /**
  * Computes the flux along the x axis for all faces
  */
-void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+__device__
-                  __local float Qy[3][block_height+2][block_width+2],
+void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  __local float G[3][block_height+1][block_width+1],
+                  float Qy[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j + 1;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        { 
            int i=tx;
            const int k = i + 2; //Skip ghost cells
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
            //NOte that hu and hv are swapped ("transposing" the domain)!
-            const float3 Q_rl = (float3)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+            const float3 Q_rl = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
                                            Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
                                            Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
-            const float3 Q_rr = (float3)(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
+            const float3 Q_rr = make_float3(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
                                            Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
                                            Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
-            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qy[0][j][i],
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qy[0][j][i],
                                            Q[2][l][k] - 0.5f*Qy[2][j][i],
                                            Q[1][l][k] - 0.5f*Qy[1][j][i]);
-            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qy[0][j][i],
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qy[0][j][i],
                                            Q[2][l][k] + 0.5f*Qy[2][j][i],
                                            Q[1][l][k] + 0.5f*Qy[1][j][i]);
@ -131,7 +135,7 @@ void computeFluxG(__local float Q[3][block_height+4][block_width+4],
-__kernel void swe_2D(
+__global__ void HLL2Kernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
@ -141,19 +145,19 @@ __kernel void swe_2D(
        int step_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_) {
    //Shared memory variables
-    __local float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][block_height+4][block_width+4];
-    __local float Qx[3][block_height+2][block_width+2];
+    __shared__ float Qx[3][block_height+2][block_width+2];
-    __local float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][block_height+1][block_width+1];
@ -163,55 +167,55 @@ __kernel void swe_2D(
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Set boundary conditions
    noFlowBoundary2(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Step 0 => evolve x first, then y
    if (step_ == 0) {
        //Compute fluxes along the x axis and evolve
        minmodSlopeX(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxF(Q, Qx, F, g_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveF2(Q, F, nx_, ny_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Set boundary conditions
        noFlowBoundary2(Q, nx_, ny_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Compute fluxes along the y axis and evolve
        minmodSlopeY(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxG(Q, Qx, F, g_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveG2(Q, F, nx_, ny_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
    }
    //Step 1 => evolve y first, then x
    else {
        //Compute fluxes along the y axis and evolve
        minmodSlopeY(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxG(Q, Qx, F, g_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveG2(Q, F, nx_, ny_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Set boundary conditions
        noFlowBoundary2(Q, nx_, ny_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Compute fluxes along the x axis and evolve
        minmodSlopeX(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxF(Q, Qx, F, g_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveF2(Q, F, nx_, ny_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
    }
--- a/SWESimulators/HLL_kernel.opencl
+++ b/SWESimulators/HLL_kernel.opencl
@ -19,7 +19,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#include "common.opencl"
+#include "common.cu"
@ -28,20 +28,22 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 /**
  * Computes the flux along the x axis for all faces
  */
-void computeFluxF(__local float Q[3][block_height+2][block_width+2],
+__device__
-                  __local float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][block_height+2][block_width+2],
                  float F[3][block_height+1][block_width+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {   
+    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells     
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) { 
+        for (int i=tx; i<block_width+1; i+=block_width) { 
            const int k = i;
-            const float3 Q_l  = (float3)(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
+            const float3 Q_l  = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
-            const float3 Q_r  = (float3)(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
+            const float3 Q_r  = make_float3(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
            const float3 flux = HLL_flux(Q_l, Q_r, g_);
@ -58,23 +60,25 @@ void computeFluxF(__local float Q[3][block_height+2][block_width+2],
 /**
-  * Computes the flux along the x axis for all faces
+  * Computes the flux along the y axis for all faces
  */
-void computeFluxG(__local float Q[3][block_height+2][block_width+2],
+__device__
-                  __local float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        {
            const int i=tx;
            const int k = i + 1; //Skip ghost cells
            //NOte that hu and hv are swapped ("transposing" the domain)!
-            const float3 Q_l = (float3)(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
+            const float3 Q_l = make_float3(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
-            const float3 Q_r = (float3)(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
+            const float3 Q_r = make_float3(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
            // Computed flux
            const float3 flux = HLL_flux(Q_l, Q_r, g_);
@ -100,23 +104,23 @@ void computeFluxG(__local float Q[3][block_height+2][block_width+2],
-__kernel void swe_2D(
+__global__ void HLLKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_) {
    //Shared memory variables
-    __local float Q[3][block_height+2][block_width+2];
+    __shared__ float Q[3][block_height+2][block_width+2];
-    __local float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][block_height+1][block_width+1];
    //Read into shared memory
@ -124,28 +128,30 @@ __kernel void swe_2D(
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    noFlowBoundary1(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Compute F flux
    computeFluxF(Q, F, g_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    evolveF1(Q, F, nx_, ny_, dx_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Set boundary conditions
    noFlowBoundary1(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Compute G flux
    computeFluxG(Q, F, g_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    evolveG1(Q, F, nx_, ny_, dy_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Q[0][get_local_id(1) + 1][get_local_id(0) + 1] += 0.1;
    // Write to main memory for all internal cells
--- a/SWESimulators/KP07.py
+++ b/SWESimulators/KP07.py
@ -26,7 +26,11 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-import pyopencl as cl #OpenCL in Python
+
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -63,26 +67,25 @@ class KP07:
    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
    """
    def __init__(self, \
-                 cl_ctx, \
+                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
-                 g, f=0.0, r=0.0, \
+                 g, theta=1.3, \
-                 theta=1.3, use_rk2=True,
+                 r=0.0, use_rk2=True,
                 wind_stress=Common.WindStressParams(), \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-                 
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.kp07_kernel = Common.get_kernel(self.cl_ctx, "KP07_kernel.opencl", block_width, block_height)
+        self.kp07_module = context.get_kernel("KP07_kernel.cu", block_width, block_height)
        self.kp07_kernel = self.kp07_module.get_function("KP07Kernel")
        self.kp07_kernel.prepare("iiffffffiPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 2
        ghost_cells_y = 2
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -93,26 +96,24 @@ class KP07:
        self.dy = np.float32(dy)
        self.dt = np.float32(dt)
        self.g = np.float32(g)
        self.f = np.float32(f)
        self.r = np.float32(r)
        self.theta = np.float32(theta)
        self.r = np.float32(r)
        self.use_rk2 = use_rk2
        self.wind_stress = wind_stress
        #Initialize time
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height) 
+        self.local_size = (block_width, block_height, 1) 
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      ) 
    def __str__(self):
-        return "Kurganov-Petrova"
+        return "Kurganov-Petrova 2007"
    """
    Function which steps n timesteps
@ -127,64 +128,47 @@ class KP07:
                break
            if (self.use_rk2):
-                self.kp07_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                        self.nx, self.ny, \
                        self.dx, self.dy, local_dt, \
                        self.g, \
                        self.theta, \
                        self.f, \
                        self.r, \
                        np.int32(0), \
-                        self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                        self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
-                        self.wind_stress.type, \
+                        
-                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
+                self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                        self.wind_stress.x0, self.wind_stress.y0, \
                        self.wind_stress.u0, self.wind_stress.v0, \
                        self.t)
                self.kp07_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
                        self.nx, self.ny, \
                        self.dx, self.dy, local_dt, \
                        self.g, \
                        self.theta, \
                        self.f, \
                        self.r, \
                        np.int32(1), \
-                        self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.data.hv1.data.gpudata, self.data.hv1.pitch, \
-                        self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.pitch)
                        self.wind_stress.type, \
                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                        self.wind_stress.x0, self.wind_stress.y0, \
                        self.wind_stress.u0, self.wind_stress.v0, \
                        self.t)
            else:
-                self.kp07_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+                self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                        self.nx, self.ny, \
                        self.dx, self.dy, local_dt, \
                        self.g, \
                        self.theta, \
                        self.f, \
                        self.r, \
                        np.int32(0), \
-                        self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                        self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                        self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                        self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                        self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                        self.cl_data.hv1.data, self.cl_data.hv1.pitch, \
+                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
                        self.wind_stress.type, \
                        self.wind_stress.tau0, self.wind_stress.rho, self.wind_stress.alpha, self.wind_stress.xm, self.wind_stress.Rc, \
                        self.wind_stress.x0, self.wind_stress.y0, \
                        self.wind_stress.u0, self.wind_stress.v0, \
                        self.t)
                self.cl_data.swap()
            self.t += local_dt
@ -196,5 +180,5 @@ class KP07:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/KP07_dimsplit.py
+++ b/SWESimulators/KP07_dimsplit.py
@ -26,7 +26,11 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-import pyopencl as cl #OpenCL in Python
+
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -51,25 +55,25 @@ class KP07_dimsplit:
    g: Gravitational accelleration (9.81 m/s^2)
    """
    def __init__(self, \
-                 cl_ctx, \
+                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, \
                 theta=1.3, \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-                 
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.swe_kernel = Common.get_kernel(self.cl_ctx, "KP07_dimsplit_kernel.opencl", block_width, block_height)
+        self.kp07_dimsplit_module = context.get_kernel("KP07_dimsplit_kernel.cu", block_width, block_height)
        self.kp07_dimsplit_kernel = self.kp07_dimsplit_module.get_function("KP07DimsplitKernel")
        self.kp07_dimsplit_kernel.prepare("iifffffiPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 2
        ghost_cells_y = 2
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -86,15 +90,15 @@ class KP07_dimsplit:
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height) 
+        self.local_size = (block_width, block_height, 1) 
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      ) 
    def __str__(self):
-        return "Kurganov-Petrova dimensionally split"
+        return "Kurganov-Petrova 2007 dimensionally split"
    """
@ -113,34 +117,34 @@ class KP07_dimsplit:
                break
            #Along X, then Y
-            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
                    self.theta, \
                    np.int32(0), \
-                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.data.h0.data.gpudata,  self.data.h0.pitch, \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.data.h1.data.gpudata,  self.data.h1.pitch, \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.cl_data.swap()
+            self.data.swap()
            #Along Y, then X
-            self.swe_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
                    self.theta, \
                    np.int32(1), \
-                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.data.h0.data.gpudata,  self.data.h0.pitch, \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.data.h1.data.gpudata,  self.data.h1.pitch, \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.cl_data.swap()
+            self.data.swap()
            self.t += 2.0*local_dt
@ -151,5 +155,5 @@ class KP07_dimsplit:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/KP07_dimsplit_kernel.opencl
+++ b/SWESimulators/KP07_dimsplit_kernel.opencl
@ -24,35 +24,36 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#include "common.opencl"
+#include "common.cu"
-
+__device__
-void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  __local float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][block_height+2][block_width+2],
-                  __local float F[3][block_height+1][block_width+1],
+                  float F[3][block_height+1][block_width+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+1; i+=block_width) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
-            const float3 Q_rl = (float3)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+            const float3 Q_rl = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
                                            Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
                                            Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
-            const float3 Q_rr = (float3)(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
+            const float3 Q_rr = make_float3(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
                                            Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
                                            Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
-            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qx[0][j][i],
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qx[0][j][i],
                                            Q[1][l][k] - 0.5f*Qx[1][j][i],
                                            Q[2][l][k] - 0.5f*Qx[2][j][i]);
-            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qx[0][j][i],
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qx[0][j][i],
                                            Q[1][l][k] + 0.5f*Qx[1][j][i],
                                            Q[2][l][k] + 0.5f*Qx[2][j][i]);
@ -71,32 +72,34 @@ void computeFluxF(__local float Q[3][block_height+4][block_width+4],
    }    
 }
-void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+__device__
-                  __local float Qy[3][block_height+2][block_width+2],
+void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  __local float G[3][block_height+1][block_width+1],
+                  float Qy[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j + 1;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        {
            int i=tx;
            const int k = i + 2; //Skip ghost cells
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
            //NOte that hu and hv are swapped ("transposing" the domain)!
-            const float3 Q_rl = (float3)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+            const float3 Q_rl = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
                                            Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
                                            Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
-            const float3 Q_rr = (float3)(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
+            const float3 Q_rr = make_float3(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
                                            Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
                                            Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
-            const float3 Q_ll = (float3)(Q[0][l][k] - 0.5f*Qy[0][j][i],
+            const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qy[0][j][i],
                                            Q[2][l][k] - 0.5f*Qy[2][j][i],
                                            Q[1][l][k] - 0.5f*Qy[1][j][i]);
-            const float3 Q_lr = (float3)(Q[0][l][k] + 0.5f*Qy[0][j][i],
+            const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qy[0][j][i],
                                            Q[2][l][k] + 0.5f*Qy[2][j][i],
                                            Q[1][l][k] + 0.5f*Qy[1][j][i]);
@ -122,7 +125,7 @@ void computeFluxG(__local float Q[3][block_height+4][block_width+4],
 /**
  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
  */
-__kernel void swe_2D(
+__global__ void KP07DimsplitKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
@ -132,20 +135,20 @@ __kernel void swe_2D(
        int step_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_) {
    //Shared memory variables
-    __local float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][block_height+4][block_width+4];
-    __local float Qx[3][block_height+2][block_width+2];
+    __shared__ float Qx[3][block_height+2][block_width+2];
-    __local float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][block_height+1][block_width+1];
@ -154,12 +157,12 @@ __kernel void swe_2D(
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Fix boundary conditions
    noFlowBoundary2(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
@ -167,45 +170,45 @@ __kernel void swe_2D(
    if (step_ == 0) {
        //Compute fluxes along the x axis and evolve
        minmodSlopeX(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxF(Q, Qx, F, g_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveF2(Q, F, nx_, ny_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Set boundary conditions
        noFlowBoundary2(Q, nx_, ny_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Compute fluxes along the y axis and evolve
        minmodSlopeY(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxG(Q, Qx, F, g_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveG2(Q, F, nx_, ny_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
    }
    //Step 1 => evolve y first, then x
    else {
        //Compute fluxes along the y axis and evolve
        minmodSlopeY(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxG(Q, Qx, F, g_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveG2(Q, F, nx_, ny_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Set boundary conditions
        noFlowBoundary2(Q, nx_, ny_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Compute fluxes along the x axis and evolve
        minmodSlopeX(Q, Qx, theta_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        computeFluxF(Q, Qx, F, g_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveF2(Q, F, nx_, ny_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
    }
--- a/SWESimulators/KP07_kernel.opencl
+++ b/SWESimulators/KP07_kernel.opencl
@ -24,27 +24,28 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#include "common.opencl"
+#include "common.cu"
-
+__device__
-void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  __local float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][block_height+2][block_width+2],
-                  __local float F[3][block_height+1][block_width+1],
+                  float F[3][block_height+1][block_width+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+1; i+=block_width) {
            const int k = i + 1;
            // Q at interface from the right and left
-            const float3 Qp = (float3)(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
+            const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
                                          Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
                                          Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
-            const float3 Qm = (float3)(Q[0][l][k  ] + 0.5f*Qx[0][j][i  ],
+            const float3 Qm = make_float3(Q[0][l][k  ] + 0.5f*Qx[0][j][i  ],
                                          Q[1][l][k  ] + 0.5f*Qx[1][j][i  ],
                                          Q[2][l][k  ] + 0.5f*Qx[2][j][i  ]);
@ -57,24 +58,26 @@ void computeFluxF(__local float Q[3][block_height+4][block_width+4],
    }    
 }
-void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+__device__
-                  __local float Qy[3][block_height+2][block_width+2],
+void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  __local float G[3][block_height+1][block_width+1],
+                  float Qy[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j + 1;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        {
            int i=tx;
            const int k = i + 2; //Skip ghost cells
            // Q at interface from the right and left
            // Note that we swap hu and hv
-            const float3 Qp = (float3)(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
+            const float3 Qp = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
                                          Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
                                          Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
-            const float3 Qm = (float3)(Q[0][l  ][k] + 0.5f*Qy[0][j  ][i],
+            const float3 Qm = make_float3(Q[0][l  ][k] + 0.5f*Qy[0][j  ][i],
                                          Q[2][l  ][k] + 0.5f*Qy[2][j  ][i],
                                          Q[1][l  ][k] + 0.5f*Qy[1][j  ][i]);
@ -94,56 +97,44 @@ void computeFluxG(__local float Q[3][block_height+4][block_width+4],
 /**
  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
  */
-__kernel void swe_2D(
+__global__ void KP07Kernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
        float theta_,
        float f_, //< Coriolis coefficient
        float r_, //< Bottom friction coefficient
        int step_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_,
+        float* hv1_ptr_, int hv1_pitch_) {
        //Wind stress parameters
        int wind_stress_type_, 
        float tau0_, float rho_, float alpha_, float xm_, float Rc_,
        float x0_, float y0_,
        float u0_, float v0_,
        float t_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
    const int bx = get_local_size(0) * get_group_id(0);
    const int by = get_local_size(1) * get_group_id(1);
    //Index of cell within domain
    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
    const int tj = get_global_id(1) + 2;
    //Shared memory variables
-    __local float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][block_height+4][block_width+4];
    //The following slightly wastes memory, but enables us to reuse the 
    //funcitons in common.opencl
-    __local float Qx[3][block_height+2][block_width+2];
+    __shared__ float Qx[3][block_height+2][block_width+2];
-    __local float Qy[3][block_height+2][block_width+2];
+    __shared__ float Qy[3][block_height+2][block_width+2];
-    __local float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][block_height+1][block_width+1];
-    __local float G[3][block_height+1][block_width+1];
+    __shared__ float G[3][block_height+1][block_width+1];
@ -152,24 +143,24 @@ __kernel void swe_2D(
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Fix boundary conditions
    noFlowBoundary2(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Reconstruct slopes along x and axis
    minmodSlopeX(Q, Qx, theta_);
    minmodSlopeY(Q, Qy, theta_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Compute fluxes along the x and y axis
    computeFluxF(Q, Qx, F, g_);
    computeFluxG(Q, Qy, G, g_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Sum fluxes and advance in time for all internal cells
@ -177,33 +168,16 @@ __kernel void swe_2D(
        const int i = tx + 2; //Skip local ghost cells, i.e., +2
        const int j = ty + 2;
        const float X = windStressX(
            wind_stress_type_, 
            dx_, dy_, dt_,
            tau0_, rho_, alpha_, xm_, Rc_,
            x0_, y0_,
            u0_, v0_,
            t_);
        const float Y = windStressY(
            wind_stress_type_, 
            dx_, dy_, dt_,
            tau0_, rho_, alpha_, xm_, Rc_,
            x0_, y0_,
            u0_, v0_,
            t_);
        const float h1  = Q[0][j][i] + (F[0][ty][tx] - F[0][ty  ][tx+1]) * dt_ / dx_ 
                                     + (G[0][ty][tx] - G[0][ty+1][tx  ]) * dt_ / dy_;
        const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty  ][tx+1]) * dt_ / dx_ 
-                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_
+                                     + (G[1][ty][tx] - G[1][ty+1][tx  ]) * dt_ / dy_;
                                     + dt_*X - dt_*f_*Q[2][j][i];
        const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
-                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_
+                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_;
                                     + dt_*Y + dt_*f_*Q[1][j][i];
-        __global float* const h_row  = (__global float*) ((__global char*) h1_ptr_ + h1_pitch_*tj);
+        float* const h_row  = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
-        __global float* const hu_row = (__global float*) ((__global char*) hu1_ptr_ + hu1_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
-        __global float* const hv_row = (__global float*) ((__global char*) hv1_ptr_ + hv1_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
        const float C = 2.0f*r_*dt_/Q[0][j][i];
--- a/SWESimulators/LxF.py
+++ b/SWESimulators/LxF.py
@ -22,7 +22,11 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-import pyopencl as cl #OpenCL in Python
+
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -31,12 +35,8 @@ from SWESimulators import Common
 """
-Class that solves the SW equations using the Forward-Backward linear scheme
+Class that solves the SW equations using the Lax Friedrichs scheme
 """
 class LxF:
@ -53,24 +53,27 @@ class LxF:
    g: Gravitational accelleration (9.81 m/s^2)
    """
    def __init__(self, \
-                 cl_ctx, \
+                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.lxf_kernel = Common.get_kernel(self.cl_ctx, "LxF_kernel.opencl", block_width, block_height)
+        self.lxf_module = context.get_kernel("LxF_kernel.cu", block_width, block_height)
        self.lxf_kernel = self.lxf_module.get_function("LxFKernel")
        self.lxf_kernel.prepare("iiffffPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 1
        ghost_cells_y = 1
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        self.data = Common.SWEDataArakawaA(self.stream, \
                            nx, ny, \
                            ghost_cells_x, ghost_cells_y, \
                            h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -86,10 +89,10 @@ class LxF:
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height) 
+        self.local_size = (block_width, block_height, 1) 
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      ) 
@ -109,20 +112,20 @@ class LxF:
            if (local_dt <= 0.0):
                break
-            self.lxf_kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.lxf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
-                    self.cl_data.h0.data, self.cl_data.h0.pitch, \
+                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data, self.cl_data.h1.pitch, \
+                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
            self.t += local_dt
-            self.cl_data.swap()
+            self.data.swap()
        return self.t
@ -131,5 +134,5 @@ class LxF:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/LxF_kernel.opencl
+++ b/SWESimulators/LxF_kernel.opencl
@ -19,29 +19,31 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "common.opencl"
+#include "common.cu"
 /**
  * Computes the flux along the x axis for all faces
  */
-void computeFluxF(__local float Q[3][block_height+2][block_width+2],
+__device__ 
-                  __local float F[3][block_height][block_width+1],
+void computeFluxF(float Q[3][block_height+2][block_width+2],
                  float F[3][block_height][block_width+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+1; i+=block_width) {
            const int k = i;
            // Q at interface from the right and left
-            const float3 Qp = (float3)(Q[0][l][k+1],
+            const float3 Qp = make_float3(Q[0][l][k+1],
                                          Q[1][l][k+1],
                                          Q[2][l][k+1]);
-            const float3 Qm = (float3)(Q[0][l][k],
+            const float3 Qm = make_float3(Q[0][l][k],
                                          Q[1][l][k],
                                          Q[2][l][k]);
@ -58,24 +60,26 @@ void computeFluxF(__local float Q[3][block_height+2][block_width+2],
 /**
  * Computes the flux along the y axis for all faces
  */
-void computeFluxG(__local float Q[3][block_height+2][block_width+2],
+__device__ 
-                  __local float G[3][block_height+1][block_width],
+void computeFluxG(float Q[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {
+        {
            const int i=tx;
            const int k = i + 1; //Skip ghost cells
            // Q at interface from the right and left
            // Note that we swap hu and hv
-            const float3 Qp = (float3)(Q[0][l+1][k],
+            const float3 Qp = make_float3(Q[0][l+1][k],
                                          Q[2][l+1][k],
                                          Q[1][l+1][k]);
-            const float3 Qm = (float3)(Q[0][l][k],
+            const float3 Qm = make_float3(Q[0][l][k],
                                          Q[2][l][k],
                                          Q[1][l][k]);
@ -90,45 +94,45 @@ void computeFluxG(__local float Q[3][block_height+2][block_width+2],
 }
-__kernel void swe_2D(
+__global__ void LxFKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_) {
    //Index of cell within domain
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
-    __local float Q[3][block_height+2][block_width+2];
+    __shared__ float Q[3][block_height+2][block_width+2];
-    __local float F[3][block_height][block_width+1];
+    __shared__ float F[3][block_height][block_width+1];
-    __local float G[3][block_height+1][block_width];
+    __shared__ float G[3][block_height+1][block_width];
    //Read into shared memory
    readBlock1(h0_ptr_, h0_pitch_,
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Set boundary conditions
    noFlowBoundary1(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Compute fluxes along the x and y axis
    computeFluxF(Q, F, g_, dx_, dt_);
    computeFluxG(Q, G, g_, dy_, dt_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Evolve for all internal cells
@ -147,9 +151,9 @@ __kernel void swe_2D(
        const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty  ][tx+1]) * dt_ / dx_ 
                                     + (G[2][ty][tx] - G[2][ty+1][tx  ]) * dt_ / dy_;
-        __global float* const h_row  = (__global float*) ((__global char*) h1_ptr_ + h1_pitch_*tj);
+        float* const h_row  = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
-        __global float* const hu_row = (__global float*) ((__global char*) hu1_ptr_ + hu1_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
-        __global float* const hv_row = (__global float*) ((__global char*) hv1_ptr_ + hv1_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
        h_row[ti] = h1;
        hu_row[ti] = hu1;
--- a/SWESimulators/WAF.py
+++ b/SWESimulators/WAF.py
@ -22,7 +22,11 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-import pyopencl as cl #OpenCL in Python
+
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
 from SWESimulators import Common
@ -47,24 +51,24 @@ class WAF:
    g: Gravitational accelleration (9.81 m/s^2)
    """
    def __init__(self, \
-                 cl_ctx, \
+                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        self.cl_ctx = cl_ctx
+        #Create a CUDA stream
-                 
+        self.stream = cuda.Stream()
        #Create an OpenCL command queue
        self.cl_queue = cl.CommandQueue(self.cl_ctx)
        #Get kernels
-        self.kernel = Common.get_kernel(self.cl_ctx, "WAF_kernel.opencl", block_width, block_height)
+        self.waf_module = context.get_kernel("WAF_kernel.cu", block_width, block_height)
        self.waf_kernel = self.waf_module.get_function("WAFKernel")
        self.waf_kernel.prepare("iiffffiPiPiPiPiPiPi")
        #Create data by uploading to device
        ghost_cells_x = 2
        ghost_cells_y = 2
-        self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+        self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@ -80,14 +84,16 @@ class WAF:
        self.t = np.float32(0.0)
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height) 
+        self.local_size = (block_width, block_height, 1) 
        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
                      ) 
    def __str__(self):
        return "Weighted average flux"
    """
    Function which steps n timesteps
@ -104,32 +110,30 @@ class WAF:
                break
            #Along X, then Y
-            self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
                    np.int32(0), \
-                    self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                    self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                    self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
            self.cl_data.swap()
            #Along Y, then X
-            self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+            self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
                    self.nx, self.ny, \
                    self.dx, self.dy, local_dt, \
                    self.g, \
                    np.int32(1), \
-                    self.cl_data.h0.data,  self.cl_data.h0.pitch,  \
+                    self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                    self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
+                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
+                    self.data.hv1.data.gpudata, self.data.hv1.pitch, \
-                    self.cl_data.h1.data,  self.cl_data.h1.pitch,  \
+                    self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                    self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
+                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.cl_data.hv1.data, self.cl_data.hv1.pitch)
+                    self.data.hv0.data.gpudata, self.data.hv0.pitch)
            self.cl_data.swap()
            self.t += local_dt
@ -140,5 +144,5 @@ class WAF:
    def download(self):
-        return self.cl_data.download(self.cl_queue)
+        return self.data.download(self.stream)
--- a/SWESimulators/WAF_kernel.opencl
+++ b/SWESimulators/WAF_kernel.opencl
@ -24,30 +24,32 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#include "common.opencl"
+#include "common.cu"
 /**
  * Computes the flux along the x axis for all faces
  */
-void computeFluxF(__local float Q[3][block_height+4][block_width+4],
+__device__
-                  __local float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][block_height+4][block_width+4],
                  float F[3][block_height+1][block_width+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        int j=ty; 
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+1; i+=block_width) {
            const int k = i + 1;
            // Q at interface from the right and left
-            const float3 Ql2 = (float3)(Q[0][l][k-1], Q[1][l][k-1], Q[2][l][k-1]);
+            const float3 Ql2 = make_float3(Q[0][l][k-1], Q[1][l][k-1], Q[2][l][k-1]);
-            const float3 Ql1 = (float3)(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
+            const float3 Ql1 = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
-            const float3 Qr1 = (float3)(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
+            const float3 Qr1 = make_float3(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
-            const float3 Qr2 = (float3)(Q[0][l][k+2], Q[1][l][k+2], Q[2][l][k+2]);
+            const float3 Qr2 = make_float3(Q[0][l][k+2], Q[1][l][k+2], Q[2][l][k+2]);
            // Computed flux
            const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dx_, dt_);
@ -68,24 +70,26 @@ void computeFluxF(__local float Q[3][block_height+4][block_width+4],
 /**
  * Computes the flux along the y axis for all faces
  */
-void computeFluxG(__local float Q[3][block_height+4][block_width+4],
+__device__
-                  __local float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][block_height+4][block_width+4],
                  float G[3][block_height+1][block_width+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+1; j+=block_height) {
        const int l = j + 1;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        {
            int i=tx;
            const int k = i + 2; //Skip ghost cells
            // Q at interface from the right and left
            // Note that we swap hu and hv
-            const float3 Ql2 = (float3)(Q[0][l-1][k], Q[2][l-1][k], Q[1][l-1][k]);
+            const float3 Ql2 = make_float3(Q[0][l-1][k], Q[2][l-1][k], Q[1][l-1][k]);
-            const float3 Ql1 = (float3)(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
+            const float3 Ql1 = make_float3(Q[0][l  ][k], Q[2][l  ][k], Q[1][l  ][k]);
-            const float3 Qr1 = (float3)(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
+            const float3 Qr1 = make_float3(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
-            const float3 Qr2 = (float3)(Q[0][l+2][k], Q[2][l+2][k], Q[1][l+2][k]);
+            const float3 Qr2 = make_float3(Q[0][l+2][k], Q[2][l+2][k], Q[1][l+2][k]);
            // Computed flux
            // Note that we swap back
@ -110,23 +114,23 @@ void computeFluxG(__local float Q[3][block_height+4][block_width+4],
-__kernel void swe_2D(
+__global__ void WAFKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_, int step_,
        //Input h^n
-        __global float* h0_ptr_, int h0_pitch_,
+        float* h0_ptr_, int h0_pitch_,
-        __global float* hu0_ptr_, int hu0_pitch_,
+        float* hu0_ptr_, int hu0_pitch_,
-        __global float* hv0_ptr_, int hv0_pitch_,
+        float* hv0_ptr_, int hv0_pitch_,
        //Output h^{n+1}
-        __global float* h1_ptr_, int h1_pitch_,
+        float* h1_ptr_, int h1_pitch_,
-        __global float* hu1_ptr_, int hu1_pitch_,
+        float* hu1_ptr_, int hu1_pitch_,
-        __global float* hv1_ptr_, int hv1_pitch_) {    
+        float* hv1_ptr_, int hv1_pitch_) {    
    //Shared memory variables
-    __local float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][block_height+4][block_width+4];
-    __local float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][block_height+1][block_width+1];
@ -135,12 +139,12 @@ __kernel void swe_2D(
               hu0_ptr_, hu0_pitch_,
               hv0_ptr_, hv0_pitch_,
               Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
    //Set boundary conditions
    noFlowBoundary2(Q, nx_, ny_);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    __syncthreads();
@ -148,37 +152,37 @@ __kernel void swe_2D(
    if (step_ == 0) {
        //Compute fluxes along the x axis and evolve
        computeFluxF(Q, F, g_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveF2(Q, F, nx_, ny_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Fix boundary conditions
        noFlowBoundary2(Q, nx_, ny_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Compute fluxes along the y axis and evolve
        computeFluxG(Q, F, g_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveG2(Q, F, nx_, ny_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
    }
    //Step 1 => evolve y first, then x
    else {
        //Compute fluxes along the y axis and evolve
        computeFluxG(Q, F, g_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveG2(Q, F, nx_, ny_, dy_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Fix boundary conditions
        noFlowBoundary2(Q, nx_, ny_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        //Compute fluxes along the x axis and evolve
        computeFluxF(Q, F, g_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
        evolveF2(Q, F, nx_, ny_, dx_, dt_);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        __syncthreads();
    }
--- a/SWESimulators/common.opencl
+++ b/SWESimulators/common.opencl
@ -22,32 +22,97 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
  * Location of thread in block
  */
 inline __device__ int get_local_id(int dim) {
    switch(dim) {
        case 0: return threadIdx.x; 
        case 1: return threadIdx.y;
        case 2: return threadIdx.z;
        default: return -1;
    }
 }
 /**
  * Get block index
  */
 __device__ int get_group_id(int dim) {
    switch(dim) {
        case 0: return blockIdx.x;
        case 1: return blockIdx.y;
        case 2: return blockIdx.z;
        default: return -1;
    }
 }
 /**
  * Location of thread in global domain
  */
 __device__ int get_global_id(int dim) {
    switch(dim) {
        case 0: return blockDim.x*blockIdx.x + threadIdx.x;
        case 1: return blockDim.y*blockIdx.y + threadIdx.y;
        case 2: return blockDim.z*blockIdx.z + threadIdx.z;
        default: return -1;
    }
 }
 /**
  * Float3 operators 
  */
 inline __device__ float3 operator*(const float a, const float3 b) {
    return make_float3(a*b.x, a*b.y, a*b.z);
 }
 inline __device__ float3 operator/(const float3 a, const float b) {
    return make_float3(a.x/b, a.y/b, a.z/b);
 }
 inline __device__ float3 operator-(const float3 a, const float3 b) {
    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
 }
 inline __device__ float3 operator+(const float3 a, const float3 b) {
    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
 }
 inline __device__ __host__ float clamp(const float f, const float a, const float b) {
    return fmaxf(a, fminf(f, b));
 }
 /**
  * Reads a block of data  with one ghost cell for the shallow water equations
  */
-void readBlock1(__global float* h_ptr_, int h_pitch_,
+__device__ void readBlock1(float* h_ptr_, int h_pitch_,
-                __global float* hu_ptr_, int hu_pitch_,
+                float* hu_ptr_, int hu_pitch_,
-                __global float* hv_ptr_, int hv_pitch_,
+                float* hv_ptr_, int hv_pitch_,
-                __local float Q[3][block_height+2][block_width+2], 
+                float Q[3][block_height+2][block_width+2], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
-    const int bx = get_local_size(0) * get_group_id(0);
+    const int bx = block_width * get_group_id(0);
-    const int by = get_local_size(1) * get_group_id(1);
+    const int by = block_height * get_group_id(1);
    //Read into shared memory
-    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+2; j+=block_height) {
        const int l = clamp(by + j, 0, ny_+1); // Out of bounds
        //Compute the pointer to current row in the arrays
-        __global float* const h_row = (__global float*) ((__global char*) h_ptr_ + h_pitch_*l);
+        float* const h_row  = (float*) ((char*) h_ptr_  + h_pitch_*l);
-        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*l);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
-        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*l);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
-        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+2; i+=block_width) {
            const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
            Q[0][j][i] = h_row[k];
@ -64,29 +129,29 @@ void readBlock1(__global float* h_ptr_, int h_pitch_,
 /**
  * Reads a block of data  with two ghost cells for the shallow water equations
  */
-void readBlock2(__global float* h_ptr_, int h_pitch_,
+__device__ void readBlock2(float* h_ptr_, int h_pitch_,
-                __global float* hu_ptr_, int hu_pitch_,
+                float* hu_ptr_, int hu_pitch_,
-                __global float* hv_ptr_, int hv_pitch_,
+                float* hv_ptr_, int hv_pitch_,
-                __local float Q[3][block_height+4][block_width+4], 
+                float Q[3][block_height+4][block_width+4], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
-    const int bx = get_local_size(0) * get_group_id(0);
+    const int bx = block_width * get_group_id(0);
-    const int by = get_local_size(1) * get_group_id(1);
+    const int by = block_height * get_group_id(1);
    //Read into shared memory
-    for (int j=ty; j<block_height+4; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+4; j+=block_height) {
        const int l = clamp(by + j, 0, ny_+3); // Out of bounds
        //Compute the pointer to current row in the arrays
-        __global float* const h_row = (__global float*) ((__global char*) h_ptr_ + h_pitch_*l);
+        float* const h_row  = (float*) ((char*) h_ptr_  + h_pitch_*l);
-        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*l);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
-        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*l);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
-        for (int i=tx; i<block_width+4; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+4; i+=block_width) {
            const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
            Q[0][j][i] = h_row[k];
@ -102,10 +167,10 @@ void readBlock2(__global float* h_ptr_, int h_pitch_,
 /**
  * Writes a block of data to global memory for the shallow water equations.
  */
-void writeBlock1(__global float* h_ptr_, int h_pitch_,
+__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
-                 __global float* hu_ptr_, int hu_pitch_,
+                 float* hu_ptr_, int hu_pitch_,
-                 __global float* hv_ptr_, int hv_pitch_,
+                 float* hv_ptr_, int hv_pitch_,
-                 __local float Q[3][block_height+2][block_width+2],
+                 float Q[3][block_height+2][block_width+2],
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -120,9 +185,9 @@ void writeBlock1(__global float* h_ptr_, int h_pitch_,
        const int i = tx + 1; //Skip local ghost cells, i.e., +1
        const int j = ty + 1;
-        __global float* const h_row  = (__global float*) ((__global char*) h_ptr_ + h_pitch_*tj);
+        float* const h_row  = (float*) ((char*) h_ptr_  + h_pitch_*tj);
-        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj);
-        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj);
        h_row[ti]  = Q[0][j][i];
        hu_row[ti] = Q[1][j][i];
@ -137,10 +202,10 @@ void writeBlock1(__global float* h_ptr_, int h_pitch_,
 /**
  * Writes a block of data to global memory for the shallow water equations.
  */
-void writeBlock2(__global float* h_ptr_, int h_pitch_,
+__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
-                 __global float* hu_ptr_, int hu_pitch_,
+                 float* hu_ptr_, int hu_pitch_,
-                 __global float* hv_ptr_, int hv_pitch_,
+                 float* hv_ptr_, int hv_pitch_,
-                 __local float Q[3][block_height+4][block_width+4], 
+                 float Q[3][block_height+4][block_width+4], 
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -155,9 +220,9 @@ void writeBlock2(__global float* h_ptr_, int h_pitch_,
        const int i = tx + 2; //Skip local ghost cells, i.e., +2
        const int j = ty + 2;
-        __global float* const h_row  = (__global float*) ((__global char*) h_ptr_ + h_pitch_*tj);
+        float* const h_row  = (float*) ((char*) h_ptr_ + h_pitch_*tj);
-        __global float* const hu_row = (__global float*) ((__global char*) hu_ptr_ + hu_pitch_*tj);
+        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj);
-        __global float* const hv_row = (__global float*) ((__global char*) hv_ptr_ + hv_pitch_*tj);
+        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj);
        h_row[ti]  = Q[0][j][i];
        hu_row[ti] = Q[1][j][i];
@ -174,7 +239,7 @@ void writeBlock2(__global float* h_ptr_, int h_pitch_,
  * No flow boundary conditions for the shallow water equations
  * with one ghost cell in each direction
  */
-void noFlowBoundary1(__local float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
+__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
@ -218,7 +283,7 @@ void noFlowBoundary1(__local float Q[3][block_height+2][block_width+2], const in
  * No flow boundary conditions for the shallow water equations
  * with two ghost cells in each direction
  */
-void noFlowBoundary2(__local float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
+__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
    const int tj = get_global_id(1) + 2;
@ -276,8 +341,8 @@ void noFlowBoundary2(__local float Q[3][block_height+4][block_width+4], const in
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-void evolveF1(__local float Q[3][block_height+2][block_width+2],
+__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
-              __local float F[3][block_height+1][block_width+1],
+              float F[3][block_height+1][block_width+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@ -306,8 +371,8 @@ void evolveF1(__local float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-void evolveF2(__local float Q[3][block_height+4][block_width+4],
+__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
-              __local float F[3][block_height+1][block_width+1],
+              float F[3][block_height+1][block_width+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@ -336,8 +401,8 @@ void evolveF2(__local float Q[3][block_height+4][block_width+4],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-void evolveG1(__local float Q[3][block_height+2][block_width+2],
+__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
-              __local float G[3][block_height+1][block_width+1],
+              float G[3][block_height+1][block_width+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
@ -367,8 +432,8 @@ void evolveG1(__local float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-void evolveG2(__local float Q[3][block_height+4][block_width+4],
+__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
-              __local float G[3][block_height+1][block_width+1],
+              float G[3][block_height+1][block_width+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
@ -402,7 +467,7 @@ void evolveG2(__local float Q[3][block_height+4][block_width+4],
  * Reconstructs a slope using the minmod limiter based on three 
  * consecutive values
  */
-float minmodSlope(float left, float center, float right, float theta) {
+__device__ float minmodSlope(float left, float center, float right, float theta) {
    const float backward = (center - left) * theta;
    const float central = (right - left) * 0.5f;
    const float forward = (right - center) * theta;
@ -420,17 +485,18 @@ float minmodSlope(float left, float center, float right, float theta) {
 /**
  * Reconstructs a minmod slope for a whole block along x
  */
-void minmodSlopeX(__local float  Q[3][block_height+4][block_width+4],
+__device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
-                  __local float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][block_height+2][block_width+2],
                  const float theta_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Reconstruct slopes along x axis
-    for (int j=ty; j<block_height; j+=get_local_size(1)) {
+    {
        const int j = ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+2; i+=get_local_size(0)) {
+        for (int i=tx; i<block_width+2; i+=block_width) {
            const int k = i + 1;
            for (int p=0; p<3; ++p) {
                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
@ -443,16 +509,17 @@ void minmodSlopeX(__local float  Q[3][block_height+4][block_width+4],
 /**
  * Reconstructs a minmod slope for a whole block along y
  */
-void minmodSlopeY(__local float  Q[3][block_height+4][block_width+4],
+__device__ void minmodSlopeY(float  Q[3][block_height+4][block_width+4],
-                  __local float Qy[3][block_height+2][block_width+2],
+                  float Qy[3][block_height+2][block_width+2],
                  const float theta_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+2; j+=get_local_size(1)) {
+    for (int j=ty; j<block_height+2; j+=block_height) {
        const int l = j + 1;
-        for (int i=tx; i<block_width; i+=get_local_size(0)) {            
+        {
            const int i = tx;
            const int k = i + 2; //Skip ghost cells
            for (int p=0; p<3; ++p) {
                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
@ -466,91 +533,10 @@ void minmodSlopeY(__local float  Q[3][block_height+4][block_width+4],
 float windStressX(int wind_stress_type_,
                float dx_, float dy_, float dt_,
                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
                float x0_, float y0_,
                float u0_, float v0_,
                float t_) {
    float X = 0.0f;
    switch (wind_stress_type_) {
    case 0: //UNIFORM_ALONGSHORE
        {
            const float y = (get_global_id(1)+0.5f)*dy_;
            X = tau0_/rho_ * exp(-alpha_*y);
        }
        break;
    case 1: //BELL_SHAPED_ALONGSHORE
        if (t_ <= 48.0f*3600.0f) {
            const float a = alpha_*((get_global_id(0)+0.5f)*dx_-xm_);
            const float aa = a*a;
            const float y = (get_global_id(1)+0.5f)*dy_;
            X = tau0_/rho_ * exp(-aa) * exp(-alpha_*y);
        }
        break;
    case 2: //MOVING_CYCLONE
        {
            const float x = (get_global_id(0))*dx_;
            const float y = (get_global_id(1)+0.5f)*dy_;
            const float a = (x-x0_-u0_*(t_+dt_));
            const float aa = a*a;
            const float b = (y-y0_-v0_*(t_+dt_));
            const float bb = b*b;
            const float r = sqrt(aa+bb);
            const float c = 1.0f - r/Rc_;
            const float xi = c*c;
            X = -(tau0_/rho_) * (b/Rc_) * exp(-0.5f*xi);
        }
        break;
    }
    return X;
 }
-
+__device__ float3 F_func(const float3 Q, const float g) {
 float windStressY(int wind_stress_type_,
                float dx_, float dy_, float dt_,
                float tau0_, float rho_, float alpha_, float xm_, float Rc_,
                float x0_, float y0_,
                float u0_, float v0_,
                float t_) {
    float Y = 0.0f;
    switch (wind_stress_type_) {
    case 2: //MOVING_CYCLONE:
        {
            const float x = (get_global_id(0)+0.5f)*dx_; 
            const float y = (get_global_id(1))*dy_;
            const float a = (x-x0_-u0_*(t_+dt_));
            const float aa = a*a;
            const float b = (y-y0_-v0_*(t_+dt_));
            const float bb = b*b;
            const float r = sqrt(aa+bb);
            const float c = 1.0f - r/Rc_;
            const float xi = c*c;
            Y = (tau0_/rho_) * (a/Rc_) * exp(-0.5f*xi);
        }
        break;
    }
    return Y;
 }
 float3 F_func(const float3 Q, const float g) {
    float3 F;
    F.x = Q.y;                              //hu
@ -567,7 +553,7 @@ float3 F_func(const float3 Q, const float g) {
 /**
  * Central upwind flux function
  */
-float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
+__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
    const float3 Fp = F_func(Qp, g);
    const float up = Qp.y / Qp.x;   // hu / h
    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
@ -594,7 +580,7 @@ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
 /**
  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
  */
-float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
    const float h_l = Q_l.x;
    const float h_r = Q_r.x;
@ -646,7 +632,7 @@ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
 /**
  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
  */
-float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
    const float h_l = Q_l.x;
    const float h_r = Q_r.x;
@ -685,19 +671,19 @@ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
    //Or estimate flux in the "left star" region
    else if (S_l <= 0.0f && 0.0f <=S_star) {
        const float v_l = Q_l.z / h_l;
-        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * (float3)(1, S_star, v_l);
+        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
        return flux;
    }
    //Or estimate flux in the "righ star" region
    else if (S_star <= 0.0f && 0.0f <=S_r) {
        const float v_r = Q_r.z / h_r;
-        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * (float3)(1, S_star, v_r);
+        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
        return flux;
    }
    else {
-        return -99999.9f; //Something wrong here
+        return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
    }
 }
@ -709,7 +695,7 @@ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
  * @param c_ the courant number for wave k, dt*S_k/dx
  */
-float WAF_superbee(float r_, float c_) {
+__device__ float WAF_superbee(float r_, float c_) {
    // r <= 0.0
    if (r_ <= 0.0f) { 
        return 1.0f;
@ -735,7 +721,7 @@ float WAF_superbee(float r_, float c_) {
-float WAF_albada(float r_, float c_) {
+__device__ float WAF_albada(float r_, float c_) {
    if (r_ <= 0.0f) {
        return 1.0f;
    }
@ -744,32 +730,29 @@ float WAF_albada(float r_, float c_) {
    }
 }
-
+__device__ float WAF_minmod(float r_, float c_) {
-float WAF_minbee(float r_, float c_) {
+    return 1.0f - (1.0f - fabs(c_)) * fmax(0.0f, fmin(1.0f, r_));
    if (r_ <= 0.0f) {
        return 1.0f;
 }
-    else if (r_ >= 0.0f && r_ <= 1.0f) {
+
 __device__ float minmod(float r_) {
    return fmax(0.0f, fmin(1.0f, r_));
 }
 __device__ float superbee(float r_) {
    return fmax(0.0f, fmax(fmin(2.0f*r_, 1.0f), fmin(r_, 2.0f)));
 }
 __device__ float vanAlbada1(float r_) {
    return (r_*r_ + r_) / (r_*r_ + 1.0f);
 }
 __device__ float vanLeer(float r_) {
    return (r_ + fabs(r_)) / (1.0f + fabs(r_));
 }
 __device__ float limiterToWAFLimiter(float r_, float c_) {
    return 1.0f - (1.0f - fabs(c_))*r_;
 }
    else {
        return fabs(c_);
    }
 }
 float WAF_minmod(float r_, float c_) {
    if (r_ <= 0.0f) {
        return fabs(c_);
    }
    else if (r_ <= 1.0f) {
        return (1.0f - r_) * (1.0f - c_);
    }
    else {
        return 1.0f;
    }
 }
 /**
@ -780,7 +763,7 @@ float WAF_minmod(float r_, float c_) {
  * @param Q_r1 Q_{i+1}
  * @param Q_r2 Q_{i+2}
  */
-float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
+__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
    const float h_l = Q_l1.x;
    const float h_r = Q_r1.x;
@ -811,12 +794,12 @@ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, cons
    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
    // Compute wave speed estimates
-    const float S_l = u_l - c_l;//*q_l;
+    const float S_l = u_l - c_l*q_l; //FIXME: Right wave speed estimate?
-    const float S_r = u_r + c_r;//*q_r;
+    const float S_r = u_r + c_r*q_r;
    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
-    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * (float3)(1, S_star, v_l);
+    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
-    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * (float3)(1, S_star, v_r);
+    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
    // Estimate the fluxes in the four regions
    const float3 F_1 = F_func(Q_l1, g_);
@ -833,27 +816,40 @@ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, cons
    const float c_3 = S_r * dt_ / dx_;
    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
-    const float rh_m = fmin(fmax( (h_l - h_l2) / (h_r - h_l), -1.0f ), 1.0f);
+    const float rh_m = (h_l - h_l2) / (h_r - h_l);
-    const float rh_p = fmin(fmax( (h_r2 - h_r) / (h_r - h_l), -1.0f ), 1.0f);
+    const float rh_p = (h_r2 - h_r) / (h_r - h_l);
-    const float rv_m = fmin(fmax( (v_l - v_l2) / (v_r - v_l), -1.0f ), 1.0f);
+    const float rv_m = (v_l - v_l2) / (v_r - v_l);
-    const float rv_p = fmin(fmax( (v_r2 - v_r) / (v_r - v_l), -1.0f ), 1.0f);
+    const float rv_p = (v_r2 - v_r) / (v_r - v_l);
    // Compute the r parameters for the flux limiter
    const float rh_1 = (c_1 > 0.0f) ? rh_m : rh_p; 
-    const float rv_1 = (c_1 > 0.0f) ? rv_m : rv_p; 
+    //const float rv_1 = (c_1 > 0.0f) ? rv_m : rv_p; 
-    const float rh_2 = (c_2 > 0.0f) ? rh_m : rh_p; 
+    //const float rh_2 = (c_2 > 0.0f) ? rh_m : rh_p; 
    const float rv_2 = (c_2 > 0.0f) ? rv_m : rv_p; 
    const float rh_3 = (c_3 > 0.0f) ? rh_m : rh_p;
-    const float rv_3 = (c_3 > 0.0f) ? rv_m : rv_p;
+    //const float rv_3 = (c_3 > 0.0f) ? rv_m : rv_p;
    // Compute the limiter
    // We use h for the nonlinear waves, and v for the middle shear wave 
-    const float A_1 = c_1;//sign(c_1)*WAF_minbee(rh_1, c_1);
+    ///**
-    const float A_2 = c_2;//sign(c_2)*WAF_minbee(rv_2, c_2); //Middle shear wave 
+    const float A_1 = copysign(1.0f, c_1) * WAF_minmod(rh_1, c_1);
-    const float A_3 = c_3;//sign(c_3)*WAF_minbee(rh_3, c_3); 
+    const float A_2 = copysign(1.0f, c_2) * WAF_minmod(rv_2, c_2); //Middle shear wave 
    const float A_3 = copysign(1.0f, c_3) * WAF_minmod(rh_3, c_3); 
    //*/
    /**
    //2nd order for smooth cases (unstable for shocks)
    const float A_1 = c_1;
    const float A_2 = c_2;
    const float A_3 = c_3;
    */
    /*
    const float A_1 = sign(c_1) * limiterToWAFLimiter(minmod(rh_1), c_1);
    const float A_2 = sign(c_2) * limiterToWAFLimiter(minmod(rv_2), c_2);
    const float A_3 = sign(c_3) * limiterToWAFLimiter(minmod(rh_3), c_3);
    */
    //Average the fluxes
    const float3 flux = 0.5f*( F_1 + F_4 )
@ -899,11 +895,11 @@ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, cons
 /**
  * Lax-Friedrichs flux (Toro 2001, p 163)
  */
-float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
    const float3 F_l = F_func(Q_l, g_);
    const float3 F_r = F_func(Q_r, g_);
-    return 0.5f*(F_l + F_r) + (Q_l - Q_r) * dx_ / (2.0f*dt_);
+    return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
 }
@ -911,12 +907,12 @@ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const flo
 /**
  * Lax-Friedrichs extended to 2D
  */
-float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
    const float3 F_l = F_func(Q_l, g_);
    const float3 F_r = F_func(Q_r, g_);
    //Note numerical diffusion for 2D here (0.25)
-    return 0.5f*(F_l + F_r) + (Q_l - Q_r) * dx_ / (4.0f*dt_);
+    return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
 }
@ -925,11 +921,11 @@ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const flo
 /**
  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
  */
-float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
    const float3 F_l = F_func(Q_l, g_);
    const float3 F_r = F_func(Q_r, g_);
-    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (F_l - F_r)*dt_/(2.0f*dx_);
+    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
    return F_func(Q_lw2, g_);
 }
@ -942,11 +938,11 @@ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const fl
 /**
  * Godunovs centered scheme (Toro 2001, p 165)
  */
-float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
    const float3 F_l = F_func(Q_l, g_);
    const float3 F_r = F_func(Q_r, g_);
-    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (F_l - F_r)*dt_/dx_;
+    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
    return F_func(Q_godc, g_);
 }
@ -957,7 +953,7 @@ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const fl
 /**
  * First Ordered Centered (Toro 2001, p.163)
  */
-float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
    return 0.5f*(F_lf + F_lw2);
--- a/WAFExp.ipynb
+++ b/WAFExp.ipynb
--- a/shock1d_ref_nx=1024.csv
+++ b/shock1d_ref_nx=1024.csv
--- a/shock1d_ref_nx=128.csv
+++ b/shock1d_ref_nx=128.csv
@ -0,0 +1,146 @@
 ##############################################################################
 # Generated by SWASHES version 1.03.00, 2016-01-29
 ##############################################################################
 # Dimension: 1
 # Type: 3 (=Dam break)
 # Domain: 1
 # Choice: 1 (=on a wet domain without friction (Stoker's solution))
 ##############################################################################
 # PARAMETERS OF THE SOLUTION
 # 
 # Length of the domain: 10 meters
 # Space step: 0.078125 meters
 # Number of cells: 128
 # Position of the dam: x=5 meters
 # Time value: 6 seconds
 ##############################################################################
 # 
 #(i-0.5)*dx 	    h[i] 	    u[i] 	 topo[i] 	    q[i] 	 topo[i]+h[i] 	Fr[i]=Froude	 topo[i]+hc[i] 	
 0.0390625	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.117188	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.195312	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.273438	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.351562	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.429688	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.507812	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.585938	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.664062	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.742188	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.820312	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.898438	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.976562	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.05469	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.13281	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.21094	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.28906	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.36719	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.44531	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.52344	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.60156	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.67969	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.75781	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.83594	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.91406	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.99219	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.07031	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.14844	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.22656	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.30469	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.38281	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.46094	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.53906	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.61719	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.69531	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.77344	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.85156	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.92969	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.00781	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.08594	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.16406	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.24219	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.32031	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.39844	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.47656	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.55469	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.63281	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.71094	0.00490073	0.00441906	        0	2.16566e-005	0.00490073	0.0201542	0.000362943	
 3.78906	0.00470863	0.0130996	        0	6.16813e-005	0.00470863	0.0609504	0.000729255	
 3.86719	0.00452038	0.0217802	        0	9.84546e-005	0.00452038	 0.103428	0.000996019	
 3.94531	0.00433596	0.0304607	        0	0.000132076	0.00433596	 0.147694	0.00121151	
 4.02344	0.00415538	0.0391413	        0	0.000162647	0.00415538	 0.193863	0.0013919	
 4.10156	0.00397865	0.0478218	        0	0.000190266	0.00397865	 0.242061	0.00154532	
 4.17969	0.00380575	0.0565024	        0	0.000215034	0.00380575	 0.292423	0.00167667	
 4.25781	0.0036367	 0.065183	        0	0.000237051	0.0036367	 0.345101	0.00178925	
 4.33594	0.00347148	0.0738635	        0	0.000256416	0.00347148	 0.400256	0.00188541	
 4.41406	0.00331011	0.0825441	        0	0.00027323	0.00331011	 0.458068	0.00196696	
 4.49219	0.00315257	0.0912246	        0	0.000287592	0.00315257	 0.518734	0.0020353	
 4.57031	0.00299888	0.0999052	        0	0.000299604	0.00299888	  0.58247	0.00209158	
 4.64844	0.00284903	 0.108586	        0	0.000309364	0.00284903	 0.649516	0.00213677	
 4.72656	0.00270302	 0.117266	        0	0.000316973	0.00270302	 0.720135	0.00217166	
 4.80469	0.00256085	 0.125947	        0	0.000322531	0.00256085	 0.794623	0.00219697	
 4.88281	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.96094	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.03906	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.11719	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.19531	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.27344	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.35156	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.42969	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.50781	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.58594	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.66406	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.74219	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.82031	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.89844	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.97656	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.05469	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.13281	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.21094	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.28906	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.36719	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.44531	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.52344	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.60156	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.67969	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.75781	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.83594	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.91406	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.99219	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.07031	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.14844	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.22656	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.30469	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.38281	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.46094	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.53906	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.61719	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.69531	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.77344	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.85156	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.92969	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.00781	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.08594	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.16406	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.24219	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.32031	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.39844	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.47656	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.55469	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.63281	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.71094	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.78906	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.86719	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.94531	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.02344	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.10156	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.17969	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.25781	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.33594	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.41406	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.49219	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.57031	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.64844	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.72656	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.80469	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.88281	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.96094	    0.001	        0	        0	        0	    0.001	        0	        0	
--- a/shock1d_ref_nx=2048.csv
+++ b/shock1d_ref_nx=2048.csv
--- a/shock1d_ref_nx=256.csv
+++ b/shock1d_ref_nx=256.csv
@ -0,0 +1,274 @@
 ##############################################################################
 # Generated by SWASHES version 1.03.00, 2016-01-29
 ##############################################################################
 # Dimension: 1
 # Type: 3 (=Dam break)
 # Domain: 1
 # Choice: 1 (=on a wet domain without friction (Stoker's solution))
 ##############################################################################
 # PARAMETERS OF THE SOLUTION
 # 
 # Length of the domain: 10 meters
 # Space step: 0.0390625 meters
 # Number of cells: 256
 # Position of the dam: x=5 meters
 # Time value: 6 seconds
 ##############################################################################
 # 
 #(i-0.5)*dx 	    h[i] 	    u[i] 	 topo[i] 	    q[i] 	 topo[i]+h[i] 	Fr[i]=Froude	 topo[i]+hc[i] 	
 0.0195312	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.0585938	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.0976562	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.136719	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.175781	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.214844	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.253906	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.292969	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.332031	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.371094	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.410156	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.449219	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.488281	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.527344	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.566406	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.605469	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.644531	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.683594	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.722656	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.761719	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.800781	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.839844	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.878906	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.917969	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.957031	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.996094	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.03516	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.07422	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.11328	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.15234	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.19141	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.23047	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.26953	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.30859	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.34766	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.38672	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.42578	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.46484	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.50391	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.54297	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.58203	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.62109	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.66016	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.69922	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.73828	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.77734	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.81641	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.85547	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.89453	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.93359	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.97266	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.01172	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.05078	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.08984	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.12891	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.16797	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.20703	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.24609	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.28516	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.32422	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.36328	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.40234	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.44141	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.48047	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.51953	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.55859	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.59766	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.63672	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.67578	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.71484	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.75391	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.79297	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.83203	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.87109	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.91016	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.94922	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.98828	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.02734	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.06641	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.10547	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.14453	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.18359	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.22266	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.26172	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.30078	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.33984	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.37891	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.41797	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.45703	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.49609	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.53516	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.57422	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.61328	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.65234	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.69141	0.00494936	0.00224893	        0	1.11307e-005	0.00494936	0.0102062	0.000232877	
 3.73047	0.00485235	0.0065892	        0	3.19731e-005	0.00485235	0.0302011	0.00047058	
 3.76953	0.0047563	0.0109295	        0	5.19839e-005	0.0047563	0.0505977	0.000650663	
 3.80859	0.00466121	0.0152698	        0	7.11755e-005	0.00466121	0.0714082	0.000802289	
 3.84766	0.00456708	  0.01961	        0	8.95606e-005	0.00456708	0.0926456	0.000935093	
 3.88672	0.00447391	0.0239503	        0	0.000107152	0.00447391	 0.114323	0.00105384	
 3.92578	0.0043817	0.0282906	        0	0.000123961	0.0043817	 0.136454	0.00116136	
 3.96484	0.00429045	0.0326309	        0	0.000140001	0.00429045	 0.159053	0.0012595	
 4.00391	0.00420017	0.0369711	        0	0.000155285	0.00420017	 0.182136	0.00134957	
 4.04297	0.00411084	0.0413114	        0	0.000169825	0.00411084	 0.205717	0.00143255	
 4.08203	0.00402247	0.0456517	        0	0.000183633	0.00402247	 0.229814	0.00150919	
 4.12109	0.00393506	 0.049992	        0	0.000196722	0.00393506	 0.254443	0.00158008	
 4.16016	0.00384861	0.0543323	        0	0.000209104	0.00384861	 0.279622	0.0016457	
 4.19922	0.00376313	0.0586725	        0	0.000220792	0.00376313	  0.30537	0.00170647	
 4.23828	0.0036786	0.0630128	        0	0.000231799	0.0036786	 0.331706	0.00176272	
 4.27734	0.00359503	0.0673531	        0	0.000242137	0.00359503	 0.358651	0.00181475	
 4.31641	0.00351242	0.0716934	        0	0.000251818	0.00351242	 0.386226	0.00186281	
 4.35547	0.00343078	0.0760336	        0	0.000260855	0.00343078	 0.414453	0.00190711	
 4.39453	0.00335009	0.0803739	        0	0.00026926	0.00335009	 0.443356	0.00194786	
 4.43359	0.00327036	0.0847142	        0	0.000277046	0.00327036	 0.472959	0.00198523	
 4.47266	0.0031916	0.0890545	        0	0.000284226	0.0031916	 0.503289	0.00201939	
 4.51172	0.00311379	0.0933948	        0	0.000290812	0.00311379	 0.534371	0.00205046	
 4.55078	0.00303694	 0.097735	        0	0.000296816	0.00303694	 0.566236	0.00207859	
 4.58984	0.00296106	 0.102075	        0	0.000302251	0.00296106	 0.598912	0.00210389	
 4.62891	0.00288613	 0.106416	        0	0.000307129	0.00288613	  0.63243	0.00212646	
 4.66797	0.00281217	 0.110756	        0	0.000311464	0.00281217	 0.666825	0.00214642	
 4.70703	0.00273916	 0.115096	        0	0.000315267	0.00273916	  0.70213	0.00216386	
 4.74609	0.00266712	 0.119436	        0	0.000318551	0.00266712	 0.738383	0.00217886	
 4.78516	0.00259603	 0.123777	        0	0.000321328	0.00259603	 0.775621	0.00219151	
 4.82422	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.86328	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.90234	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.94141	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.98047	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.01953	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.05859	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.09766	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.13672	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.17578	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.21484	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.25391	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.29297	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.33203	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.37109	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.41016	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.44922	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.48828	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.52734	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.56641	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.60547	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.64453	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.68359	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.72266	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.76172	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.80078	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.83984	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.87891	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.91797	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.95703	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.99609	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.03516	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.07422	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.11328	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.15234	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.19141	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.23047	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.26953	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.30859	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.34766	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.38672	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.42578	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.46484	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.50391	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.54297	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.58203	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.62109	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.66016	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.69922	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.73828	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.77734	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.81641	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.85547	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.89453	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.93359	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.97266	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.01172	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.05078	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.08984	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.12891	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.16797	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.20703	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.24609	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.28516	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.32422	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.36328	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.40234	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.44141	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.48047	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.51953	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.55859	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.59766	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.63672	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.67578	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.71484	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.75391	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.79297	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.83203	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.87109	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.91016	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.94922	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.98828	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.02734	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.06641	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.10547	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.14453	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.18359	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.22266	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.26172	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.30078	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.33984	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.37891	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.41797	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.45703	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.49609	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.53516	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.57422	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.61328	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.65234	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.69141	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.73047	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.76953	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.80859	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.84766	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.88672	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.92578	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.96484	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.00391	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.04297	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.08203	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.12109	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.16016	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.19922	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.23828	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.27734	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.31641	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.35547	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.39453	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.43359	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.47266	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.51172	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.55078	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.58984	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.62891	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.66797	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.70703	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.74609	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.78516	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.82422	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.86328	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.90234	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.94141	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.98047	    0.001	        0	        0	        0	    0.001	        0	        0	
--- a/shock1d_ref_nx=4096.csv
+++ b/shock1d_ref_nx=4096.csv
--- a/shock1d_ref_nx=512.csv
+++ b/shock1d_ref_nx=512.csv
@ -0,0 +1,530 @@
 ##############################################################################
 # Generated by SWASHES version 1.03.00, 2016-01-29
 ##############################################################################
 # Dimension: 1
 # Type: 3 (=Dam break)
 # Domain: 1
 # Choice: 1 (=on a wet domain without friction (Stoker's solution))
 ##############################################################################
 # PARAMETERS OF THE SOLUTION
 # 
 # Length of the domain: 10 meters
 # Space step: 0.0195312 meters
 # Number of cells: 512
 # Position of the dam: x=5 meters
 # Time value: 6 seconds
 ##############################################################################
 # 
 #(i-0.5)*dx 	    h[i] 	    u[i] 	 topo[i] 	    q[i] 	 topo[i]+h[i] 	Fr[i]=Froude	 topo[i]+hc[i] 	
 0.00976562	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.0292969	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.0488281	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.0683594	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.0878906	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.107422	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.126953	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.146484	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.166016	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.185547	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.205078	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.224609	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.244141	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.263672	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.283203	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.302734	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.322266	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.341797	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.361328	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.380859	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.400391	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.419922	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.439453	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.458984	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.478516	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.498047	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.517578	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.537109	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.556641	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.576172	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.595703	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.615234	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.634766	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.654297	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.673828	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.693359	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.712891	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.732422	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.751953	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.771484	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.791016	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.810547	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.830078	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.849609	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.869141	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.888672	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.908203	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.927734	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.947266	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.966797	    0.005	        0	        0	        0	    0.005	        0	        0	
 0.986328	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.00586	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.02539	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.04492	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.06445	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.08398	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.10352	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.12305	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.14258	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.16211	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.18164	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.20117	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.2207	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.24023	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.25977	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.2793	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.29883	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.31836	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.33789	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.35742	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.37695	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.39648	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.41602	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.43555	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.45508	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.47461	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.49414	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.51367	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.5332	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.55273	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.57227	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.5918	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.61133	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.63086	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.65039	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.66992	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.68945	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.70898	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.72852	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.74805	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.76758	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.78711	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.80664	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.82617	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.8457	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.86523	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.88477	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.9043	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.92383	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.94336	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.96289	    0.005	        0	        0	        0	    0.005	        0	        0	
 1.98242	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.00195	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.02148	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.04102	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.06055	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.08008	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.09961	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.11914	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.13867	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.1582	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.17773	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.19727	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.2168	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.23633	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.25586	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.27539	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.29492	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.31445	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.33398	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.35352	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.37305	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.39258	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.41211	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.43164	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.45117	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.4707	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.49023	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.50977	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.5293	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.54883	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.56836	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.58789	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.60742	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.62695	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.64648	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.66602	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.68555	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.70508	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.72461	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.74414	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.76367	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.7832	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.80273	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.82227	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.8418	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.86133	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.88086	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.90039	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.91992	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.93945	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.95898	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.97852	    0.005	        0	        0	        0	    0.005	        0	        0	
 2.99805	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.01758	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.03711	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.05664	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.07617	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.0957	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.11523	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.13477	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.1543	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.17383	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.19336	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.21289	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.23242	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.25195	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.27148	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.29102	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.31055	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.33008	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.34961	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.36914	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.38867	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.4082	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.42773	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.44727	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.4668	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.48633	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.50586	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.52539	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.54492	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.56445	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.58398	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.60352	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.62305	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.64258	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.66211	    0.005	        0	        0	        0	    0.005	        0	        0	
 3.68164	0.00497376	0.00116386	        0	5.78874e-006	0.00497376	0.00526893	0.000150603	
 3.70117	0.00492501	0.00333399	        0	1.642e-005	0.00492501	0.0151679	0.000301781	
 3.7207	0.00487651	0.00550413	        0	2.6841e-005	0.00487651	0.0251652	0.00041877	
 3.74023	0.00482825	0.00767427	        0	3.70533e-005	0.00482825	0.0352621	0.000519192	
 3.75977	0.00478022	0.00984441	        0	4.70585e-005	0.00478022	0.0454602	0.000608885	
 3.7793	0.00473244	0.0120146	        0	5.68581e-005	0.00473244	 0.055761	0.000690725	
 3.79883	0.00468489	0.0141847	        0	6.64537e-005	0.00468489	0.0661661	0.000766402	
 3.81836	0.00463759	0.0163548	        0	7.58469e-005	0.00463759	0.0766771	0.00083702	
 3.83789	0.00459052	 0.018525	        0	8.50393e-005	0.00459052	0.0872955	0.000903351	
 3.85742	0.0045437	0.0206951	        0	9.40323e-005	0.0045437	0.0980231	0.000965966	
 3.87695	0.00449711	0.0228652	        0	0.000102828	0.00449711	 0.108862	0.0010253	
 3.89648	0.00445077	0.0250354	        0	0.000111427	0.00445077	 0.119813	0.00108169	
 3.91602	0.00440467	0.0272055	        0	0.000119831	0.00440467	 0.130878	0.00113542	
 3.93555	0.0043588	0.0293757	        0	0.000128043	0.0043588	 0.142059	0.00118672	
 3.95508	0.00431318	0.0315458	        0	0.000136063	0.00431318	 0.153359	0.00123577	
 3.97461	0.00426779	0.0337159	        0	0.000143893	0.00426779	 0.164778	0.00128273	
 3.99414	0.00422265	0.0358861	        0	0.000151534	0.00422265	 0.176319	0.00132775	
 4.01367	0.00417774	0.0380562	        0	0.000158989	0.00417774	 0.187984	0.00137095	
 4.0332	0.00413308	0.0402264	        0	0.000166259	0.00413308	 0.199774	0.00141243	
 4.05273	0.00408866	0.0423965	        0	0.000173345	0.00408866	 0.211692	0.00145228	
 4.07227	0.00404447	0.0445666	        0	0.000180248	0.00404447	  0.22374	0.00149059	
 4.0918	0.00400053	0.0467368	        0	0.000186972	0.00400053	  0.23592	0.00152743	
 4.11133	0.00395682	0.0489069	        0	0.000193516	0.00395682	 0.248235	0.00156287	
 4.13086	0.00391336	0.0510771	        0	0.000199883	0.00391336	 0.260685	0.00159696	
 4.15039	0.00387014	0.0532472	        0	0.000206074	0.00387014	 0.273274	0.00162977	
 4.16992	0.00382715	0.0554173	        0	0.000212091	0.00382715	 0.286005	0.00166134	
 4.18945	0.00378441	0.0575875	        0	0.000217935	0.00378441	 0.298878	0.00169172	
 4.20898	0.0037419	0.0597576	        0	0.000223607	0.0037419	 0.311898	0.00172095	
 4.22852	0.00369964	0.0619277	        0	0.00022911	0.00369964	 0.325066	0.00174907	
 4.24805	0.00365762	0.0640979	        0	0.000234446	0.00365762	 0.338384	0.00177612	
 4.26758	0.00361583	 0.066268	        0	0.000239614	0.00361583	 0.351856	0.00180213	
 4.28711	0.00357429	0.0684382	        0	0.000244618	0.00357429	 0.365484	0.00182713	
 4.30664	0.00353299	0.0706083	        0	0.000249458	0.00353299	 0.379272	0.00185115	
 4.32617	0.00349192	0.0727784	        0	0.000254137	0.00349192	  0.39322	0.00187423	
 4.3457	0.0034511	0.0749486	        0	0.000258655	0.0034511	 0.407334	0.00189638	
 4.36523	0.00341052	0.0771187	        0	0.000263015	0.00341052	 0.421614	0.00191762	
 4.38477	0.00337017	0.0792889	        0	0.000267217	0.00337017	 0.436065	 0.001938	
 4.4043	0.00333007	 0.081459	        0	0.000271264	0.00333007	  0.45069	0.00195752	
 4.42383	0.00329021	0.0836291	        0	0.000275157	0.00329021	 0.465491	0.0019762	
 4.44336	0.00325058	0.0857993	        0	0.000278898	0.00325058	 0.480472	0.00199407	
 4.46289	0.0032112	0.0879694	        0	0.000282487	0.0032112	 0.495637	0.00201114	
 4.48242	0.00317206	0.0901396	        0	0.000285928	0.00317206	 0.510988	0.00202744	
 4.50195	0.00313315	0.0923097	        0	0.00028922	0.00313315	 0.526529	0.00204297	
 4.52148	0.00309449	0.0944798	        0	0.000292367	0.00309449	 0.542263	0.00205776	
 4.54102	0.00305607	  0.09665	        0	0.000295369	0.00305607	 0.558195	0.00207183	
 4.56055	0.00301788	0.0988201	        0	0.000298228	0.00301788	 0.574327	0.00208517	
 4.58008	0.00297994	  0.10099	        0	0.000300945	0.00297994	 0.590665	0.00209782	
 4.59961	0.00294224	  0.10316	        0	0.000303522	0.00294224	 0.607211	0.00210978	
 4.61914	0.00290477	 0.105331	        0	0.000305961	0.00290477	  0.62397	0.00212107	
 4.63867	0.00286755	 0.107501	        0	0.000308264	0.00286755	 0.640945	0.0021317	
 4.6582	0.00283057	 0.109671	        0	0.000310431	0.00283057	 0.658142	0.00214167	
 4.67773	0.00279383	 0.111841	        0	0.000312464	0.00279383	 0.675564	0.00215102	
 4.69727	0.00275732	 0.114011	        0	0.000314365	0.00275732	 0.693216	0.00215973	
 4.7168	0.00272106	 0.116181	        0	0.000316136	0.00272106	 0.711103	0.00216784	
 4.73633	0.00268504	 0.118351	        0	0.000317778	0.00268504	 0.729228	0.00217533	
 4.75586	0.00264925	 0.120521	        0	0.000319292	0.00264925	 0.747598	0.00218224	
 4.77539	0.00261371	 0.122692	        0	0.00032068	0.00261371	 0.766217	0.00218856	
 4.79492	0.00257841	 0.124862	        0	0.000321945	0.00257841	 0.785089	0.00219431	
 4.81445	0.00254335	 0.127032	        0	0.000323086	0.00254335	 0.804221	0.00219949	
 4.83398	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.85352	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.87305	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.89258	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.91211	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.93164	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.95117	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.9707	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 4.99023	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.00977	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.0293	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.04883	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.06836	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.08789	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.10742	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.12695	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.14648	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.16602	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.18555	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.20508	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.22461	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.24414	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.26367	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.2832	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.30273	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.32227	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.3418	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.36133	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.38086	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.40039	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.41992	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.43945	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.45898	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.47852	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.49805	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.51758	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.53711	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.55664	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.57617	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.5957	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.61523	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.63477	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.6543	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.67383	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.69336	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.71289	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.73242	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.75195	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.77148	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.79102	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.81055	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.83008	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.84961	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.86914	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.88867	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.9082	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.92773	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.94727	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.9668	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 5.98633	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.00586	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.02539	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.04492	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.06445	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.08398	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.10352	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.12305	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.14258	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.16211	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.18164	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.20117	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.2207	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.24023	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.25977	0.00253936	 0.127279	        0	0.000323208	0.00253936	 0.806419	0.00220005	
 6.2793	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.29883	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.31836	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.33789	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.35742	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.37695	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.39648	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.41602	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.43555	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.45508	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.47461	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.49414	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.51367	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.5332	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.55273	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.57227	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.5918	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.61133	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.63086	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.65039	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.66992	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.68945	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.70898	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.72852	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.74805	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.76758	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.78711	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.80664	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.82617	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.8457	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.86523	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.88477	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.9043	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.92383	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.94336	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.96289	    0.001	        0	        0	        0	    0.001	        0	        0	
 6.98242	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.00195	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.02148	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.04102	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.06055	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.08008	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.09961	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.11914	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.13867	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.1582	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.17773	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.19727	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.2168	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.23633	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.25586	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.27539	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.29492	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.31445	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.33398	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.35352	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.37305	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.39258	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.41211	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.43164	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.45117	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.4707	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.49023	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.50977	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.5293	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.54883	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.56836	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.58789	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.60742	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.62695	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.64648	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.66602	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.68555	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.70508	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.72461	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.74414	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.76367	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.7832	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.80273	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.82227	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.8418	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.86133	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.88086	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.90039	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.91992	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.93945	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.95898	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.97852	    0.001	        0	        0	        0	    0.001	        0	        0	
 7.99805	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.01758	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.03711	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.05664	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.07617	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.0957	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.11523	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.13477	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.1543	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.17383	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.19336	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.21289	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.23242	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.25195	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.27148	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.29102	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.31055	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.33008	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.34961	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.36914	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.38867	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.4082	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.42773	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.44727	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.4668	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.48633	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.50586	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.52539	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.54492	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.56445	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.58398	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.60352	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.62305	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.64258	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.66211	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.68164	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.70117	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.7207	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.74023	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.75977	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.7793	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.79883	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.81836	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.83789	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.85742	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.87695	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.89648	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.91602	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.93555	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.95508	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.97461	    0.001	        0	        0	        0	    0.001	        0	        0	
 8.99414	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.01367	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.0332	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.05273	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.07227	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.0918	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.11133	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.13086	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.15039	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.16992	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.18945	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.20898	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.22852	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.24805	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.26758	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.28711	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.30664	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.32617	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.3457	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.36523	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.38477	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.4043	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.42383	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.44336	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.46289	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.48242	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.50195	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.52148	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.54102	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.56055	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.58008	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.59961	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.61914	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.63867	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.6582	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.67773	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.69727	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.7168	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.73633	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.75586	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.77539	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.79492	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.81445	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.83398	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.85352	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.87305	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.89258	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.91211	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.93164	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.95117	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.9707	    0.001	        0	        0	        0	    0.001	        0	        0	
 9.99023	    0.001	        0	        0	        0	    0.001	        0	        0