Refactoring CudaArray and ArakawaA grid

2025-11-29 17:28:03 +01:00 · 2018-08-23 20:44:49 +02:00
parent 5668e28f99
commit 918d22b257
10 changed files with 452 additions and 159 deletions
--- a/Autotuning.ipynb
+++ b/Autotuning.ipynb
@@ -228,7 +228,7 @@
      "CUDA version (9, 1, 0)\n",
      "Driver version 9010\n",
      "Using 'GeForce 840M' GPU\n",
-      "Created context handle <879048629408>\n",
+      "Created context handle <694827722560>\n",
      "Using CUDA cache dir c:\\Users\\anbro\\Documents\\projects\\ShallowWaterGPU\\GPUSimulators\\cuda_cache\n",
      "Autotuning enabled. It may take several minutes to run the code the first time: have patience\n"
     ]
@@ -247,13 +247,13 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "gen_data: 3115.227938 ms\n"
+      "gen_data: 1647.211552 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "<matplotlib.image.AxesImage at 0xccab2d4c18>"
+       "<matplotlib.image.AxesImage at 0xa1c91aa390>"
      ]
     },
     "execution_count": 8,
@@ -328,14 +328,238 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LxF\n",
      "[63x63] => 107.3 (0.000185)\n",
      "[127x127] => 165.6 (0.000487)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\anbro\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in sqrt\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[191x191] => 183.4 (0.000995)\n",
      "[255x255] => 180.0 (0.001806)\n",
      "[319x319] => 185.8 (0.002738)\n",
      "[383x383] => 187.3 (0.003915)\n",
      "[447x447] => 189.7 (0.005266)\n",
      "[511x511] => 191.8 (0.006806)\n",
      "[639x639] => 193.6 (0.010548)\n",
      "[767x767] => 193.7 (0.015182)\n",
      "[895x895] => 195.6 (0.020481)\n",
      "[1023x1023] => 195.0 (0.026839)\n",
      "[1151x1151] => 195.8 (0.033822)\n",
      "[1279x1279] => 196.1 (0.041711)\n",
      "[1407x1407] => 196.2 (0.050439)\n",
      "[1535x1535] => 196.4 (0.059986)\n",
      "[1663x1663] => 196.6 (0.070330)\n",
      "[1791x1791] => 196.7 (0.081546)\n",
      "[1919x1919] => 196.9 (0.093511)\n",
      "[2047x2047] => 202.9 (0.103257)\n",
      "[2303x2303] => 210.7 (0.125838)\n",
      "[2559x2559] => 208.0 (0.157417)\n",
      "[2815x2815] => 211.6 (0.187229)\n",
      "[3071x3071] => 208.7 (0.225954)\n",
      "[3327x3327] => 214.2 (0.258395)\n",
      "[3583x3583] => 214.2 (0.299629)\n",
      "[3839x3839] => 214.2 (0.343982)\n",
      "[4095x4095] => 214.9 (0.390088)\n",
      "FORCE\n",
      "[63x63] => 94.3 (0.000210)\n",
      "[127x127] => 136.5 (0.000591)\n",
      "[191x191] => 147.0 (0.001241)\n",
      "[255x255] => 148.5 (0.002189)\n",
      "[319x319] => 151.6 (0.003357)\n",
      "[383x383] => 153.0 (0.004793)\n",
      "[447x447] => 153.9 (0.006494)\n",
      "[511x511] => 155.0 (0.008421)\n",
      "[639x639] => 156.4 (0.013056)\n",
      "[767x767] => 156.5 (0.018790)\n",
      "[895x895] => 157.0 (0.025514)\n",
      "[1023x1023] => 143.6 (0.036450)\n",
      "[1151x1151] => 143.6 (0.046115)\n",
      "[1279x1279] => 143.8 (0.056865)\n",
      "[1407x1407] => 143.9 (0.068797)\n",
      "[1535x1535] => 144.0 (0.081832)\n",
      "[1663x1663] => 144.0 (0.096007)\n",
      "[1791x1791] => 144.0 (0.111343)\n",
      "[1919x1919] => 144.2 (0.127712)\n",
      "[2047x2047] => 151.7 (0.138153)\n",
      "[2303x2303] => 147.3 (0.180021)\n",
      "[2559x2559] => 154.3 (0.212248)\n",
      "[2815x2815] => 158.3 (0.250279)\n",
      "[3071x3071] => 156.9 (0.300547)\n",
      "[3327x3327] => 158.4 (0.349353)\n",
      "[3583x3583] => 158.4 (0.405175)\n",
      "[3839x3839] => 158.4 (0.465201)\n",
      "[4095x4095] => 158.4 (0.529337)\n",
      "HLL\n",
      "[63x63] => 65.7 (0.000302)\n",
      "[127x127] => 98.6 (0.000818)\n",
      "[191x191] => 108.1 (0.001688)\n",
      "[255x255] => 109.2 (0.002977)\n",
      "[319x319] => 111.9 (0.004546)\n",
      "[383x383] => 113.2 (0.006482)\n",
      "[447x447] => 113.7 (0.008785)\n",
      "[511x511] => 114.4 (0.011411)\n",
      "[639x639] => 115.3 (0.017713)\n",
      "[767x767] => 115.6 (0.025454)\n",
      "[895x895] => 105.7 (0.037888)\n",
      "[1023x1023] => 105.8 (0.049473)\n",
      "[1151x1151] => 105.9 (0.062558)\n",
      "[1279x1279] => 106.0 (0.077148)\n",
      "[1407x1407] => 106.1 (0.093290)\n",
      "[1535x1535] => 109.8 (0.107271)\n",
      "[1663x1663] => 106.2 (0.130195)\n",
      "[1791x1791] => 107.7 (0.148973)\n",
      "[1919x1919] => 115.0 (0.160104)\n",
      "[2047x2047] => 113.3 (0.184913)\n",
      "[2303x2303] => 111.9 (0.236908)\n",
      "[2559x2559] => 116.6 (0.280840)\n",
      "[2815x2815] => 116.6 (0.339777)\n",
      "[3071x3071] => 116.6 (0.404268)\n",
      "[3327x3327] => 116.6 (0.474572)\n",
      "[3583x3583] => 116.7 (0.550240)\n",
      "[3839x3839] => 116.7 (0.631563)\n",
      "[4095x4095] => 116.7 (0.718161)\n",
      "HLL2\n",
      "[63x63] => 44.2 (0.000449)\n",
      "[127x127] => 63.0 (0.001280)\n",
      "[191x191] => 68.4 (0.002666)\n",
      "[255x255] => 69.2 (0.004698)\n",
      "[319x319] => 70.6 (0.007204)\n",
      "[383x383] => 71.1 (0.010314)\n",
      "[447x447] => 71.6 (0.013956)\n",
      "[511x511] => 72.0 (0.018146)\n",
      "[639x639] => 72.4 (0.028204)\n",
      "[767x767] => 72.5 (0.040545)\n",
      "[895x895] => 72.8 (0.055047)\n",
      "[1023x1023] => 72.8 (0.071828)\n",
      "[1151x1151] => 66.5 (0.099652)\n",
      "[1279x1279] => 69.8 (0.117195)\n",
      "[1407x1407] => 67.0 (0.147833)\n",
      "[1535x1535] => 71.3 (0.165185)\n",
      "[1663x1663] => 71.2 (0.194123)\n",
      "[1791x1791] => 72.1 (0.222351)\n",
      "[1919x1919] => 70.3 (0.261847)\n",
      "[2047x2047] => 73.2 (0.286228)\n",
      "[2303x2303] => 72.0 (0.368479)\n",
      "[2559x2559] => 73.2 (0.447096)\n",
      "[2815x2815] => 73.2 (0.541084)\n",
      "[3071x3071] => 73.2 (0.643925)\n",
      "[3327x3327] => 73.2 (0.755588)\n",
      "[3583x3583] => 73.3 (0.876222)\n",
      "[3839x3839] => 73.3 (1.005958)\n",
      "[4095x4095] => 73.3 (1.144158)\n",
      "KP07\n",
      "[63x63] => 69.9 (0.000284)\n",
      "[127x127] => 95.0 (0.000849)\n",
      "[191x191] => 101.7 (0.001794)\n",
      "[255x255] => 101.3 (0.003209)\n",
      "[319x319] => 106.9 (0.004760)\n",
      "[383x383] => 107.1 (0.006850)\n",
      "[447x447] => 109.2 (0.009150)\n",
      "[511x511] => 108.0 (0.012088)\n",
      "[639x639] => 111.6 (0.018295)\n",
      "[767x767] => 111.6 (0.026361)\n",
      "[895x895] => 102.4 (0.039123)\n",
      "[1023x1023] => 102.2 (0.051186)\n",
      "[1151x1151] => 102.3 (0.064764)\n",
      "[1279x1279] => 103.4 (0.079074)\n",
      "[1407x1407] => 103.2 (0.095876)\n",
      "[1535x1535] => 106.3 (0.110860)\n",
      "[1663x1663] => 103.1 (0.134182)\n",
      "[1791x1791] => 107.7 (0.148853)\n",
      "[1919x1919] => 105.5 (0.174575)\n",
      "[2047x2047] => 111.4 (0.188084)\n",
      "[2303x2303] => 113.5 (0.233650)\n",
      "[2559x2559] => 114.0 (0.287327)\n",
      "[2815x2815] => 113.7 (0.348536)\n",
      "[3071x3071] => 113.2 (0.416533)\n",
      "[3327x3327] => 113.7 (0.486893)\n",
      "[3583x3583] => 113.5 (0.565573)\n",
      "[3839x3839] => 113.5 (0.649058)\n",
      "[4095x4095] => 113.6 (0.738275)\n",
      "KP07_dimsplit\n",
      "[63x63] => 49.9 (0.000397)\n",
      "[127x127] => 71.7 (0.001125)\n",
      "[191x191] => 76.8 (0.002374)\n",
      "[255x255] => 77.5 (0.004197)\n",
      "[319x319] => 79.0 (0.006437)\n",
      "[383x383] => 79.8 (0.009189)\n",
      "[447x447] => 80.3 (0.012449)\n",
      "[511x511] => 80.6 (0.016191)\n",
      "[639x639] => 81.1 (0.025171)\n",
      "[767x767] => 81.3 (0.036181)\n",
      "[895x895] => 74.3 (0.053902)\n",
      "[1023x1023] => 74.4 (0.070335)\n",
      "[1151x1151] => 76.2 (0.086896)\n",
      "[1279x1279] => 74.5 (0.109725)\n",
      "[1407x1407] => 74.6 (0.132712)\n",
      "[1535x1535] => 79.4 (0.148342)\n",
      "[1663x1663] => 78.3 (0.176547)\n",
      "[1791x1791] => 81.3 (0.197279)\n",
      "[1919x1919] => 78.5 (0.234550)\n",
      "[2047x2047] => 82.0 (0.255396)\n",
      "[2303x2303] => 81.0 (0.327297)\n",
      "[2559x2559] => 82.0 (0.399197)\n",
      "[2815x2815] => 82.0 (0.483034)\n",
      "[3071x3071] => 82.0 (0.574737)\n",
      "[3327x3327] => 82.1 (0.674395)\n",
      "[3583x3583] => 82.1 (0.782180)\n",
      "[3839x3839] => 82.1 (0.897551)\n",
      "[4095x4095] => 82.1 (1.020911)\n",
      "WAF\n",
      "[63x63] => 32.8 (0.000605)\n",
      "[127x127] => 45.6 (0.001768)\n",
      "[191x191] => 53.9 (0.003381)\n",
      "[255x255] => 54.3 (0.005985)\n",
      "[319x319] => 57.7 (0.008821)\n",
      "[383x383] => 56.9 (0.012893)\n",
      "[447x447] => 59.3 (0.016840)\n",
      "[511x511] => 58.8 (0.022214)\n",
      "[639x639] => 59.6 (0.034278)\n",
      "[767x767] => 60.1 (0.048942)\n",
      "[895x895] => 55.3 (0.072483)\n",
      "[1023x1023] => 55.4 (0.094402)\n",
      "[1151x1151] => 55.7 (0.119006)\n",
      "[1279x1279] => 55.0 (0.148746)\n",
      "[1407x1407] => 55.8 (0.177399)\n",
      "[1535x1535] => 58.7 (0.200663)\n",
      "[1663x1663] => 57.8 (0.239299)\n",
      "[1791x1791] => 59.6 (0.269144)\n",
      "[1919x1919] => 61.1 (0.301218)\n",
      "[2047x2047] => 61.2 (0.342070)\n",
      "[2303x2303] => 61.3 (0.432280)\n",
      "[2559x2559] => 61.0 (0.537125)\n",
      "[2815x2815] => 61.1 (0.648336)\n",
      "[3071x3071] => 61.3 (0.769734)\n",
      "[3327x3327] => 61.4 (0.901199)\n",
      "[3583x3583] => 61.1 (1.049726)\n",
      "[3839x3839] => 61.3 (1.202961)\n",
      "[4095x4095] => 61.4 (1.366446)\n"
     ]
    }
   ],
   "source": [
-    "run_simulation = False\n",
+    "run_simulation = True\n",
    "sizes = list(range(64, 512, 64)) + list(range(512, 2048, 128)) + list(range(2048, 4096, 256)) + [4096]\n",
    "simulators = [LxF.LxF, FORCE.FORCE, HLL.HLL, HLL2.HLL2, KP07.KP07, KP07_dimsplit.KP07_dimsplit, WAF.WAF]\n",
    "if (run_simulation):\n",
    "    megacells = {}\n",
    "    for simulator in simulators:\n",
@@ -388,7 +612,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@@ -412,7 +636,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
@@ -423,7 +647,7 @@
       "Text(0.5,0,'nx')"
      ]
     },
-     "execution_count": 14,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
@@ -450,7 +674,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -459,7 +683,7 @@
       "Text(0.5,0,'nx')"
      ]
     },
-     "execution_count": 15,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    },
@@ -487,6 +711,23 @@
    "plt.xlabel(\"nx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    }
   ],
   "source": [
    "print(type(None) == None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -84,7 +84,6 @@ class CudaContext(object):
        self.cuda_device = cuda.Device(0)
        self.logger.info("Using '%s' GPU", self.cuda_device.name())
        self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
        self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
        # Create the CUDA context
        if (self.blocking):
@@ -92,6 +91,9 @@ class CudaContext(object):
            self.logger.warning("Using blocking context")
        else:
            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
        free, total = cuda.mem_get_info()
        self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024)))
        self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
@@ -294,7 +296,7 @@ class CudaArray2D:
    """
    Uploads initial data to the CL device
    """
-    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data):
+    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32):
        self.logger =  logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
@@ -307,16 +309,18 @@ class CudaArray2D:
        #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
        #Make sure data is in proper format
-        assert np.issubdtype(cpu_data.dtype, np.float32), "Wrong datatype: %s" % str(cpu_data.dtype)
+        if cpu_data is not None:
-        assert cpu_data.itemsize == 4, "Wrong size of data type"
+            assert cpu_data.itemsize == 4, "Wrong size of data type"
-        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
+            assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
        #Upload data to the device
-        if (cpu_data.shape == (ny_halo, nx_halo)):
+        if cpu_data is None:
            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
        elif (cpu_data.shape == (ny_halo, nx_halo)):
            self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
        elif (cpu_data.shape == (self.ny, self.nx)):
            #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
-            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), cpu_data.dtype)
+            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
            #self.data.fill(0.0)
            #Create copy object from host to device
@@ -337,7 +341,6 @@ class CudaArray2D:
            #Perform the copy
            copy(stream)
            stream.synchronize()
        else:
            assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
@@ -390,36 +393,31 @@ class CudaArray2D:
 """
 A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
 """
-class SWEDataArakawaA:
+class ArakawaA2D:
    """
    Uploads initial data to the CL device
    """
-    def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0):
+    def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
        self.logger =  logging.getLogger(__name__)
-        self.h0  = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
+        self.gpu_variables = []
-        self.hu0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
+        for cpu_variable in cpu_variables:
-        self.hv0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
+            self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)]
        self.h1  = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
        self.hu1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
        self.hv1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
    """
    Swaps the variables after a timestep has been completed
    """
    def swap(self):
        self.h1,  self.h0  = self.h0,  self.h1
        self.hu1, self.hu0 = self.hu0, self.hu1
        self.hv1, self.hv0 = self.hv0, self.hv1
    def __getitem__(self, key):
        assert type(key) == int, "Indexing is int based"
        if (key > len(self.gpu_variables) or key < 0):
            raise IndexError("Out of bounds")
        return self.gpu_variables[key]
    """
    Enables downloading data from CL device to Python
    """
    def download(self, stream):
-        h_cpu  = self.h0.download(stream, async=True)
+        cpu_variables = []
-        hu_cpu = self.hu0.download(stream, async=True)
+        for gpu_variable in self.gpu_variables:
-        hv_cpu = self.hv0.download(stream, async=False)
+            cpu_variables += [gpu_variable.download(stream, async=True)]
-        
+        stream.synchronize()
-        return h_cpu, hu_cpu, hv_cpu
+        return cpu_variables
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -60,9 +60,7 @@ class FORCE (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            1, 1, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -73,6 +71,16 @@ class FORCE (Simulator.BaseSimulator):
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        1, 1, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        1, 1, \
                        [None, None, None])
    def __str__(self):
        return "First order centered"
@@ -84,13 +92,14 @@ class FORCE (Simulator.BaseSimulator):
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
-                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
-        
+    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -20,7 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -55,9 +55,7 @@ class HLL (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            1, 1, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -68,6 +66,16 @@ class HLL (Simulator.BaseSimulator):
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        1, 1, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        1, 1, \
                        [None, None, None])
    def __str__(self):
        return "Harten-Lax-van Leer"
@@ -79,13 +87,14 @@ class HLL (Simulator.BaseSimulator):
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
-                self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])        
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
-
+    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -59,9 +59,7 @@ class HLL2 (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            2, 2, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -74,6 +72,16 @@ class HLL2 (Simulator.BaseSimulator):
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [None, None, None])
    def __str__(self):
        return "Harten-Lax-van Leer (2nd order)"
@@ -90,13 +98,13 @@ class HLL2 (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(0), \
-                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
    def stepDimsplitYX(self, dt):
@@ -106,12 +114,14 @@ class HLL2 (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(1), \
-                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
-        
+        
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -26,7 +26,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -60,9 +60,7 @@ class KP07 (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            2, 2, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -75,6 +73,16 @@ class KP07 (Simulator.BaseSimulator):
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [None, None, None])
    def __str__(self):
        return "Kurganov-Petrova 2007"
@@ -88,13 +96,13 @@ class KP07 (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(substep), \
-                self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
    def stepEuler(self, dt):
        self.substepRK(dt, 0)
@@ -108,5 +116,4 @@ class KP07 (Simulator.BaseSimulator):
        self.t += dt
    def download(self):
-        return self.data.download(self.stream)
+        return self.u0.download(self.stream)
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -26,7 +26,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -60,9 +60,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            2, 2, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -75,6 +73,16 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [None, None, None])
    def __str__(self):
        return "Kurganov-Petrova 2007 dimensionally split"
@@ -91,13 +99,13 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(0), \
-                self.data.h0.data.gpudata,  self.data.h0.data.strides[0], \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
    def stepDimsplitYX(self, dt):
@@ -107,13 +115,14 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(1), \
-                self.data.h0.data.gpudata,  self.data.h0.data.strides[0], \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
-        
+    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -56,9 +56,7 @@ class LxF (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            1, 1, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -68,6 +66,16 @@ class LxF (Simulator.BaseSimulator):
                                        "iiffffPiPiPiPiPiPi", \
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        1, 1, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        1, 1, \
                        [None, None, None])
    def __str__(self):
        return "Lax Friedrichs"
@@ -80,13 +88,14 @@ class LxF (Simulator.BaseSimulator):
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
-                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
-
+    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -48,9 +48,7 @@ class BaseSimulator:
    """
    def __init__(self, \
                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 ghost_cells_x, ghost_cells_y, \
                 dx, dy, dt, \
                 g, \
                 block_width, block_height):
@@ -67,14 +65,6 @@ class BaseSimulator:
        #Create a CUDA stream
        self.stream = cuda.Stream()
        #Create data by uploading to device
        free, total = cuda.mem_get_info()
        self.logger.debug("GPU memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024)))
        self.data = Common.SWEDataArakawaA(self.stream, \
                            nx, ny, \
                            ghost_cells_x, ghost_cells_y, \
                            h0, hu0, hv0)
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
@@ -94,7 +84,7 @@ class BaseSimulator:
        self.global_size = ( \
                       int(np.ceil(self.nx / float(self.local_size[0]))), \
                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
+                      )
    """
    Function which simulates forward in time using the default simulation type
@@ -192,7 +182,7 @@ class BaseSimulator:
        return self.t
    def download(self):
-        return self.data.download(self.stream)
+        raise(NotImplementedError("Needs to be implemented in subclass"))
    def synchronize(self):
        self.stream.synchronize()
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -22,7 +22,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from GPUSimulators import Simulator
+from GPUSimulators import Simulator, Common
@@ -55,9 +55,7 @@ class WAF (Simulator.BaseSimulator):
        # Call super constructor
        super().__init__(context, \
            h0, hu0, hv0, \
            nx, ny, \
            2, 2, \
            dx, dy, dt, \
            g, \
            block_width, block_height);
@@ -68,6 +66,16 @@ class WAF (Simulator.BaseSimulator):
                                        BLOCK_WIDTH=self.local_size[0], \
                                        BLOCK_HEIGHT=self.local_size[1])
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [h0, hu0, hv0])
        self.u1 = Common.ArakawaA2D(self.stream, \
                        nx, ny, \
                        2, 2, \
                        [None, None, None])
    def __str__(self):
        return "Weighted average flux"
@@ -79,30 +87,33 @@ class WAF (Simulator.BaseSimulator):
    def stepDimsplitXY(self, dt):
        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                        self.nx, self.ny, \
+                self.nx, self.ny, \
-                        self.dx, self.dy, dt, \
+                self.dx, self.dy, dt, \
-                        self.g, \
+                self.g, \
-                        np.int32(0), \
+                np.int32(0), \
-                        self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                        self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                        self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                        self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                        self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                        self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
    def stepDimsplitYX(self, dt):
        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                        self.nx, self.ny, \
+                self.nx, self.ny, \
-                        self.dx, self.dy, dt, \
+                self.dx, self.dy, dt, \
-                        self.g, \
+                self.g, \
-                        np.int32(1), \
+                np.int32(1), \
-                        self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
-                        self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
-                        self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
-                        self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
-                        self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
-                        self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
-        self.data.swap()
+        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
    def download(self):
        return self.u0.download(self.stream)