Working prototype of autotuning

2025-10-15 08:47:41 +02:00 · 2018-08-22 16:20:18 +02:00 · 2018-08-22 16:20:18 +02:00 · 803ce8ab70
commit 803ce8ab70
parent f60ceaa316
33 changed files with 992 additions and 976 deletions
--- a/Autotuning.ipynb
+++ b/Autotuning.ipynb
--- a/ConvergenceShock1D.ipynb
+++ b/ConvergenceShock1D.ipynb
--- a/ConvergenceSmooth1D.ipynb
+++ b/ConvergenceSmooth1D.ipynb
--- a/ConvergenceSmooth2D.ipynb
+++ b/ConvergenceSmooth2D.ipynb
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@ -0,0 +1,277 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements the different helper functions and 
 classes
 Copyright (C) 2018  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 import os
 import gc
 import numpy as np
 import logging
 from socket import gethostname
 import pycuda.driver as cuda
 from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF
 class Autotuner:
    def __init__(self, 
                nx=2048, ny=2048, 
                block_widths=range(8, 32, 2),
                block_heights=range(8, 32, 2)):
        logger = logging.getLogger(__name__)
        self.filename = "autotuning_data_" + gethostname() + ".npz"
        self.nx = nx
        self.ny = ny
        self.block_widths = block_widths
        self.block_heights = block_heights
        self.performance = {}
    def benchmark(self, simulator, force=False):
        logger = logging.getLogger(__name__)
        #Run through simulators and benchmark
        key = str(simulator.__name__)
        logger.info("Benchmarking %s to %s", key, self.filename)
        #If this simulator has been benchmarked already, skip it
        if (force==False and os.path.isfile(self.filename)):
            with np.load(self.filename) as data:
                if key in data["simulators"]:
                    logger.info("%s already benchmarked - skipping", key)
                    return
        # Set arguments to send to the simulators during construction
        context = Common.CudaContext(autotuning=False)
        g = 9.81
        h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
        arguments = {
            'context': context,
            'h0': h0, 'hu0': hu0, 'hv0': hv0,
            'nx': self.nx, 'ny': self.ny,
            'dx': dx, 'dy': dy, 'dt': 0.9*dt,
            'g': g
        } 
        # Load existing data into memory
        benchmark_data = {
                "simulators": [],
        }
        if (os.path.isfile(self.filename)):
            with np.load(self.filename) as data:
                for k, v in data.items():
                    benchmark_data[k] = v
        # Run benchmark
        benchmark_data[key + "_megacells"] = Autotuner.benchmark_single_simulator(simulator, arguments, self.block_widths, self.block_heights)
        benchmark_data[key + "_block_widths"] = self.block_widths
        benchmark_data[key + "_block_heights"] = self.block_heights
        benchmark_data[key + "_arguments"] = str(arguments)
        existing_sims = benchmark_data["simulators"]
        if (isinstance(existing_sims, np.ndarray)):
            existing_sims = existing_sims.tolist()
        if (key not in existing_sims):
            benchmark_data["simulators"] = existing_sims + [key]
        # Save to file
        np.savez_compressed(self.filename, **benchmark_data)
    """
    Function which reads a numpy file with autotuning data
    and reports the maximum performance and block size
    """
    def get_peak_performance(self, simulator):
        logger = logging.getLogger(__name__)
        assert issubclass(simulator, Simulator.BaseSimulator)
        key = simulator.__name__
        if (key in self.performance):
            return self.performance[key]
        else:
            #Run simulation if required
            if (not os.path.isfile(self.filename)):
                logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
                self.benchmark(simulator)
            with np.load(self.filename) as data:
                if key not in data['simulators']:
                    logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
                    data.close()
                    self.benchmark(simulator)
                    data = np.load(self.filename)
                def find_max_index(megacells):
                    max_index = np.nanargmax(megacells)
                    return np.unravel_index(max_index, megacells.shape)
                megacells = data[key + '_megacells']
                block_widths = data[key + '_block_widths']
                block_heights = data[key + '_block_heights']
                j, i = find_max_index(megacells)
                self.performance[key] = { "block_width": block_widths[i],
                                         "block_height": block_heights[j],
                                         "megacells": megacells[j, i] }
                logger.debug("Returning %s as peak performance parameters", self.performance[key])
                return self.performance[key]
            #This should never happen
            raise "Something wrong: Could not get autotuning data!"
            return None
    """
    Runs a set of benchmarks for a single simulator
    """
    def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
        logger = logging.getLogger(__name__)
        megacells = np.empty((len(block_heights), len(block_widths)))
        megacells.fill(np.nan)
        logger.debug("Running %d benchmarks with %s", len(block_heights)*len(block_widths), simulator.__name__)
        sim_arguments = arguments.copy()
        with Common.Timer(simulator.__name__) as t:
            for j, block_height in enumerate(block_heights):
                sim_arguments.update({'block_height': block_height})
                for i, block_width in enumerate(block_widths):
                    sim_arguments.update({'block_width': block_width})
                    megacells[j, i] = Autotuner.run_benchmark(simulator, sim_arguments)
        logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
        return megacells
    """
    Runs a benchmark, and returns the number of megacells achieved
    """
    def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
        logger = logging.getLogger(__name__)
        #Initialize simulator
        try:
            sim = simulator(**arguments)
        except:
            #An exception raised - not possible to continue
            logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
            return np.nan
        #Create timer events
        start = cuda.Event()
        end = cuda.Event()
        #Warmup
        for i in range(warmup_timesteps):
            sim.stepEuler(sim.dt)
        #Run simulation with timer        
        start.record(sim.stream)
        for i in range(timesteps):
            sim.stepEuler(sim.dt)
        end.record(sim.stream)
        #Synchronize end event
        end.synchronize()
        #Compute megacells
        gpu_elapsed = end.time_since(start)*1.0e-3
        megacells = (sim.nx*sim.ny*timesteps / (1000*1000)) / gpu_elapsed
        #Sanity check solution
        h, hu, hv = sim.download()
        sane = True
        sane = sane and Autotuner.sanity_check(h, 0.3, 0.7)
        sane = sane and Autotuner.sanity_check(hu, -0.2, 0.2)
        sane = sane and Autotuner.sanity_check(hv, -0.2, 0.2)
        if (sane):
            logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
            return megacells
        else:
            logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
            return np.nan
    """
    Generates test dataset
    """
    def gen_test_data(nx, ny, g):
        width = 100.0
        height = 100.0
        dx = width / float(nx)
        dy = height / float(ny)
        x_center = dx*nx/2.0
        y_center = dy*ny/2.0
        #Create a gaussian "dam break" that will not form shocks
        size = width / 5.0
        dt = 10**10
        h  = np.zeros((ny, nx), dtype=np.float32); 
        hu = np.zeros((ny, nx), dtype=np.float32);
        hv = np.zeros((ny, nx), dtype=np.float32);
        x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
        y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
        xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
        r = np.sqrt(xv**2 + yv**2)
        xv = None
        yv = None
        gc.collect()
        #Generate highres
        h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
        hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
        hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
        scale = 0.7
        max_h_estimate = 0.6
        max_u_estimate = 0.1*np.sqrt(2.0)
        dx = width/nx
        dy = height/ny
        dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g*max_h_estimate))
        return h, hu, hv, dx, dy, dt
    """
    Checks that a variable is "sane"
    """
    def sanity_check(variable, bound_min, bound_max):
        maxval = np.amax(variable)
        minval = np.amin(variable)
        if (np.isnan(maxval) 
                or np.isnan(minval)
                or maxval > bound_max
                or minval < bound_min):
            return False
        else:
            return True
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@ -34,7 +34,7 @@ import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
-
+from GPUSimulators import Autotuner
 """
 Class which keeps track of time spent for a section of code
@ -64,8 +64,7 @@ Class which keeps track of the CUDA context and some helper functions
 """
 class CudaContext(object):
-    def __init__(self, verbose=True, blocking=False, use_cache=True):
+    def __init__(self, blocking=False, use_cache=True, autotuning=True):
        self.verbose = verbose
        self.blocking = blocking
        self.use_cache = use_cache
        self.logger =  logging.getLogger(__name__)
@ -76,12 +75,13 @@ class CudaContext(object):
        #Initialize cuda (must be first call to PyCUDA)
        cuda.init(flags=0)
        self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
        #Print some info about CUDA
        self.logger.info("CUDA version %s", str(cuda.get_version()))
        self.logger.info("Driver version %s",  str(cuda.get_driver_version()))
        self.cuda_device = cuda.Device(0)
        if (self.verbose):
        self.logger.info("Using '%s' GPU", self.cuda_device.name())
        self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
        self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
@ -102,6 +102,11 @@ class CudaContext(object):
                os.mkdir(self.cache_path)
            self.logger.debug("Using CUDA cache dir %s", self.cache_path)
        self.autotuner = None
        if (autotuning):
            self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
            self.autotuner = Autotuner.Autotuner()
    def __del__(self, *args):
        self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
@ -131,7 +136,7 @@ class CudaContext(object):
        return "CudaContext id " + str(self.cuda_context.handle)
-    def hash_kernel(kernel_filename, include_dirs, verbose=False):        
+    def hash_kernel(kernel_filename, include_dirs):        
        # Generate a kernel ID for our caches
        num_includes = 0
        max_includes = 100
@ -147,7 +152,7 @@ class CudaContext(object):
            filename = files.pop()
-            logger.debug("Hashing %s", filename)
+            #logger.debug("Hashing %s", filename)
            modified = os.path.getmtime(filename)
@ -183,7 +188,7 @@ class CudaContext(object):
    """
    def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
                    prepared_call_args, \
-                    include_dirs=[], verbose=False, no_extern_c=True, 
+                    include_dirs=[], no_extern_c=True, 
                    **kwargs):
        """
        Helper function to print compilation output
@ -195,7 +200,7 @@ class CudaContext(object):
            if error_str:
                self.logger.debug("Error: %s", error_str)
-        self.logger.debug("Getting %s", kernel_filename)
+        #self.logger.debug("Getting %s", kernel_filename)
        # Create a hash of the kernel (and its includes)
        kwargs_hasher = hashlib.md5()
@ -206,8 +211,7 @@ class CudaContext(object):
        kernel_hash = root \
                + "_" + CudaContext.hash_kernel( \
                    os.path.join(self.module_path, kernel_filename), \
-                    include_dirs=[self.module_path] + include_dirs, \
+                    include_dirs=[self.module_path] + include_dirs) \
                    verbose=verbose) \
                + "_" + kwargs_hash \
                + ext
        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
@ -278,55 +282,102 @@ class CudaContext(object):
 """
 Class that holds data 
 """
-class CUDAArray2D:
+class CudaArray2D:
    """
    Uploads initial data to the CL device
    """
-    def __init__(self, stream, nx, ny, halo_x, halo_y, data):
+    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data):
        self.logger =  logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
-        self.nx_halo = nx + 2*halo_x
+        self.x_halo = x_halo
-        self.ny_halo = ny + 2*halo_y
+        self.y_halo = y_halo
-        self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+        nx_halo = nx + 2*x_halo
        ny_halo = ny + 2*y_halo
        #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
        #Make sure data is in proper format
-        assert np.issubdtype(data.dtype, np.float32), "Wrong datatype: %s" % str(data.dtype)
+        assert np.issubdtype(cpu_data.dtype, np.float32), "Wrong datatype: %s" % str(cpu_data.dtype)
-        assert not np.isfortran(data), "Wrong datatype (Fortran, expected C)"
+        assert cpu_data.itemsize == 4, "Wrong size of data type"
-        assert data.shape == (self.ny_halo, self.nx_halo), "Wrong data shape: %s vs %s" % (str(data.shape), str((self.ny_halo, self.nx_halo)))
+        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
        #Upload data to the device
-        self.data = pycuda.gpuarray.to_gpu_async(data, stream=stream)
+        if (cpu_data.shape == (ny_halo, nx_halo)):
            self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
        elif (cpu_data.shape == (self.ny, self.nx)):
            #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
            self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), cpu_data.dtype)
            #self.data.fill(0.0)
-        self.bytes_per_float = data.itemsize
+            #Create copy object from host to device
-        assert(self.bytes_per_float == 4)
+            copy = cuda.Memcpy2D()
-        self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
+            copy.set_src_host(cpu_data)
-        self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+            copy.set_dst_device(self.data.gpudata)
            #Set offsets and pitch of destination
            copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
            copy.dst_y = self.y_halo
            copy.dst_pitch = self.data.strides[0]
            #Set width in bytes to copy for each row and
            #number of rows to copy
            copy.width_in_bytes = self.nx*cpu_data.itemsize
            copy.height = self.ny
            #Perform the copy
            copy(stream)
            stream.synchronize()
        else:
            assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
    def __del__(self, *args):
-        self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
        self.data.gpudata.free()
        self.data = None
    """
-    Enables downloading data from CL device to Python
+    Enables downloading data from GPU to Python
    """
    def download(self, stream, async=False):
        #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
        #Allocate host memory
        #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
        cpu_data = np.empty((self.ny, self.nx), dtype=np.float32)
-        #Copy data from device to host
+        #Create copy object from device to host
-        if (async):
+        copy = cuda.Memcpy2D()
-            self.logger.debug("Buffer <%s> [%dx%d]: Downloading async ", int(self.data.gpudata), self.nx, self.ny)
+        copy.set_src_device(self.data.gpudata)
-            host_data = self.data.get_async(stream=stream)
+        copy.set_dst_host(cpu_data)
-            return host_data
+        
-        else:
+        #Set offsets and pitch of source
-            self.logger.debug("Buffer <%s> [%dx%d]: Downloading synchronously", int(self.data.gpudata), self.nx, self.ny)
+        copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
-            host_data = self.data.get(stream=stream)#, pagelocked=True) # pagelocked causes crash on windows at least
+        copy.src_y = self.y_halo
-            return host_data
+        copy.src_pitch = self.data.strides[0]
        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = self.nx*cpu_data.itemsize
        copy.height = self.ny
        copy(stream)
        if async==False:
            stream.synchronize()
        return cpu_data
@ -344,13 +395,13 @@ class SWEDataArakawaA:
    Uploads initial data to the CL device
    """
    def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0):
-        self.h0  = CUDAArray2D(stream, nx, ny, halo_x, halo_y, h0)
+        self.h0  = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
-        self.hu0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hu0)
+        self.hu0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
-        self.hv0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hv0)
+        self.hv0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
-        self.h1  = CUDAArray2D(stream, nx, ny, halo_x, halo_y, h0)
+        self.h1  = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
-        self.hu1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hu0)
+        self.hu1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
-        self.hv1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hv0)
+        self.hv1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
    """
    Swaps the variables after a timestep has been completed
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -70,8 +70,8 @@ class FORCE (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
                                        "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "First order centered"
@ -84,12 +84,12 @@ class FORCE (Simulator.BaseSimulator):
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
-                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/FORCE_kernel.cu
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@ -20,7 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -65,8 +65,8 @@ class HLL (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
                                        "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "Harten-Lax-van Leer"
@ -79,12 +79,12 @@ class HLL (Simulator.BaseSimulator):
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
-                self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)        
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])        
        self.data.swap()
        self.t += dt
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -71,8 +71,8 @@ class HLL2 (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
                                        "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "Harten-Lax-van Leer (2nd order)"
@ -90,12 +90,12 @@ class HLL2 (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(0), \
-                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
@ -106,12 +106,12 @@ class HLL2 (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(1), \
-                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/HLL2_kernel.cu
--- a/GPUSimulators/HLL_kernel.cu
+++ b/GPUSimulators/HLL_kernel.cu
--- a/GPUSimulators/IPythonMagic.py
+++ b/GPUSimulators/IPythonMagic.py
@ -21,13 +21,15 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import logging
 from IPython.core import magic_arguments
 from IPython.core.magic import line_magic, Magics, magics_class
 import pycuda.driver as cuda
 from GPUSimulators import Common
@magics_class
-class CudaContextHandler(Magics): 
+class MyIPythonMagic(Magics): 
    @line_magic
    def cuda_context_handler(self, context_name):
        self.logger =  logging.getLogger(__name__)
@ -39,7 +41,8 @@ class CudaContextHandler(Magics):
            return
        else:
            self.logger.debug("Creating context")
-            self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
+            #self.shell.ex(context_name + " = Common.CudaContext(blocking=False)")
            self.shell.user_ns[context_name] = Common.CudaContext(blocking=False)
        # this function will be called on exceptions in any cell
        def custom_exc(shell, etype, evalue, tb, tb_offset=None):
@ -51,11 +54,14 @@ class CudaContextHandler(Magics):
            if context_name in self.shell.user_ns.keys():
                self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
-                self.shell.ex(context_name + ".cuda_context.push()")
+                #self.shell.ex(context_name + ".cuda_context.push()")
                self.shell.user_ns[context_name].cuda_context.push()
            else:
                self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
                self.logger.error("CUDA will not work now")
            self.logger.debug("==================================================================")
            # still show the error within the notebook, don't just swallow it
            shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
@ -71,10 +77,42 @@ class CudaContextHandler(Magics):
                context = cuda.Context.get_current()
                self.logger.info("`-> Popping <%s>", str(context.handle))
                cuda.Context.pop()
            self.logger.debug("==================================================================")
        atexit.register(exitfunc)
-logger = logging.getLogger(__name__)
+    @line_magic
-logger.info("Registering automatic CUDA context handling")
+    @magic_arguments.magic_arguments()
-logger.debug("(use %cuda_context_handler my_context to create a context called my_context")
+    @magic_arguments.argument(
        '--out', '-o', type=str, default='output.log', help='The filename to store the log to')
    @magic_arguments.argument(
        '--level', '-l', type=int, default=20, help='The level of logging to screen [0, 50]')
    @magic_arguments.argument(
        '--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
    def setup_logging(self, line):
        args = magic_arguments.parse_argstring(self.setup_logging, line)
        import sys
        #Get root logger
        logger = logging.getLogger('')
        logger.setLevel(min(args.level, args.file_level))
        #Add log to screen
        ch = logging.StreamHandler()
        ch.setLevel(args.level)
        logger.addHandler(ch)
        logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level))
        #Add log to file
        logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), args.out)
        fh = logging.FileHandler(args.out)
        formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
        fh.setFormatter(formatter)
        fh.setLevel(args.file_level)
        logger.addHandler(fh)
        logger.info("Python version %s", sys.version)
 # Register 
 ip = get_ipython()
-ip.register_magics(CudaContextHandler)
+ip.register_magics(MyIPythonMagic)
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@ -26,7 +26,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -72,8 +72,8 @@ class KP07 (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
                                        "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "Kurganov-Petrova 2007"
@ -88,12 +88,12 @@ class KP07 (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(substep), \
-                self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
    def stepEuler(self, dt):
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@ -26,7 +26,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -72,8 +72,8 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
                                        "iifffffiPiPiPiPiPiPi", \
-                                            BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                            BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "Kurganov-Petrova 2007 dimensionally split"
@ -91,12 +91,12 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(0), \
-                self.data.h0.data.gpudata,  self.data.h0.pitch, \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.pitch, \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
@ -107,12 +107,12 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.g, \
                self.theta, \
                np.int32(1), \
-                self.data.h0.data.gpudata,  self.data.h0.pitch, \
+                self.data.h0.data.gpudata,  self.data.h0.data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata,  self.data.h1.pitch, \
+                self.data.h1.data.gpudata,  self.data.h1.data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
--- a/GPUSimulators/KP07_dimsplit_kernel.cu
+++ b/GPUSimulators/KP07_dimsplit_kernel.cu
--- a/GPUSimulators/KP07_kernel.cu
+++ b/GPUSimulators/KP07_kernel.cu
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@ -21,7 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 #Import packages we need
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -66,8 +66,8 @@ class LxF (Simulator.BaseSimulator):
        # Get kernels
        self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
                                        "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "Lax Friedrichs"
@ -80,12 +80,12 @@ class LxF (Simulator.BaseSimulator):
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
-                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
-                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
-                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
--- a/GPUSimulators/LxF_kernel.cu
+++ b/GPUSimulators/LxF_kernel.cu
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@ -28,7 +28,7 @@ import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
-from SWESimulators import Common
+from GPUSimulators import Common
 class BaseSimulator:
@ -57,6 +57,14 @@ class BaseSimulator:
        #Get logger
        self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
        self.context = context
        if (self.context.autotuner):
            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
            block_width = int(peak_configuration["block_width"])
            block_height = int(peak_configuration["block_height"])
            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
        #Create a CUDA stream
        self.stream = cuda.Stream()
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@ -22,7 +22,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #Import packages we need
 import numpy as np
-from SWESimulators import Simulator
+from GPUSimulators import Simulator
@ -65,8 +65,8 @@ class WAF (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
                                        "iiffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=block_height)
+                                        BLOCK_HEIGHT=self.local_size[1])
    def __str__(self):
        return "Weighted average flux"
@ -83,12 +83,12 @@ class WAF (Simulator.BaseSimulator):
                        self.dx, self.dy, dt, \
                        self.g, \
                        np.int32(0), \
-                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                        self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
-                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                        self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                        self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
-                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                        self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                        self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
@ -98,11 +98,11 @@ class WAF (Simulator.BaseSimulator):
                        self.dx, self.dy, dt, \
                        self.g, \
                        np.int32(1), \
-                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                        self.data.h0.data.gpudata,  self.data.h0.data.strides[0],  \
-                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                        self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
-                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
-                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                        self.data.h1.data.gpudata,  self.data.h1.data.strides[0],  \
-                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                        self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
-                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
+                        self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
        self.data.swap()
        self.t += dt
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/WAF_kernel.cu
--- a/GPUSimulators/init.py
+++ b/GPUSimulators/init.py
--- a/GPUSimulators/common.cu
+++ b/GPUSimulators/common.cu
--- a/GPUSimulators/fluxes/CentralUpwind.cu
+++ b/GPUSimulators/fluxes/CentralUpwind.cu
--- a/GPUSimulators/fluxes/FirstOrderCentered.cu
+++ b/GPUSimulators/fluxes/FirstOrderCentered.cu
--- a/GPUSimulators/fluxes/Godunov.cu
+++ b/GPUSimulators/fluxes/Godunov.cu
--- a/GPUSimulators/fluxes/HartenLaxVanLeer.cu
+++ b/GPUSimulators/fluxes/HartenLaxVanLeer.cu
--- a/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
+++ b/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
--- a/GPUSimulators/fluxes/LaxFriedrichs.cu
+++ b/GPUSimulators/fluxes/LaxFriedrichs.cu
--- a/GPUSimulators/fluxes/LaxWendroff.cu
+++ b/GPUSimulators/fluxes/LaxWendroff.cu
--- a/GPUSimulators/fluxes/WeightedAverageFlux.cu
+++ b/GPUSimulators/fluxes/WeightedAverageFlux.cu
--- a/GPUSimulators/limiters.cu
+++ b/GPUSimulators/limiters.cu