diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu
index 017a05c..bedb0d7 100644
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/FORCE_kernel.cu
@@ -120,9 +120,9 @@ __global__ void FORCEKernel(
     
     
     //Read into shared memory
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_,  h0_pitch_, Q[0], nx_+2, ny_+2);
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
     __syncthreads();
     
     //Set boundary conditions
diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu
index a5d3156..7909a57 100644
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/HLL2_kernel.cu
@@ -166,9 +166,9 @@ __global__ void HLL2Kernel(
     
     
     //Read into shared memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_,  h0_pitch_, Q[0], nx_+4, ny_+4);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
     __syncthreads();
     
     //Set boundary conditions
diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/HLL_kernel.cu
index 88fffd5..a86963b 100644
--- a/GPUSimulators/HLL_kernel.cu
+++ b/GPUSimulators/HLL_kernel.cu
@@ -128,9 +128,9 @@ __global__ void HLLKernel(
     
     
     //Read into shared memory
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_,  h0_pitch_, Q[0], nx_+2, ny_+2);
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
     __syncthreads();
 
     //Set boundary conditions
diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/KP07_dimsplit_kernel.cu
index 4ab3c78..7d9fd9d 100644
--- a/GPUSimulators/KP07_dimsplit_kernel.cu
+++ b/GPUSimulators/KP07_dimsplit_kernel.cu
@@ -157,9 +157,9 @@ __global__ void KP07DimsplitKernel(
     
     
     //Read into shared memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_,  h0_pitch_, Q[0], nx_+4, ny_+4);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
     __syncthreads();
     
     
diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/KP07_kernel.cu
index 8a22a55..dbd1964 100644
--- a/GPUSimulators/KP07_kernel.cu
+++ b/GPUSimulators/KP07_kernel.cu
@@ -141,9 +141,9 @@ __global__ void KP07Kernel(
     
     
     //Read into shared memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_,  h0_pitch_, Q[0], nx_+4, ny_+4);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
     __syncthreads();
     
     
diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/LxF_kernel.cu
index 63dc0f8..ff094ad 100644
--- a/GPUSimulators/LxF_kernel.cu
+++ b/GPUSimulators/LxF_kernel.cu
@@ -98,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
 }
 
 
+
 extern "C" {
 __global__ 
 void LxFKernel(
@@ -122,10 +123,10 @@ void LxFKernel(
     __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
     __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
     
-    //Read into shared memory including ghost cells
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_,  h0_pitch_, Q[0], nx_+2, ny_+2);
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    
+    readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
     __syncthreads();
     
     //Set boundary conditions
@@ -137,6 +138,7 @@ void LxFKernel(
     computeFluxG<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, G, g_, dy_, dt_);
     __syncthreads();
     
+
     //Evolve for all cells
     const int i = tx + 1; //Skip local ghost cells, i.e., +1
     const int j = ty + 1;
diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/WAF_kernel.cu
index 20c42da..b35e5dd 100644
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/WAF_kernel.cu
@@ -138,9 +138,9 @@ __global__ void WAFKernel(
     
     
     //Read into shared memory Q from global memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3);
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
     __syncthreads();
     
     
diff --git a/GPUSimulators/common.cu b/GPUSimulators/common.cu
index ce96dff..2f3f057 100644
--- a/GPUSimulators/common.cu
+++ b/GPUSimulators/common.cu
@@ -53,9 +53,9 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
 /**
   * Reads a block of data  with one ghost cell for the shallow water equations
   */
-template<int sm_width, int sm_height>
-__device__ void readBlock(float* ptr_, int pitch_,
-                float shmem[sm_height][sm_width], 
+template<int sm_width, int sm_height, int block_width, int block_height>
+inline __device__ void readBlock(float* ptr_, int pitch_,
+                float (&shmem)[sm_height][sm_width], 
                 const int max_x_, const int max_y_) {
     
     //Index of block within domain
@@ -63,13 +63,13 @@ __device__ void readBlock(float* ptr_, int pitch_,
     const int by = blockDim.y * blockIdx.y;
     
     //Read into shared memory
-    for (int j=threadIdx.y; j<sm_height; j+=blockDim.y) {
+    for (int j=threadIdx.y; j<sm_height; j+=block_height) {
         const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
         
         //Compute the pointer to current row in the arrays
         const float* const row  = (float*) ((char*) ptr_  + pitch_*l);
         
-        for (int i=threadIdx.x; i<sm_width; i+=blockDim.x) {
+        for (int i=threadIdx.x; i<sm_width; i+=block_width) {
             const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
 
             shmem[j][i] = row[k];
@@ -78,6 +78,40 @@ __device__ void readBlock(float* ptr_, int pitch_,
 }
 
 
+/**
+  * Reads a block of data with ghost cells
+  */
+template<int vars, int sm_width, int sm_height, int block_width, int block_height>
+inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars],
+                float shmem[vars][sm_height][sm_width], 
+                const int max_x_, const int max_y_) {
+    
+    //Index of block within domain
+    const int bx = blockDim.x * blockIdx.x;
+    const int by = blockDim.y * blockIdx.y;
+    
+    float* rows[3];
+    
+    //Read into shared memory
+    for (int j=threadIdx.y; j<sm_height; j+=block_height) {
+        const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
+        
+        //Compute the pointer to current row in the arrays
+        for (int m=0; m<vars; ++m) {
+            rows[m] = (float*) ((char*) ptr_[m]  + pitch_[m]*l);
+        }
+        
+        for (int i=threadIdx.x; i<sm_width; i+=block_width) {
+            const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
+
+            for (int m=0; m<vars; ++m) {
+                shmem[m][j][i] = rows[m][k];
+            }
+        }
+    }
+}
+
+
 
 
 
@@ -85,7 +119,7 @@ __device__ void readBlock(float* ptr_, int pitch_,
   * Writes a block of data to global memory for the shallow water equations.
   */
 template<int sm_width, int sm_height, int offset_x=0, int offset_y=0>
-__device__ void writeBlock(float* ptr_, int pitch_,
+inline __device__ void writeBlock(float* ptr_, int pitch_,
                  float shmem[sm_height][sm_width],
                  const int width, const int height) {