diff --git a/GPUSimulators/cuda/SWE2D_FORCE.cu b/GPUSimulators/cuda/SWE2D_FORCE.cu
index 0762029..1db7b17 100644
--- a/GPUSimulators/cuda/SWE2D_FORCE.cu
+++ b/GPUSimulators/cuda/SWE2D_FORCE.cu
@@ -117,6 +117,7 @@ __global__ void FORCEKernel(
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc = 1;
+    const unsigned int vars = 3;
     
     __shared__ float Q[3][h+2][w+2];
     __shared__ float F[3][h+1][w+1];
@@ -136,10 +137,7 @@ __global__ void FORCEKernel(
     //Compute flux along x, and evolve
     computeFluxF(Q, F, g_, dx_, dt_);
     __syncthreads();
-    
-    evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-    evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-    evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+    evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
     __syncthreads();
     
     //Set boundary conditions
@@ -151,10 +149,7 @@ __global__ void FORCEKernel(
     //Compute flux along y, and evolve
     computeFluxG(Q, F, g_, dy_, dt_);
     __syncthreads();
-    
-    evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-    evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-    evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+    evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
     __syncthreads();
     
     //Write to main memory
diff --git a/GPUSimulators/cuda/SWE2D_HLL.cu b/GPUSimulators/cuda/SWE2D_HLL.cu
index 8b0c2cb..c05552c 100644
--- a/GPUSimulators/cuda/SWE2D_HLL.cu
+++ b/GPUSimulators/cuda/SWE2D_HLL.cu
@@ -124,6 +124,7 @@ __global__ void HLLKernel(
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc = 1;
+    const unsigned int vars = 3;
     
     //Shared memory variables
     __shared__ float Q[3][h+2][w+2];
@@ -145,9 +146,7 @@ __global__ void HLLKernel(
     computeFluxF(Q, F, g_);
     __syncthreads();
     
-    evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-    evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-    evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+    evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
     __syncthreads();
     
     //Set boundary conditions
@@ -160,9 +159,7 @@ __global__ void HLLKernel(
     computeFluxG(Q, F, g_);
     __syncthreads();
     
-    evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-    evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-    evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+    evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
     __syncthreads();
     
     // Write to main memory for all internal cells
diff --git a/GPUSimulators/cuda/SWE2D_HLL2.cu b/GPUSimulators/cuda/SWE2D_HLL2.cu
index 40910cc..dc39089 100644
--- a/GPUSimulators/cuda/SWE2D_HLL2.cu
+++ b/GPUSimulators/cuda/SWE2D_HLL2.cu
@@ -159,6 +159,7 @@ __global__ void HLL2Kernel(
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc = 2;
+    const unsigned int vars = 3;
             
     //Shared memory variables
     __shared__ float  Q[3][h+4][w+4];
@@ -184,10 +185,7 @@ __global__ void HLL2Kernel(
         __syncthreads();
         computeFluxF(Q, Qx, F, g_, dx_, dt_);
         __syncthreads();
-        
-        evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-        evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-        evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
         __syncthreads();
         
         //Set boundary conditions
@@ -201,10 +199,7 @@ __global__ void HLL2Kernel(
         __syncthreads();
         computeFluxG(Q, Qx, F, g_, dy_, dt_);
         __syncthreads();
-        
-        evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-        evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-        evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+        evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
         __syncthreads();
     }
     //Step 1 => evolve y first, then x
@@ -214,10 +209,7 @@ __global__ void HLL2Kernel(
         __syncthreads();
         computeFluxG(Q, Qx, F, g_, dy_, dt_);
         __syncthreads();
-        
-        evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-        evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-        evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+        evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
         __syncthreads();
         
         //Set boundary conditions
@@ -231,10 +223,7 @@ __global__ void HLL2Kernel(
         __syncthreads();
         computeFluxF(Q, Qx, F, g_, dx_, dt_);
         __syncthreads();
-        
-        evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-        evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-        evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
         __syncthreads();
     }
     
diff --git a/GPUSimulators/cuda/SWE2D_KP07.cu b/GPUSimulators/cuda/SWE2D_KP07.cu
index 09b44a5..98d2c5b 100644
--- a/GPUSimulators/cuda/SWE2D_KP07.cu
+++ b/GPUSimulators/cuda/SWE2D_KP07.cu
@@ -122,6 +122,7 @@ __global__ void KP07Kernel(
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc = 2;
+    const unsigned int vars = 3;
         
     //Index of thread within block
     const int tx = threadIdx.x;
diff --git a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
index 2957dcc..2ab0c70 100644
--- a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
@@ -150,6 +150,7 @@ __global__ void KP07DimsplitKernel(
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc = 2;
+    const unsigned int vars = 3;
         
         
     //Shared memory variables
@@ -181,9 +182,7 @@ __global__ void KP07DimsplitKernel(
         __syncthreads();
         computeFluxF(Q, Qx, F, g_, dx_, dt_);
         __syncthreads();
-        evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-        evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-        evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
         __syncthreads();
         
         //Set boundary conditions
@@ -200,10 +199,7 @@ __global__ void KP07DimsplitKernel(
         
         computeFluxG(Q, Qx, F, g_, dy_, dt_);
         __syncthreads();
-        
-        evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-        evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-        evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+        evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
         __syncthreads();
     }
     //Step 1 => evolve y first, then x
@@ -213,10 +209,7 @@ __global__ void KP07DimsplitKernel(
         __syncthreads();
         computeFluxG(Q, Qx, F, g_, dy_, dt_);
         __syncthreads();
-        
-        evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-        evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-        evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+        evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
         __syncthreads();
         
         //Set boundary conditions
@@ -230,9 +223,7 @@ __global__ void KP07DimsplitKernel(
         __syncthreads();
         computeFluxF(Q, Qx, F, g_, dx_, dt_);
         __syncthreads();
-        evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-        evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-        evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
         __syncthreads();
     }
     
diff --git a/GPUSimulators/cuda/SWE2D_WAF.cu b/GPUSimulators/cuda/SWE2D_WAF.cu
index 1b80e4d..7a56d8e 100644
--- a/GPUSimulators/cuda/SWE2D_WAF.cu
+++ b/GPUSimulators/cuda/SWE2D_WAF.cu
@@ -134,6 +134,7 @@ __global__ void WAFKernel(
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc = 2;
+    const unsigned int vars = 3;
          
     //Shared memory variables
     __shared__ float Q[3][h+4][w+4];
@@ -161,10 +162,7 @@ __global__ void WAFKernel(
         //Compute fluxes along the x axis and evolve
         computeFluxF(Q, F, g_, dx_, dt_);
         __syncthreads();
-        
-        evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-        evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-        evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
         __syncthreads();
         
         //Fix boundary conditions
@@ -176,10 +174,7 @@ __global__ void WAFKernel(
         //Compute fluxes along the y axis and evolve
         computeFluxG(Q, F, g_, dy_, dt_);
         __syncthreads();
-        
-        evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-        evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-        evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+        evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
         __syncthreads();
     }
     //Step 1 => evolve y first, then x
@@ -187,10 +182,7 @@ __global__ void WAFKernel(
         //Compute fluxes along the y axis and evolve
         computeFluxG(Q, F, g_, dy_, dt_);
         __syncthreads();
-        
-        evolveG<w, h, gc>(Q[0], F[0], dy_, dt_);
-        evolveG<w, h, gc>(Q[1], F[1], dy_, dt_);
-        evolveG<w, h, gc>(Q[2], F[2], dy_, dt_);
+        evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
         __syncthreads();
         
         //Fix boundary conditions
@@ -202,10 +194,7 @@ __global__ void WAFKernel(
         //Compute fluxes along the x axis and evolve
         computeFluxF(Q, F, g_, dx_, dt_);
         __syncthreads();
-        
-        evolveF<w, h, gc>(Q[0], F[0], dx_, dt_);
-        evolveF<w, h, gc>(Q[1], F[1], dx_, dt_);
-        evolveF<w, h, gc>(Q[2], F[2], dx_, dt_);
+        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
         __syncthreads();
     }
 
diff --git a/GPUSimulators/cuda/common.h b/GPUSimulators/cuda/common.h
index 9fe08ff..d0c53bc 100644
--- a/GPUSimulators/cuda/common.h
+++ b/GPUSimulators/cuda/common.h
@@ -245,9 +245,9 @@ __device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2
 
 
 
-template<int block_width, int block_height, int ghost_cells>
-__device__ void evolveF(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells],
-              float F[block_height+1][block_width+1],
+template<int block_width, int block_height, int ghost_cells, int vars>
+__device__ void evolveF(float Q[vars][block_height+2*ghost_cells][block_width+2*ghost_cells],
+              float F[vars][block_height+1][block_width+1],
               const float dx_, const float dt_) {
     //Index of thread within block
     const int tx = threadIdx.x;
@@ -260,8 +260,9 @@ __device__ void evolveF(float Q[block_height+2*ghost_cells][block_width+2*ghost_
     //const int ti = blockDim.x*blockIdx.x + threadIdx.x + ghost_cells; //Skip global ghost cells, i.e., +1
     //const int tj = blockDim.y*blockIdx.y + threadIdx.y + ghost_cells;
     //if (ti > ghost_cells-1 && ti < nx_+ghost_cells && tj > ghost_cells-1 && tj < ny_+ghost_cells) {
-    Q[j][i] = Q[j][i] + (F[ty][tx] - F[ty][tx+1]) * dt_ / dx_;
-    
+    for (int var=0; var < vars; ++var) {
+        Q[var][j][i] = Q[var][j][i] + (F[var][ty][tx] - F[var][ty][tx+1]) * dt_ / dx_;
+    }
 }
 
 
@@ -272,9 +273,9 @@ __device__ void evolveF(float Q[block_height+2*ghost_cells][block_width+2*ghost_
 /**
   * Evolves the solution in time along the y axis (dimensional splitting)
   */
-template<int block_width, int block_height, int ghost_cells>
-__device__ void evolveG(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells],
-              float G[block_height+1][block_width+1],
+template<int block_width, int block_height, int ghost_cells, int vars>
+__device__ void evolveG(float Q[vars][block_height+2*ghost_cells][block_width+2*ghost_cells],
+              float G[vars][block_height+1][block_width+1],
               const float dy_, const float dt_) {
     //Index of thread within block
     const int tx = threadIdx.x;
@@ -283,7 +284,9 @@ __device__ void evolveG(float Q[block_height+2*ghost_cells][block_width+2*ghost_
     const int i = tx + ghost_cells; //Skip local ghost cells, i.e., +1
     const int j = ty + ghost_cells;
     
-    Q[j][i] = Q[j][i] + (G[ty][tx] - G[ty+1][tx]) * dt_ / dy_;
+    for (int var=0; var < vars; ++var) {
+        Q[var][j][i] = Q[var][j][i] + (G[var][ty][tx] - G[var][ty+1][tx]) * dt_ / dy_;
+    }
 }