diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py
index 616d965..8df04b1 100644
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -41,7 +41,9 @@ class FORCE (Simulator.BaseSimulator):
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
-                 block_width=16, block_height=16):
+                 block_width=16, block_height=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
         
@@ -55,6 +57,7 @@ class FORCE (Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -75,11 +78,11 @@ class FORCE (Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("FORCEKernel")
-        self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
+        self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -91,10 +94,15 @@ class FORCE (Simulator.BaseSimulator):
                         1, 1, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+
+        if dt == None:
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+
+        self.cfl_data.fill(self.dt, stream=self.stream)
         
     def substep(self, dt, step_number):
         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@@ -108,7 +116,10 @@ class FORCE (Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
+
         self.u0, self.u1 = self.u1, self.u0
         
     def getOutput(self):
diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py
index 52fc0ca..840cd83 100644
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -40,7 +40,9 @@ class HLL (Simulator.BaseSimulator):
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
-                 block_width=16, block_height=16):
+                 block_width=16, block_height=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
 
@@ -54,6 +56,7 @@ class HLL (Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -74,11 +77,11 @@ class HLL (Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("HLLKernel")
-        self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
+        self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -90,10 +93,14 @@ class HLL (Simulator.BaseSimulator):
                         1, 1, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+        if dt == None:
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+        
+        self.cfl_data.fill(self.dt, stream=self.stream)
         
     def substep(self, dt, step_number):
         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@@ -107,7 +114,9 @@ class HLL (Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
         self.u0, self.u1 = self.u1, self.u0
         
     def getOutput(self):
diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py
index 9fdcdd8..12f8dcf 100644
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -41,7 +41,9 @@ class HLL2 (Simulator.BaseSimulator):
                  theta=1.8, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
-                 block_width=16, block_height=16):
+                 block_width=16, block_height=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
 
@@ -55,6 +57,7 @@ class HLL2 (Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -76,11 +79,11 @@ class HLL2 (Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("HLL2Kernel")
-        self.kernel.prepare("iifffffiiPiPiPiPiPiPiP")
+        self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
         
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -92,10 +95,15 @@ class HLL2 (Simulator.BaseSimulator):
                         2, 2, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+        
+        if dt == None:
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+        
+        self.cfl_data.fill(self.dt, stream=self.stream)
         
     def substep(self, dt, step_number):
         self.substepDimsplit(dt*0.5, step_number)
@@ -114,7 +122,9 @@ class HLL2 (Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
         self.u0, self.u1 = self.u1, self.u0
     
     def getOutput(self):
diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py
index 37ca7ec..ff255e7 100644
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -47,7 +47,9 @@ class KP07 (Simulator.BaseSimulator):
                  cfl_scale=0.9,
                  order=2,
                  boundary_conditions=BoundaryCondition(), 
-                 block_width=16, block_height=16):
+                 block_width=16, block_height=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
         
@@ -61,6 +63,7 @@ class KP07 (Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -83,11 +86,11 @@ class KP07 (Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("KP07Kernel")
-        self.kernel.prepare("iifffffiiPiPiPiPiPiPiP")
+        self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
         
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -99,10 +102,15 @@ class KP07 (Simulator.BaseSimulator):
                         2, 2, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+        
+        if dt == None:
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+        
+        self.cfl_data.fill(self.dt, stream=self.stream)
                         
         
     def substep(self, dt, step_number):
@@ -123,7 +131,9 @@ class KP07 (Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
         self.u0, self.u1 = self.u1, self.u0
 
     def getOutput(self):
diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py
index 680adb7..b5eb7aa 100644
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -46,7 +46,9 @@ class KP07_dimsplit(Simulator.BaseSimulator):
                  theta=1.3, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
-                 block_width=16, block_height=16):
+                 block_width=16, block_height=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
         
@@ -60,6 +62,7 @@ class KP07_dimsplit(Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -83,11 +86,11 @@ class KP07_dimsplit(Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("KP07DimsplitKernel")
-        self.kernel.prepare("iifffffiiPiPiPiPiPiPiP")
+        self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -99,10 +102,15 @@ class KP07_dimsplit(Simulator.BaseSimulator):
                         self.gc_x, self.gc_y, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+
+        if dt == None:
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+        
+        self.cfl_data.fill(self.dt, stream=self.stream)
     
     def substep(self, dt, step_number):
         self.substepDimsplit(dt*0.5, step_number)
@@ -121,7 +129,9 @@ class KP07_dimsplit(Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
         self.u0, self.u1 = self.u1, self.u0
 
     def getOutput(self):
diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py
index e5833a9..614b514 100644
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -41,7 +41,9 @@ class LxF (Simulator.BaseSimulator):
                  g: float, 
                  cfl_scale: float=0.9,
                  boundary_conditions=BoundaryCondition(),
-                 block_width: int=16, block_height: int=16):
+                 block_width: int=16, block_height: int=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
 
@@ -55,6 +57,7 @@ class LxF (Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -75,11 +78,11 @@ class LxF (Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("LxFKernel")
-        self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
+        self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
 
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -91,10 +94,15 @@ class LxF (Simulator.BaseSimulator):
                         1, 1, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+
+        if dt == None:  
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) 
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+        
+        self.cfl_data.fill(self.dt, stream=self.stream)
         
     def substep(self, dt, step_number):
         """
@@ -113,7 +121,9 @@ class LxF (Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
         self.u0, self.u1 = self.u1, self.u0
   
     def getOutput(self):
diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py
index 21fd484..25c0d6a 100644
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -41,7 +41,9 @@ class WAF (Simulator.BaseSimulator):
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
-                 block_width=16, block_height=16):
+                 block_width=16, block_height=16,
+                 dt: float=None,
+                 compile_opts: list[str]=[]):
         """
         Initialization routine
 
@@ -55,6 +57,7 @@ class WAF (Simulator.BaseSimulator):
             dy: Grid cell spacing along y-axis (20 000 m)
             dt: Size of each timestep (90 s)
             g: Gravitational accelleration (9.81 m/s^2)
+            compile_opts: Pass a list of nvcc compiler options
         """
                  
         # Call super constructor
@@ -75,11 +78,11 @@ class WAF (Simulator.BaseSimulator):
                                         }, 
                                         compile_args={
                                             'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
+                                            'options': ["--use_fast_math"] + compile_opts, 
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("WAFKernel")
-        self.kernel.prepare("iiffffiiPiPiPiPiPiPiP")
+        self.kernel.prepare("iiffffiiPiPiPiPiPiPiPiiii")
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -91,10 +94,15 @@ class WAF (Simulator.BaseSimulator):
                         2, 2, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
-        dt = min(dt_x, dt_y)
-        self.cfl_data.fill(dt, stream=self.stream)
+        
+        if dt == None:
+            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+            self.dt = min(dt_x, dt_y)
+        else:
+            self.dt = dt
+        
+        self.cfl_data.fill(self.dt, stream=self.stream)
     
     def substep(self, dt, step_number):
         self.substepDimsplit(dt*0.5, step_number)
@@ -112,7 +120,9 @@ class WAF (Simulator.BaseSimulator):
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
+                0, 0,
+                self.nx, self.ny)
         self.u0, self.u1 = self.u1, self.u0
 
     def getOutput(self):
diff --git a/GPUSimulators/cuda/SWE2D_FORCE.cu b/GPUSimulators/cuda/SWE2D_FORCE.cu
index dac46be..787fbf4 100644
--- a/GPUSimulators/cuda/SWE2D_FORCE.cu
+++ b/GPUSimulators/cuda/SWE2D_FORCE.cu
@@ -100,8 +100,18 @@ __global__ void FORCEKernel(
         float* hv1_ptr_, int hv1_pitch_,
         
         //Output CFL
-        float* cfl_) {
-    
+        float* cfl_,
+
+        //Subarea of internal domain to compute
+        int x0=0, int y0=0,
+        int x1=0, int y1=0) {
+
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_; 
+
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc_x = 1;
@@ -112,9 +122,9 @@ __global__ void FORCEKernel(
     __shared__ float F[vars][h+2*gc_y][w+2*gc_x];
     
     //Read into shared memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     __syncthreads();
     
     //Compute flux along x, and evolve
@@ -130,9 +140,9 @@ __global__ void FORCEKernel(
     __syncthreads();
     
     //Write to main memory
-    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
     
     //Compute the CFL for this block
     if (cfl_ != NULL) {
diff --git a/GPUSimulators/cuda/SWE2D_HLL.cu b/GPUSimulators/cuda/SWE2D_HLL.cu
index 3ed6b35..07c5cec 100644
--- a/GPUSimulators/cuda/SWE2D_HLL.cu
+++ b/GPUSimulators/cuda/SWE2D_HLL.cu
@@ -116,7 +116,17 @@ __global__ void HLLKernel(
         float* hv1_ptr_, int hv1_pitch_,
         
         //Output CFL
-        float* cfl_) {
+        float* cfl_,
+
+        //Subarea of internal domain to compute
+        int x0=0, int y0=0,
+        int x1=0, int y1=0) {
+
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_;
     
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
@@ -129,9 +139,9 @@ __global__ void HLLKernel(
     __shared__ float F[vars][h+2*gc_y][w+2*gc_x];
     
     //Read into shared memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     
     //Compute F flux
     computeFluxF(Q, F, g_);
@@ -148,9 +158,9 @@ __global__ void HLLKernel(
     __syncthreads();
     
     // Write to main memory for all internal cells
-    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
     
     //Compute the CFL for this block
     if (cfl_ != NULL) {
diff --git a/GPUSimulators/cuda/SWE2D_HLL2.cu b/GPUSimulators/cuda/SWE2D_HLL2.cu
index 94f92b5..e920948 100644
--- a/GPUSimulators/cuda/SWE2D_HLL2.cu
+++ b/GPUSimulators/cuda/SWE2D_HLL2.cu
@@ -144,7 +144,17 @@ __global__ void HLL2Kernel(
         float* hv1_ptr_, int hv1_pitch_,
         
         //Output CFL
-        float* cfl_) {
+        float* cfl_,
+
+        //Subarea of internal domain to compute
+        int x0=0, int y0=0,
+        int x1=0, int y1=0) {
+
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_;
     
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
@@ -158,9 +168,9 @@ __global__ void HLL2Kernel(
     __shared__ float  F[3][h+4][w+4];
     
     //Read into shared memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     
     //Step 0 => evolve x first, then y
     if (step_ == 0) {
@@ -203,9 +213,9 @@ __global__ void HLL2Kernel(
     
     
     // Write to main memory for all internal cells
-    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
     
     //Compute the CFL for this block
     if (cfl_ != NULL) {
diff --git a/GPUSimulators/cuda/SWE2D_KP07.cu b/GPUSimulators/cuda/SWE2D_KP07.cu
index 6fa6154..f39ead4 100644
--- a/GPUSimulators/cuda/SWE2D_KP07.cu
+++ b/GPUSimulators/cuda/SWE2D_KP07.cu
@@ -154,7 +154,18 @@ __global__ void KP07Kernel(
         float* hv1_ptr_, int hv1_pitch_,
         
         //Output CFL
-        float* cfl_) {
+        float* cfl_,
+
+        //Subarea of internal domain to compute
+        int x0=0, int y0=0,
+        int x1=0, int y1=0) {
+
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_;
+
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc_x = 2;
@@ -179,9 +190,9 @@ __global__ void KP07Kernel(
     
     
     //Read into shared memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     
     
     //Reconstruct slopes along x and axis
diff --git a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
index ac256e3..a8bfae8 100644
--- a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
@@ -141,7 +141,18 @@ __global__ void KP07DimsplitKernel(
         float* hv1_ptr_, int hv1_pitch_, 
         
         //Output CFL
-        float* cfl_) {
+        float* cfl_,
+
+        //Subarea of internal domain to compute
+        int x0=0, int y0=0,
+        int x1=0, int y1=0) {
+
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_;
+
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
     const unsigned int gc_x = 2;
@@ -154,9 +165,9 @@ __global__ void KP07DimsplitKernel(
     __shared__ float  F[vars][h+2*gc_y][w+2*gc_x];
     
     //Read into shared memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     
     if (step_ == 0) {
         //Along X
@@ -194,9 +205,9 @@ __global__ void KP07DimsplitKernel(
     }
     
     // Write to main memory for all internal cells
-    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
     
     //Compute the CFL for this block
     if (cfl_ != NULL) {
diff --git a/GPUSimulators/cuda/SWE2D_LxF.cu b/GPUSimulators/cuda/SWE2D_LxF.cu
index 1f197fd..a68cd29 100644
--- a/GPUSimulators/cuda/SWE2D_LxF.cu
+++ b/GPUSimulators/cuda/SWE2D_LxF.cu
@@ -116,8 +116,18 @@ void LxFKernel(
         float* hu1_ptr_, int hu1_pitch_,
         float* hv1_ptr_, int hv1_pitch_,
         
-        //Output CFL
-        float* cfl_) {
+       //Output CFL
+       float* cfl_,
+
+       //Subarea of internal domain to compute
+       int x0=0, int y0=0,
+       int x1=0, int y1=0) {
+
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_;
     
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
@@ -130,9 +140,9 @@ void LxFKernel(
     __shared__ float G[vars][h+1][w  ];
     
     //Read from global memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     
     //Compute fluxes along the x and y axis
     computeFluxF<w, h>(Q, F, g_, dx_, dt_);
@@ -154,9 +164,9 @@ void LxFKernel(
     __syncthreads();
 
     //Write to main memory
-    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
     
     //Compute the CFL for this block
     if (cfl_ != NULL) {
diff --git a/GPUSimulators/cuda/SWE2D_WAF.cu b/GPUSimulators/cuda/SWE2D_WAF.cu
index 2c38cdf..cfcbd79 100644
--- a/GPUSimulators/cuda/SWE2D_WAF.cu
+++ b/GPUSimulators/cuda/SWE2D_WAF.cu
@@ -116,7 +116,17 @@ __global__ void WAFKernel(
         //Output h^{n+1}
         float* h1_ptr_, int h1_pitch_,
         float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {   
+        float* hv1_ptr_, int hv1_pitch_,
+ 
+        //Subarea of internal domain to compute
+        int x0=0, int y0=0,
+        int x1=0, int y1=0) {
+ 
+    if(x1 == 0)
+        x1 = nx_;
+
+    if(y1 == 0)
+        y1 = ny_;  
             
     const unsigned int w = BLOCK_WIDTH;
     const unsigned int h = BLOCK_HEIGHT;
@@ -131,9 +141,9 @@ __global__ void WAFKernel(
     
     
     //Read into shared memory Q from global memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
     __syncthreads();
     
     
@@ -170,9 +180,9 @@ __global__ void WAFKernel(
 
     
     // Write to main memory for all internal cells
-    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
-    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
+    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
 }
 
 } // extern "C"
\ No newline at end of file
diff --git a/GPUSimulators/cuda/SWECommon.h b/GPUSimulators/cuda/SWECommon.h
index 52f8b31..72eff5c 100644
--- a/GPUSimulators/cuda/SWECommon.h
+++ b/GPUSimulators/cuda/SWECommon.h
@@ -477,10 +477,11 @@ __device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float
 
 
 
-
-template<int w, int h, int gc_x, int gc_y, int vars>
-__device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x],
-        float shmem[h+2*gc_y][w+2*gc_x],
+// TODO give better names for `Q_w` and `Q_h` in the template
+// as it probably does not reflect well on the name 
+template<int Q_w, int Q_h, int gc_x, int gc_y, int vars>
+__device__ void writeCfl(float Q[vars][Q_h+2*gc_y][Q_w+2*gc_x],
+        float shmem[Q_h+2*gc_y][Q_w+2*gc_x],
         const int nx_, const int ny_,
         const float dx_, const float dy_, const float g_,
         float* output_) {
@@ -509,7 +510,7 @@ __device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x],
     if (ti < nx_+gc_x && tj < ny_+gc_y) {
         if (ty == gc_y) {
             float min_val = shmem[ty][tx];
-            const int max_y = min(h, ny_+gc_y - tj);
+            const int max_y = min(Q_h, ny_+gc_y - tj);
             for (int j=gc_y; j<max_y+gc_y; j++) {
                 min_val = fminf(min_val, shmem[j][tx]);
             }
@@ -521,7 +522,7 @@ __device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x],
     //One thread loops over first row to find global max
     if (tx == gc_x && ty == gc_y) {
         float min_val = shmem[ty][tx];
-        const int max_x = min(w, nx_+gc_x - ti);
+        const int max_x = min(Q_w, nx_+gc_x - ti);
         for (int i=gc_x; i<max_x+gc_x; ++i) {
             min_val = fminf(min_val, shmem[ty][i]);
         }
diff --git a/GPUSimulators/cuda/common.h b/GPUSimulators/cuda/common.h
index 5463294..11d1c8e 100644
--- a/GPUSimulators/cuda/common.h
+++ b/GPUSimulators/cuda/common.h
@@ -322,8 +322,8 @@ inline __device__ void readBlock(float* ptr_, int pitch_,
                 float Q[h+2*gc_y][w+2*gc_x], 
                 const int nx_, const int ny_,
                 const int boundary_conditions_,
-                 int x0, int y0,
-                 int x1, int y1) {
+                int x0, int y0,
+                int x1, int y1) {
     //Index of block within domain
     const int bx = blockDim.x * blockIdx.x;
     const int by = blockDim.y * blockIdx.y;