diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py
index f1b5e1c..ccf826a 100644
--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@@ -58,7 +58,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                  context, 
                  rho, rho_u, rho_v, E, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy,  
                  g, 
                  gamma, 
                  theta=1.3, 
@@ -67,14 +67,14 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                  block_width=16, block_height=8):
                  
         # Call super constructor
-        super().__init__(context, \
-            nx, ny, \
-            dx, dy, 2*dt, \
+        super().__init__(context, 
+            nx, ny, 
+            dx, dy, 
+            cfl_scale, 
             block_width, block_height)
         self.g = np.float32(g)
         self.gamma = np.float32(gamma)
         self.theta = np.float32(theta) 
-        self.cfl_scale = cfl_scale
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
         #Get kernels
@@ -102,7 +102,10 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                         2, 2, 
                         [None, None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(gamma*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(gamma*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
                         
     
     def step(self, dt):
@@ -140,4 +143,4 @@ class EE2D_KP07_dimsplit (BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file
diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py
index dc66b38..e6d7e94 100644
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -57,7 +57,7 @@ class FORCE (Simulator.BaseSimulator):
                  context, 
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
@@ -66,10 +66,10 @@ class FORCE (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height)
         self.g = np.float32(g) 
-        self.cfl_scale = cfl_scale
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
         #Get kernels
@@ -96,7 +96,10 @@ class FORCE (Simulator.BaseSimulator):
                         1, 1, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
         
     def step(self, dt):
         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@@ -124,4 +127,4 @@ class FORCE (Simulator.BaseSimulator):
                 
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file
diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py
index c76074a..26bfa11 100644
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -52,7 +52,7 @@ class HLL (Simulator.BaseSimulator):
                  context,
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
@@ -61,10 +61,10 @@ class HLL (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height);
         self.g = np.float32(g) 
-        self.cfl_scale = cfl_scale
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
         #Get kernels
@@ -91,7 +91,10 @@ class HLL (Simulator.BaseSimulator):
                         1, 1, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
         
     def step(self, dt):
         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@@ -119,4 +122,4 @@ class HLL (Simulator.BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file
diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py
index 3894116..02edaf7 100644
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -54,7 +54,7 @@ class HLL2 (Simulator.BaseSimulator):
                  context, 
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  theta=1.8, 
                  cfl_scale=0.9,
@@ -64,7 +64,8 @@ class HLL2 (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt*2, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height);
         self.g = np.float32(g) 
         self.theta = np.float32(theta)
@@ -95,7 +96,10 @@ class HLL2 (Simulator.BaseSimulator):
                         2, 2, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
         
     def step(self, dt):
         self.substepDimsplit(dt*0.5, 0)
@@ -130,4 +134,4 @@ class HLL2 (Simulator.BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file
diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py
index ad4faef..a9d5912 100644
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -55,7 +55,7 @@ class KP07 (Simulator.BaseSimulator):
                  context, 
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  theta=1.3, 
                  cfl_scale=0.9,
@@ -66,11 +66,11 @@ class KP07 (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height);
         self.g = np.float32(g)             
         self.theta = np.float32(theta) 
-        self.cfl_scale = cfl_scale
         self.order = np.int32(order)
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
@@ -98,7 +98,10 @@ class KP07 (Simulator.BaseSimulator):
                         2, 2, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
                         
         
     def step(self, dt):
@@ -140,4 +143,4 @@ class KP07 (Simulator.BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5**self.order*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5**self.order
\ No newline at end of file
diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py
index fd52f8a..000e6c7 100644
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -38,7 +38,7 @@ from pycuda import gpuarray
 """
 Class that solves the SW equations using the dimentionally split KP07 scheme
 """
-class KP07_dimsplit (Simulator.BaseSimulator):
+class KP07_dimsplit(Simulator.BaseSimulator):
 
     """
     Initialization routine
@@ -56,7 +56,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                  context, 
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  theta=1.3, 
                  cfl_scale=0.9,
@@ -66,13 +66,13 @@ class KP07_dimsplit (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt*2, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height)
         self.gc_x = 2
         self.gc_y = 2
         self.g = np.float32(g)
         self.theta = np.float32(theta)
-        self.cfl_scale = cfl_scale
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
         #Get kernels
@@ -99,7 +99,10 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                         self.gc_x, self.gc_y, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
     
     def step(self, dt):
         self.substepDimsplit(dt*0.5, 0)
@@ -135,4 +138,4 @@ class KP07_dimsplit (Simulator.BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file
diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py
index aa71281..62ea04b 100644
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -53,7 +53,7 @@ class LxF (Simulator.BaseSimulator):
                  context, 
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(),
@@ -62,10 +62,10 @@ class LxF (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height);
         self.g = np.float32(g) 
-        self.cfl_scale = cfl_scale
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
         # Get kernels
@@ -92,7 +92,10 @@ class LxF (Simulator.BaseSimulator):
                         1, 1, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
         
     def step(self, dt):
         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@@ -116,4 +119,4 @@ class LxF (Simulator.BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file
diff --git a/GPUSimulators/Simulator.py b/GPUSimulators/Simulator.py
index bfeedf6..6928d36 100644
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -104,7 +104,8 @@ class BaseSimulator(object):
     def __init__(self, 
                  context, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
+                 cfl_scale,
                  block_width, block_height):
         """
         Initialization routine
@@ -130,7 +131,7 @@ class BaseSimulator(object):
         self.ny = np.int32(ny)
         self.dx = np.float32(dx)
         self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
+        self.cfl_scale = cfl_scale
         
         #Handle autotuning block size
         if (self.context.autotuner):
@@ -168,20 +169,22 @@ class BaseSimulator(object):
         
         t_end = self.simTime() + t
         
+        dt = None
+        
         while(self.simTime() < t_end):
             if (self.simSteps() % 100 == 0):
-                self.dt = self.computeDt()
+                dt = self.computeDt()*self.cfl_scale
         
             # Compute timestep for "this" iteration (i.e., shorten last timestep)
-            local_dt = np.float32(min(self.dt, t_end-self.simTime()))
+            dt = np.float32(min(dt, t_end-self.simTime()))
 
             # Stop if end reached (should not happen)
-            if (local_dt <= 0.0):
+            if (dt <= 0.0):
                 self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.simSteps()))
                 break
         
             # Step forward in time
-            self.step(local_dt)
+            self.step(dt)
 
             #Print info
             print_string = printer.getPrintString(t_end - self.simTime())
diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py
index 3de3b58..b7382e4 100644
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -51,7 +51,7 @@ class WAF (Simulator.BaseSimulator):
                  context,
                  h0, hu0, hv0, 
                  nx, ny, 
-                 dx, dy, dt, 
+                 dx, dy, 
                  g, 
                  cfl_scale=0.9,
                  boundary_conditions=BoundaryCondition(), 
@@ -60,10 +60,10 @@ class WAF (Simulator.BaseSimulator):
         # Call super constructor
         super().__init__(context, 
             nx, ny, 
-            dx, dy, dt*2, 
+            dx, dy, 
+            cfl_scale,
             block_width, block_height);
         self.g = np.float32(g) 
-        self.cfl_scale = cfl_scale
         self.boundary_conditions = boundary_conditions.asCodedInt()
 
         #Get kernels
@@ -78,7 +78,7 @@ class WAF (Simulator.BaseSimulator):
                                         }, 
                                         jit_compile_args={})
         self.kernel = module.get_function("WAFKernel")
-        self.kernel.prepare("iiffffiiPiPiPiPiPiPi")
+        self.kernel.prepare("iiffffiiPiPiPiPiPiPiP")
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, 
@@ -90,7 +90,10 @@ class WAF (Simulator.BaseSimulator):
                         2, 2, 
                         [None, None, None])
         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        self.cfl_data.fill(self.dt, stream=self.stream)
+        dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
+        dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        dt = min(dt_x, dt_y)
+        self.cfl_data.fill(dt, stream=self.stream)
     
     def step(self, dt):
         self.substepDimsplit(dt*0.5, substep=0)
@@ -110,7 +113,8 @@ class WAF (Simulator.BaseSimulator):
                 self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                 self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                 self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
         self.u0, self.u1 = self.u1, self.u0
 
     def download(self):
@@ -122,4 +126,4 @@ class WAF (Simulator.BaseSimulator):
         
     def computeDt(self):
         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5*self.cfl_scale
\ No newline at end of file
+        return max_dt*0.5
\ No newline at end of file