Implemented variable timestep

2025-07-05 07:51:00 +02:00 · 2018-11-15 16:48:47 +01:00 · 2018-11-15 16:48:47 +01:00 · ddac53271c
commit ddac53271c
parent 7592ad5b9f
15 changed files with 224 additions and 29 deletions
--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@ -62,7 +62,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                 g, 
                 gamma, 
                 theta=1.3, 
-                 cfl_scale=0.25*0.9,
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=8):
                 
@ -140,4 +140,4 @@ class EE2D_KP07_dimsplit (BaseSimulator):
        
    def computeDt(self):
        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*self.cfl_scale
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@ -25,6 +25,7 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray


        
@ -58,6 +59,7 @@ class FORCE (Simulator.BaseSimulator):
                 nx, ny, 
                 dx, dy, dt, 
                 g, 
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=16):
                 
@ -65,8 +67,9 @@ class FORCE (Simulator.BaseSimulator):
        super().__init__(context, 
            nx, ny, 
            dx, dy, dt, 
-            block_width, block_height);
+            block_width, block_height)
        self.g = np.float32(g) 
+        self.cfl_scale = cfl_scale
        self.boundary_conditions = boundary_conditions.asCodedInt()

        #Get kernels
@ -81,7 +84,7 @@ class FORCE (Simulator.BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("FORCEKernel")
-        self.kernel.prepare("iiffffiPiPiPiPiPiPi")
+        self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
@ -92,6 +95,8 @@ class FORCE (Simulator.BaseSimulator):
                        nx, ny, 
                        1, 1, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
        
    def step(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@ -104,7 +109,8 @@ class FORCE (Simulator.BaseSimulator):
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
        self.nt += 1
@ -115,4 +121,7 @@ class FORCE (Simulator.BaseSimulator):
    def check(self):
        self.u0.check()
        self.u1.check()
-        
+                
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@ -24,6 +24,7 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray



@ -53,6 +54,7 @@ class HLL (Simulator.BaseSimulator):
                 nx, ny, 
                 dx, dy, dt, 
                 g, 
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=16):
                 
@ -62,6 +64,7 @@ class HLL (Simulator.BaseSimulator):
            dx, dy, dt, 
            block_width, block_height);
        self.g = np.float32(g) 
+        self.cfl_scale = cfl_scale
        self.boundary_conditions = boundary_conditions.asCodedInt()

        #Get kernels
@ -76,7 +79,7 @@ class HLL (Simulator.BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("HLLKernel")
-        self.kernel.prepare("iiffffiPiPiPiPiPiPi")
+        self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
@ -87,6 +90,8 @@ class HLL (Simulator.BaseSimulator):
                        nx, ny, 
                        1, 1, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
        
    def step(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@ -99,10 +104,19 @@ class HLL (Simulator.BaseSimulator):
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
        self.nt += 1
        
    def download(self):
-        return self.u0.download(self.stream)
+        return self.u0.download(self.stream)
+                        
+    def check(self):
+        self.u0.check()
+        self.u1.check()
+        
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@ -24,6 +24,7 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray

        
        
@ -56,6 +57,7 @@ class HLL2 (Simulator.BaseSimulator):
                 dx, dy, dt, 
                 g, 
                 theta=1.8, 
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=16):
                 
@ -66,6 +68,7 @@ class HLL2 (Simulator.BaseSimulator):
            block_width, block_height);
        self.g = np.float32(g) 
        self.theta = np.float32(theta)
+        self.cfl_scale = cfl_scale
        self.boundary_conditions = boundary_conditions.asCodedInt()
        
        #Get kernels
@ -80,7 +83,7 @@ class HLL2 (Simulator.BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("HLL2Kernel")
-        self.kernel.prepare("iifffffiiPiPiPiPiPiPi")
+        self.kernel.prepare("iifffffiiPiPiPiPiPiPiP")
        
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
@ -91,6 +94,8 @@ class HLL2 (Simulator.BaseSimulator):
                        nx, ny, 
                        2, 2, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
        
    def step(self, dt):
        self.substepDimsplit(dt*0.5, 0)
@ -112,8 +117,17 @@ class HLL2 (Simulator.BaseSimulator):
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
        self.u0, self.u1 = self.u1, self.u0
    
    def download(self):
-        return self.u0.download(self.stream)
+        return self.u0.download(self.stream)
+        
+    def check(self):
+        self.u0.check()
+        self.u1.check()
+        
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@ -29,6 +29,7 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray



@ -57,6 +58,7 @@ class KP07 (Simulator.BaseSimulator):
                 dx, dy, dt, 
                 g, 
                 theta=1.3, 
+                 cfl_scale=0.9,
                 order=2,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=16):
@ -67,7 +69,8 @@ class KP07 (Simulator.BaseSimulator):
            dx, dy, dt, 
            block_width, block_height);
        self.g = np.float32(g)             
-        self.theta = np.float32(theta)
+        self.theta = np.float32(theta) 
+        self.cfl_scale = cfl_scale
        self.order = np.int32(order)
        self.boundary_conditions = boundary_conditions.asCodedInt()

@ -83,7 +86,7 @@ class KP07 (Simulator.BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("KP07Kernel")
-        self.kernel.prepare("iifffffiiPiPiPiPiPiPi")
+        self.kernel.prepare("iifffffiiPiPiPiPiPiPiP")
        
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
@ -94,6 +97,8 @@ class KP07 (Simulator.BaseSimulator):
                        nx, ny, 
                        2, 2, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
                        
        
    def step(self, dt):
@ -121,9 +126,18 @@ class KP07 (Simulator.BaseSimulator):
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
        self.u0, self.u1 = self.u1, self.u0


    def download(self):
-        return self.u0.download(self.stream)
+        return self.u0.download(self.stream)
+        
+    def check(self):
+        self.u0.check()
+        self.u1.check()
+        
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5**self.order*self.cfl_scale
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@ -29,6 +29,8 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray
+



@ -57,6 +59,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                 dx, dy, dt, 
                 g, 
                 theta=1.3, 
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=16):
                 
@ -69,6 +72,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        self.gc_y = 2
        self.g = np.float32(g)
        self.theta = np.float32(theta)
+        self.cfl_scale = cfl_scale
        self.boundary_conditions = boundary_conditions.asCodedInt()

        #Get kernels
@ -83,7 +87,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("KP07DimsplitKernel")
-        self.kernel.prepare("iifffffiiPiPiPiPiPiPi")
+        self.kernel.prepare("iifffffiiPiPiPiPiPiPiP")
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
@ -94,6 +98,8 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                        nx, ny, 
                        self.gc_x, self.gc_y, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
    
    def step(self, dt):
        self.substepDimsplit(dt*0.5, 0)
@ -115,7 +121,8 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
        self.u0, self.u1 = self.u1, self.u0
    
        
@ -124,4 +131,8 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        
    def check(self):
        self.u0.check()
-        self.u1.check()
+        self.u1.check()
+        
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@ -25,6 +25,7 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray



@ -54,6 +55,7 @@ class LxF (Simulator.BaseSimulator):
                 nx, ny, 
                 dx, dy, dt, 
                 g, 
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16):
                 
@ -63,6 +65,7 @@ class LxF (Simulator.BaseSimulator):
            dx, dy, dt, 
            block_width, block_height);
        self.g = np.float32(g) 
+        self.cfl_scale = cfl_scale
        self.boundary_conditions = boundary_conditions.asCodedInt()

        # Get kernels
@ -77,7 +80,7 @@ class LxF (Simulator.BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("LxFKernel")
-        self.kernel.prepare("iiffffiPiPiPiPiPiPi")
+        self.kernel.prepare("iiffffiPiPiPiPiPiPiP")

        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, 
@ -88,6 +91,8 @@ class LxF (Simulator.BaseSimulator):
                        nx, ny, 
                        1, 1, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
        
    def step(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
@ -100,10 +105,15 @@ class LxF (Simulator.BaseSimulator):
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                self.cfl_data.gpudata)
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
        self.nt += 1
  
    def download(self):
        return self.u0.download(self.stream)
+        
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@ -25,6 +25,7 @@ from GPUSimulators import Simulator, Common
 from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
 import numpy as np

+from pycuda import gpuarray



@ -52,6 +53,7 @@ class WAF (Simulator.BaseSimulator):
                 nx, ny, 
                 dx, dy, dt, 
                 g, 
+                 cfl_scale=0.9,
                 boundary_conditions=BoundaryCondition(), 
                 block_width=16, block_height=16):
                 
@ -61,6 +63,7 @@ class WAF (Simulator.BaseSimulator):
            dx, dy, dt*2, 
            block_width, block_height);
        self.g = np.float32(g) 
+        self.cfl_scale = cfl_scale
        self.boundary_conditions = boundary_conditions.asCodedInt()

        #Get kernels
@ -86,6 +89,8 @@ class WAF (Simulator.BaseSimulator):
                        nx, ny, 
                        2, 2, 
                        [None, None, None])
+        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
+        self.cfl_data.fill(self.dt, stream=self.stream)
    
    def step(self, dt):
        self.substepDimsplit(dt*0.5, substep=0)
@ -109,4 +114,12 @@ class WAF (Simulator.BaseSimulator):
        self.u0, self.u1 = self.u1, self.u0

    def download(self):
-        return self.u0.download(self.stream)
+        return self.u0.download(self.stream)
+        
+    def check(self):
+        self.u0.check()
+        self.u1.check()
+        
+    def computeDt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
+        return max_dt*0.5*self.cfl_scale
--- a/GPUSimulators/cuda/SWE2D_FORCE.cu
+++ b/GPUSimulators/cuda/SWE2D_FORCE.cu
@ -97,7 +97,10 @@ __global__ void FORCEKernel(
        //Output h^{n+1}
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_,
+        
+        //Output CFL
+        float* cfl_) {
    
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
@ -130,6 +133,11 @@ __global__ void FORCEKernel(
    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    
+    //Compute the CFL for this block
+    if (cfl_ != NULL) {
+        writeCfl<w, h, gc_x, gc_y, vars>(Q, F[0], nx_, ny_, dx_, dy_, g_, cfl_);
+    }
 }

 } // extern "C"
--- a/GPUSimulators/cuda/SWE2D_HLL.cu
+++ b/GPUSimulators/cuda/SWE2D_HLL.cu
@ -113,7 +113,10 @@ __global__ void HLLKernel(
        //Output h^{n+1}
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_,
+        
+        //Output CFL
+        float* cfl_) {
    
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
@ -148,6 +151,11 @@ __global__ void HLLKernel(
    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    
+    //Compute the CFL for this block
+    if (cfl_ != NULL) {
+        writeCfl<w, h, gc_x, gc_y, vars>(Q, F[0], nx_, ny_, dx_, dy_, g_, cfl_);
+    }
 }

 } // extern "C"
--- a/GPUSimulators/cuda/SWE2D_HLL2.cu
+++ b/GPUSimulators/cuda/SWE2D_HLL2.cu
@ -141,7 +141,10 @@ __global__ void HLL2Kernel(
        //Output h^{n+1}
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_,
+        
+        //Output CFL
+        float* cfl_) {
    
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
@ -203,6 +206,11 @@ __global__ void HLL2Kernel(
    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    
+    //Compute the CFL for this block
+    if (cfl_ != NULL) {
+        writeCfl<w, h, gc_x, gc_y, vars>(Q, F[0], nx_, ny_, dx_, dy_, g_, cfl_);
+    }
 }

 } // extern "C"
--- a/GPUSimulators/cuda/SWE2D_KP07.cu
+++ b/GPUSimulators/cuda/SWE2D_KP07.cu
@ -151,8 +151,10 @@ __global__ void KP07Kernel(
        //Output h^{n+1}
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {
-            
+        float* hv1_ptr_, int hv1_pitch_,
+        
+        //Output CFL
+        float* cfl_) {
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
    const unsigned int gc_x = 2;
@ -222,5 +224,10 @@ __global__ void KP07Kernel(
            hv_row[ti] = hv1;
        }
    }
+    
+    //Compute the CFL for this block
+    if (cfl_ != NULL) {
+        writeCfl<w, h, gc_x, gc_y, vars>(Q, Q[0], nx_, ny_, dx_, dy_, g_, cfl_);
+    }
 }
 } //extern "C"
--- a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu
@ -138,7 +138,10 @@ __global__ void KP07DimsplitKernel(
        //Output h^{n+1}
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_, 
+        
+        //Output CFL
+        float* cfl_) {
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
    const unsigned int gc_x = 2;
@ -194,6 +197,11 @@ __global__ void KP07DimsplitKernel(
    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    
+    //Compute the CFL for this block
+    if (cfl_ != NULL) {
+        writeCfl<w, h, gc_x, gc_y, vars>(Q, F[0], nx_, ny_, dx_, dy_, g_, cfl_);
+    }
 }


--- a/GPUSimulators/cuda/SWE2D_LxF.cu
+++ b/GPUSimulators/cuda/SWE2D_LxF.cu
@ -114,7 +114,10 @@ void LxFKernel(
        //Output h^{n+1}
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
-        float* hv1_ptr_, int hv1_pitch_) {
+        float* hv1_ptr_, int hv1_pitch_,
+        
+        //Output CFL
+        float* cfl_) {
    
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
@ -154,6 +157,11 @@ void LxFKernel(
    writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1);
    writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1);
+    
+    //Compute the CFL for this block
+    if (cfl_ != NULL) {
+        writeCfl<w, h, gc_x, gc_y, vars>(Q, Q[0], nx_, ny_, dx_, dy_, g_, cfl_);
+    }
 }

 } // extern "C"
--- a/GPUSimulators/cuda/SWECommon.h
+++ b/GPUSimulators/cuda/SWECommon.h
@ -468,3 +468,66 @@ __device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float
    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
    return 0.5f*(F_lf + F_lw2);
 }
+
+
+
+
+
+
+
+
+
+
+template<int w, int h, int gc_x, int gc_y, int vars>
+__device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x],
+        float shmem[h+2*gc_y][w+2*gc_x],
+        const int nx_, const int ny_,
+        const float dx_, const float dy_, const float g_,
+        float* output_) {
+    //Index of thread within block
+    const int tx = threadIdx.x + gc_x;
+    const int ty = threadIdx.y + gc_y;
+    
+    //Index of cell within domain
+    const int ti = blockDim.x*blockIdx.x + tx;
+    const int tj = blockDim.y*blockIdx.y + ty;
+    
+    //Only internal cells
+    if (ti < nx_+gc_x && tj < ny_+gc_y) {
+        const float h = Q[0][ty][tx];
+        const float u   = Q[1][ty][tx] / h;
+        const float v   = Q[2][ty][tx] / h;
+        
+        const float max_u = dx_ / (fabsf(u) + sqrtf(g_*h));
+        const float max_v = dy_ / (fabsf(v) + sqrtf(g_*h));
+        
+        shmem[ty][tx] = fminf(max_u, max_v);
+    }
+    __syncthreads();
+    
+    //One row of threads loop over all rows
+    if (ti < nx_+gc_x && tj < ny_+gc_y) {
+        if (ty == gc_y) {
+            float min_val = shmem[ty][tx];
+            const int max_y = min(h, ny_+gc_y - tj);
+            for (int j=gc_y; j<max_y+gc_y; j++) {
+                min_val = fminf(min_val, shmem[j][tx]);
+            }
+            shmem[ty][tx] = min_val;
+        }
+    }
+    __syncthreads();
+    
+    //One thread loops over first row to find global max
+    if (tx == gc_x && ty == gc_y) {
+        float min_val = shmem[ty][tx];
+        const int max_x = min(w, nx_+gc_x - ti);
+        for (int i=gc_x; i<max_x+gc_x; ++i) {
+            min_val = fminf(min_val, shmem[ty][i]);
+        }
+        
+        const int idx = gridDim.x*blockIdx.y + blockIdx.x;
+        output_[idx] = min_val;
+    }
+}
+