Euler appears to work now

2025-11-29 17:28:03 +01:00 · 2018-11-05 16:46:37 +01:00
parent 0671bd747a
commit e38885d39b
13 changed files with 702 additions and 19 deletions
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -29,16 +29,17 @@ import io
 import hashlib
 import logging
 import gc
+import netCDF4

 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda


-"""
-Class which keeps track of time spent for a section of code
-"""
 class Timer(object):
+    """
+    Class which keeps track of time spent for a section of code
+    """
    def __init__(self, tag, log_level=logging.DEBUG):
        self.tag = tag
        self.log_level = log_level
@@ -63,9 +64,116 @@ class Timer(object):
        


+class DataDumper(object):
+    """
+    Simple class for holding a netCDF4 object
+    (handles opening and closing in a nice way)
+    Use as 
+    with DataDumper("filename") as data:
+        ...
+    """
+    def __init__(self, filename, *args, **kwargs):
+        self.logger = logging.getLogger(__name__)
+        
+        #Create directory if needed
+        dirname = os.path.dirname(filename)
+        if not os.path.isdir(dirname):
+            self.logger.info("Creating directory " + dirname)
+            os.makedirs(dirname)
+        
+        #Get mode of file if we have that
+        mode = None
+        if (args):
+            mode = args[0]
+        elif (kwargs and 'mode' in kwargs.keys()):
+            mode = kwargs['mode']
+            
+        #Create new unique file if writing
+        if (mode):
+            if (("w" in mode) or ("+" in mode) or ("a" in mode)):
+                i = 0
+                stem, ext = os.path.splitext(filename)
+                while (os.path.isfile(filename)):
+                    filename = "{:s}_{:04d}{:s}".format(stem, i, ext)
+                    i = i+1
+        self.filename = os.path.abspath(filename)
+        
+        #Save arguments
+        self.args = args
+        self.kwargs = kwargs
+                
+        #Log output
+        self.logger.info("Writing output to " + self.filename)
+        
+        
+    def __enter__(self):
+        self.logger.info("Opening " + self.filename)
+        if (self.args):
+            self.logger.info("Arguments: " + str(self.args))
+        if (self.kwargs):
+            self.logger.info("Keyword arguments: " + str(self.kwargs))
+        self.ncfile = netCDF4.Dataset(self.filename, *self.args, **self.kwargs)
+        return self
+        
+    def __exit__(self, *args):
+        self.logger.info("Closing " + self.filename)
+        self.ncfile.close()


+        
+        
+class ProgressPrinter(object):
+    """
+    Small helper class for 
+    """
+    def __init__(self, total_steps, print_every=5):
+        self.logger = logging.getLogger(__name__)
+        self.start = time.time()
+        self.total_steps = total_steps
+        self.print_every = print_every
+        self.next_print_time = self.print_every
+        self.last_step = 0
+        self.secs_per_iter = None
+        
+    def getPrintString(self, step):
+        elapsed =  time.time() - self.start
+        if (elapsed > self.next_print_time):            
+            dt = elapsed - (self.next_print_time - self.print_every)
+            dsteps = step - self.last_step
+            steps_remaining = self.total_steps - step
+                        
+            if (dsteps == 0):
+                return
+                
+            self.last_step = step
+            self.next_print_time = elapsed + self.print_every
+            
+            if not self.secs_per_iter:
+                self.secs_per_iter = dt / dsteps
+            self.secs_per_iter = 0.2*self.secs_per_iter + 0.8*(dt / dsteps)
+            
+            remaining_time = steps_remaining * self.secs_per_iter

+            return "{:s}. Total: {:s}, elapsed: {:s}, remaining: {:s}".format(
+                ProgressPrinter.progressBar(step, self.total_steps), 
+                ProgressPrinter.timeString(elapsed + remaining_time), 
+                ProgressPrinter.timeString(elapsed), 
+                ProgressPrinter.timeString(remaining_time))
+
+    def timeString(seconds):
+        seconds = int(max(seconds, 1))
+        minutes, seconds = divmod(seconds, 60)
+        hours, minutes = divmod(minutes, 60)
+        periods = [('h', hours), ('m', minutes), ('s', seconds)]
+        time_string = ' '.join('{}{}'.format(value, name)
+                                for name, value in periods
+                                if value)
+        return time_string
+
+    def progressBar(step, total_steps, width=30):
+        progress = np.round(width * step / total_steps).astype(np.int32)
+        progressbar = "0% [" + "#"*(progress) + "="*(width-progress) + "] 100%"
+        return progressbar



--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@@ -113,6 +113,7 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
                self.u1[3].data.gpudata, self.u1[3].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
            
    def stepDimsplitYX(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
@@ -131,6 +132,7 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
                self.u1[3].data.gpudata, self.u1[3].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
        
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -105,6 +105,7 @@ class FORCE (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
        
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -100,6 +100,7 @@ class HLL (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
        
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -110,6 +110,7 @@ class HLL2 (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
            
    def stepDimsplitYX(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
@@ -126,6 +127,7 @@ class HLL2 (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
        
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -110,6 +110,7 @@ class KP07 (Simulator.BaseSimulator):
    def stepEuler(self, dt):
        self.substepRK(dt, 0)
        self.t += dt
+        self.nt += 1
        
    def stepRK(self, dt, order):
        if (order != 2):
@@ -117,6 +118,7 @@ class KP07 (Simulator.BaseSimulator):
        self.substepRK(dt, 0)
        self.substepRK(dt, 1)
        self.t += dt
+        self.nt += 1
    
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -110,6 +110,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
    
    def stepDimsplitYX(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
@@ -126,6 +127,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
        
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -101,6 +101,7 @@ class LxF (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
  
    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -80,8 +80,9 @@ class BaseSimulator:
        #Create a CUDA stream
        self.stream = cuda.Stream()
        
-        #Keep track of simulation time
-        self.t = 0.0;
+        #Keep track of simulation time and number of timesteps
+        self.t = 0.0
+        self.nt = 0

        #Log progress every n seconds during simulation
        self.log_every = 5
@@ -209,14 +210,16 @@ class BaseSimulator:

    def download(self):
        raise(NotImplementedError("Needs to be implemented in subclass"))
-
-    def check(self):
-        raise(NotImplementedError("Needs to be implemented in subclass"))
-        
-    def sim_time(self):
-        return self.t
        
    def synchronize(self):
        self.stream.synchronize()

+    def check(self):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def simTime(self):
+        return self.t
+
+    def simSteps(self):
+        return self.nt
        
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -103,6 +103,7 @@ class WAF (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1
        
    def stepDimsplitYX(self, dt):
        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
@@ -118,6 +119,7 @@ class WAF (Simulator.BaseSimulator):
                self.u1[2].data.gpudata, self.u1[2].data.strides[0])
        self.u0, self.u1 = self.u1, self.u0
        self.t += dt
+        self.nt += 1

    def download(self):
        return self.u0.download(self.stream)
--- a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu
@@ -154,10 +154,10 @@ __global__ void KP07DimsplitKernel(
    
    
    //Read into shared memory
-    readBlock<w, h, gc>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_+4, ny_+4);
-    readBlock<w, h, gc>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_+4, ny_+4);
-    readBlock<w, h, gc>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_+4, ny_+4);
-    readBlock<w, h, gc>(    E0_ptr_,     E0_pitch_, Q[3], nx_+4, ny_+4);
+    readBlock<w, h, gc>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_);
+    readBlock<w, h, gc>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_);
+    readBlock<w, h, gc>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_);
+    readBlock<w, h, gc>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_);
    __syncthreads();
    
    //Fix boundary conditions
@@ -226,6 +226,26 @@ __global__ void KP07DimsplitKernel(

        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
        __syncthreads();
+        
+        //This is the RK2-part
+        const int tx = threadIdx.x + gc;
+        const int ty = threadIdx.y + gc;
+        const float q1 = Q[0][ty][tx];
+        const float q2 = Q[1][ty][tx];
+        const float q3 = Q[2][ty][tx];
+        const float q4 = Q[3][ty][tx];
+        __syncthreads();
+        
+        readBlock<w, h, gc>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_);
+        readBlock<w, h, gc>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_);
+        readBlock<w, h, gc>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_);
+        readBlock<w, h, gc>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_);
+        __syncthreads();
+        
+        Q[0][ty][tx] = 0.5f*( Q[0][ty][tx] + q1 );
+        Q[1][ty][tx] = 0.5f*( Q[1][ty][tx] + q2 );
+        Q[2][ty][tx] = 0.5f*( Q[2][ty][tx] + q3 );
+        Q[3][ty][tx] = 0.5f*( Q[3][ty][tx] + q4 );
    }

    
--- a/GPUSimulators/cuda/common.h
+++ b/GPUSimulators/cuda/common.h
@@ -92,19 +92,29 @@ __device__ float desingularize(float x_, float eps_) {
 template<int block_width, int block_height, int ghost_cells>
 inline __device__ void readBlock(float* ptr_, int pitch_,
                float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells], 
-                const int max_x_, const int max_y_) {
+                const int nx_, const int ny_) {
    //Index of block within domain
    const int bx = blockDim.x * blockIdx.x;
    const int by = blockDim.y * blockIdx.y;
-        
+
+    const int gc_pad = 4;
+    
    //Read into shared memory
    //Loop over all variables
    for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+=block_height) {
-        const int l = min(by + j, max_y_-1);
+        //const int l = min(by + j, ny_+2*ghost_cells-1);
+        const int y = by + j;
+        const int y_offset = ( (int) (y < gc_pad) - (int) (y >= ny_+2*ghost_cells-gc_pad) ) * (ny_+2*ghost_cells - 2*gc_pad); 
+        const int l = y + y_offset;
        float* row = (float*) ((char*) ptr_  + pitch_*l);
        
        for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+=block_width) {
-            const int k = min(bx + i, max_x_-1);
+            //const int k = min(bx + i, nx_+2*ghost_cells-1);
+            
+            const int x = bx + i;
+            const int gc_pad = 4;
+            const int x_offset = ( (int) (x < gc_pad) - (int) (x >= nx_+2*ghost_cells-gc_pad) ) * (nx_+2*ghost_cells - 2*gc_pad); 
+            const int k = x + x_offset;
            
            shmem[j][i] = row[k];
        }
@@ -248,6 +258,16 @@ __device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2



+
+
+
+
+
+
+
+
+
+


 template<int block_width, int block_height, int ghost_cells, int vars>