Euler appears to work now

2025-11-29 17:28:03 +01:00 · 2018-11-05 16:46:37 +01:00
parent 0671bd747a
commit e38885d39b
13 changed files with 702 additions and 19 deletions
--- a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu
@@ -154,10 +154,10 @@ __global__ void KP07DimsplitKernel(
    
    
    //Read into shared memory
-    readBlock<w, h, gc>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_+4, ny_+4);
-    readBlock<w, h, gc>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_+4, ny_+4);
-    readBlock<w, h, gc>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_+4, ny_+4);
-    readBlock<w, h, gc>(    E0_ptr_,     E0_pitch_, Q[3], nx_+4, ny_+4);
+    readBlock<w, h, gc>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_);
+    readBlock<w, h, gc>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_);
+    readBlock<w, h, gc>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_);
+    readBlock<w, h, gc>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_);
    __syncthreads();
    
    //Fix boundary conditions
@@ -226,6 +226,26 @@ __global__ void KP07DimsplitKernel(

        evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
        __syncthreads();
+        
+        //This is the RK2-part
+        const int tx = threadIdx.x + gc;
+        const int ty = threadIdx.y + gc;
+        const float q1 = Q[0][ty][tx];
+        const float q2 = Q[1][ty][tx];
+        const float q3 = Q[2][ty][tx];
+        const float q4 = Q[3][ty][tx];
+        __syncthreads();
+        
+        readBlock<w, h, gc>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_);
+        readBlock<w, h, gc>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_);
+        readBlock<w, h, gc>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_);
+        readBlock<w, h, gc>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_);
+        __syncthreads();
+        
+        Q[0][ty][tx] = 0.5f*( Q[0][ty][tx] + q1 );
+        Q[1][ty][tx] = 0.5f*( Q[1][ty][tx] + q2 );
+        Q[2][ty][tx] = 0.5f*( Q[2][ty][tx] + q3 );
+        Q[3][ty][tx] = 0.5f*( Q[3][ty][tx] + q4 );
    }

    
--- a/GPUSimulators/cuda/common.h
+++ b/GPUSimulators/cuda/common.h
@@ -92,19 +92,29 @@ __device__ float desingularize(float x_, float eps_) {
 template<int block_width, int block_height, int ghost_cells>
 inline __device__ void readBlock(float* ptr_, int pitch_,
                float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells], 
-                const int max_x_, const int max_y_) {
+                const int nx_, const int ny_) {
    //Index of block within domain
    const int bx = blockDim.x * blockIdx.x;
    const int by = blockDim.y * blockIdx.y;
-        
+
+    const int gc_pad = 4;
+    
    //Read into shared memory
    //Loop over all variables
    for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+=block_height) {
-        const int l = min(by + j, max_y_-1);
+        //const int l = min(by + j, ny_+2*ghost_cells-1);
+        const int y = by + j;
+        const int y_offset = ( (int) (y < gc_pad) - (int) (y >= ny_+2*ghost_cells-gc_pad) ) * (ny_+2*ghost_cells - 2*gc_pad); 
+        const int l = y + y_offset;
        float* row = (float*) ((char*) ptr_  + pitch_*l);
        
        for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+=block_width) {
-            const int k = min(bx + i, max_x_-1);
+            //const int k = min(bx + i, nx_+2*ghost_cells-1);
+            
+            const int x = bx + i;
+            const int gc_pad = 4;
+            const int x_offset = ( (int) (x < gc_pad) - (int) (x >= nx_+2*ghost_cells-gc_pad) ) * (nx_+2*ghost_cells - 2*gc_pad); 
+            const int k = x + x_offset;
            
            shmem[j][i] = row[k];
        }
@@ -248,6 +258,16 @@ __device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2



+
+
+
+
+
+
+
+
+
+


 template<int block_width, int block_height, int ghost_cells, int vars>