Added more efficient function to read data from global memory!

2025-05-18 14:34:13 +02:00 · 2018-09-04 15:12:40 +02:00 · 2018-09-04 15:12:40 +02:00 · 58f281d724
commit 58f281d724
parent a1902a1330
8 changed files with 64 additions and 28 deletions
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/FORCE_kernel.cu
@ -120,9 +120,9 @@ __global__ void FORCEKernel(
    //Read into shared memory
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_,  h0_pitch_, Q[0], nx_+2, ny_+2);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
+    readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
    __syncthreads();
    //Set boundary conditions
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/HLL2_kernel.cu
@ -166,9 +166,9 @@ __global__ void HLL2Kernel(
    //Read into shared memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_,  h0_pitch_, Q[0], nx_+4, ny_+4);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
    __syncthreads();
    //Set boundary conditions
--- a/GPUSimulators/HLL_kernel.cu
+++ b/GPUSimulators/HLL_kernel.cu
@ -128,9 +128,9 @@ __global__ void HLLKernel(
    //Read into shared memory
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_,  h0_pitch_, Q[0], nx_+2, ny_+2);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
+    readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
    __syncthreads();
    //Set boundary conditions
--- a/GPUSimulators/KP07_dimsplit_kernel.cu
+++ b/GPUSimulators/KP07_dimsplit_kernel.cu
@ -157,9 +157,9 @@ __global__ void KP07DimsplitKernel(
    //Read into shared memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_,  h0_pitch_, Q[0], nx_+4, ny_+4);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
    __syncthreads();
--- a/GPUSimulators/KP07_kernel.cu
+++ b/GPUSimulators/KP07_kernel.cu
@ -141,9 +141,9 @@ __global__ void KP07Kernel(
    //Read into shared memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_,  h0_pitch_, Q[0], nx_+4, ny_+4);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
    __syncthreads();
--- a/GPUSimulators/LxF_kernel.cu
+++ b/GPUSimulators/LxF_kernel.cu
@ -98,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
 }
 extern "C" {
 __global__ 
 void LxFKernel(
@ -122,10 +123,10 @@ void LxFKernel(
    __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
-    //Read into shared memory including ghost cells
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_,  h0_pitch_, Q[0], nx_+2, ny_+2);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
+    
-    readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
+    readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
    __syncthreads();
    //Set boundary conditions
@ -137,6 +138,7 @@ void LxFKernel(
    computeFluxG<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, G, g_, dy_, dt_);
    __syncthreads();
    //Evolve for all cells
    const int i = tx + 1; //Skip local ghost cells, i.e., +1
    const int j = ty + 1;
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/WAF_kernel.cu
@ -138,9 +138,9 @@ __global__ void WAFKernel(
    //Read into shared memory Q from global memory
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3);
+    float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3);
+    int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
-    readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3);
+    readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
    __syncthreads();
--- a/GPUSimulators/common.cu
+++ b/GPUSimulators/common.cu
@ -53,9 +53,9 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
 /**
  * Reads a block of data  with one ghost cell for the shallow water equations
  */
-template<int sm_width, int sm_height>
+template<int sm_width, int sm_height, int block_width, int block_height>
-__device__ void readBlock(float* ptr_, int pitch_,
+inline __device__ void readBlock(float* ptr_, int pitch_,
-                float shmem[sm_height][sm_width], 
+                float (&shmem)[sm_height][sm_width], 
                const int max_x_, const int max_y_) {
    //Index of block within domain
@ -63,13 +63,13 @@ __device__ void readBlock(float* ptr_, int pitch_,
    const int by = blockDim.y * blockIdx.y;
    //Read into shared memory
-    for (int j=threadIdx.y; j<sm_height; j+=blockDim.y) {
+    for (int j=threadIdx.y; j<sm_height; j+=block_height) {
        const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
        //Compute the pointer to current row in the arrays
        const float* const row  = (float*) ((char*) ptr_  + pitch_*l);
-        for (int i=threadIdx.x; i<sm_width; i+=blockDim.x) {
+        for (int i=threadIdx.x; i<sm_width; i+=block_width) {
            const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
            shmem[j][i] = row[k];
@ -78,6 +78,40 @@ __device__ void readBlock(float* ptr_, int pitch_,
 }
 /**
  * Reads a block of data with ghost cells
  */
 template<int vars, int sm_width, int sm_height, int block_width, int block_height>
 inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars],
                float shmem[vars][sm_height][sm_width], 
                const int max_x_, const int max_y_) {
    //Index of block within domain
    const int bx = blockDim.x * blockIdx.x;
    const int by = blockDim.y * blockIdx.y;
    float* rows[3];
    //Read into shared memory
    for (int j=threadIdx.y; j<sm_height; j+=block_height) {
        const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
        //Compute the pointer to current row in the arrays
        for (int m=0; m<vars; ++m) {
            rows[m] = (float*) ((char*) ptr_[m]  + pitch_[m]*l);
        }
        for (int i=threadIdx.x; i<sm_width; i+=block_width) {
            const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
            for (int m=0; m<vars; ++m) {
                shmem[m][j][i] = rows[m][k];
            }
        }
    }
 }
@ -85,7 +119,7 @@ __device__ void readBlock(float* ptr_, int pitch_,
  * Writes a block of data to global memory for the shallow water equations.
  */
 template<int sm_width, int sm_height, int offset_x=0, int offset_y=0>
-__device__ void writeBlock(float* ptr_, int pitch_,
+inline __device__ void writeBlock(float* ptr_, int pitch_,
                 float shmem[sm_height][sm_width],
                 const int width, const int height) {