From a1902a13305b1b1a0308dc800c5c3bcf9c7aca1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= Date: Fri, 24 Aug 2018 15:44:51 +0200 Subject: [PATCH] Updated writeblock also --- GPUSimulators/FORCE_kernel.cu | 14 ++--- GPUSimulators/HLL2_kernel.cu | 6 +- GPUSimulators/HLL_kernel.cu | 26 +++------ GPUSimulators/KP07_dimsplit_kernel.cu | 6 +- GPUSimulators/KP07_kernel.cu | 6 +- GPUSimulators/LxF_kernel.cu | 64 ++++++++------------- GPUSimulators/common.cu | 83 +++++++-------------------- 7 files changed, 69 insertions(+), 136 deletions(-) diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu index f3212e4..017a05c 100644 --- a/GPUSimulators/FORCE_kernel.cu +++ b/GPUSimulators/FORCE_kernel.cu @@ -120,11 +120,10 @@ __global__ void FORCEKernel( //Read into shared memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+1, ny_+1); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+1, ny_+1); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+1, ny_+1); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); - //Set boundary conditions noFlowBoundary1(Q, nx_, ny_); @@ -147,10 +146,9 @@ __global__ void FORCEKernel( __syncthreads(); //Write to main memory - writeBlock1(h1_ptr_, h1_pitch_, - hu1_ptr_, hu1_pitch_, - hv1_ptr_, hv1_pitch_, - Q, nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu index 4439d49..a5d3156 100644 --- a/GPUSimulators/HLL2_kernel.cu +++ b/GPUSimulators/HLL2_kernel.cu @@ -166,9 +166,9 @@ __global__ void HLL2Kernel( //Read into shared memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); __syncthreads(); //Set boundary conditions diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/HLL_kernel.cu index 0cc483f..88fffd5 100644 --- a/GPUSimulators/HLL_kernel.cu +++ b/GPUSimulators/HLL_kernel.cu @@ -121,21 +121,19 @@ __global__ void HLLKernel( float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { - - const int block_width = BLOCK_WIDTH; - const int block_height = BLOCK_HEIGHT; //Shared memory variables - __shared__ float Q[3][block_height+2][block_width+2]; - __shared__ float F[3][block_height+1][block_width+1]; + __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; + __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; //Read into shared memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+1, ny_+1); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+1, ny_+1); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+1, ny_+1); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); + //Set boundary conditions noFlowBoundary1(Q, nx_, ny_); __syncthreads(); @@ -155,16 +153,10 @@ __global__ void HLLKernel( evolveG1(Q, F, nx_, ny_, dy_, dt_); __syncthreads(); - - //Q[0][threadIdx.y + 1][threadIdx.x + 1] += 0.1; - - - // Write to main memory for all internal cells - writeBlock1(h1_ptr_, h1_pitch_, - hu1_ptr_, hu1_pitch_, - hv1_ptr_, hv1_pitch_, - Q, nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/KP07_dimsplit_kernel.cu index 5eb8165..4ab3c78 100644 --- a/GPUSimulators/KP07_dimsplit_kernel.cu +++ b/GPUSimulators/KP07_dimsplit_kernel.cu @@ -157,9 +157,9 @@ __global__ void KP07DimsplitKernel( //Read into shared memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); __syncthreads(); diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/KP07_kernel.cu index 096c535..8a22a55 100644 --- a/GPUSimulators/KP07_kernel.cu +++ b/GPUSimulators/KP07_kernel.cu @@ -141,9 +141,9 @@ __global__ void KP07Kernel( //Read into shared memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); __syncthreads(); diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/LxF_kernel.cu index 427a521..63dc0f8 100644 --- a/GPUSimulators/LxF_kernel.cu +++ b/GPUSimulators/LxF_kernel.cu @@ -115,58 +115,42 @@ void LxFKernel( float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { - const int block_width = BLOCK_WIDTH; - const int block_height = BLOCK_HEIGHT; - - //Index of cell within domain - const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1 - const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1; + const int tx = threadIdx.x; + const int ty = threadIdx.y; - __shared__ float Q[3][block_height+2][block_width+2]; - __shared__ float F[3][block_height][block_width+1]; - __shared__ float G[3][block_height+1][block_width]; + __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; + __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1]; + __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH]; - //Read into shared memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+1, ny_+1); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+1, ny_+1); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+1, ny_+1); + //Read into shared memory including ghost cells + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Set boundary conditions noFlowBoundary1(Q, nx_, ny_); __syncthreads(); - //Compute fluxes along the x and y axis - computeFluxF(Q, F, g_, dx_, dt_); - computeFluxG(Q, G, g_, dy_, dt_); + computeFluxF(Q, F, g_, dx_, dt_); + computeFluxG(Q, G, g_, dy_, dt_); __syncthreads(); - - //Evolve for all internal cells - if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) { - //Index of thread within block - const int tx = threadIdx.x; - const int ty = threadIdx.y; - - const int i = tx + 1; //Skip local ghost cells, i.e., +1 - const int j = ty + 1; - - const float h1 = Q[0][j][i] + (F[0][ty][tx] - F[0][ty ][tx+1]) * dt_ / dx_ - + (G[0][ty][tx] - G[0][ty+1][tx ]) * dt_ / dy_; - const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty ][tx+1]) * dt_ / dx_ - + (G[1][ty][tx] - G[1][ty+1][tx ]) * dt_ / dy_; - const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty ][tx+1]) * dt_ / dx_ - + (G[2][ty][tx] - G[2][ty+1][tx ]) * dt_ / dy_; + //Evolve for all cells + const int i = tx + 1; //Skip local ghost cells, i.e., +1 + const int j = ty + 1; + Q[0][j][i] += (F[0][ty][tx] - F[0][ty ][tx+1]) * dt_ / dx_ + + (G[0][ty][tx] - G[0][ty+1][tx ]) * dt_ / dy_; + Q[1][j][i] += (F[1][ty][tx] - F[1][ty ][tx+1]) * dt_ / dx_ + + (G[1][ty][tx] - G[1][ty+1][tx ]) * dt_ / dy_; + Q[2][j][i] += (F[2][ty][tx] - F[2][ty ][tx+1]) * dt_ / dx_ + + (G[2][ty][tx] - G[2][ty+1][tx ]) * dt_ / dy_; - float* const h_row = (float*) ((char*) h1_ptr_ + h1_pitch_*tj); - float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj); - float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj); - - h_row[ti] = h1; - hu_row[ti] = hu1; - hv_row[ti] = hv1; - } + //Write to main memory + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" diff --git a/GPUSimulators/common.cu b/GPUSimulators/common.cu index 54dce8d..ce96dff 100644 --- a/GPUSimulators/common.cu +++ b/GPUSimulators/common.cu @@ -56,7 +56,7 @@ inline __device__ __host__ float clamp(const float f, const float a, const float template __device__ void readBlock(float* ptr_, int pitch_, float shmem[sm_height][sm_width], - const int max_x, const int max_y) { + const int max_x_, const int max_y_) { //Index of block within domain const int bx = blockDim.x * blockIdx.x; @@ -64,13 +64,13 @@ __device__ void readBlock(float* ptr_, int pitch_, //Read into shared memory for (int j=threadIdx.y; j(h_ptr_, h_pitch_, Q[0], nx_+3, ny_+3); - readBlock(hu_ptr_, hu_pitch_, Q[1], nx_+3, ny_+3); - readBlock(hv_ptr_, hv_pitch_, Q[2], nx_+3, ny_+3); -} - - - - - - /** * Writes a block of data to global memory for the shallow water equations. */ -__device__ void writeBlock1(float* h_ptr_, int h_pitch_, - float* hu_ptr_, int hu_pitch_, - float* hv_ptr_, int hv_pitch_, - float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], - const int nx_, const int ny_) { - //Index of thread within block - const int tx = threadIdx.x; - const int ty = threadIdx.y; +template +__device__ void writeBlock(float* ptr_, int pitch_, + float shmem[sm_height][sm_width], + const int width, const int height) { //Index of cell within domain - const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1 - const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + offset_x; + const int tj = blockDim.y*blockIdx.y + threadIdx.y + offset_y; //Only write internal cells - if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) { - const int i = tx + 1; //Skip local ghost cells, i.e., +1 - const int j = ty + 1; - - float* const h_row = (float*) ((char*) h_ptr_ + h_pitch_*tj); - float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj); - float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj); + if (ti < width+offset_x && tj < height+offset_y) { + //Index of thread within block + const int tx = threadIdx.x + offset_x; + const int ty = threadIdx.y + offset_y; - h_row[ti] = Q[0][j][i]; - hu_row[ti] = Q[1][j][i]; - hv_row[ti] = Q[2][j][i]; + float* const row = (float*) ((char*) ptr_ + pitch_*tj); + row[ti] = shmem[ty][tx]; } } @@ -134,6 +108,9 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_, + + + /** * Writes a block of data to global memory for the shallow water equations. */ @@ -142,27 +119,9 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_, float* hv_ptr_, int hv_pitch_, float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) { - //Index of thread within block - const int tx = threadIdx.x; - const int ty = threadIdx.y; - - //Index of cell within domain - const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2 - const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; - - //Only write internal cells - if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) { - const int i = tx + 2; //Skip local ghost cells, i.e., +2 - const int j = ty + 2; - - float* const h_row = (float*) ((char*) h_ptr_ + h_pitch_*tj); - float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj); - float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj); - - h_row[ti] = Q[0][j][i]; - hu_row[ti] = Q[1][j][i]; - hv_row[ti] = Q[2][j][i]; - } + writeBlock( h_ptr_, h_pitch_, Q[0], nx_, ny_); + writeBlock(hu_ptr_, hu_pitch_, Q[1], nx_, ny_); + writeBlock(hv_ptr_, hv_pitch_, Q[2], nx_, ny_); }