diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu index 419c665..f3212e4 100644 --- a/GPUSimulators/FORCE_kernel.cu +++ b/GPUSimulators/FORCE_kernel.cu @@ -120,10 +120,9 @@ __global__ void FORCEKernel( //Read into shared memory - readBlock1(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+1, ny_+1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+1, ny_+1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+1, ny_+1); __syncthreads(); diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu index bdaf0f8..4439d49 100644 --- a/GPUSimulators/HLL2_kernel.cu +++ b/GPUSimulators/HLL2_kernel.cu @@ -166,10 +166,9 @@ __global__ void HLL2Kernel( //Read into shared memory - readBlock2(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); __syncthreads(); //Set boundary conditions diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/HLL_kernel.cu index 42d34e1..0cc483f 100644 --- a/GPUSimulators/HLL_kernel.cu +++ b/GPUSimulators/HLL_kernel.cu @@ -131,10 +131,9 @@ __global__ void HLLKernel( //Read into shared memory - readBlock1(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+1, ny_+1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+1, ny_+1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+1, ny_+1); __syncthreads(); noFlowBoundary1(Q, nx_, ny_); diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/KP07_dimsplit_kernel.cu index 1dd31d0..5eb8165 100644 --- a/GPUSimulators/KP07_dimsplit_kernel.cu +++ b/GPUSimulators/KP07_dimsplit_kernel.cu @@ -157,10 +157,9 @@ __global__ void KP07DimsplitKernel( //Read into shared memory - readBlock2(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); __syncthreads(); diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/KP07_kernel.cu index 250a9a8..096c535 100644 --- a/GPUSimulators/KP07_kernel.cu +++ b/GPUSimulators/KP07_kernel.cu @@ -141,10 +141,9 @@ __global__ void KP07Kernel( //Read into shared memory - readBlock2(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); __syncthreads(); diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/LxF_kernel.cu index c6ceeec..427a521 100644 --- a/GPUSimulators/LxF_kernel.cu +++ b/GPUSimulators/LxF_kernel.cu @@ -127,10 +127,9 @@ void LxFKernel( __shared__ float G[3][block_height+1][block_width]; //Read into shared memory - readBlock1(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+1, ny_+1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+1, ny_+1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+1, ny_+1); __syncthreads(); //Set boundary conditions diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/WAF_kernel.cu index 3c971ae..20c42da 100644 --- a/GPUSimulators/WAF_kernel.cu +++ b/GPUSimulators/WAF_kernel.cu @@ -138,10 +138,9 @@ __global__ void WAFKernel( //Read into shared memory Q from global memory - readBlock2(h0_ptr_, h0_pitch_, - hu0_ptr_, hu0_pitch_, - hv0_ptr_, hv0_pitch_, - Q, nx_, ny_); + readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); __syncthreads(); diff --git a/GPUSimulators/common.cu b/GPUSimulators/common.cu index 5f5d66f..54dce8d 100644 --- a/GPUSimulators/common.cu +++ b/GPUSimulators/common.cu @@ -50,38 +50,29 @@ inline __device__ __host__ float clamp(const float f, const float a, const float - /** * Reads a block of data with one ghost cell for the shallow water equations */ -__device__ void readBlock1(float* h_ptr_, int h_pitch_, - float* hu_ptr_, int hu_pitch_, - float* hv_ptr_, int hv_pitch_, - float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], - const int nx_, const int ny_) { - //Index of thread within block - const int tx = threadIdx.x; - const int ty = threadIdx.y; +template +__device__ void readBlock(float* ptr_, int pitch_, + float shmem[sm_height][sm_width], + const int max_x, const int max_y) { //Index of block within domain - const int bx = BLOCK_WIDTH * blockIdx.x; - const int by = BLOCK_HEIGHT * blockIdx.y; + const int bx = blockDim.x * blockIdx.x; + const int by = blockDim.y * blockIdx.y; //Read into shared memory - for (int j=ty; j(h_ptr_, h_pitch_, Q[0], nx_+3, ny_+3); + readBlock(hu_ptr_, hu_pitch_, Q[1], nx_+3, ny_+3); + readBlock(hv_ptr_, hv_pitch_, Q[2], nx_+3, ny_+3); } + + /** * Writes a block of data to global memory for the shallow water equations. */