diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu index 017a05c..bedb0d7 100644 --- a/GPUSimulators/FORCE_kernel.cu +++ b/GPUSimulators/FORCE_kernel.cu @@ -120,9 +120,9 @@ __global__ void FORCEKernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2); __syncthreads(); //Set boundary conditions diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu index a5d3156..7909a57 100644 --- a/GPUSimulators/HLL2_kernel.cu +++ b/GPUSimulators/HLL2_kernel.cu @@ -166,9 +166,9 @@ __global__ void HLL2Kernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); __syncthreads(); //Set boundary conditions diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/HLL_kernel.cu index 88fffd5..a86963b 100644 --- a/GPUSimulators/HLL_kernel.cu +++ b/GPUSimulators/HLL_kernel.cu @@ -128,9 +128,9 @@ __global__ void HLLKernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2); __syncthreads(); //Set boundary conditions diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/KP07_dimsplit_kernel.cu index 4ab3c78..7d9fd9d 100644 --- a/GPUSimulators/KP07_dimsplit_kernel.cu +++ b/GPUSimulators/KP07_dimsplit_kernel.cu @@ -157,9 +157,9 @@ __global__ void KP07DimsplitKernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); __syncthreads(); diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/KP07_kernel.cu index 8a22a55..dbd1964 100644 --- a/GPUSimulators/KP07_kernel.cu +++ b/GPUSimulators/KP07_kernel.cu @@ -141,9 +141,9 @@ __global__ void KP07Kernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); __syncthreads(); diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/LxF_kernel.cu index 63dc0f8..ff094ad 100644 --- a/GPUSimulators/LxF_kernel.cu +++ b/GPUSimulators/LxF_kernel.cu @@ -98,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2], } + extern "C" { __global__ void LxFKernel( @@ -122,10 +123,10 @@ void LxFKernel( __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1]; __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH]; - //Read into shared memory including ghost cells - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + + readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2); __syncthreads(); //Set boundary conditions @@ -137,6 +138,7 @@ void LxFKernel( computeFluxG(Q, G, g_, dy_, dt_); __syncthreads(); + //Evolve for all cells const int i = tx + 1; //Skip local ghost cells, i.e., +1 const int j = ty + 1; diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/WAF_kernel.cu index 20c42da..b35e5dd 100644 --- a/GPUSimulators/WAF_kernel.cu +++ b/GPUSimulators/WAF_kernel.cu @@ -138,9 +138,9 @@ __global__ void WAFKernel( //Read into shared memory Q from global memory - readBlock(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); + float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; + int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); __syncthreads(); diff --git a/GPUSimulators/common.cu b/GPUSimulators/common.cu index ce96dff..2f3f057 100644 --- a/GPUSimulators/common.cu +++ b/GPUSimulators/common.cu @@ -53,9 +53,9 @@ inline __device__ __host__ float clamp(const float f, const float a, const float /** * Reads a block of data with one ghost cell for the shallow water equations */ -template -__device__ void readBlock(float* ptr_, int pitch_, - float shmem[sm_height][sm_width], +template +inline __device__ void readBlock(float* ptr_, int pitch_, + float (&shmem)[sm_height][sm_width], const int max_x_, const int max_y_) { //Index of block within domain @@ -63,13 +63,13 @@ __device__ void readBlock(float* ptr_, int pitch_, const int by = blockDim.y * blockIdx.y; //Read into shared memory - for (int j=threadIdx.y; j +inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars], + float shmem[vars][sm_height][sm_width], + const int max_x_, const int max_y_) { + + //Index of block within domain + const int bx = blockDim.x * blockIdx.x; + const int by = blockDim.y * blockIdx.y; + + float* rows[3]; + + //Read into shared memory + for (int j=threadIdx.y; j -__device__ void writeBlock(float* ptr_, int pitch_, +inline __device__ void writeBlock(float* ptr_, int pitch_, float shmem[sm_height][sm_width], const int width, const int height) {