Added more efficient function to read data from global memory!

This commit is contained in:
André R. Brodtkorb 2018-09-04 15:12:40 +02:00
parent a1902a1330
commit 58f281d724
8 changed files with 64 additions and 28 deletions

View File

@ -120,9 +120,9 @@ __global__ void FORCEKernel(
//Read into shared memory //Read into shared memory
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
__syncthreads(); __syncthreads();
//Set boundary conditions //Set boundary conditions

View File

@ -166,9 +166,9 @@ __global__ void HLL2Kernel(
//Read into shared memory //Read into shared memory
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
__syncthreads(); __syncthreads();
//Set boundary conditions //Set boundary conditions

View File

@ -128,9 +128,9 @@ __global__ void HLLKernel(
//Read into shared memory //Read into shared memory
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
__syncthreads(); __syncthreads();
//Set boundary conditions //Set boundary conditions

View File

@ -157,9 +157,9 @@ __global__ void KP07DimsplitKernel(
//Read into shared memory //Read into shared memory
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
__syncthreads(); __syncthreads();

View File

@ -141,9 +141,9 @@ __global__ void KP07Kernel(
//Read into shared memory //Read into shared memory
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4); float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4); readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
__syncthreads(); __syncthreads();

View File

@ -98,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
} }
extern "C" { extern "C" {
__global__ __global__
void LxFKernel( void LxFKernel(
@ -122,10 +123,10 @@ void LxFKernel(
__shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1]; __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH]; __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
//Read into shared memory including ghost cells float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
__syncthreads(); __syncthreads();
//Set boundary conditions //Set boundary conditions
@ -137,6 +138,7 @@ void LxFKernel(
computeFluxG<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, G, g_, dy_, dt_); computeFluxG<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, G, g_, dy_, dt_);
__syncthreads(); __syncthreads();
//Evolve for all cells //Evolve for all cells
const int i = tx + 1; //Skip local ghost cells, i.e., +1 const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1; const int j = ty + 1;

View File

@ -138,9 +138,9 @@ __global__ void WAFKernel(
//Read into shared memory Q from global memory //Read into shared memory Q from global memory
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3); float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3); int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3); readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
__syncthreads(); __syncthreads();

View File

@ -53,9 +53,9 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
/** /**
* Reads a block of data with one ghost cell for the shallow water equations * Reads a block of data with one ghost cell for the shallow water equations
*/ */
template<int sm_width, int sm_height> template<int sm_width, int sm_height, int block_width, int block_height>
__device__ void readBlock(float* ptr_, int pitch_, inline __device__ void readBlock(float* ptr_, int pitch_,
float shmem[sm_height][sm_width], float (&shmem)[sm_height][sm_width],
const int max_x_, const int max_y_) { const int max_x_, const int max_y_) {
//Index of block within domain //Index of block within domain
@ -63,13 +63,13 @@ __device__ void readBlock(float* ptr_, int pitch_,
const int by = blockDim.y * blockIdx.y; const int by = blockDim.y * blockIdx.y;
//Read into shared memory //Read into shared memory
for (int j=threadIdx.y; j<sm_height; j+=blockDim.y) { for (int j=threadIdx.y; j<sm_height; j+=block_height) {
const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
//Compute the pointer to current row in the arrays //Compute the pointer to current row in the arrays
const float* const row = (float*) ((char*) ptr_ + pitch_*l); const float* const row = (float*) ((char*) ptr_ + pitch_*l);
for (int i=threadIdx.x; i<sm_width; i+=blockDim.x) { for (int i=threadIdx.x; i<sm_width; i+=block_width) {
const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
shmem[j][i] = row[k]; shmem[j][i] = row[k];
@ -78,6 +78,40 @@ __device__ void readBlock(float* ptr_, int pitch_,
} }
/**
* Reads a block of data with ghost cells
*/
template<int vars, int sm_width, int sm_height, int block_width, int block_height>
inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars],
float shmem[vars][sm_height][sm_width],
const int max_x_, const int max_y_) {
//Index of block within domain
const int bx = blockDim.x * blockIdx.x;
const int by = blockDim.y * blockIdx.y;
float* rows[3];
//Read into shared memory
for (int j=threadIdx.y; j<sm_height; j+=block_height) {
const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
//Compute the pointer to current row in the arrays
for (int m=0; m<vars; ++m) {
rows[m] = (float*) ((char*) ptr_[m] + pitch_[m]*l);
}
for (int i=threadIdx.x; i<sm_width; i+=block_width) {
const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
for (int m=0; m<vars; ++m) {
shmem[m][j][i] = rows[m][k];
}
}
}
}
@ -85,7 +119,7 @@ __device__ void readBlock(float* ptr_, int pitch_,
* Writes a block of data to global memory for the shallow water equations. * Writes a block of data to global memory for the shallow water equations.
*/ */
template<int sm_width, int sm_height, int offset_x=0, int offset_y=0> template<int sm_width, int sm_height, int offset_x=0, int offset_y=0>
__device__ void writeBlock(float* ptr_, int pitch_, inline __device__ void writeBlock(float* ptr_, int pitch_,
float shmem[sm_height][sm_width], float shmem[sm_height][sm_width],
const int width, const int height) { const int width, const int height) {