mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 14:34:13 +02:00
Added more efficient function to read data from global memory!
This commit is contained in:
parent
a1902a1330
commit
58f281d724
@ -120,9 +120,9 @@ __global__ void FORCEKernel(
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
|
@ -166,9 +166,9 @@ __global__ void HLL2Kernel(
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
|
@ -128,9 +128,9 @@ __global__ void HLLKernel(
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
|
@ -157,9 +157,9 @@ __global__ void KP07DimsplitKernel(
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
@ -141,9 +141,9 @@ __global__ void KP07Kernel(
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>( h0_ptr_, h0_pitch_, Q[0], nx_+4, ny_+4);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+4, ny_+4);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+4, ny_+4);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
@ -98,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__
|
||||
void LxFKernel(
|
||||
@ -122,10 +123,10 @@ void LxFKernel(
|
||||
__shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
|
||||
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
|
||||
|
||||
//Read into shared memory including ghost cells
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
|
||||
readBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
|
||||
readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
@ -137,6 +138,7 @@ void LxFKernel(
|
||||
computeFluxG<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, G, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Evolve for all cells
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
@ -138,9 +138,9 @@ __global__ void WAFKernel(
|
||||
|
||||
|
||||
//Read into shared memory Q from global memory
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(h0_ptr_, h0_pitch_, Q[0], nx_+3, ny_+3);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hu0_ptr_, hu0_pitch_, Q[1], nx_+3, ny_+3);
|
||||
readBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4>(hv0_ptr_, hv0_pitch_, Q[2], nx_+3, ny_+3);
|
||||
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
|
||||
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
|
||||
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
@ -53,9 +53,9 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
|
||||
/**
|
||||
* Reads a block of data with one ghost cell for the shallow water equations
|
||||
*/
|
||||
template<int sm_width, int sm_height>
|
||||
__device__ void readBlock(float* ptr_, int pitch_,
|
||||
float shmem[sm_height][sm_width],
|
||||
template<int sm_width, int sm_height, int block_width, int block_height>
|
||||
inline __device__ void readBlock(float* ptr_, int pitch_,
|
||||
float (&shmem)[sm_height][sm_width],
|
||||
const int max_x_, const int max_y_) {
|
||||
|
||||
//Index of block within domain
|
||||
@ -63,13 +63,13 @@ __device__ void readBlock(float* ptr_, int pitch_,
|
||||
const int by = blockDim.y * blockIdx.y;
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=threadIdx.y; j<sm_height; j+=blockDim.y) {
|
||||
for (int j=threadIdx.y; j<sm_height; j+=block_height) {
|
||||
const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
const float* const row = (float*) ((char*) ptr_ + pitch_*l);
|
||||
|
||||
for (int i=threadIdx.x; i<sm_width; i+=blockDim.x) {
|
||||
for (int i=threadIdx.x; i<sm_width; i+=block_width) {
|
||||
const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
|
||||
|
||||
shmem[j][i] = row[k];
|
||||
@ -78,6 +78,40 @@ __device__ void readBlock(float* ptr_, int pitch_,
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads a block of data with ghost cells
|
||||
*/
|
||||
template<int vars, int sm_width, int sm_height, int block_width, int block_height>
|
||||
inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars],
|
||||
float shmem[vars][sm_height][sm_width],
|
||||
const int max_x_, const int max_y_) {
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = blockDim.x * blockIdx.x;
|
||||
const int by = blockDim.y * blockIdx.y;
|
||||
|
||||
float* rows[3];
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=threadIdx.y; j<sm_height; j+=block_height) {
|
||||
const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
for (int m=0; m<vars; ++m) {
|
||||
rows[m] = (float*) ((char*) ptr_[m] + pitch_[m]*l);
|
||||
}
|
||||
|
||||
for (int i=threadIdx.x; i<sm_width; i+=block_width) {
|
||||
const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
|
||||
|
||||
for (int m=0; m<vars; ++m) {
|
||||
shmem[m][j][i] = rows[m][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -85,7 +119,7 @@ __device__ void readBlock(float* ptr_, int pitch_,
|
||||
* Writes a block of data to global memory for the shallow water equations.
|
||||
*/
|
||||
template<int sm_width, int sm_height, int offset_x=0, int offset_y=0>
|
||||
__device__ void writeBlock(float* ptr_, int pitch_,
|
||||
inline __device__ void writeBlock(float* ptr_, int pitch_,
|
||||
float shmem[sm_height][sm_width],
|
||||
const int width, const int height) {
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user