Async mem ops

This commit is contained in:
Martin Lilleeng Sætra
2022-04-26 11:34:29 +00:00
parent dcef56a1b9
commit 6c8bac6f7b
6 changed files with 314 additions and 57 deletions

View File

@@ -147,23 +147,34 @@ __global__ void KP07DimsplitKernel(
float* E1_ptr_, int E1_pitch_,
//Output CFL
float* cfl_) {
float* cfl_,
//Subarea of internal domain to compute
int x0=0, int y0=0,
int x1=0, int y1=0) {
if(x1 == 0)
x1 = nx_;
if(y1 == 0)
y1 = ny_;
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc_x = 2;
const unsigned int gc_y = 2;
const unsigned int vars = 4;
//Shared memory variables
__shared__ float Q[4][h+2*gc_y][w+2*gc_x];
__shared__ float Qx[4][h+2*gc_y][w+2*gc_x];
__shared__ float F[4][h+2*gc_y][w+2*gc_x];
//Read into shared memory
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_);
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
//Step 0 => evolve x first, then y
if (step_ == 0) {
@@ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel(
// Write to main memory for all internal cells
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1);
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1);
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1);
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1);
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1);
//Compute the CFL for this block
if (cfl_ != NULL) {

View File

@@ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y>
inline __device__ void readBlock(float* ptr_, int pitch_,
float Q[h+2*gc_y][w+2*gc_x],
const int nx_, const int ny_,
const int boundary_conditions_) {
const int boundary_conditions_,
int x0, int y0,
int x1, int y1) {
//Index of block within domain
const int bx = blockDim.x * blockIdx.x;
const int by = blockDim.y * blockIdx.y;
@@ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_,
//Loop over all variables
for (int j=threadIdx.y; j<h+2*gc_y; j+=h) {
//Handle periodic boundary conditions here
int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_);
l = min(l, ny_+2*gc_y-1);
int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_);
l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1));
float* row = (float*) ((char*) ptr_ + pitch_*l);
for (int i=threadIdx.x; i<w+2*gc_x; i+=w) {
//Handle periodic boundary conditions here
int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_);
k = min(k, nx_+2*gc_x-1);
int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_);
k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1));
//Read from global memory
Q[j][i] = row[k];
@@ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y>
inline __device__ void writeBlock(float* ptr_, int pitch_,
float shmem[h+2*gc_y][w+2*gc_x],
const int nx_, const int ny_,
int rk_step_, int rk_order_) {
int rk_step_, int rk_order_,
int x0, int y0,
int x1, int y1) {
//Index of cell within domain
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x;
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0;
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0;
//In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
const int max_ti = min(nx_+gc_x, x1+gc_x);
const int max_tj = min(ny_+gc_y, y1+gc_y);
//Only write internal cells
if (ti < nx_+gc_x && tj < ny_+gc_y) {
if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) {
//Index of thread within block
const int tx = threadIdx.x + gc_x;
const int ty = threadIdx.y + gc_y;
@@ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx];
}
}
// DEBUG
//row[ti] = 99.0;
}
}