From 9c20930c11956780265a6befcf9a93b81e297776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= Date: Fri, 10 Aug 2018 15:26:48 +0200 Subject: [PATCH] Templetized LxF kernel --- SWESimulators/LxF.py | 3 +- SWESimulators/LxF_kernel.cu | 64 ++++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/SWESimulators/LxF.py b/SWESimulators/LxF.py index 218bd69..0030971 100644 --- a/SWESimulators/LxF.py +++ b/SWESimulators/LxF.py @@ -66,7 +66,8 @@ class LxF (Simulator.BaseSimulator): # Get kernels self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \ "iiffffPiPiPiPiPiPi", \ - block_width, block_height) + block_width, block_height, \ + no_extern_c=True) def __str__(self): return "Lax Friedrichs" diff --git a/SWESimulators/LxF_kernel.cu b/SWESimulators/LxF_kernel.cu index 9d7ab22..237e3b0 100644 --- a/SWESimulators/LxF_kernel.cu +++ b/SWESimulators/LxF_kernel.cu @@ -26,9 +26,10 @@ along with this program. If not, see . /** * Computes the flux along the x axis for all faces */ +template __device__ -void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], - float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1], +void computeFluxF(float Q[3][block_height+2][block_width+2], + float F[3][block_height][block_width+1], const float g_, const float dx_, const float dt_) { //Index of thread within block const int tx = get_local_id(0); @@ -37,7 +38,7 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], { const int j=ty; const int l = j + 1; //Skip ghost cells - for (int i=tx; i +__device__ +void computeFluxG(float Q[3][block_height+2][block_width+2], + float G[3][block_height+1][block_width], const float g_, const float dy_, const float dt_) { //Index of thread within block const int tx = get_local_id(0); const int ty = get_local_id(1); - for (int j=ty; j +__device__ +void LxFKernelHelper( int nx_, int ny_, float dx_, float dy_, float dt_, float g_, @@ -114,9 +118,9 @@ __global__ void LxFKernel( const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1 const int tj = get_global_id(1) + 1; - __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1]; - __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH]; + __shared__ float Q[3][block_height+2][block_width+2]; + __shared__ float F[3][block_height][block_width+1]; + __shared__ float G[3][block_height+1][block_width]; //Read into shared memory readBlock1(h0_ptr_, h0_pitch_, @@ -131,8 +135,8 @@ __global__ void LxFKernel( //Compute fluxes along the x and y axis - computeFluxF(Q, F, g_, dx_, dt_); - computeFluxG(Q, G, g_, dy_, dt_); + computeFluxF(Q, F, g_, dx_, dt_); + computeFluxG(Q, G, g_, dy_, dt_); __syncthreads(); @@ -160,4 +164,34 @@ __global__ void LxFKernel( hu_row[ti] = hu1; hv_row[ti] = hv1; } -} \ No newline at end of file +} + +extern "C" { +__global__ +void LxFKernel( + int nx_, int ny_, + float dx_, float dy_, float dt_, + float g_, + + //Input h^n + float* h0_ptr_, int h0_pitch_, + float* hu0_ptr_, int hu0_pitch_, + float* hv0_ptr_, int hv0_pitch_, + + //Output h^{n+1} + float* h1_ptr_, int h1_pitch_, + float* hu1_ptr_, int hu1_pitch_, + float* hv1_ptr_, int hv1_pitch_) { + LxFKernelHelper( + nx_, ny_, + dx_, dy_, dt_, + g_, + h0_ptr_, h0_pitch_, + hu0_ptr_, hu0_pitch_, + hv0_ptr_, hv0_pitch_, + h1_ptr_, h1_pitch_, + hu1_ptr_, hu1_pitch_, + hv1_ptr_, hv1_pitch_); +} +} // extern "C" +