diff --git a/SWESimulators/WAF.py b/SWESimulators/WAF.py index a043e0b..8ba4ddf 100644 --- a/SWESimulators/WAF.py +++ b/SWESimulators/WAF.py @@ -22,7 +22,11 @@ along with this program. If not, see . #Import packages we need import numpy as np -import pyopencl as cl #OpenCL in Python + +import pycuda.compiler as cuda_compiler +import pycuda.gpuarray +import pycuda.driver as cuda + from SWESimulators import Common @@ -47,24 +51,24 @@ class WAF: g: Gravitational accelleration (9.81 m/s^2) """ def __init__(self, \ - cl_ctx, \ + context, \ h0, hu0, hv0, \ nx, ny, \ dx, dy, dt, \ g, \ block_width=16, block_height=16): - self.cl_ctx = cl_ctx - - #Create an OpenCL command queue - self.cl_queue = cl.CommandQueue(self.cl_ctx) + #Create a CUDA stream + self.stream = cuda.Stream() #Get kernels - self.kernel = Common.get_kernel(self.cl_ctx, "WAF_kernel.opencl", block_width, block_height) + self.waf_module = context.get_kernel("WAF_kernel.cu", block_width, block_height) + self.waf_kernel = self.waf_module.get_function("WAFKernel") + self.waf_kernel.prepare("iiffffiPiPiPiPiPiPi") #Create data by uploading to device ghost_cells_x = 2 ghost_cells_y = 2 - self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0) + self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0) #Save input parameters #Notice that we need to specify them in the correct dataformat for the @@ -80,14 +84,16 @@ class WAF: self.t = np.float32(0.0) #Compute kernel launch parameters - self.local_size = (block_width, block_height) + self.local_size = (block_width, block_height, 1) self.global_size = ( \ - int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \ - int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \ + int(np.ceil(self.nx / float(self.local_size[0]))), \ + int(np.ceil(self.ny / float(self.local_size[1]))) \ ) + def __str__(self): + return "Weighted average flux" """ Function which steps n timesteps @@ -104,32 +110,30 @@ class WAF: break #Along X, then Y - self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \ + self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, local_dt, \ self.g, \ np.int32(0), \ - self.cl_data.h0.data, self.cl_data.h0.pitch, \ - self.cl_data.hu0.data, self.cl_data.hu0.pitch, \ - self.cl_data.hv0.data, self.cl_data.hv0.pitch, \ - self.cl_data.h1.data, self.cl_data.h1.pitch, \ - self.cl_data.hu1.data, self.cl_data.hu1.pitch, \ - self.cl_data.hv1.data, self.cl_data.hv1.pitch) - self.cl_data.swap() + self.data.h0.data.gpudata, self.data.h0.pitch, \ + self.data.hu0.data.gpudata, self.data.hu0.pitch, \ + self.data.hv0.data.gpudata, self.data.hv0.pitch, \ + self.data.h1.data.gpudata, self.data.h1.pitch, \ + self.data.hu1.data.gpudata, self.data.hu1.pitch, \ + self.data.hv1.data.gpudata, self.data.hv1.pitch) #Along Y, then X - self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \ + self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, local_dt, \ self.g, \ np.int32(1), \ - self.cl_data.h0.data, self.cl_data.h0.pitch, \ - self.cl_data.hu0.data, self.cl_data.hu0.pitch, \ - self.cl_data.hv0.data, self.cl_data.hv0.pitch, \ - self.cl_data.h1.data, self.cl_data.h1.pitch, \ - self.cl_data.hu1.data, self.cl_data.hu1.pitch, \ - self.cl_data.hv1.data, self.cl_data.hv1.pitch) - self.cl_data.swap() + self.data.h1.data.gpudata, self.data.h1.pitch, \ + self.data.hu1.data.gpudata, self.data.hu1.pitch, \ + self.data.hv1.data.gpudata, self.data.hv1.pitch, \ + self.data.h0.data.gpudata, self.data.h0.pitch, \ + self.data.hu0.data.gpudata, self.data.hu0.pitch, \ + self.data.hv0.data.gpudata, self.data.hv0.pitch) self.t += local_dt @@ -140,5 +144,5 @@ class WAF: def download(self): - return self.cl_data.download(self.cl_queue) + return self.data.download(self.stream) diff --git a/SWESimulators/WAF_kernel.opencl b/SWESimulators/WAF_kernel.cu similarity index 64% rename from SWESimulators/WAF_kernel.opencl rename to SWESimulators/WAF_kernel.cu index a35c49e..2ff39fa 100644 --- a/SWESimulators/WAF_kernel.opencl +++ b/SWESimulators/WAF_kernel.cu @@ -24,30 +24,32 @@ along with this program. If not, see . -#include "common.opencl" +#include "common.cu" /** * Computes the flux along the x axis for all faces */ -void computeFluxF(__local float Q[3][block_height+4][block_width+4], - __local float F[3][block_height+1][block_width+1], +__device__ +void computeFluxF(float Q[3][block_height+4][block_width+4], + float F[3][block_height+1][block_width+1], const float g_, const float dx_, const float dt_) { //Index of thread within block const int tx = get_local_id(0); const int ty = get_local_id(1); - for (int j=ty; j evolve y first, then x else { //Compute fluxes along the y axis and evolve computeFluxG(Q, F, g_, dy_, dt_); - barrier(CLK_LOCAL_MEM_FENCE); + __syncthreads(); evolveG2(Q, F, nx_, ny_, dy_, dt_); - barrier(CLK_LOCAL_MEM_FENCE); + __syncthreads(); //Fix boundary conditions noFlowBoundary2(Q, nx_, ny_); - barrier(CLK_LOCAL_MEM_FENCE); + __syncthreads(); //Compute fluxes along the x axis and evolve computeFluxF(Q, F, g_, dx_, dt_); - barrier(CLK_LOCAL_MEM_FENCE); + __syncthreads(); evolveF2(Q, F, nx_, ny_, dx_, dt_); - barrier(CLK_LOCAL_MEM_FENCE); + __syncthreads(); }