diff --git a/SWESimulators/WAF.py b/SWESimulators/WAF.py
index a043e0b..8ba4ddf 100644
--- a/SWESimulators/WAF.py
+++ b/SWESimulators/WAF.py
@@ -22,7 +22,11 @@ along with this program. If not, see .
#Import packages we need
import numpy as np
-import pyopencl as cl #OpenCL in Python
+
+import pycuda.compiler as cuda_compiler
+import pycuda.gpuarray
+import pycuda.driver as cuda
+
from SWESimulators import Common
@@ -47,24 +51,24 @@ class WAF:
g: Gravitational accelleration (9.81 m/s^2)
"""
def __init__(self, \
- cl_ctx, \
+ context, \
h0, hu0, hv0, \
nx, ny, \
dx, dy, dt, \
g, \
block_width=16, block_height=16):
- self.cl_ctx = cl_ctx
-
- #Create an OpenCL command queue
- self.cl_queue = cl.CommandQueue(self.cl_ctx)
+ #Create a CUDA stream
+ self.stream = cuda.Stream()
#Get kernels
- self.kernel = Common.get_kernel(self.cl_ctx, "WAF_kernel.opencl", block_width, block_height)
+ self.waf_module = context.get_kernel("WAF_kernel.cu", block_width, block_height)
+ self.waf_kernel = self.waf_module.get_function("WAFKernel")
+ self.waf_kernel.prepare("iiffffiPiPiPiPiPiPi")
#Create data by uploading to device
ghost_cells_x = 2
ghost_cells_y = 2
- self.cl_data = Common.SWEDataArkawaA(self.cl_ctx, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
+ self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
@@ -80,14 +84,16 @@ class WAF:
self.t = np.float32(0.0)
#Compute kernel launch parameters
- self.local_size = (block_width, block_height)
+ self.local_size = (block_width, block_height, 1)
self.global_size = ( \
- int(np.ceil(self.nx / float(self.local_size[0])) * self.local_size[0]), \
- int(np.ceil(self.ny / float(self.local_size[1])) * self.local_size[1]) \
+ int(np.ceil(self.nx / float(self.local_size[0]))), \
+ int(np.ceil(self.ny / float(self.local_size[1]))) \
)
+ def __str__(self):
+ return "Weighted average flux"
"""
Function which steps n timesteps
@@ -104,32 +110,30 @@ class WAF:
break
#Along X, then Y
- self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+ self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
np.int32(0), \
- self.cl_data.h0.data, self.cl_data.h0.pitch, \
- self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
- self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
- self.cl_data.h1.data, self.cl_data.h1.pitch, \
- self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
- self.cl_data.hv1.data, self.cl_data.hv1.pitch)
- self.cl_data.swap()
+ self.data.h0.data.gpudata, self.data.h0.pitch, \
+ self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+ self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+ self.data.h1.data.gpudata, self.data.h1.pitch, \
+ self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+ self.data.hv1.data.gpudata, self.data.hv1.pitch)
#Along Y, then X
- self.kernel.swe_2D(self.cl_queue, self.global_size, self.local_size, \
+ self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
np.int32(1), \
- self.cl_data.h0.data, self.cl_data.h0.pitch, \
- self.cl_data.hu0.data, self.cl_data.hu0.pitch, \
- self.cl_data.hv0.data, self.cl_data.hv0.pitch, \
- self.cl_data.h1.data, self.cl_data.h1.pitch, \
- self.cl_data.hu1.data, self.cl_data.hu1.pitch, \
- self.cl_data.hv1.data, self.cl_data.hv1.pitch)
- self.cl_data.swap()
+ self.data.h1.data.gpudata, self.data.h1.pitch, \
+ self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+ self.data.hv1.data.gpudata, self.data.hv1.pitch, \
+ self.data.h0.data.gpudata, self.data.h0.pitch, \
+ self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+ self.data.hv0.data.gpudata, self.data.hv0.pitch)
self.t += local_dt
@@ -140,5 +144,5 @@ class WAF:
def download(self):
- return self.cl_data.download(self.cl_queue)
+ return self.data.download(self.stream)
diff --git a/SWESimulators/WAF_kernel.opencl b/SWESimulators/WAF_kernel.cu
similarity index 64%
rename from SWESimulators/WAF_kernel.opencl
rename to SWESimulators/WAF_kernel.cu
index a35c49e..2ff39fa 100644
--- a/SWESimulators/WAF_kernel.opencl
+++ b/SWESimulators/WAF_kernel.cu
@@ -24,30 +24,32 @@ along with this program. If not, see .
-#include "common.opencl"
+#include "common.cu"
/**
* Computes the flux along the x axis for all faces
*/
-void computeFluxF(__local float Q[3][block_height+4][block_width+4],
- __local float F[3][block_height+1][block_width+1],
+__device__
+void computeFluxF(float Q[3][block_height+4][block_width+4],
+ float F[3][block_height+1][block_width+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
- for (int j=ty; j evolve y first, then x
else {
//Compute fluxes along the y axis and evolve
computeFluxG(Q, F, g_, dy_, dt_);
- barrier(CLK_LOCAL_MEM_FENCE);
+ __syncthreads();
evolveG2(Q, F, nx_, ny_, dy_, dt_);
- barrier(CLK_LOCAL_MEM_FENCE);
+ __syncthreads();
//Fix boundary conditions
noFlowBoundary2(Q, nx_, ny_);
- barrier(CLK_LOCAL_MEM_FENCE);
+ __syncthreads();
//Compute fluxes along the x axis and evolve
computeFluxF(Q, F, g_, dx_, dt_);
- barrier(CLK_LOCAL_MEM_FENCE);
+ __syncthreads();
evolveF2(Q, F, nx_, ny_, dx_, dt_);
- barrier(CLK_LOCAL_MEM_FENCE);
+ __syncthreads();
}