diff --git a/GPUSimulators/Autotuner.py b/GPUSimulators/Autotuner.py
index 44cdb00..a77447b 100644
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@@ -34,8 +34,8 @@ from GPUSimulators import Common, Simulator
class Autotuner:
def __init__(self,
nx=2048, ny=2048,
- block_widths=range(8, 32, 2),
- block_heights=range(8, 32, 2)):
+ block_widths=range(8, 32, 1),
+ block_heights=range(8, 32, 1)):
logger = logging.getLogger(__name__)
self.filename = "autotuning_data_" + gethostname() + ".npz"
self.nx = nx
@@ -239,19 +239,21 @@ class Autotuner:
h = np.zeros((ny, nx), dtype=np.float32);
hu = np.zeros((ny, nx), dtype=np.float32);
hv = np.zeros((ny, nx), dtype=np.float32);
-
- x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
- y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
+
+ extent = 1.0/np.sqrt(2.0)
+ x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size
+ y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
- r = np.sqrt(xv**2 + yv**2)
+ r = np.minimum(1.0, np.sqrt(xv**2 + yv**2))
xv = None
yv = None
gc.collect()
-
+
#Generate highres
- h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
- hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
- hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
+ cos = np.cos(np.pi*r)
+ h = 0.5 + 0.1*0.5*(1.0 + cos)
+ hu = 0.1*0.5*(1.0 + cos)
+ hv = hu.copy()
scale = 0.7
max_h_estimate = 0.6
diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py
index 84bdb7a..17b20a4 100644
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -289,8 +289,9 @@ class CudaContext(object):
+
"""
-Class that holds data
+Class that holds 2D data
"""
class CudaArray2D:
"""
@@ -307,42 +308,41 @@ class CudaArray2D:
ny_halo = ny + 2*y_halo
#self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+ #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+ self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
- #Make sure data is in proper format
- if cpu_data is not None:
- assert cpu_data.itemsize == 4, "Wrong size of data type"
- assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
-
- #Upload data to the device
+ #If we don't have any data, just allocate and return
if cpu_data is None:
- self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
- elif (cpu_data.shape == (ny_halo, nx_halo)):
- self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
- elif (cpu_data.shape == (self.ny, self.nx)):
- #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
- self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
- #self.data.fill(0.0)
+ return
- #Create copy object from host to device
- copy = cuda.Memcpy2D()
- copy.set_src_host(cpu_data)
- copy.set_dst_device(self.data.gpudata)
+ #Make sure data is in proper format
+ assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
+ assert cpu_data.itemsize == 4, "Wrong size of data type"
+ assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
+
+ #Create copy object from host to device
+ copy = cuda.Memcpy2D()
+ copy.set_src_host(cpu_data)
+ copy.set_dst_device(self.data.gpudata)
- #Set offsets and pitch of destination
- copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
- copy.dst_y = self.y_halo
- copy.dst_pitch = self.data.strides[0]
-
- #Set width in bytes to copy for each row and
- #number of rows to copy
- copy.width_in_bytes = self.nx*cpu_data.itemsize
- copy.height = self.ny
-
- #Perform the copy
- copy(stream)
- stream.synchronize()
- else:
- assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
+ #Set offsets of upload in destination
+ x_offset = (nx_halo - cpu_data.shape[1]) // 2
+ y_offset = (ny_halo - cpu_data.shape[0]) // 2
+ copy.dst_x_in_bytes = x_offset*self.data.strides[1]
+ copy.dst_y = y_offset
+
+ #Set destination pitch
+ copy.dst_pitch = self.data.strides[0]
+
+ #Set width in bytes to copy for each row and
+ #number of rows to copy
+ width = max(self.nx, cpu_data.shape[1])
+ height = max(self.ny, cpu_data.shape[0])
+ copy.width_in_bytes = width*cpu_data.itemsize
+ copy.height = height
+
+ #Perform the copy
+ copy(stream)
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
@@ -389,6 +389,114 @@ class CudaArray2D:
+"""
+Class that holds 2D data
+"""
+class CudaArray3D:
+ """
+ Uploads initial data to the CL device
+ """
+ def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
+ self.logger = logging.getLogger(__name__)
+ self.nx = nx
+ self.ny = ny
+ self.nz = nz
+ self.x_halo = x_halo
+ self.y_halo = y_halo
+ self.z_halo = z_halo
+
+ nx_halo = nx + 2*x_halo
+ ny_halo = ny + 2*y_halo
+ nz_halo = nz + 2*z_halo
+
+ #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
+ #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+ self.data = pycuda.gpuarray.empty((nz_halo, ny_halo, nx_halo), dtype)
+
+ #If we don't have any data, just allocate and return
+ if cpu_data is None:
+ return
+
+ #Make sure data is in proper format
+ assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo)))
+ assert cpu_data.itemsize == 4, "Wrong size of data type"
+ assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
+
+ #Create copy object from host to device
+ copy = cuda.Memcpy3D()
+ copy.set_src_host(cpu_data)
+ copy.set_dst_device(self.data.gpudata)
+
+ #Set offsets of destination
+ x_offset = (nx_halo - cpu_data.shape[2]) // 2
+ y_offset = (ny_halo - cpu_data.shape[1]) // 2
+ z_offset = (nz_halo - cpu_data.shape[0]) // 2
+ copy.dst_x_in_bytes = x_offset*self.data.strides[1]
+ copy.dst_y = y_offset
+ copy.dst_z = z_offset
+
+ #Set pitch of destination
+ copy.dst_pitch = self.data.strides[0]
+
+ #Set width in bytes to copy for each row and
+ #number of rows to copy
+ width = max(self.nx, cpu_data.shape[2])
+ height = max(self.ny, cpu_data.shape[1])
+ depth = max(self.nz, cpu-data.shape[0])
+ copy.width_in_bytes = width*cpu_data.itemsize
+ copy.height = height
+ copy.depth = depth
+
+ #Perform the copy
+ copy(stream)
+
+ #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+
+
+ def __del__(self, *args):
+ #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+ self.data.gpudata.free()
+ self.data = None
+
+ """
+ Enables downloading data from GPU to Python
+ """
+ def download(self, stream, async=False):
+ #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
+ #Allocate host memory
+ #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
+ cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
+
+ #Create copy object from device to host
+ copy = cuda.Memcpy2D()
+ copy.set_src_device(self.data.gpudata)
+ copy.set_dst_host(cpu_data)
+
+ #Set offsets and pitch of source
+ copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
+ copy.src_y = self.y_halo
+ copy.src_z = self.z_halo
+ copy.src_pitch = self.data.strides[0]
+
+ #Set width in bytes to copy for each row and
+ #number of rows to copy
+ copy.width_in_bytes = self.nx*cpu_data.itemsize
+ copy.height = self.ny
+ copy.depth = self.nz
+
+ copy(stream)
+ if async==False:
+ stream.synchronize()
+
+ return cpu_data
+
+
+
+
+
+
+
+
"""
A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
diff --git a/GPUSimulators/EulerCommon.cu b/GPUSimulators/EulerCommon.cu
new file mode 100644
index 0000000..eaaffe9
--- /dev/null
+++ b/GPUSimulators/EulerCommon.cu
@@ -0,0 +1,58 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme
+for the shallow water equations, described in
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160.
+
+Copyright (C) 2016 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+*/
+
+
+
+
+
+
+
+__device__ float pressure(float4 Q, float gamma) {
+ const float rho = Q.x;
+ const float rho_u = Q.y;
+ const float rho_v = Q.z;
+ const float E = Q.w;
+
+ return (gamma-1.0f)*(E-0.5f*(rho_u*rho_u + rho_v*rho_v)/rho);
+}
+
+
+__device__ float4 F_func(const float4 Q, float gamma) {
+ const float rho = Q.x;
+ const float rho_u = Q.y;
+ const float rho_v = Q.z;
+ const float E = Q.w;
+
+ const float u = rho_u/rho;
+ const float P = pressure(Q, gamma);
+
+ float4 F;
+
+ F.x = rho_u;
+ F.y = rho_u*u + P;
+ F.z = rho_v*u;
+ F.w = u*(E+P);
+
+ return F;
+}
\ No newline at end of file
diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu
index c43fd38..419c665 100644
--- a/GPUSimulators/FORCE_kernel.cu
+++ b/GPUSimulators/FORCE_kernel.cu
@@ -20,6 +20,7 @@ along with this program. If not, see .
#include "common.cu"
+#include "SWECommon.cu"
#include "fluxes/FirstOrderCentered.cu"
@@ -32,8 +33,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Compute fluxes along the x axis
{
@@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Compute fluxes along the y axis
for (int j=ty; j.
+"""
+
+#Import packages we need
+import numpy as np
+from GPUSimulators import Simulator, Common
+
+
+
+
+
+
+
+
+
+"""
+Class that solves the SW equations using the Forward-Backward linear scheme
+"""
+class HLL2Euler (Simulator.BaseSimulator):
+
+ """
+ Initialization routine
+ rho: Density
+ rho_u: Momentum along x-axis
+ rho_v: Momentum along y-axis
+ E: energy
+ nx: Number of cells along x-axis
+ ny: Number of cells along y-axis
+ dx: Grid cell spacing along x-axis
+ dy: Grid cell spacing along y-axis
+ dt: Size of each timestep
+ g: Gravitational accelleration
+ """
+ def __init__(self, \
+ context, \
+ rho, rho_u, rho_v, E, \
+ nx, ny, \
+ dx, dy, dt, \
+ g, \
+ theta=1.8, \
+ block_width=16, block_height=16):
+
+ # Call super constructor
+ super().__init__(context, \
+ nx, ny, \
+ dx, dy, dt, \
+ g, \
+ block_width, block_height);
+
+ self.theta = np.float32(theta)
+
+ #Get kernels
+ self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
+ "iifffffiPiPiPiPiPiPi", \
+ BLOCK_WIDTH=self.local_size[0], \
+ BLOCK_HEIGHT=self.local_size[1])
+
+ #Create data by uploading to device
+ self.u0 = Common.ArakawaA2D(self.stream, \
+ nx, ny, \
+ 2, 2, \
+ [rho, rho_u, rho_v, E])
+ self.u1 = Common.ArakawaA2D(self.stream, \
+ nx, ny, \
+ 2, 2, \
+ [None, None, None, None])
+
+ def __str__(self):
+ return "Harten-Lax-van Leer (2nd order)"
+
+ def simulate(self, t_end):
+ return super().simulateDimsplit(t_end)
+
+ def stepEuler(self, dt):
+ return self.stepDimsplitXY(dt)
+
+ def stepDimsplitXY(self, dt):
+ self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+ self.nx, self.ny, \
+ self.dx, self.dy, dt, \
+ self.g, \
+ self.theta, \
+ np.int32(0), \
+ self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
+ self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
+ self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
+ self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
+ self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
+ self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
+ self.u1[2].data.gpudata, self.u1[2].data.strides[0]
+ self.u1[3].data.gpudata, self.u1[3].data.strides[0])
+ self.u0, self.u1 = self.u1, self.u0
+ self.t += dt
+
+ def stepDimsplitYX(self, dt):
+ self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+ self.nx, self.ny, \
+ self.dx, self.dy, dt, \
+ self.g, \
+ self.theta, \
+ np.int32(1), \
+ self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
+ self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
+ self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
+ self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
+ self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
+ self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
+ self.u1[2].data.gpudata, self.u1[2].data.strides[0]
+ self.u1[3].data.gpudata, self.u1[3].data.strides[0])
+ self.u0, self.u1 = self.u1, self.u0
+ self.t += dt
+
+ def download(self):
+ return self.u0.download(self.stream)
\ No newline at end of file
diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu
index 168627f..bdaf0f8 100644
--- a/GPUSimulators/HLL2_kernel.cu
+++ b/GPUSimulators/HLL2_kernel.cu
@@ -19,6 +19,7 @@ along with this program. If not, see .
#include "common.cu"
+#include "SWECommon.cu"
#include "limiters.cu"
#include "fluxes/HartenLaxVanLeer.cu"
@@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
{
const int j=ty;
@@ -89,8 +90,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
for (int j=ty; j.
#include "common.cu"
+#include "SWECommon.cu"
#include "fluxes/HartenLaxVanLeer.cu"
@@ -34,8 +35,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
{
const int j=ty;
@@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
for (int j=ty; j.
#include "common.cu"
+#include "SWECommon.cu"
#include "limiters.cu"
#include "fluxes/CentralUpwind.cu"
@@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
{
int j=ty;
@@ -80,8 +81,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
for (int j=ty; j.
#include "common.cu"
+#include "SWECommon.cu"
#include "limiters.cu"
#include "fluxes/CentralUpwind.cu"
@@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
{
int j=ty;
@@ -66,8 +67,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
for (int j=ty; j.
*/
-#include "common.cu"
+#include "Common.cu"
+#include "SWECommon.cu"
#include "fluxes/LaxFriedrichs.cu"
@@ -32,8 +33,8 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height][block_width+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
{
const int j=ty;
@@ -68,8 +69,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
for (int j=ty; j 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1;
diff --git a/GPUSimulators/SWECommon.cu b/GPUSimulators/SWECommon.cu
new file mode 100644
index 0000000..389a062
--- /dev/null
+++ b/GPUSimulators/SWECommon.cu
@@ -0,0 +1,66 @@
+/*
+This OpenCL kernel implements the Kurganov-Petrova numerical scheme
+for the shallow water equations, described in
+A. Kurganov & Guergana Petrova
+A Second-Order Well-Balanced Positivity Preserving Central-Upwind
+Scheme for the Saint-Venant System Communications in Mathematical
+Sciences, 5 (2007), 133-160.
+
+Copyright (C) 2016 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+*/
+
+
+
+
+
+
+
+
+
+__device__ float3 F_func(const float3 Q, const float g) {
+ float3 F;
+
+ F.x = Q.y; //hu
+ F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
+ F.z = Q.y*Q.z / Q.x; //hu*hv/h;
+
+ return F;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/WAF_kernel.cu
index f707aa8..3c971ae 100644
--- a/GPUSimulators/WAF_kernel.cu
+++ b/GPUSimulators/WAF_kernel.cu
@@ -25,6 +25,7 @@ along with this program. If not, see .
#include "common.cu"
+#include "SWECommon.cu"
#include "fluxes/WeightedAverageFlux.cu"
@@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
{
int j=ty;
@@ -76,8 +77,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Compute fluxes along the y axis
for (int j=ty; j.
*/
-/**
- * Location of thread in block
- */
-inline __device__ int get_local_id(int dim) {
- switch(dim) {
- case 0: return threadIdx.x;
- case 1: return threadIdx.y;
- case 2: return threadIdx.z;
- default: return -1;
- }
-}
-
-
-/**
- * Get block index
- */
-__device__ int get_group_id(int dim) {
- switch(dim) {
- case 0: return blockIdx.x;
- case 1: return blockIdx.y;
- case 2: return blockIdx.z;
- default: return -1;
- }
-}
-
-/**
- * Location of thread in global domain
- */
-__device__ int get_global_id(int dim) {
- switch(dim) {
- case 0: return blockDim.x*blockIdx.x + threadIdx.x;
- case 1: return blockDim.y*blockIdx.y + threadIdx.y;
- case 2: return blockDim.z*blockIdx.z + threadIdx.z;
- default: return -1;
- }
-}
-
/**
* Float3 operators
@@ -97,12 +60,12 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Index of block within domain
- const int bx = BLOCK_WIDTH * get_group_id(0);
- const int by = BLOCK_HEIGHT * get_group_id(1);
+ const int bx = BLOCK_WIDTH * blockIdx.x;
+ const int by = BLOCK_HEIGHT * blockIdx.y;
//Read into shared memory
for (int j=ty; j 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
@@ -209,12 +172,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Index of cell within domain
- const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
- const int tj = get_global_id(1) + 2;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Only write internal cells
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
@@ -242,12 +205,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
*/
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
//Global index
- const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
- const int tj = get_global_id(1) + 1;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
//Block-local indices
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1;
@@ -286,12 +249,12 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const
*/
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
//Global index
- const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
- const int tj = get_global_id(1) + 2;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Block-local indices
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
const int i = tx + 2; //Skip local ghost cells, i.e., +2
const int j = ty + 2;
@@ -347,12 +310,12 @@ __device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Index of cell within domain
- const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
- const int tj = get_global_id(1) + 1;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
const int i = tx + 1; //Skip local ghost cells, i.e., +1
@@ -377,12 +340,12 @@ __device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Index of cell within domain
- const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
- const int tj = get_global_id(1) + 2;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
const int i = tx + 2; //Skip local ghost cells, i.e., +1
@@ -407,12 +370,12 @@ __device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Index of cell within domain
- const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
- const int tj = get_global_id(1) + 1;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
const int i = tx + 1; //Skip local ghost cells, i.e., +1
@@ -438,12 +401,12 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Index of cell within domain
- const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
- const int tj = get_global_id(1) + 2;
+ const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
+ const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
const int i = tx + 2; //Skip local ghost cells, i.e., +2
@@ -464,23 +427,6 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-__device__ float3 F_func(const float3 Q, const float g) {
- float3 F;
-
- F.x = Q.y; //hu
- F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
- F.z = Q.y*Q.z / Q.x; //hu*hv/h;
-
- return F;
-}
-
-
-
-
-
-
-
-
diff --git a/GPUSimulators/limiters.cu b/GPUSimulators/limiters.cu
index 39ec2d9..e34d21f 100644
--- a/GPUSimulators/limiters.cu
+++ b/GPUSimulators/limiters.cu
@@ -50,8 +50,8 @@ __device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
//Reconstruct slopes along x axis
{
@@ -74,8 +74,8 @@ __device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
- const int tx = get_local_id(0);
- const int ty = get_local_id(1);
+ const int tx = threadIdx.x;
+ const int ty = threadIdx.y;
for (int j=ty; j