Checkpoint

This commit is contained in:
André R. Brodtkorb 2018-08-24 11:48:30 +02:00
parent 918d22b257
commit daa175ef32
14 changed files with 498 additions and 179 deletions

View File

@ -34,8 +34,8 @@ from GPUSimulators import Common, Simulator
class Autotuner:
def __init__(self,
nx=2048, ny=2048,
block_widths=range(8, 32, 2),
block_heights=range(8, 32, 2)):
block_widths=range(8, 32, 1),
block_heights=range(8, 32, 1)):
logger = logging.getLogger(__name__)
self.filename = "autotuning_data_" + gethostname() + ".npz"
self.nx = nx
@ -240,18 +240,20 @@ class Autotuner:
hu = np.zeros((ny, nx), dtype=np.float32);
hv = np.zeros((ny, nx), dtype=np.float32);
x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
extent = 1.0/np.sqrt(2.0)
x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size
y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
r = np.sqrt(xv**2 + yv**2)
r = np.minimum(1.0, np.sqrt(xv**2 + yv**2))
xv = None
yv = None
gc.collect()
#Generate highres
h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
cos = np.cos(np.pi*r)
h = 0.5 + 0.1*0.5*(1.0 + cos)
hu = 0.1*0.5*(1.0 + cos)
hv = hu.copy()
scale = 0.7
max_h_estimate = 0.6

View File

@ -289,8 +289,9 @@ class CudaContext(object):
"""
Class that holds data
Class that holds 2D data
"""
class CudaArray2D:
"""
@ -307,42 +308,41 @@ class CudaArray2D:
ny_halo = ny + 2*y_halo
#self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
#If we don't have any data, just allocate and return
if cpu_data is None:
return
#Make sure data is in proper format
if cpu_data is not None:
assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Upload data to the device
if cpu_data is None:
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
elif (cpu_data.shape == (ny_halo, nx_halo)):
self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
elif (cpu_data.shape == (self.ny, self.nx)):
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
#self.data.fill(0.0)
#Create copy object from host to device
copy = cuda.Memcpy2D()
copy.set_src_host(cpu_data)
copy.set_dst_device(self.data.gpudata)
#Create copy object from host to device
copy = cuda.Memcpy2D()
copy.set_src_host(cpu_data)
copy.set_dst_device(self.data.gpudata)
#Set offsets of upload in destination
x_offset = (nx_halo - cpu_data.shape[1]) // 2
y_offset = (ny_halo - cpu_data.shape[0]) // 2
copy.dst_x_in_bytes = x_offset*self.data.strides[1]
copy.dst_y = y_offset
#Set offsets and pitch of destination
copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
copy.dst_y = self.y_halo
copy.dst_pitch = self.data.strides[0]
#Set destination pitch
copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = self.nx*cpu_data.itemsize
copy.height = self.ny
#Set width in bytes to copy for each row and
#number of rows to copy
width = max(self.nx, cpu_data.shape[1])
height = max(self.ny, cpu_data.shape[0])
copy.width_in_bytes = width*cpu_data.itemsize
copy.height = height
#Perform the copy
copy(stream)
stream.synchronize()
else:
assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
#Perform the copy
copy(stream)
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
@ -389,6 +389,114 @@ class CudaArray2D:
"""
Class that holds 2D data
"""
class CudaArray3D:
"""
Uploads initial data to the CL device
"""
def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
self.logger = logging.getLogger(__name__)
self.nx = nx
self.ny = ny
self.nz = nz
self.x_halo = x_halo
self.y_halo = y_halo
self.z_halo = z_halo
nx_halo = nx + 2*x_halo
ny_halo = ny + 2*y_halo
nz_halo = nz + 2*z_halo
#self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.empty((nz_halo, ny_halo, nx_halo), dtype)
#If we don't have any data, just allocate and return
if cpu_data is None:
return
#Make sure data is in proper format
assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo)))
assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Create copy object from host to device
copy = cuda.Memcpy3D()
copy.set_src_host(cpu_data)
copy.set_dst_device(self.data.gpudata)
#Set offsets of destination
x_offset = (nx_halo - cpu_data.shape[2]) // 2
y_offset = (ny_halo - cpu_data.shape[1]) // 2
z_offset = (nz_halo - cpu_data.shape[0]) // 2
copy.dst_x_in_bytes = x_offset*self.data.strides[1]
copy.dst_y = y_offset
copy.dst_z = z_offset
#Set pitch of destination
copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
width = max(self.nx, cpu_data.shape[2])
height = max(self.ny, cpu_data.shape[1])
depth = max(self.nz, cpu-data.shape[0])
copy.width_in_bytes = width*cpu_data.itemsize
copy.height = height
copy.depth = depth
#Perform the copy
copy(stream)
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args):
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free()
self.data = None
"""
Enables downloading data from GPU to Python
"""
def download(self, stream, async=False):
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
#Allocate host memory
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
#Create copy object from device to host
copy = cuda.Memcpy2D()
copy.set_src_device(self.data.gpudata)
copy.set_dst_host(cpu_data)
#Set offsets and pitch of source
copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
copy.src_y = self.y_halo
copy.src_z = self.z_halo
copy.src_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = self.nx*cpu_data.itemsize
copy.height = self.ny
copy.depth = self.nz
copy(stream)
if async==False:
stream.synchronize()
return cpu_data
"""
A class representing an Arakawa A type (unstaggered, logically Cartesian) grid

View File

@ -0,0 +1,58 @@
/*
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
for the shallow water equations, described in
A. Kurganov & Guergana Petrova
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
Scheme for the Saint-Venant System Communications in Mathematical
Sciences, 5 (2007), 133-160.
Copyright (C) 2016 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
__device__ float pressure(float4 Q, float gamma) {
const float rho = Q.x;
const float rho_u = Q.y;
const float rho_v = Q.z;
const float E = Q.w;
return (gamma-1.0f)*(E-0.5f*(rho_u*rho_u + rho_v*rho_v)/rho);
}
__device__ float4 F_func(const float4 Q, float gamma) {
const float rho = Q.x;
const float rho_u = Q.y;
const float rho_v = Q.z;
const float E = Q.w;
const float u = rho_u/rho;
const float P = pressure(Q, gamma);
float4 F;
F.x = rho_u;
F.y = rho_u*u + P;
F.z = rho_v*u;
F.w = u*(E+P);
return F;
}

View File

@ -20,6 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "SWECommon.cu"
#include "fluxes/FirstOrderCentered.cu"
@ -32,8 +33,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Compute fluxes along the x axis
{
@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Compute fluxes along the y axis
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {

132
GPUSimulators/HLL2Euler.py Normal file
View File

@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
"""
This python module implements the 2nd order HLL flux
Copyright (C) 2016 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
import numpy as np
from GPUSimulators import Simulator, Common
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
class HLL2Euler (Simulator.BaseSimulator):
"""
Initialization routine
rho: Density
rho_u: Momentum along x-axis
rho_v: Momentum along y-axis
E: energy
nx: Number of cells along x-axis
ny: Number of cells along y-axis
dx: Grid cell spacing along x-axis
dy: Grid cell spacing along y-axis
dt: Size of each timestep
g: Gravitational accelleration
"""
def __init__(self, \
context, \
rho, rho_u, rho_v, E, \
nx, ny, \
dx, dy, dt, \
g, \
theta=1.8, \
block_width=16, block_height=16):
# Call super constructor
super().__init__(context, \
nx, ny, \
dx, dy, dt, \
g, \
block_width, block_height);
self.theta = np.float32(theta)
#Get kernels
self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
"iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=self.local_size[1])
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \
nx, ny, \
2, 2, \
[rho, rho_u, rho_v, E])
self.u1 = Common.ArakawaA2D(self.stream, \
nx, ny, \
2, 2, \
[None, None, None, None])
def __str__(self):
return "Harten-Lax-van Leer (2nd order)"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)
def stepEuler(self, dt):
return self.stepDimsplitXY(dt)
def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(0), \
self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
self.u1[2].data.gpudata, self.u1[2].data.strides[0]
self.u1[3].data.gpudata, self.u1[3].data.strides[0])
self.u0, self.u1 = self.u1, self.u0
self.t += dt
def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(1), \
self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
self.u1[2].data.gpudata, self.u1[2].data.strides[0]
self.u1[3].data.gpudata, self.u1[3].data.strides[0])
self.u0, self.u1 = self.u1, self.u0
self.t += dt
def download(self):
return self.u0.download(self.stream)

View File

@ -19,6 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "SWECommon.cu"
#include "limiters.cu"
#include "fluxes/HartenLaxVanLeer.cu"
@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
{
const int j=ty;
@ -89,8 +90,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;

View File

@ -20,6 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "SWECommon.cu"
#include "fluxes/HartenLaxVanLeer.cu"
@ -34,8 +35,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
{
const int j=ty;
@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j;
@ -156,7 +157,7 @@ __global__ void HLLKernel(
__syncthreads();
//Q[0][get_local_id(1) + 1][get_local_id(0) + 1] += 0.1;
//Q[0][threadIdx.y + 1][threadIdx.x + 1] += 0.1;

View File

@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "SWECommon.cu"
#include "limiters.cu"
#include "fluxes/CentralUpwind.cu"
@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
{
int j=ty;
@ -80,8 +81,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;

View File

@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "SWECommon.cu"
#include "limiters.cu"
#include "fluxes/CentralUpwind.cu"
@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
{
int j=ty;
@ -66,8 +67,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
@ -120,12 +121,12 @@ __global__ void KP07Kernel(
float* hv1_ptr_, int hv1_pitch_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Shared memory variables
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];

View File

@ -19,7 +19,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "common.cu"
#include "Common.cu"
#include "SWECommon.cu"
#include "fluxes/LaxFriedrichs.cu"
@ -32,8 +33,8 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height][block_width+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
{
const int j=ty;
@ -68,8 +69,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int j=ty; j<block_height+1; j+=block_height) {
const int l = j;
@ -118,8 +119,8 @@ void LxFKernel(
const int block_height = BLOCK_HEIGHT;
//Index of cell within domain
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
__shared__ float Q[3][block_height+2][block_width+2];
__shared__ float F[3][block_height][block_width+1];
@ -146,8 +147,8 @@ void LxFKernel(
//Evolve for all internal cells
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1;

View File

@ -0,0 +1,66 @@
/*
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
for the shallow water equations, described in
A. Kurganov & Guergana Petrova
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
Scheme for the Saint-Venant System Communications in Mathematical
Sciences, 5 (2007), 133-160.
Copyright (C) 2016 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
__device__ float3 F_func(const float3 Q, const float g) {
float3 F;
F.x = Q.y; //hu
F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
F.z = Q.y*Q.z / Q.x; //hu*hv/h;
return F;
}

View File

@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "SWECommon.cu"
#include "fluxes/WeightedAverageFlux.cu"
@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
{
int j=ty;
@ -76,8 +77,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Compute fluxes along the y axis
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {

View File

@ -23,43 +23,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Location of thread in block
*/
inline __device__ int get_local_id(int dim) {
switch(dim) {
case 0: return threadIdx.x;
case 1: return threadIdx.y;
case 2: return threadIdx.z;
default: return -1;
}
}
/**
* Get block index
*/
__device__ int get_group_id(int dim) {
switch(dim) {
case 0: return blockIdx.x;
case 1: return blockIdx.y;
case 2: return blockIdx.z;
default: return -1;
}
}
/**
* Location of thread in global domain
*/
__device__ int get_global_id(int dim) {
switch(dim) {
case 0: return blockDim.x*blockIdx.x + threadIdx.x;
case 1: return blockDim.y*blockIdx.y + threadIdx.y;
case 2: return blockDim.z*blockIdx.z + threadIdx.z;
default: return -1;
}
}
/**
* Float3 operators
@ -97,12 +60,12 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of block within domain
const int bx = BLOCK_WIDTH * get_group_id(0);
const int by = BLOCK_HEIGHT * get_group_id(1);
const int bx = BLOCK_WIDTH * blockIdx.x;
const int by = BLOCK_HEIGHT * blockIdx.y;
//Read into shared memory
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
@ -136,12 +99,12 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of block within domain
const int bx = BLOCK_WIDTH * get_group_id(0);
const int by = BLOCK_HEIGHT * get_group_id(1);
const int bx = BLOCK_WIDTH * blockIdx.x;
const int by = BLOCK_HEIGHT * blockIdx.y;
//Read into shared memory
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
@ -174,12 +137,12 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
//Only write internal cells
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
@ -209,12 +172,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Only write internal cells
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
@ -242,12 +205,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
*/
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
//Global index
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
//Block-local indices
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1;
@ -286,12 +249,12 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const
*/
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
//Global index
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Block-local indices
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int i = tx + 2; //Skip local ghost cells, i.e., +2
const int j = ty + 2;
@ -347,12 +310,12 @@ __device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
const int i = tx + 1; //Skip local ghost cells, i.e., +1
@ -377,12 +340,12 @@ __device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
const int i = tx + 2; //Skip local ghost cells, i.e., +1
@ -407,12 +370,12 @@ __device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
const int i = tx + 1; //Skip local ghost cells, i.e., +1
@ -438,12 +401,12 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Index of cell within domain
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
const int i = tx + 2; //Skip local ghost cells, i.e., +2
@ -464,23 +427,6 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
__device__ float3 F_func(const float3 Q, const float g) {
float3 F;
F.x = Q.y; //hu
F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
F.z = Q.y*Q.z / Q.x; //hu*hv/h;
return F;
}

View File

@ -50,8 +50,8 @@ __device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
//Reconstruct slopes along x axis
{
@ -74,8 +74,8 @@ __device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
const int tx = threadIdx.x;
const int ty = threadIdx.y;
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
const int l = j + 1;