mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 06:24:13 +02:00
Checkpoint
This commit is contained in:
parent
918d22b257
commit
daa175ef32
@ -34,8 +34,8 @@ from GPUSimulators import Common, Simulator
|
||||
class Autotuner:
|
||||
def __init__(self,
|
||||
nx=2048, ny=2048,
|
||||
block_widths=range(8, 32, 2),
|
||||
block_heights=range(8, 32, 2)):
|
||||
block_widths=range(8, 32, 1),
|
||||
block_heights=range(8, 32, 1)):
|
||||
logger = logging.getLogger(__name__)
|
||||
self.filename = "autotuning_data_" + gethostname() + ".npz"
|
||||
self.nx = nx
|
||||
@ -239,19 +239,21 @@ class Autotuner:
|
||||
h = np.zeros((ny, nx), dtype=np.float32);
|
||||
hu = np.zeros((ny, nx), dtype=np.float32);
|
||||
hv = np.zeros((ny, nx), dtype=np.float32);
|
||||
|
||||
x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
|
||||
y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
|
||||
|
||||
extent = 1.0/np.sqrt(2.0)
|
||||
x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size
|
||||
y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size
|
||||
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
|
||||
r = np.sqrt(xv**2 + yv**2)
|
||||
r = np.minimum(1.0, np.sqrt(xv**2 + yv**2))
|
||||
xv = None
|
||||
yv = None
|
||||
gc.collect()
|
||||
|
||||
|
||||
#Generate highres
|
||||
h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
|
||||
hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
|
||||
hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
|
||||
cos = np.cos(np.pi*r)
|
||||
h = 0.5 + 0.1*0.5*(1.0 + cos)
|
||||
hu = 0.1*0.5*(1.0 + cos)
|
||||
hv = hu.copy()
|
||||
|
||||
scale = 0.7
|
||||
max_h_estimate = 0.6
|
||||
|
@ -289,8 +289,9 @@ class CudaContext(object):
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that holds data
|
||||
Class that holds 2D data
|
||||
"""
|
||||
class CudaArray2D:
|
||||
"""
|
||||
@ -307,42 +308,41 @@ class CudaArray2D:
|
||||
ny_halo = ny + 2*y_halo
|
||||
|
||||
#self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
|
||||
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
|
||||
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
|
||||
|
||||
#Make sure data is in proper format
|
||||
if cpu_data is not None:
|
||||
assert cpu_data.itemsize == 4, "Wrong size of data type"
|
||||
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
|
||||
|
||||
#Upload data to the device
|
||||
#If we don't have any data, just allocate and return
|
||||
if cpu_data is None:
|
||||
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
|
||||
elif (cpu_data.shape == (ny_halo, nx_halo)):
|
||||
self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
|
||||
elif (cpu_data.shape == (self.ny, self.nx)):
|
||||
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
|
||||
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype)
|
||||
#self.data.fill(0.0)
|
||||
return
|
||||
|
||||
#Create copy object from host to device
|
||||
copy = cuda.Memcpy2D()
|
||||
copy.set_src_host(cpu_data)
|
||||
copy.set_dst_device(self.data.gpudata)
|
||||
#Make sure data is in proper format
|
||||
assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
|
||||
assert cpu_data.itemsize == 4, "Wrong size of data type"
|
||||
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
|
||||
|
||||
#Create copy object from host to device
|
||||
copy = cuda.Memcpy2D()
|
||||
copy.set_src_host(cpu_data)
|
||||
copy.set_dst_device(self.data.gpudata)
|
||||
|
||||
#Set offsets and pitch of destination
|
||||
copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
|
||||
copy.dst_y = self.y_halo
|
||||
copy.dst_pitch = self.data.strides[0]
|
||||
|
||||
#Set width in bytes to copy for each row and
|
||||
#number of rows to copy
|
||||
copy.width_in_bytes = self.nx*cpu_data.itemsize
|
||||
copy.height = self.ny
|
||||
|
||||
#Perform the copy
|
||||
copy(stream)
|
||||
stream.synchronize()
|
||||
else:
|
||||
assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
|
||||
#Set offsets of upload in destination
|
||||
x_offset = (nx_halo - cpu_data.shape[1]) // 2
|
||||
y_offset = (ny_halo - cpu_data.shape[0]) // 2
|
||||
copy.dst_x_in_bytes = x_offset*self.data.strides[1]
|
||||
copy.dst_y = y_offset
|
||||
|
||||
#Set destination pitch
|
||||
copy.dst_pitch = self.data.strides[0]
|
||||
|
||||
#Set width in bytes to copy for each row and
|
||||
#number of rows to copy
|
||||
width = max(self.nx, cpu_data.shape[1])
|
||||
height = max(self.ny, cpu_data.shape[0])
|
||||
copy.width_in_bytes = width*cpu_data.itemsize
|
||||
copy.height = height
|
||||
|
||||
#Perform the copy
|
||||
copy(stream)
|
||||
|
||||
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
|
||||
|
||||
@ -389,6 +389,114 @@ class CudaArray2D:
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that holds 2D data
|
||||
"""
|
||||
class CudaArray3D:
|
||||
"""
|
||||
Uploads initial data to the CL device
|
||||
"""
|
||||
def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.nx = nx
|
||||
self.ny = ny
|
||||
self.nz = nz
|
||||
self.x_halo = x_halo
|
||||
self.y_halo = y_halo
|
||||
self.z_halo = z_halo
|
||||
|
||||
nx_halo = nx + 2*x_halo
|
||||
ny_halo = ny + 2*y_halo
|
||||
nz_halo = nz + 2*z_halo
|
||||
|
||||
#self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
|
||||
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
|
||||
self.data = pycuda.gpuarray.empty((nz_halo, ny_halo, nx_halo), dtype)
|
||||
|
||||
#If we don't have any data, just allocate and return
|
||||
if cpu_data is None:
|
||||
return
|
||||
|
||||
#Make sure data is in proper format
|
||||
assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo)))
|
||||
assert cpu_data.itemsize == 4, "Wrong size of data type"
|
||||
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
|
||||
|
||||
#Create copy object from host to device
|
||||
copy = cuda.Memcpy3D()
|
||||
copy.set_src_host(cpu_data)
|
||||
copy.set_dst_device(self.data.gpudata)
|
||||
|
||||
#Set offsets of destination
|
||||
x_offset = (nx_halo - cpu_data.shape[2]) // 2
|
||||
y_offset = (ny_halo - cpu_data.shape[1]) // 2
|
||||
z_offset = (nz_halo - cpu_data.shape[0]) // 2
|
||||
copy.dst_x_in_bytes = x_offset*self.data.strides[1]
|
||||
copy.dst_y = y_offset
|
||||
copy.dst_z = z_offset
|
||||
|
||||
#Set pitch of destination
|
||||
copy.dst_pitch = self.data.strides[0]
|
||||
|
||||
#Set width in bytes to copy for each row and
|
||||
#number of rows to copy
|
||||
width = max(self.nx, cpu_data.shape[2])
|
||||
height = max(self.ny, cpu_data.shape[1])
|
||||
depth = max(self.nz, cpu-data.shape[0])
|
||||
copy.width_in_bytes = width*cpu_data.itemsize
|
||||
copy.height = height
|
||||
copy.depth = depth
|
||||
|
||||
#Perform the copy
|
||||
copy(stream)
|
||||
|
||||
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
|
||||
|
||||
|
||||
def __del__(self, *args):
|
||||
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
|
||||
self.data.gpudata.free()
|
||||
self.data = None
|
||||
|
||||
"""
|
||||
Enables downloading data from GPU to Python
|
||||
"""
|
||||
def download(self, stream, async=False):
|
||||
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
|
||||
#Allocate host memory
|
||||
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
|
||||
cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
|
||||
|
||||
#Create copy object from device to host
|
||||
copy = cuda.Memcpy2D()
|
||||
copy.set_src_device(self.data.gpudata)
|
||||
copy.set_dst_host(cpu_data)
|
||||
|
||||
#Set offsets and pitch of source
|
||||
copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
|
||||
copy.src_y = self.y_halo
|
||||
copy.src_z = self.z_halo
|
||||
copy.src_pitch = self.data.strides[0]
|
||||
|
||||
#Set width in bytes to copy for each row and
|
||||
#number of rows to copy
|
||||
copy.width_in_bytes = self.nx*cpu_data.itemsize
|
||||
copy.height = self.ny
|
||||
copy.depth = self.nz
|
||||
|
||||
copy(stream)
|
||||
if async==False:
|
||||
stream.synchronize()
|
||||
|
||||
return cpu_data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
|
||||
|
58
GPUSimulators/EulerCommon.cu
Normal file
58
GPUSimulators/EulerCommon.cu
Normal file
@ -0,0 +1,58 @@
|
||||
/*
|
||||
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float pressure(float4 Q, float gamma) {
|
||||
const float rho = Q.x;
|
||||
const float rho_u = Q.y;
|
||||
const float rho_v = Q.z;
|
||||
const float E = Q.w;
|
||||
|
||||
return (gamma-1.0f)*(E-0.5f*(rho_u*rho_u + rho_v*rho_v)/rho);
|
||||
}
|
||||
|
||||
|
||||
__device__ float4 F_func(const float4 Q, float gamma) {
|
||||
const float rho = Q.x;
|
||||
const float rho_u = Q.y;
|
||||
const float rho_v = Q.z;
|
||||
const float E = Q.w;
|
||||
|
||||
const float u = rho_u/rho;
|
||||
const float P = pressure(Q, gamma);
|
||||
|
||||
float4 F;
|
||||
|
||||
F.x = rho_u;
|
||||
F.y = rho_u*u + P;
|
||||
F.z = rho_v*u;
|
||||
F.w = u*(E+P);
|
||||
|
||||
return F;
|
||||
}
|
@ -20,6 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "fluxes/FirstOrderCentered.cu"
|
||||
|
||||
|
||||
@ -32,8 +33,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Compute fluxes along the x axis
|
||||
{
|
||||
@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
|
132
GPUSimulators/HLL2Euler.py
Normal file
132
GPUSimulators/HLL2Euler.py
Normal file
@ -0,0 +1,132 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the 2nd order HLL flux
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
from GPUSimulators import Simulator, Common
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class HLL2Euler (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
rho: Density
|
||||
rho_u: Momentum along x-axis
|
||||
rho_v: Momentum along y-axis
|
||||
E: energy
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis
|
||||
dy: Grid cell spacing along y-axis
|
||||
dt: Size of each timestep
|
||||
g: Gravitational accelleration
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
rho, rho_u, rho_v, E, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
theta=1.8, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
[rho, rho_u, rho_v, E])
|
||||
self.u1 = Common.ArakawaA2D(self.stream, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
[None, None, None, None])
|
||||
|
||||
def __str__(self):
|
||||
return "Harten-Lax-van Leer (2nd order)"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0]
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0])
|
||||
self.u0, self.u1 = self.u1, self.u0
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0], \
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0], \
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0], \
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0]
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0])
|
||||
self.u0, self.u1 = self.u1, self.u0
|
||||
self.t += dt
|
||||
|
||||
def download(self):
|
||||
return self.u0.download(self.stream)
|
@ -19,6 +19,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/HartenLaxVanLeer.cu"
|
||||
|
||||
@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
{
|
||||
const int j=ty;
|
||||
@ -89,8 +90,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
|
@ -20,6 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "fluxes/HartenLaxVanLeer.cu"
|
||||
|
||||
|
||||
@ -34,8 +35,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
{
|
||||
const int j=ty;
|
||||
@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
@ -156,7 +157,7 @@ __global__ void HLLKernel(
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Q[0][get_local_id(1) + 1][get_local_id(0) + 1] += 0.1;
|
||||
//Q[0][threadIdx.y + 1][threadIdx.x + 1] += 0.1;
|
||||
|
||||
|
||||
|
||||
|
@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/CentralUpwind.cu"
|
||||
|
||||
@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
{
|
||||
int j=ty;
|
||||
@ -80,8 +81,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
|
@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/CentralUpwind.cu"
|
||||
|
||||
@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
{
|
||||
int j=ty;
|
||||
@ -66,8 +67,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
@ -120,12 +121,12 @@ __global__ void KP07Kernel(
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
|
@ -19,7 +19,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "Common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "fluxes/LaxFriedrichs.cu"
|
||||
|
||||
|
||||
@ -32,8 +33,8 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height][block_width+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
{
|
||||
const int j=ty;
|
||||
@ -68,8 +69,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
const int l = j;
|
||||
@ -118,8 +119,8 @@ void LxFKernel(
|
||||
const int block_height = BLOCK_HEIGHT;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
|
||||
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height][block_width+1];
|
||||
@ -146,8 +147,8 @@ void LxFKernel(
|
||||
//Evolve for all internal cells
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
66
GPUSimulators/SWECommon.cu
Normal file
66
GPUSimulators/SWECommon.cu
Normal file
@ -0,0 +1,66 @@
|
||||
/*
|
||||
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float3 F_func(const float3 Q, const float g) {
|
||||
float3 F;
|
||||
|
||||
F.x = Q.y; //hu
|
||||
F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
|
||||
F.z = Q.y*Q.z / Q.x; //hu*hv/h;
|
||||
|
||||
return F;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "SWECommon.cu"
|
||||
#include "fluxes/WeightedAverageFlux.cu"
|
||||
|
||||
|
||||
@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
{
|
||||
int j=ty;
|
||||
@ -76,8 +77,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
|
@ -23,43 +23,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Location of thread in block
|
||||
*/
|
||||
inline __device__ int get_local_id(int dim) {
|
||||
switch(dim) {
|
||||
case 0: return threadIdx.x;
|
||||
case 1: return threadIdx.y;
|
||||
case 2: return threadIdx.z;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get block index
|
||||
*/
|
||||
__device__ int get_group_id(int dim) {
|
||||
switch(dim) {
|
||||
case 0: return blockIdx.x;
|
||||
case 1: return blockIdx.y;
|
||||
case 2: return blockIdx.z;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Location of thread in global domain
|
||||
*/
|
||||
__device__ int get_global_id(int dim) {
|
||||
switch(dim) {
|
||||
case 0: return blockDim.x*blockIdx.x + threadIdx.x;
|
||||
case 1: return blockDim.y*blockIdx.y + threadIdx.y;
|
||||
case 2: return blockDim.z*blockIdx.z + threadIdx.z;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Float3 operators
|
||||
@ -97,12 +60,12 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
const int bx = BLOCK_WIDTH * blockIdx.x;
|
||||
const int by = BLOCK_HEIGHT * blockIdx.y;
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
@ -136,12 +99,12 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
const int bx = BLOCK_WIDTH * blockIdx.x;
|
||||
const int by = BLOCK_HEIGHT * blockIdx.y;
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
|
||||
@ -174,12 +137,12 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
|
||||
|
||||
//Only write internal cells
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
@ -209,12 +172,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
|
||||
|
||||
//Only write internal cells
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
@ -242,12 +205,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
*/
|
||||
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
|
||||
|
||||
//Block-local indices
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
@ -286,12 +249,12 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const
|
||||
*/
|
||||
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
|
||||
|
||||
//Block-local indices
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +2
|
||||
const int j = ty + 2;
|
||||
@ -347,12 +310,12 @@ __device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
|
||||
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
@ -377,12 +340,12 @@ __device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
|
||||
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +1
|
||||
@ -407,12 +370,12 @@ __device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
|
||||
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
@ -438,12 +401,12 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
|
||||
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +2
|
||||
@ -464,23 +427,6 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
|
||||
|
||||
|
||||
__device__ float3 F_func(const float3 Q, const float g) {
|
||||
float3 F;
|
||||
|
||||
F.x = Q.y; //hu
|
||||
F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
|
||||
F.z = Q.y*Q.z / Q.x; //hu*hv/h;
|
||||
|
||||
return F;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -50,8 +50,8 @@ __device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
//Reconstruct slopes along x axis
|
||||
{
|
||||
@ -74,8 +74,8 @@ __device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
const int tx = threadIdx.x;
|
||||
const int ty = threadIdx.y;
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
|
Loading…
x
Reference in New Issue
Block a user