diff --git a/GPUSimulators/Autotuner.py b/GPUSimulators/Autotuner.py index 44cdb00..a77447b 100644 --- a/GPUSimulators/Autotuner.py +++ b/GPUSimulators/Autotuner.py @@ -34,8 +34,8 @@ from GPUSimulators import Common, Simulator class Autotuner: def __init__(self, nx=2048, ny=2048, - block_widths=range(8, 32, 2), - block_heights=range(8, 32, 2)): + block_widths=range(8, 32, 1), + block_heights=range(8, 32, 1)): logger = logging.getLogger(__name__) self.filename = "autotuning_data_" + gethostname() + ".npz" self.nx = nx @@ -239,19 +239,21 @@ class Autotuner: h = np.zeros((ny, nx), dtype=np.float32); hu = np.zeros((ny, nx), dtype=np.float32); hv = np.zeros((ny, nx), dtype=np.float32); - - x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center - y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center + + extent = 1.0/np.sqrt(2.0) + x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size + y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy') - r = np.sqrt(xv**2 + yv**2) + r = np.minimum(1.0, np.sqrt(xv**2 + yv**2)) xv = None yv = None gc.collect() - + #Generate highres - h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size) - hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size) - hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size) + cos = np.cos(np.pi*r) + h = 0.5 + 0.1*0.5*(1.0 + cos) + hu = 0.1*0.5*(1.0 + cos) + hv = hu.copy() scale = 0.7 max_h_estimate = 0.6 diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index 84bdb7a..17b20a4 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -289,8 +289,9 @@ class CudaContext(object): + """ -Class that holds data +Class that holds 2D data """ class CudaArray2D: """ @@ -307,42 +308,41 @@ class CudaArray2D: ny_halo = ny + 2*y_halo #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny) + #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here + self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype) - #Make sure data is in proper format - if cpu_data is not None: - assert cpu_data.itemsize == 4, "Wrong size of data type" - assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" - - #Upload data to the device + #If we don't have any data, just allocate and return if cpu_data is None: - self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype) - elif (cpu_data.shape == (ny_halo, nx_halo)): - self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream) - elif (cpu_data.shape == (self.ny, self.nx)): - #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here - self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), dtype) - #self.data.fill(0.0) + return - #Create copy object from host to device - copy = cuda.Memcpy2D() - copy.set_src_host(cpu_data) - copy.set_dst_device(self.data.gpudata) + #Make sure data is in proper format + assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo))) + assert cpu_data.itemsize == 4, "Wrong size of data type" + assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" + + #Create copy object from host to device + copy = cuda.Memcpy2D() + copy.set_src_host(cpu_data) + copy.set_dst_device(self.data.gpudata) - #Set offsets and pitch of destination - copy.dst_x_in_bytes = self.x_halo*self.data.strides[1] - copy.dst_y = self.y_halo - copy.dst_pitch = self.data.strides[0] - - #Set width in bytes to copy for each row and - #number of rows to copy - copy.width_in_bytes = self.nx*cpu_data.itemsize - copy.height = self.ny - - #Perform the copy - copy(stream) - stream.synchronize() - else: - assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo))) + #Set offsets of upload in destination + x_offset = (nx_halo - cpu_data.shape[1]) // 2 + y_offset = (ny_halo - cpu_data.shape[0]) // 2 + copy.dst_x_in_bytes = x_offset*self.data.strides[1] + copy.dst_y = y_offset + + #Set destination pitch + copy.dst_pitch = self.data.strides[0] + + #Set width in bytes to copy for each row and + #number of rows to copy + width = max(self.nx, cpu_data.shape[1]) + height = max(self.ny, cpu_data.shape[0]) + copy.width_in_bytes = width*cpu_data.itemsize + copy.height = height + + #Perform the copy + copy(stream) #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) @@ -389,6 +389,114 @@ class CudaArray2D: +""" +Class that holds 2D data +""" +class CudaArray3D: + """ + Uploads initial data to the CL device + """ + def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32): + self.logger = logging.getLogger(__name__) + self.nx = nx + self.ny = ny + self.nz = nz + self.x_halo = x_halo + self.y_halo = y_halo + self.z_halo = z_halo + + nx_halo = nx + 2*x_halo + ny_halo = ny + 2*y_halo + nz_halo = nz + 2*z_halo + + #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz) + #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here + self.data = pycuda.gpuarray.empty((nz_halo, ny_halo, nx_halo), dtype) + + #If we don't have any data, just allocate and return + if cpu_data is None: + return + + #Make sure data is in proper format + assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo))) + assert cpu_data.itemsize == 4, "Wrong size of data type" + assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" + + #Create copy object from host to device + copy = cuda.Memcpy3D() + copy.set_src_host(cpu_data) + copy.set_dst_device(self.data.gpudata) + + #Set offsets of destination + x_offset = (nx_halo - cpu_data.shape[2]) // 2 + y_offset = (ny_halo - cpu_data.shape[1]) // 2 + z_offset = (nz_halo - cpu_data.shape[0]) // 2 + copy.dst_x_in_bytes = x_offset*self.data.strides[1] + copy.dst_y = y_offset + copy.dst_z = z_offset + + #Set pitch of destination + copy.dst_pitch = self.data.strides[0] + + #Set width in bytes to copy for each row and + #number of rows to copy + width = max(self.nx, cpu_data.shape[2]) + height = max(self.ny, cpu_data.shape[1]) + depth = max(self.nz, cpu-data.shape[0]) + copy.width_in_bytes = width*cpu_data.itemsize + copy.height = height + copy.depth = depth + + #Perform the copy + copy(stream) + + #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) + + + def __del__(self, *args): + #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) + self.data.gpudata.free() + self.data = None + + """ + Enables downloading data from GPU to Python + """ + def download(self, stream, async=False): + #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) + #Allocate host memory + #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32) + cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32) + + #Create copy object from device to host + copy = cuda.Memcpy2D() + copy.set_src_device(self.data.gpudata) + copy.set_dst_host(cpu_data) + + #Set offsets and pitch of source + copy.src_x_in_bytes = self.x_halo*self.data.strides[1] + copy.src_y = self.y_halo + copy.src_z = self.z_halo + copy.src_pitch = self.data.strides[0] + + #Set width in bytes to copy for each row and + #number of rows to copy + copy.width_in_bytes = self.nx*cpu_data.itemsize + copy.height = self.ny + copy.depth = self.nz + + copy(stream) + if async==False: + stream.synchronize() + + return cpu_data + + + + + + + + """ A class representing an Arakawa A type (unstaggered, logically Cartesian) grid diff --git a/GPUSimulators/EulerCommon.cu b/GPUSimulators/EulerCommon.cu new file mode 100644 index 0000000..eaaffe9 --- /dev/null +++ b/GPUSimulators/EulerCommon.cu @@ -0,0 +1,58 @@ +/* +This OpenCL kernel implements the Kurganov-Petrova numerical scheme +for the shallow water equations, described in +A. Kurganov & Guergana Petrova +A Second-Order Well-Balanced Positivity Preserving Central-Upwind +Scheme for the Saint-Venant System Communications in Mathematical +Sciences, 5 (2007), 133-160. + +Copyright (C) 2016 SINTEF ICT + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + + + + + + + +__device__ float pressure(float4 Q, float gamma) { + const float rho = Q.x; + const float rho_u = Q.y; + const float rho_v = Q.z; + const float E = Q.w; + + return (gamma-1.0f)*(E-0.5f*(rho_u*rho_u + rho_v*rho_v)/rho); +} + + +__device__ float4 F_func(const float4 Q, float gamma) { + const float rho = Q.x; + const float rho_u = Q.y; + const float rho_v = Q.z; + const float E = Q.w; + + const float u = rho_u/rho; + const float P = pressure(Q, gamma); + + float4 F; + + F.x = rho_u; + F.y = rho_u*u + P; + F.z = rho_v*u; + F.w = u*(E+P); + + return F; +} \ No newline at end of file diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/FORCE_kernel.cu index c43fd38..419c665 100644 --- a/GPUSimulators/FORCE_kernel.cu +++ b/GPUSimulators/FORCE_kernel.cu @@ -20,6 +20,7 @@ along with this program. If not, see . #include "common.cu" +#include "SWECommon.cu" #include "fluxes/FirstOrderCentered.cu" @@ -32,8 +33,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const float g_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Compute fluxes along the x axis { @@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Compute fluxes along the y axis for (int j=ty; j. +""" + +#Import packages we need +import numpy as np +from GPUSimulators import Simulator, Common + + + + + + + + + +""" +Class that solves the SW equations using the Forward-Backward linear scheme +""" +class HLL2Euler (Simulator.BaseSimulator): + + """ + Initialization routine + rho: Density + rho_u: Momentum along x-axis + rho_v: Momentum along y-axis + E: energy + nx: Number of cells along x-axis + ny: Number of cells along y-axis + dx: Grid cell spacing along x-axis + dy: Grid cell spacing along y-axis + dt: Size of each timestep + g: Gravitational accelleration + """ + def __init__(self, \ + context, \ + rho, rho_u, rho_v, E, \ + nx, ny, \ + dx, dy, dt, \ + g, \ + theta=1.8, \ + block_width=16, block_height=16): + + # Call super constructor + super().__init__(context, \ + nx, ny, \ + dx, dy, dt, \ + g, \ + block_width, block_height); + + self.theta = np.float32(theta) + + #Get kernels + self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \ + "iifffffiPiPiPiPiPiPi", \ + BLOCK_WIDTH=self.local_size[0], \ + BLOCK_HEIGHT=self.local_size[1]) + + #Create data by uploading to device + self.u0 = Common.ArakawaA2D(self.stream, \ + nx, ny, \ + 2, 2, \ + [rho, rho_u, rho_v, E]) + self.u1 = Common.ArakawaA2D(self.stream, \ + nx, ny, \ + 2, 2, \ + [None, None, None, None]) + + def __str__(self): + return "Harten-Lax-van Leer (2nd order)" + + def simulate(self, t_end): + return super().simulateDimsplit(t_end) + + def stepEuler(self, dt): + return self.stepDimsplitXY(dt) + + def stepDimsplitXY(self, dt): + self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.nx, self.ny, \ + self.dx, self.dy, dt, \ + self.g, \ + self.theta, \ + np.int32(0), \ + self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ + self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ + self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ + self.u0[3].data.gpudata, self.u0[3].data.strides[0], \ + self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ + self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.u1[2].data.gpudata, self.u1[2].data.strides[0] + self.u1[3].data.gpudata, self.u1[3].data.strides[0]) + self.u0, self.u1 = self.u1, self.u0 + self.t += dt + + def stepDimsplitYX(self, dt): + self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.nx, self.ny, \ + self.dx, self.dy, dt, \ + self.g, \ + self.theta, \ + np.int32(1), \ + self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ + self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ + self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ + self.u0[3].data.gpudata, self.u0[3].data.strides[0], \ + self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ + self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.u1[2].data.gpudata, self.u1[2].data.strides[0] + self.u1[3].data.gpudata, self.u1[3].data.strides[0]) + self.u0, self.u1 = self.u1, self.u0 + self.t += dt + + def download(self): + return self.u0.download(self.stream) \ No newline at end of file diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/HLL2_kernel.cu index 168627f..bdaf0f8 100644 --- a/GPUSimulators/HLL2_kernel.cu +++ b/GPUSimulators/HLL2_kernel.cu @@ -19,6 +19,7 @@ along with this program. If not, see . #include "common.cu" +#include "SWECommon.cu" #include "limiters.cu" #include "fluxes/HartenLaxVanLeer.cu" @@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; { const int j=ty; @@ -89,8 +90,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; for (int j=ty; j. #include "common.cu" +#include "SWECommon.cu" #include "fluxes/HartenLaxVanLeer.cu" @@ -34,8 +35,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; { const int j=ty; @@ -68,8 +69,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; for (int j=ty; j. #include "common.cu" +#include "SWECommon.cu" #include "limiters.cu" #include "fluxes/CentralUpwind.cu" @@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; { int j=ty; @@ -80,8 +81,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; for (int j=ty; j. #include "common.cu" +#include "SWECommon.cu" #include "limiters.cu" #include "fluxes/CentralUpwind.cu" @@ -35,8 +36,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; { int j=ty; @@ -66,8 +67,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; for (int j=ty; j. */ -#include "common.cu" +#include "Common.cu" +#include "SWECommon.cu" #include "fluxes/LaxFriedrichs.cu" @@ -32,8 +33,8 @@ void computeFluxF(float Q[3][block_height+2][block_width+2], float F[3][block_height][block_width+1], const float g_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; { const int j=ty; @@ -68,8 +69,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2], float G[3][block_height+1][block_width], const float g_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; for (int j=ty; j 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; const int i = tx + 1; //Skip local ghost cells, i.e., +1 const int j = ty + 1; diff --git a/GPUSimulators/SWECommon.cu b/GPUSimulators/SWECommon.cu new file mode 100644 index 0000000..389a062 --- /dev/null +++ b/GPUSimulators/SWECommon.cu @@ -0,0 +1,66 @@ +/* +This OpenCL kernel implements the Kurganov-Petrova numerical scheme +for the shallow water equations, described in +A. Kurganov & Guergana Petrova +A Second-Order Well-Balanced Positivity Preserving Central-Upwind +Scheme for the Saint-Venant System Communications in Mathematical +Sciences, 5 (2007), 133-160. + +Copyright (C) 2016 SINTEF ICT + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + + + + + + + + + +__device__ float3 F_func(const float3 Q, const float g) { + float3 F; + + F.x = Q.y; //hu + F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h; + F.z = Q.y*Q.z / Q.x; //hu*hv/h; + + return F; +} + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/WAF_kernel.cu index f707aa8..3c971ae 100644 --- a/GPUSimulators/WAF_kernel.cu +++ b/GPUSimulators/WAF_kernel.cu @@ -25,6 +25,7 @@ along with this program. If not, see . #include "common.cu" +#include "SWECommon.cu" #include "fluxes/WeightedAverageFlux.cu" @@ -37,8 +38,8 @@ void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; { int j=ty; @@ -76,8 +77,8 @@ void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1], const float g_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Compute fluxes along the y axis for (int j=ty; j. */ -/** - * Location of thread in block - */ -inline __device__ int get_local_id(int dim) { - switch(dim) { - case 0: return threadIdx.x; - case 1: return threadIdx.y; - case 2: return threadIdx.z; - default: return -1; - } -} - - -/** - * Get block index - */ -__device__ int get_group_id(int dim) { - switch(dim) { - case 0: return blockIdx.x; - case 1: return blockIdx.y; - case 2: return blockIdx.z; - default: return -1; - } -} - -/** - * Location of thread in global domain - */ -__device__ int get_global_id(int dim) { - switch(dim) { - case 0: return blockDim.x*blockIdx.x + threadIdx.x; - case 1: return blockDim.y*blockIdx.y + threadIdx.y; - case 2: return blockDim.z*blockIdx.z + threadIdx.z; - default: return -1; - } -} - /** * Float3 operators @@ -97,12 +60,12 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_, float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Index of block within domain - const int bx = BLOCK_WIDTH * get_group_id(0); - const int by = BLOCK_HEIGHT * get_group_id(1); + const int bx = BLOCK_WIDTH * blockIdx.x; + const int by = BLOCK_HEIGHT * blockIdx.y; //Read into shared memory for (int j=ty; j 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) { @@ -209,12 +172,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_, float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Index of cell within domain - const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2 - const int tj = get_global_id(1) + 2; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; //Only write internal cells if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) { @@ -242,12 +205,12 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_, */ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) { //Global index - const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1 - const int tj = get_global_id(1) + 1; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1; //Block-local indices - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; const int i = tx + 1; //Skip local ghost cells, i.e., +1 const int j = ty + 1; @@ -286,12 +249,12 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const */ __device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) { //Global index - const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2 - const int tj = get_global_id(1) + 2; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; //Block-local indices - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; const int i = tx + 2; //Skip local ghost cells, i.e., +2 const int j = ty + 2; @@ -347,12 +310,12 @@ __device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Index of cell within domain - const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1 - const int tj = get_global_id(1) + 1; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1; if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) { const int i = tx + 1; //Skip local ghost cells, i.e., +1 @@ -377,12 +340,12 @@ __device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_, const float dx_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Index of cell within domain - const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2 - const int tj = get_global_id(1) + 2; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) { const int i = tx + 2; //Skip local ghost cells, i.e., +1 @@ -407,12 +370,12 @@ __device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Index of cell within domain - const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1 - const int tj = get_global_id(1) + 1; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1; if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) { const int i = tx + 1; //Skip local ghost cells, i.e., +1 @@ -438,12 +401,12 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_, const float dy_, const float dt_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Index of cell within domain - const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2 - const int tj = get_global_id(1) + 2; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2 + const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) { const int i = tx + 2; //Skip local ghost cells, i.e., +2 @@ -464,23 +427,6 @@ __device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], -__device__ float3 F_func(const float3 Q, const float g) { - float3 F; - - F.x = Q.y; //hu - F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h; - F.z = Q.y*Q.z / Q.x; //hu*hv/h; - - return F; -} - - - - - - - - diff --git a/GPUSimulators/limiters.cu b/GPUSimulators/limiters.cu index 39ec2d9..e34d21f 100644 --- a/GPUSimulators/limiters.cu +++ b/GPUSimulators/limiters.cu @@ -50,8 +50,8 @@ __device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const float theta_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; //Reconstruct slopes along x axis { @@ -74,8 +74,8 @@ __device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const float theta_) { //Index of thread within block - const int tx = get_local_id(0); - const int ty = get_local_id(1); + const int tx = threadIdx.x; + const int ty = threadIdx.y; for (int j=ty; j