mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 06:24:13 +02:00
Renamed macro and added iPython magic
This commit is contained in:
parent
426b8dba5c
commit
3e401e3fe1
@ -37,6 +37,7 @@ class Timer(object):
|
||||
Class which keeps track of the CUDA context and some helper functions
|
||||
"""
|
||||
class CudaContext(object):
|
||||
|
||||
def __init__(self, verbose=True, blocking=False):
|
||||
self.verbose = verbose
|
||||
self.blocking = blocking
|
||||
@ -93,7 +94,10 @@ class CudaContext(object):
|
||||
if (self.verbose):
|
||||
print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
|
||||
self.cuda_context.detach()
|
||||
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "CudaContext id " + str(self.cuda_context.handle)
|
||||
|
||||
def hash_kernel(kernel_filename, include_dirs, verbose=False):
|
||||
# Generate a kernel ID for our cache
|
||||
@ -172,8 +176,8 @@ class CudaContext(object):
|
||||
print("`-> Kernel changed or not in hash => compiling " + kernel_filename)
|
||||
|
||||
#Create define string
|
||||
define_string = "#define block_width " + str(block_width) + "\n"
|
||||
define_string += "#define block_height " + str(block_height) + "\n\n"
|
||||
define_string = "#define BLOCK_WIDTH " + str(block_width) + "\n"
|
||||
define_string += "#define BLOCK_HEIGHT " + str(block_height) + "\n\n"
|
||||
|
||||
kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
|
||||
|
||||
|
@ -27,8 +27,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
|
||||
//Index of thread within block
|
||||
@ -39,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
// Q at interface from the right and left
|
||||
@ -64,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
int i=tx;
|
||||
@ -113,8 +113,8 @@ __global__ void FORCEKernel(
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
|
@ -32,9 +32,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
@ -84,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -157,9 +157,9 @@ __global__ void HLL2Kernel(
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float Qx[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
|
@ -30,8 +30,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -40,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
const float3 Q_l = make_float3(Q[0][l][k ], Q[1][l][k ], Q[2][l][k ]);
|
||||
@ -64,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
const int i=tx;
|
||||
@ -120,8 +120,8 @@ __global__ void HLLKernel(
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
|
75
SWESimulators/IPythonMagic.py
Normal file
75
SWESimulators/IPythonMagic.py
Normal file
@ -0,0 +1,75 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements helpers for IPython / Jupyter and CUDA
|
||||
|
||||
Copyright (C) 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
from IPython.core.magic import line_magic, Magics, magics_class
|
||||
import pycuda.driver as cuda
|
||||
|
||||
|
||||
|
||||
@magics_class
|
||||
class CudaContextHandler(Magics):
|
||||
@line_magic
|
||||
def cuda_context_handler(self, context_name):
|
||||
print("Registering " + context_name + " as a global context")
|
||||
|
||||
if context_name in self.shell.user_ns.keys():
|
||||
print("`-> Context already registered! Ignoring")
|
||||
return
|
||||
else:
|
||||
print("`-> Creating context")
|
||||
self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
|
||||
|
||||
# this function will be called on exceptions in any cell
|
||||
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
|
||||
print("Exception caught: Resetting to CUDA context " + context_name)
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
print("`-> popping " + str(context.handle))
|
||||
cuda.Context.pop()
|
||||
|
||||
if context_name in self.shell.user_ns.keys():
|
||||
print("`-> pushing " + str(self.shell.user_ns[context_name].cuda_context.handle))
|
||||
self.shell.ex(context_name + ".cuda_context.push()")
|
||||
else:
|
||||
print("No CUDA context called " + context_name + " found (something is wrong)!")
|
||||
print("CUDA will not work now")
|
||||
|
||||
# still show the error within the notebook, don't just swallow it
|
||||
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
|
||||
|
||||
# this registers a custom exception handler for the whole current notebook
|
||||
get_ipython().set_custom_exc((Exception,), custom_exc)
|
||||
|
||||
|
||||
# Handle CUDA context when exiting python
|
||||
import atexit
|
||||
def exitfunc():
|
||||
print("Exitfunc: Resetting CUDA context stack")
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
print("`-> popping " + str(context.handle))
|
||||
cuda.Context.pop()
|
||||
atexit.register(exitfunc)
|
||||
|
||||
print("Registering automatic CUDA context handling")
|
||||
print("(use %cuda_context_handler my_context to create a context called my_context")
|
||||
ip = get_ipython()
|
||||
ip.register_magics(CudaContextHandler)
|
@ -30,9 +30,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
@ -75,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
}
|
||||
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -148,9 +148,9 @@ __global__ void KP07DimsplitKernel(
|
||||
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float Qx[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
|
@ -30,9 +30,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Q at interface from the right and left
|
||||
const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
|
||||
@ -61,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
}
|
||||
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -129,14 +129,14 @@ __global__ void KP07Kernel(
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
|
||||
//The following slightly wastes memory, but enables us to reuse the
|
||||
//funcitons in common.opencl
|
||||
__shared__ float Qx[3][block_height+2][block_width+2];
|
||||
__shared__ float Qy[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float G[3][block_height+1][block_width+1];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
|
@ -27,8 +27,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -37,7 +37,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
// Q at interface from the right and left
|
||||
@ -62,14 +62,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
const int i=tx;
|
||||
@ -114,9 +114,9 @@ __global__ void LxFKernel(
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height][block_width+1];
|
||||
__shared__ float G[3][block_height+1][block_width];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
|
||||
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
|
||||
|
||||
//Read into shared memory
|
||||
readBlock1(h0_ptr_, h0_pitch_,
|
||||
|
@ -33,8 +33,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
|
||||
// Q at interface from the right and left
|
||||
@ -72,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -130,8 +130,8 @@ __global__ void WAFKernel(
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
|
@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
|
||||
__device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+2][block_width+2],
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = block_width * get_group_id(0);
|
||||
const int by = block_height * get_group_id(1);
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<block_height+2; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = clamp(by + j, 0, ny_+1); // Out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
|
||||
|
||||
for (int i=tx; i<block_width+2; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
|
||||
const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
|
||||
|
||||
Q[0][j][i] = h_row[k];
|
||||
@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
__device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+4][block_width+4],
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = block_width * get_group_id(0);
|
||||
const int by = block_height * get_group_id(1);
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<block_height+4; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
|
||||
const int l = clamp(by + j, 0, ny_+3); // Out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
|
||||
|
||||
for (int i=tx; i<block_width+4; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
|
||||
const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
|
||||
|
||||
Q[0][j][i] = h_row[k];
|
||||
@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+2][block_width+2],
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
|
||||
__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+4][block_width+4],
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
* No flow boundary conditions for the shallow water equations
|
||||
* with one ghost cell in each direction
|
||||
*/
|
||||
__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
|
||||
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
|
||||
* No flow boundary conditions for the shallow water equations
|
||||
* with two ghost cells in each direction
|
||||
*/
|
||||
__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
|
||||
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
|
||||
/**
|
||||
* Evolves the solution in time along the x axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
|
||||
/**
|
||||
* Evolves the solution in time along the x axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
|
||||
/**
|
||||
* Evolves the solution in time along the y axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
|
||||
/**
|
||||
* Evolves the solution in time along the y axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
|
@ -46,8 +46,8 @@ __device__ __inline__ float minmodSlope(float left, float center, float right, f
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along the abscissa
|
||||
*/
|
||||
__device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
__device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -57,7 +57,7 @@ __device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
const int j = ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+2; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
|
||||
@ -70,14 +70,14 @@ __device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along the ordinate
|
||||
*/
|
||||
__device__ void minmodSlopeY(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
__device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+2; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
const int i = tx;
|
||||
|
Loading…
x
Reference in New Issue
Block a user