Merge pull request #4 from babrodtk/cleanup

Cleanup
This commit is contained in:
André R. Brodtkorb 2018-08-13 16:14:07 +02:00 committed by GitHub
commit 90df79fa9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 2137 additions and 5163 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,108 +1,261 @@
# -*- coding: utf-8 -*-
"""
This python module implements the different helper functions and
classes
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import os
import numpy as np
import time
import re
import io
import hashlib
import logging
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
"""
Class which keeps track of time spent for a section of code
"""
class Timer(object):
def __init__(self, tag, log_level=logging.DEBUG):
self.tag = tag
self.log_level = log_level
self.logger = logging.getLogger(__name__)
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs
self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
"""
Class which keeps track of the CUDA context and some helper functions
"""
class CudaContext(object):
def __init__(self, verbose=True, blocking=False):
def __init__(self, verbose=True, blocking=False, use_cache=True):
self.verbose = verbose
self.blocking = blocking
self.use_cache = use_cache
self.logger = logging.getLogger(__name__)
self.kernels = {}
self.module_path = os.path.dirname(os.path.realpath(__file__))
#Initialize cuda (must be first call to PyCUDA)
cuda.init(flags=0)
if (self.verbose):
print("CUDA version " + str(cuda.get_version()))
print("Driver version " + str(cuda.get_driver_version()))
#Print some info about CUDA
self.logger.info("CUDA version %s", str(cuda.get_version()))
self.logger.info("Driver version %s", str(cuda.get_driver_version()))
self.cuda_device = cuda.Device(0)
if (self.verbose):
print("Using " + self.cuda_device.name())
print(" => compute capability: " + str(self.cuda_device.compute_capability()))
print(" => memory: " + str(self.cuda_device.total_memory() / (1024*1024)) + " MB")
self.logger.info("Using '%s' GPU", self.cuda_device.name())
self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
# Create the CUDA context
if (self.blocking):
self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
if (self.verbose):
print("=== WARNING ===")
print("Using blocking context")
print("=== WARNING ===")
self.logger.warning("Using blocking context")
else:
self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
if (self.verbose):
print("Created context <" + str(self.cuda_context.handle) + ">")
self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
#Create cache dir for cubin files
if (self.use_cache):
self.cache_path = os.path.join(self.module_path, "cuda_cache")
if not os.path.isdir(self.cache_path):
os.mkdir(self.cache_path)
self.logger.debug("Using CUDA cache dir %s", self.cache_path)
def __del__(self, *args):
if self.verbose:
print("Cleaning up CUDA context <" + str(self.cuda_context.handle) + ">")
self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
# Loop over all contexts in stack, and remove "this"
other_contexts = []
while (cuda.Context.get_current() != None):
context = cuda.Context.get_current()
if (self.verbose):
if (context.handle != self.cuda_context.handle):
print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> which we do not own")
other_contexts = [context] + other_contexts
cuda.Context.pop()
else:
print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> (ourselves)")
cuda.Context.pop()
if (context.handle != self.cuda_context.handle):
self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
other_contexts = [context] + other_contexts
cuda.Context.pop()
else:
self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
cuda.Context.pop()
# Add all the contexts we popped that were not our own
for context in other_contexts:
if (self.verbose):
print(" `-> <" + str(self.cuda_context.handle) + "> Pushing <" + str(context.handle) + ">")
self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
cuda.Context.push(context)
if (self.verbose):
print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
self.cuda_context.detach()
def __str__(self):
return "CudaContext id " + str(self.cuda_context.handle)
"""
Reads a text file and creates an OpenCL kernel from that
"""
def get_kernel(self, kernel_filename, block_width, block_height):
# Generate a kernel ID for our cache
module_path = os.path.dirname(os.path.realpath(__file__))
kernel_hash = ""
def hash_kernel(kernel_filename, include_dirs, verbose=False):
# Generate a kernel ID for our caches
num_includes = 0
max_includes = 100
kernel_hasher = hashlib.md5()
logger = logging.getLogger(__name__)
# Loop over file and includes, and check if something has changed
files = [kernel_filename]
while len(files):
filename = os.path.join(module_path, files.pop())
if (num_includes > max_includes):
raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
filename = files.pop()
logger.debug("Hashing %s", filename)
modified = os.path.getmtime(filename)
with open(filename, "r") as file:
# Open the file
with io.open(filename, "r") as file:
# Search for #inclue <something> and also hash the file
file_str = file.read()
file_hash = filename + "_" + str(hash(file_str)) + ":" + str(modified) + "--"
kernel_hasher.update(file_str.encode('utf-8'))
kernel_hasher.update(str(modified).encode('utf-8'))
#Find all includes
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
files = files + includes #WARNING FIXME This will not work with circular includes
kernel_hash = kernel_hash + file_hash
# Loop over everything that looks like an include
for include_file in includes:
# Recompile kernel if file or includes have changed
if (kernel_hash not in self.kernels.keys()):
#Create define string
define_string = "#define block_width " + str(block_width) + "\n"
define_string += "#define block_height " + str(block_height) + "\n\n"
#Search through include directories for the file
file_path = os.path.dirname(filename)
for include_path in [file_path] + include_dirs:
kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
self.kernels[kernel_hash] = cuda_compiler.SourceModule(kernel_string, include_dirs=[module_path])
# If we find it, add it to list of files to check
temp_path = os.path.join(include_path, include_file)
if (os.path.isfile(temp_path)):
files = files + [temp_path]
num_includes = num_includes + 1 #For circular includes...
break
return self.kernels[kernel_hash]
return kernel_hasher.hexdigest()
"""
Reads a text file and creates an OpenCL kernel from that
"""
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
prepared_call_args, \
include_dirs=[], verbose=False, no_extern_c=True,
**kwargs):
"""
Helper function to print compilation output
"""
def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
self.logger.debug("Compilation returned %s", str(compile_success_bool))
if info_str:
self.logger.debug("Info: %s", info_str)
if error_str:
self.logger.debug("Error: %s", error_str)
self.logger.debug("Getting %s", kernel_filename)
# Create a hash of the kernel (and its includes)
kwargs_hasher = hashlib.md5()
kwargs_hasher.update(str(kwargs).encode('utf-8'));
kwargs_hash = kwargs_hasher.hexdigest()
kwargs_hasher = None
root, ext = os.path.splitext(kernel_filename)
kernel_hash = root \
+ "_" + CudaContext.hash_kernel( \
os.path.join(self.module_path, kernel_filename), \
include_dirs=[self.module_path] + include_dirs, \
verbose=verbose) \
+ "_" + kwargs_hash \
+ ext
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
# If we have the kernel in our hashmap, return it
if (kernel_hash in self.kernels.keys()):
self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
return self.kernels[kernel_hash]
# If we have it on disk, return it
elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
with io.open(cached_kernel_filename, "rb") as file:
file_str = file.read()
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
kernel = module.get_function(kernel_function_name)
kernel.prepare(prepared_call_args)
self.kernels[kernel_hash] = kernel
return kernel
# Otherwise, compile it from source
else:
self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
#Create kernel string
kernel_string = ""
for key, value in kwargs.items():
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
if (self.use_cache):
with io.open(cached_kernel_filename + ".txt", "w") as file:
file.write(kernel_string)
with Timer("compiler") as timer:
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
if (self.use_cache):
with io.open(cached_kernel_filename, "wb") as file:
file.write(cubin)
kernel = module.get_function(kernel_function_name)
kernel.prepare(prepared_call_args)
self.kernels[kernel_hash] = kernel
return kernel
"""
Clears the kernel cache (useful for debugging & development)
@ -113,21 +266,6 @@ class CudaContext(object):
class Timer(object):
def __init__(self, tag, verbose=True):
self.verbose = verbose
self.tag = tag
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs
if self.verbose:
print("=> " + self.tag + ' %f ms' % self.msecs)
@ -139,12 +277,14 @@ class CUDAArray2D:
Uploads initial data to the CL device
"""
def __init__(self, stream, nx, ny, halo_x, halo_y, data):
self.logger = logging.getLogger(__name__)
self.nx = nx
self.ny = ny
self.nx_halo = nx + 2*halo_x
self.ny_halo = ny + 2*halo_y
self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
#Make sure data is in proper format
assert np.issubdtype(data.dtype, np.float32), "Wrong datatype: %s" % str(data.dtype)
assert not np.isfortran(data), "Wrong datatype (Fortran, expected C)"
@ -156,17 +296,26 @@ class CUDAArray2D:
self.bytes_per_float = data.itemsize
assert(self.bytes_per_float == 4)
self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args):
self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free()
self.data = None
"""
Enables downloading data from CL device to Python
"""
def download(self, stream, async=False):
#Copy data from device to host
if (async):
self.logger.debug("Buffer <%s> [%dx%d]: Downloading async ", int(self.data.gpudata), self.nx, self.ny)
host_data = self.data.get_async(stream=stream)
return host_data
else:
self.logger.debug("Buffer <%s> [%dx%d]: Downloading synchronously", int(self.data.gpudata), self.nx, self.ny)
host_data = self.data.get(stream=stream)#, pagelocked=True) # pagelocked causes crash on windows at least
return host_data

View File

@ -21,13 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -42,7 +36,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations
"""
class FORCE:
class FORCE (Simulator.BaseSimulator):
"""
Initialization routine
@ -63,80 +57,40 @@ class FORCE:
dx, dy, dt, \
g, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
1, 1, \
dx, dy, dt, \
g, \
block_width, block_height);
#Get kernels
self.force_module = context.get_kernel("FORCE_kernel.cu", block_width, block_height)
self.force_kernel = self.force_module.get_function("FORCEKernel")
self.force_kernel.prepare("iiffffPiPiPiPiPiPi")
#Create data by uploading to device
ghost_cells_x = 1
ghost_cells_y = 1
self.data = Common.SWEDataArakawaA(self.stream, \
nx, ny, \
ghost_cells_x, ghost_cells_y, \
h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
"iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "First order centered"
def simulate(self, t_end):
return super().simulateEuler(t_end)
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / self.dt + 1)
for i in range(0, n):
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
if (local_dt <= 0.0):
break
self.force_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.t += local_dt
self.data.swap()
return self.t, n
def stepEuler(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
def download(self):
return self.data.download(self.stream)

View File

@ -20,14 +20,15 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "fluxes/FirstOrderCentered.cu"
/**
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
@ -38,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
{
int j=ty;
const int l = j + 1; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i;
// Q at interface from the right and left
@ -63,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Compute fluxes along the y axis
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j;
{
int i=tx;
@ -97,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
}
extern "C" {
__global__ void FORCEKernel(
int nx_, int ny_,
float dx_, float dy_, float dt_,
@ -112,8 +114,8 @@ __global__ void FORCEKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
__shared__ float Q[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
//Read into shared memory
@ -150,3 +152,5 @@ __global__ void FORCEKernel(
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
}
} // extern "C"

View File

@ -20,13 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -37,7 +31,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
"""
class HLL:
class HLL (Simulator.BaseSimulator):
"""
Initialization routine
@ -58,80 +52,40 @@ class HLL:
dx, dy, dt, \
g, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
1, 1, \
dx, dy, dt, \
g, \
block_width, block_height);
#Get kernels
self.hll_module = context.get_kernel("HLL_kernel.cu", block_width, block_height)
self.hll_kernel = self.hll_module.get_function("HLLKernel")
self.hll_kernel.prepare("iiffffPiPiPiPiPiPi")
#Create data by uploading to device
ghost_cells_x = 1
ghost_cells_y = 1
self.data = Common.SWEDataArakawaA(self.stream, \
nx, ny, \
ghost_cells_x, ghost_cells_y, \
h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
"iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "Harten-Lax-van Leer"
def simulate(self, t_end):
return super().simulateEuler(t_end)
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / self.dt + 1)
for i in range(0, n):
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
if (local_dt <= 0.0):
break
self.hll_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.t += local_dt
self.data.swap()
return self.t, n
def stepEuler(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
def download(self):
return self.data.download(self.stream)

View File

@ -21,12 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -39,7 +34,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
class HLL2:
class HLL2 (Simulator.BaseSimulator):
"""
Initialization routine
@ -61,99 +56,62 @@ class HLL2:
g, \
theta=1.8, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
#Get kernels
self.hll2_module = context.get_kernel("HLL2_kernel.cu", block_width, block_height)
self.hll2_kernel = self.hll2_module.get_function("HLL2Kernel")
self.hll2_kernel.prepare("iifffffiPiPiPiPiPiPi")
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
2, 2, \
dx, dy, dt, \
g, \
block_width, block_height);
#Create data by uploading to device
ghost_cells_x = 2
ghost_cells_y = 2
self.data = Common.SWEDataArakawaA(self.stream, \
nx, ny, \
ghost_cells_x, ghost_cells_y, \
h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
self.theta = np.float32(theta)
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
#Get kernels
self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
"iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "Harten-Lax-van Leer (2nd order)"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / (2.0*self.dt) + 1)
def stepEuler(self, dt):
return self.stepDimsplitXY(dt)
for i in range(0, n):
#Dimensional splitting: second order accurate for every other timestep,
#thus run two timesteps in a go
def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
if (local_dt <= 0.0):
break
#Along X, then Y
self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
#Along Y, then X
self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += local_dt
return self.t, 2*n
def download(self):
return self.data.download(self.stream)
def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt

View File

@ -19,6 +19,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "limiters.cu"
#include "fluxes/HartenLaxVanLeer.cu"
@ -30,9 +32,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -41,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
const int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Reconstruct point values of Q at the left and right hand side
// of the cell for both the left (i) and right (i+1) cell
@ -64,7 +66,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
// Compute flux based on prediction
const float3 flux = HLLC_flux(Q_l_bar, Q_r_bar, g_);
const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
//Write to shared memory
F[0][j][i] = flux.x;
@ -82,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -134,7 +136,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
extern "C" {
__global__ void HLL2Kernel(
int nx_, int ny_,
float dx_, float dy_, float dt_,
@ -155,9 +157,9 @@ __global__ void HLL2Kernel(
float* hv1_ptr_, int hv1_pitch_) {
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float Qx[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
@ -227,3 +229,5 @@ __global__ void HLL2Kernel(
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
}
} // extern "C"

View File

@ -1,5 +1,5 @@
/*
This OpenCL kernel implements the HLL flux
This GPU kernel implements the HLL flux
Copyright (C) 2016 SINTEF ICT
@ -20,6 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "fluxes/HartenLaxVanLeer.cu"
@ -29,8 +30,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -39,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
{
const int j=ty;
const int l = j + 1; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i;
const float3 Q_l = make_float3(Q[0][l][k ], Q[1][l][k ], Q[2][l][k ]);
@ -63,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j;
{
const int i=tx;
@ -103,6 +104,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
extern "C" {
__global__ void HLLKernel(
int nx_, int ny_,
@ -118,6 +120,10 @@ __global__ void HLLKernel(
float* h1_ptr_, int h1_pitch_,
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const int block_width = BLOCK_WIDTH;
const int block_height = BLOCK_HEIGHT;
//Shared memory variables
__shared__ float Q[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
@ -160,3 +166,5 @@ __global__ void HLLKernel(
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
}
} // extern "C"

View File

@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
"""
This python module implements helpers for IPython / Jupyter and CUDA
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import logging
from IPython.core.magic import line_magic, Magics, magics_class
import pycuda.driver as cuda
@magics_class
class CudaContextHandler(Magics):
@line_magic
def cuda_context_handler(self, context_name):
self.logger = logging.getLogger(__name__)
self.logger.debug("Registering %s as a global context", context_name)
if context_name in self.shell.user_ns.keys():
self.logger.debug("Context already registered! Ignoring")
return
else:
self.logger.debug("Creating context")
self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
# this function will be called on exceptions in any cell
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
self.logger.exception("Exception caught: Resetting to CUDA context %s", context_name)
while (cuda.Context.get_current() != None):
context = cuda.Context.get_current()
self.logger.info("Popping <%s>", str(context.handle))
cuda.Context.pop()
if context_name in self.shell.user_ns.keys():
self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
self.shell.ex(context_name + ".cuda_context.push()")
else:
self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
self.logger.error("CUDA will not work now")
# still show the error within the notebook, don't just swallow it
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
# this registers a custom exception handler for the whole current notebook
get_ipython().set_custom_exc((Exception,), custom_exc)
# Handle CUDA context when exiting python
import atexit
def exitfunc():
self.logger.info("Exitfunc: Resetting CUDA context stack")
while (cuda.Context.get_current() != None):
context = cuda.Context.get_current()
self.logger.info("`-> Popping <%s>", str(context.handle))
cuda.Context.pop()
atexit.register(exitfunc)
logger = logging.getLogger(__name__)
logger.info("Registering automatic CUDA context handling")
logger.debug("(use %cuda_context_handler my_context to create a context called my_context")
ip = get_ipython()
ip.register_magics(CudaContextHandler)

View File

@ -26,12 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -40,7 +35,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
class KP07:
class KP07 (Simulator.BaseSimulator):
"""
Initialization routine
@ -53,131 +48,64 @@ class KP07:
dy: Grid cell spacing along y-axis (20 000 m)
dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2)
f: Coriolis parameter (1.2e-4 s^1)
r: Bottom friction coefficient (2.4e-3 m/s)
wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
wind_tau0: Amplitude of wind stress (Pa)
wind_rho: Density of sea water (1025.0 kg / m^3)
wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
wind_xm: Maximum wind stress for bell shaped wind stress
wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
"""
def __init__(self, \
context, \
h0, hu0, hv0, \
nx, ny, \
dx, dy, dt, \
g, theta=1.3, \
r=0.0, use_rk2=True,
g, \
theta=1.3, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
2, 2, \
dx, dy, dt, \
g, \
block_width, block_height);
self.theta = np.float32(theta)
#Get kernels
self.kp07_module = context.get_kernel("KP07_kernel.cu", block_width, block_height)
self.kp07_kernel = self.kp07_module.get_function("KP07Kernel")
self.kp07_kernel.prepare("iiffffffiPiPiPiPiPiPi")
#Create data by uploading to device
ghost_cells_x = 2
ghost_cells_y = 2
self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
self.theta = np.float32(theta)
self.r = np.float32(r)
self.use_rk2 = use_rk2
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
"iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "Kurganov-Petrova 2007"
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / self.dt + 1)
for i in range(0, n):
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
if (local_dt <= 0.0):
break
if (self.use_rk2):
self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
self.r, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
self.r, \
np.int32(1), \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch)
else:
self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
self.r, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.cl_data.swap()
self.t += local_dt
return self.t, n
def simulate(self, t_end):
return super().simulateRK(t_end, 2)
def substepRK(self, dt, substep):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(substep), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
def stepEuler(self, dt):
self.substepRK(dt, 0)
self.t += dt
def stepRK(self, dt, order):
if (order != 2):
raise NotImplementedError("Only second order implemented")
self.substepRK(dt, 0)
self.substepRK(dt, 1)
self.t += dt
def download(self):
return self.data.download(self.stream)

View File

@ -26,12 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -40,7 +35,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations using the dimentionally split KP07 scheme
"""
class KP07_dimsplit:
class KP07_dimsplit (Simulator.BaseSimulator):
"""
Initialization routine
@ -62,98 +57,63 @@ class KP07_dimsplit:
g, \
theta=1.3, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
#Get kernels
self.kp07_dimsplit_module = context.get_kernel("KP07_dimsplit_kernel.cu", block_width, block_height)
self.kp07_dimsplit_kernel = self.kp07_dimsplit_module.get_function("KP07DimsplitKernel")
self.kp07_dimsplit_kernel.prepare("iifffffiPiPiPiPiPiPi")
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
2, 2, \
dx, dy, dt, \
g, \
block_width, block_height);
#Create data by uploading to device
ghost_cells_x = 2
ghost_cells_y = 2
self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
self.theta = np.float32(theta)
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
#Get kernels
self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
"iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "Kurganov-Petrova 2007 dimensionally split"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / (2.0*self.dt) + 1)
def stepEuler(self, dt):
return self.stepDimsplitXY(dt)
for i in range(0, n):
#Dimensional splitting: second order accurate for every other timestep,
#thus run two timesteps in a go
def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
#Compute timestep
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
if (local_dt <= 0.0):
break
#Along X, then Y
self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
#Along Y, then X
self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.theta, \
np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += 2.0*local_dt
def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.theta, \
np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
return self.t, 2*n
def download(self):
return self.data.download(self.stream)

View File

@ -25,12 +25,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "limiters.cu"
#include "fluxes/CentralUpwind.cu"
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -39,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Reconstruct point values of Q at the left and right hand side
// of the cell for both the left (i) and right (i+1) cell
@ -73,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
}
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -125,6 +127,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
/**
* This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
*/
extern "C" {
__global__ void KP07DimsplitKernel(
int nx_, int ny_,
float dx_, float dy_, float dt_,
@ -146,9 +149,9 @@ __global__ void KP07DimsplitKernel(
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float Qx[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
@ -218,3 +221,5 @@ __global__ void KP07DimsplitKernel(
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
}
} // extern "C"

View File

@ -25,12 +25,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "limiters.cu"
#include "fluxes/CentralUpwind.cu"
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -39,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Q at interface from the right and left
const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
@ -59,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
}
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -104,8 +106,6 @@ __global__ void KP07Kernel(
float theta_,
float r_, //< Bottom friction coefficient
int step_,
//Input h^n
@ -127,14 +127,14 @@ __global__ void KP07Kernel(
const int tj = get_global_id(1) + 2;
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
//The following slightly wastes memory, but enables us to reuse the
//funcitons in common.opencl
__shared__ float Qx[3][block_height+2][block_width+2];
__shared__ float Qy[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float G[3][block_height+1][block_width+1];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
@ -179,14 +179,12 @@ __global__ void KP07Kernel(
float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
const float C = 2.0f*r_*dt_/Q[0][j][i];
if (step_ == 0) {
//First step of RK2 ODE integrator
h_row[ti] = h1;
hu_row[ti] = hu1 / (1.0f + C);
hv_row[ti] = hv1 / (1.0f + C);
hu_row[ti] = hu1;
hv_row[ti] = hv1;
}
else if (step_ == 1) {
//Second step of RK2 ODE integrator
@ -203,8 +201,8 @@ __global__ void KP07Kernel(
//Write to main memory
h_row[ti] = h_b;
hu_row[ti] = hu_b / (1.0f + 0.5f*C);
hv_row[ti] = hv_b / (1.0f + 0.5f*C);
hu_row[ti] = hu_b;
hv_row[ti] = hv_b;
}
}
}

View File

@ -21,13 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -38,7 +32,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations using the Lax Friedrichs scheme
"""
class LxF:
class LxF (Simulator.BaseSimulator):
"""
Initialization routine
@ -59,80 +53,40 @@ class LxF:
dx, dy, dt, \
g, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
#Get kernels
self.lxf_module = context.get_kernel("LxF_kernel.cu", block_width, block_height)
self.lxf_kernel = self.lxf_module.get_function("LxFKernel")
self.lxf_kernel.prepare("iiffffPiPiPiPiPiPi")
#Create data by uploading to device
ghost_cells_x = 1
ghost_cells_y = 1
self.data = Common.SWEDataArakawaA(self.stream, \
nx, ny, \
ghost_cells_x, ghost_cells_y, \
h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
1, 1, \
dx, dy, dt, \
g, \
block_width, block_height);
# Get kernels
self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
"iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "Lax Friedrichs"
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / self.dt + 1)
def simulate(self, t_end):
return super().simulateEuler(t_end)
for i in range(0, n):
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
if (local_dt <= 0.0):
break
self.lxf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.t += local_dt
self.data.swap()
return self.t, n
def stepEuler(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
def download(self):
return self.data.download(self.stream)

View File

@ -20,11 +20,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "fluxes/LaxFriedrichs.cu"
/**
* Computes the flux along the x axis for all faces
*/
template <int block_width, int block_height>
__device__
void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height][block_width+1],
@ -60,6 +62,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
/**
* Computes the flux along the y axis for all faces
*/
template <int block_width, int block_height>
__device__
void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width],
@ -94,7 +97,9 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
}
__global__ void LxFKernel(
extern "C" {
__global__
void LxFKernel(
int nx_, int ny_,
float dx_, float dy_, float dt_,
float g_,
@ -109,6 +114,9 @@ __global__ void LxFKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const int block_width = BLOCK_WIDTH;
const int block_height = BLOCK_HEIGHT;
//Index of cell within domain
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
@ -130,8 +138,8 @@ __global__ void LxFKernel(
//Compute fluxes along the x and y axis
computeFluxF(Q, F, g_, dx_, dt_);
computeFluxG(Q, G, g_, dy_, dt_);
computeFluxF<block_width, block_height>(Q, F, g_, dx_, dt_);
computeFluxG<block_width, block_height>(Q, G, g_, dy_, dt_);
__syncthreads();
@ -160,3 +168,6 @@ __global__ void LxFKernel(
hv_row[ti] = hv1;
}
}
} // extern "C"

View File

@ -1,165 +0,0 @@
# -*- coding: utf-8 -*-
"""
This python class aids in plotting results from the numerical
simulations
Copyright (C) 2016 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import time
"""
Class that makes plotting faster by caching the plots instead of recreating them
"""
class PlotHelper:
def __init__(self, fig, x_coords, y_coords, radius, eta1, u1, v1, eta2=None, u2=None, v2=None, interpolation_type='spline36'):
self.ny, self.nx = eta1.shape
self.fig = fig;
fig.set_figheight(15)
fig.set_figwidth(15)
min_x = np.min(x_coords[:,0]);
min_y = np.min(y_coords[0,:]);
max_x = np.max(x_coords[0,:]);
max_y = np.max(y_coords[:,0]);
domain_extent = [ x_coords[0, 0], x_coords[0, -1], y_coords[0, 0], y_coords[-1, 0] ]
if (eta2 is not None):
assert(u2 is not None)
assert(v2 is not None)
self.gs = gridspec.GridSpec(3, 3)
else:
self.gs = gridspec.GridSpec(2, 3)
ax = self.fig.add_subplot(self.gs[0, 0])
self.sp_eta = plt.imshow(eta1, interpolation=interpolation_type, origin='bottom', vmin=-0.05, vmax=0.05, extent=domain_extent)
plt.axis('tight')
ax.set_aspect('equal')
plt.title('Eta')
plt.colorbar()
ax = self.fig.add_subplot(self.gs[0, 1])
self.sp_u = plt.imshow(u1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
plt.axis('tight')
ax.set_aspect('equal')
plt.title('U')
plt.colorbar()
ax = self.fig.add_subplot(self.gs[0, 2])
self.sp_v = plt.imshow(v1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
plt.axis('tight')
ax.set_aspect('equal')
plt.title('V')
plt.colorbar()
ax = self.fig.add_subplot(self.gs[1, 0])
self.sp_radial1, = plt.plot(radius.ravel(), eta1.ravel(), '.')
plt.axis([0, min(max_x, max_y), -1.5, 1])
plt.title('Eta Radial plot')
ax = self.fig.add_subplot(self.gs[1, 1])
self.sp_x_axis1, = plt.plot(x_coords[self.ny/2,:], eta1[self.ny/2,:], 'k+--', label='x-axis')
self.sp_y_axis1, = plt.plot(y_coords[:,self.nx/2], eta1[:,self.nx/2], 'kx:', label='y-axis')
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
plt.title('Eta along axis')
plt.legend()
ax = self.fig.add_subplot(self.gs[1, 2])
self.sp_x_diag1, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2), \
'k+--', label='x = -y')
self.sp_y_diag1, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2), \
'kx:', label='x = y')
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
plt.title('Eta along diagonal')
plt.legend()
if (eta2 is not None):
ax = self.fig.add_subplot(self.gs[2, 0])
self.sp_radial2, = plt.plot(radius.ravel(), eta2.ravel(), '.')
plt.axis([0, min(max_x, max_y), -1.5, 1])
plt.title('Eta2 Radial plot')
ax = self.fig.add_subplot(self.gs[2, 1])
self.sp_x_axis2, = plt.plot(x_coords[self.ny/2,:], eta2[self.ny/2,:], 'k+--', label='x-axis')
self.sp_y_axis2, = plt.plot(y_coords[:,self.nx/2], eta2[:,self.nx/2], 'kx:', label='y-axis')
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
plt.title('Eta2 along axis')
plt.legend()
ax = self.fig.add_subplot(self.gs[2, 2])
self.sp_x_diag2, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2), \
'k+--', label='x = -y')
self.sp_y_diag2, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2), \
'kx:', label='x = y')
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
plt.title('Eta2 along diagonal')
plt.legend()
def plot(self, eta1, u1, v1, eta2=None, u2=None, v2=None):
self.fig.add_subplot(self.gs[0, 0])
self.sp_eta.set_data(eta1)
self.fig.add_subplot(self.gs[0, 1])
self.sp_u.set_data(u1)
self.fig.add_subplot(self.gs[0, 2])
self.sp_v.set_data(v1)
self.fig.add_subplot(self.gs[1, 0])
self.sp_radial1.set_ydata(eta1.ravel());
self.fig.add_subplot(self.gs[1, 1])
self.sp_x_axis1.set_ydata(eta1[(self.ny+2)/2,:])
self.sp_y_axis1.set_ydata(eta1[:,(self.nx+2)/2])
self.fig.add_subplot(self.gs[1, 2])
self.sp_x_diag1.set_ydata(np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2))
self.sp_y_diag1.set_ydata(np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2))
if (eta2 is not None):
self.fig.add_subplot(self.gs[2, 0])
self.sp_radial2.set_ydata(eta2.ravel());
self.fig.add_subplot(self.gs[2, 1])
self.sp_x_axis2.set_ydata(eta2[(self.ny+2)/2,:])
self.sp_y_axis2.set_ydata(eta2[:,(self.nx+2)/2])
self.fig.add_subplot(self.gs[2, 2])
self.sp_x_diag2.set_ydata(np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2))
self.sp_y_diag2.set_ydata(np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2))
plt.draw()
time.sleep(0.001)

186
SWESimulators/Simulator.py Normal file
View File

@ -0,0 +1,186 @@
# -*- coding: utf-8 -*-
"""
This python module implements the classical Lax-Friedrichs numerical
scheme for the shallow water equations
Copyright (C) 2016 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
import numpy as np
import logging
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
class BaseSimulator:
"""
Initialization routine
context: GPU context to use
kernel_wrapper: wrapper function of GPU kernel
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
nx: Number of cells along x-axis
ny: Number of cells along y-axis
dx: Grid cell spacing along x-axis (20 000 m)
dy: Grid cell spacing along y-axis (20 000 m)
dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2)
"""
def __init__(self, \
context, \
h0, hu0, hv0, \
nx, ny, \
ghost_cells_x, ghost_cells_y, \
dx, dy, dt, \
g, \
block_width, block_height):
#Get logger
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
#Create a CUDA stream
self.stream = cuda.Stream()
#Create data by uploading to device
self.data = Common.SWEDataArakawaA(self.stream, \
nx, ny, \
ghost_cells_x, ghost_cells_y, \
h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#GPU kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
#Keep track of simulation time
self.t = 0.0;
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
"""
Function which simulates forward in time using the default simulation type
"""
def simulate(self, t_end):
raise(exceptions.NotImplementedError("Needs to be implemented in subclass"))
"""
Function which simulates t_end seconds using forward Euler
Requires that the stepEuler functionality is implemented in the subclasses
"""
def simulateEuler(self, t_end):
with Common.Timer(self.__class__.__name__ + ".simulateEuler") as t:
# Compute number of timesteps to perform
n = int(t_end / self.dt + 1)
for i in range(0, n):
# Compute timestep for "this" iteration
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
# Stop if end reached (should not happen)
if (local_dt <= 0.0):
break
# Step with forward Euler
self.stepEuler(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
return self.t, n
"""
Function which simulates t_end seconds using Runge-Kutta 2
Requires that the stepRK functionality is implemented in the subclasses
"""
def simulateRK(self, t_end, order):
with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
# Compute number of timesteps to perform
n = int(t_end / self.dt + 1)
for i in range(0, n):
# Compute timestep for "this" iteration
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
# Stop if end reached (should not happen)
if (local_dt <= 0.0):
break
# Perform all the Runge-Kutta substeps
self.stepRK(local_dt, order)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
return self.t, n
"""
Function which simulates t_end seconds using second order dimensional splitting (XYYX)
Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
"""
def simulateDimsplit(self, t_end):
with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
# Compute number of timesteps to perform
n = int(t_end / (2.0*self.dt) + 1)
for i in range(0, n):
# Compute timestep for "this" iteration
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
# Stop if end reached (should not happen)
if (local_dt <= 0.0):
break
# Perform the dimensional split substeps
self.stepDimsplitXY(local_dt)
self.stepDimsplitYX(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs)
return self.t, 2*n
"""
Function which performs one single timestep of size dt using forward euler
"""
def stepEuler(self, dt):
raise(NotImplementedError("Needs to be implemented in subclass"))
def stepRK(self, dt, substep):
raise(NotImplementedError("Needs to be implemented in subclass"))
def stepDimsplitXY(self, dt):
raise(NotImplementedError("Needs to be implemented in subclass"))
def stepDimsplitYX(self, dt):
raise(NotImplementedError("Needs to be implemented in subclass"))
def sim_time(self):
return self.t
def download(self):
return self.data.download(self.stream)

View File

@ -22,12 +22,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need
import numpy as np
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from SWESimulators import Common
from SWESimulators import Simulator
@ -36,7 +31,7 @@ from SWESimulators import Common
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
class WAF:
class WAF (Simulator.BaseSimulator):
"""
Initialization routine
@ -57,92 +52,57 @@ class WAF:
dx, dy, dt, \
g, \
block_width=16, block_height=16):
#Create a CUDA stream
self.stream = cuda.Stream()
# Call super constructor
super().__init__(context, \
h0, hu0, hv0, \
nx, ny, \
2, 2, \
dx, dy, dt, \
g, \
block_width, block_height);
#Get kernels
self.waf_module = context.get_kernel("WAF_kernel.cu", block_width, block_height)
self.waf_kernel = self.waf_module.get_function("WAFKernel")
self.waf_kernel.prepare("iiffffiPiPiPiPiPiPi")
#Create data by uploading to device
ghost_cells_x = 2
ghost_cells_y = 2
self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#OpenCL kernel
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.dt = np.float32(dt)
self.g = np.float32(g)
#Initialize time
self.t = np.float32(0.0)
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
"iiffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \
BLOCK_HEIGHT=block_height)
def __str__(self):
return "Weighted average flux"
"""
Function which steps n timesteps
"""
def step(self, t_end=0.0):
n = int(t_end / (2.0*self.dt) + 1)
def simulate(self, t_end):
return super().simulateDimsplit(t_end)
for i in range(0, n):
#Dimensional splitting: second order accurate for every other timestep,
#thus run two timesteps in a go
def stepEuler(self, dt):
return self.stepDimsplitXY(dt)
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
if (local_dt <= 0.0):
break
#Along X, then Y
self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
#Along Y, then X
self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, local_dt, \
self.g, \
np.int32(1), \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch, \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch)
self.t += local_dt
return self.t, 2*n
def download(self):
return self.data.download(self.stream)
def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt
def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
self.nx, self.ny, \
self.dx, self.dy, dt, \
self.g, \
np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
self.data.h1.data.gpudata, self.data.h1.pitch, \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
self.data.hv1.data.gpudata, self.data.hv1.pitch)
self.data.swap()
self.t += dt

View File

@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "common.cu"
#include "fluxes/WeightedAverageFlux.cu"
@ -32,8 +33,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -42,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Q at interface from the right and left
@ -71,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Compute fluxes along the y axis
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -114,6 +115,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
extern "C" {
__global__ void WAFKernel(
int nx_, int ny_,
float dx_, float dy_, float dt_,
@ -129,8 +131,8 @@ __global__ void WAFKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
@ -193,3 +195,5 @@ __global__ void WAFKernel(
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
}
} // extern "C"

View File

@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
__device__ void readBlock1(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+2][block_width+2],
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Index of block within domain
const int bx = block_width * get_group_id(0);
const int by = block_height * get_group_id(1);
const int bx = BLOCK_WIDTH * get_group_id(0);
const int by = BLOCK_HEIGHT * get_group_id(1);
//Read into shared memory
for (int j=ty; j<block_height+2; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
const int l = clamp(by + j, 0, ny_+1); // Out of bounds
//Compute the pointer to current row in the arrays
@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
for (int i=tx; i<block_width+2; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
Q[0][j][i] = h_row[k];
@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
__device__ void readBlock2(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+4][block_width+4],
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Index of block within domain
const int bx = block_width * get_group_id(0);
const int by = block_height * get_group_id(1);
const int bx = BLOCK_WIDTH * get_group_id(0);
const int by = BLOCK_HEIGHT * get_group_id(1);
//Read into shared memory
for (int j=ty; j<block_height+4; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
const int l = clamp(by + j, 0, ny_+3); // Out of bounds
//Compute the pointer to current row in the arrays
@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
for (int i=tx; i<block_width+4; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
Q[0][j][i] = h_row[k];
@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+2][block_width+2],
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+4][block_width+4],
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
* No flow boundary conditions for the shallow water equations
* with one ghost cell in each direction
*/
__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
//Global index
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
* No flow boundary conditions for the shallow water equations
* with two ghost cells in each direction
*/
__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
//Global index
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
/**
* Evolves the solution in time along the x axis (dimensional splitting)
*/
__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
/**
* Evolves the solution in time along the x axis (dimensional splitting)
*/
__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
float F[3][block_height+1][block_width+1],
__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
/**
* Evolves the solution in time along the y axis (dimensional splitting)
*/
__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
/**
* Evolves the solution in time along the y axis (dimensional splitting)
*/
__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
float G[3][block_height+1][block_width+1],
__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
@ -464,79 +464,6 @@ __device__ void evolveG2(float Q[3][block_height+4][block_width+4],
/**
* Reconstructs a slope using the minmod limiter based on three
* consecutive values
*/
__device__ float minmodSlope(float left, float center, float right, float theta) {
const float backward = (center - left) * theta;
const float central = (right - left) * 0.5f;
const float forward = (right - center) * theta;
return 0.25f
*copysign(1.0f, backward)
*(copysign(1.0f, backward) + copysign(1.0f, central))
*(copysign(1.0f, central) + copysign(1.0f, forward))
*min( min(fabs(backward), fabs(central)), fabs(forward) );
}
/**
* Reconstructs a minmod slope for a whole block along x
*/
__device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Reconstruct slopes along x axis
{
const int j = ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+2; i+=block_width) {
const int k = i + 1;
for (int p=0; p<3; ++p) {
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
}
}
}
}
/**
* Reconstructs a minmod slope for a whole block along y
*/
__device__ void minmodSlopeY(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+2; j+=block_height) {
const int l = j + 1;
{
const int i = tx;
const int k = i + 2; //Skip ghost cells
for (int p=0; p<3; ++p) {
Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
}
}
}
}
__device__ float3 F_func(const float3 Q, const float g) {
float3 F;
@ -551,23 +478,9 @@ __device__ float3 F_func(const float3 Q, const float g) {
/**
* Central upwind flux function
*/
__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
const float3 Fp = F_func(Qp, g);
const float up = Qp.y / Qp.x; // hu / h
const float cp = sqrt(g*Qp.x); // sqrt(g*h)
const float3 Fm = F_func(Qm, g);
const float um = Qm.y / Qm.x; // hu / h
const float cm = sqrt(g*Qm.x); // sqrt(g*h)
const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
}
@ -578,397 +491,8 @@ __device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
/**
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
*/
__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
const float h_l = Q_l.x;
const float h_r = Q_r.x;
// Calculate velocities
const float u_l = Q_l.y / h_l;
const float u_r = Q_r.y / h_r;
// Estimate the potential wave speeds
const float c_l = sqrt(g_*h_l);
const float c_r = sqrt(g_*h_r);
// Compute h in the "star region", h^dagger
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
// Compute wave speed estimates
const float S_l = u_l - c_l*q_l;
const float S_r = u_r + c_r*q_r;
//Upwind selection
if (S_l >= 0.0f) {
return F_func(Q_l, g_);
}
else if (S_r <= 0.0f) {
return F_func(Q_r, g_);
}
//Or estimate flux in the star region
else {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
return flux;
}
}
/**
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
*/
__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
const float h_l = Q_l.x;
const float h_r = Q_r.x;
// Calculate velocities
const float u_l = Q_l.y / h_l;
const float u_r = Q_r.y / h_r;
// Estimate the potential wave speeds
const float c_l = sqrt(g_*h_l);
const float c_r = sqrt(g_*h_r);
// Compute h in the "star region", h^dagger
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
// Compute wave speed estimates
const float S_l = u_l - c_l*q_l;
const float S_r = u_r + c_r*q_r;
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
//Upwind selection
if (S_l >= 0.0f) {
return F_l;
}
else if (S_r <= 0.0f) {
return F_r;
}
//Or estimate flux in the "left star" region
else if (S_l <= 0.0f && 0.0f <=S_star) {
const float v_l = Q_l.z / h_l;
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
const float3 flux = F_l + S_l*(Q_star_l - Q_l);
return flux;
}
//Or estimate flux in the "righ star" region
else if (S_star <= 0.0f && 0.0f <=S_r) {
const float v_r = Q_r.z / h_r;
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
const float3 flux = F_r + S_r*(Q_star_r - Q_r);
return flux;
}
else {
return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
}
}
/**
* Superbee flux limiter for WAF.
* Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
* @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
* @param c_ the courant number for wave k, dt*S_k/dx
*/
__device__ float WAF_superbee(float r_, float c_) {
// r <= 0.0
if (r_ <= 0.0f) {
return 1.0f;
}
// 0.0 <= r <= 1/2
else if (r_ <= 0.5f) {
return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
}
// 1/2 <= r <= 1
else if (r_ <= 1.0f) {
return fabs(c_);
}
// 1 <= r <= 2
else if (r_ <= 2.0f) {
return 1.0f - (1.0f - fabsf(c_))*r_;
}
// r >= 2
else {
return 2.0f*fabsf(c_) - 1.0f;
}
}
__device__ float WAF_albada(float r_, float c_) {
if (r_ <= 0.0f) {
return 1.0f;
}
else {
return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
}
}
__device__ float WAF_minbee(float r_, float c_) {
r_ = fmaxf(-1.0f, fminf(2.0f, r_));
if (r_ <= 0.0f) {
return 1.0f;
}
if (r_ >= 0.0f && r_ <= 1.0f) {
return 1.0f - (1.0f - fabsf(c_)) * r_;
}
else {
return fabsf(c_);
}
}
__device__ float WAF_minmod(float r_, float c_) {
return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
}
__device__ float minmod(float r_) {
return fmaxf(0.0f, fminf(1.0f, r_));
}
__device__ float generalized_minmod(float r_, float theta_) {
return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
}
__device__ float superbee(float r_) {
return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
}
__device__ float vanAlbada1(float r_) {
return (r_*r_ + r_) / (r_*r_ + 1.0f);
}
__device__ float vanLeer(float r_) {
return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
}
__device__ float limiterToWAFLimiter(float r_, float c_) {
return 1.0f - (1.0f - fabsf(c_))*r_;
}
__device__ float desingularize(float x_, float eps_) {
return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
}
// Compute h in the "star region", h^dagger
__device__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
//return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
return h_tmp*h_tmp / g_;
}
/**
* Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
* @param r_ The flux limiter parameter (see Toro 2001, p. 203)
* @param Q_l2 Q_{i-1}
* @param Q_l1 Q_{i}
* @param Q_r1 Q_{i+1}
* @param Q_r2 Q_{i+2}
*/
__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {
const float h_l = Q_l1.x;
const float h_r = Q_r1.x;
const float h_l2 = Q_l2.x;
const float h_r2 = Q_r2.x;
// Calculate velocities
const float u_l = Q_l1.y / h_l;
const float u_r = Q_r1.y / h_r;
const float u_l2 = Q_l2.y / h_l2;
const float u_r2 = Q_r2.y / h_r2;
const float v_l = Q_l1.z / h_l;
const float v_r = Q_r1.z / h_r;
const float v_l2 = Q_l2.z / h_l2;
const float v_r2 = Q_r2.z / h_r2;
// Estimate the potential wave speeds
const float c_l = sqrt(g_*h_l);
const float c_r = sqrt(g_*h_r);
const float c_l2 = sqrt(g_*h_l2);
const float c_r2 = sqrt(g_*h_r2);
// Compute h in the "star region", h^dagger
const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_);
const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_);
const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_);
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
// Compute wave speed estimates
const float S_l = u_l - c_l*q_l;
const float S_r = u_r + c_r*q_r;
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
// Estimate the fluxes in the four regions
const float3 F_1 = F_func(Q_l1, g_);
const float3 F_4 = F_func(Q_r1, g_);
const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
//const float3 F_2 = F_func(Q_star_l, g_);
//const float3 F_3 = F_func(Q_star_r, g_);
// Compute the courant numbers for the waves
const float c_1 = S_l * dt_ / dx_;
const float c_2 = S_star * dt_ / dx_;
const float c_3 = S_r * dt_ / dx_;
// Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
const float eps = 1.0e-6f;
const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
/*
const float eps1 = 1.0e-6f;
const float eps2 = eps1;//*1.0e-1f;
const float rh_m = desingularize((h_l - h_l2), eps1) / desingularize((h_r - h_l), eps2);
const float rh_p = desingularize((h_r2 - h_r), eps1) / desingularize((h_r - h_l), eps2);
const float rv_m = desingularize((v_l - v_l2), eps1) / desingularize((v_r - v_l), eps2);
const float rv_p = desingularize((v_r2 - v_r), eps1) / desingularize((v_r - v_l), eps2);
// Compute the r parameters for the flux limiter
const float rh_1 = (c_1 > 0.0f) ? rh_m : rh_p;
const float rv_1 = (c_1 > 0.0f) ? rv_m : rv_p;
const float rh_2 = (c_2 > 0.0f) ? rh_m : rh_p;
const float rv_2 = (c_2 > 0.0f) ? rv_m : rv_p;
const float rh_3 = (c_3 > 0.0f) ? rh_m : rh_p;
const float rv_3 = (c_3 > 0.0f) ? rv_m : rv_p;
const float r_1 = rh_1;
const float r_2 = rv_2;
const float r_3 = rh_3;
*/
// Compute the limiter
// We use h for the nonlinear waves, and v for the middle shear wave
const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); //Middle shear wave
const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
//Average the fluxes
const float3 flux = 0.5f*( F_1 + F_4 )
- 0.5f*( A_1 * (F_2 - F_1)
+ A_2 * (F_3 - F_2)
+ A_3 * (F_4 - F_3) );
return flux;
}
/**
* Lax-Friedrichs flux (Toro 2001, p 163)
*/
__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
}
/**
* Lax-Friedrichs extended to 2D
*/
__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
//Note numerical diffusion for 2D here (0.25)
return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
}
/**
* Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
*/
__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
return F_func(Q_lw2, g_);
}
/**
* Godunovs centered scheme (Toro 2001, p 165)
*/
__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
return F_func(Q_godc, g_);
}
/**
* First Ordered Centered (Toro 2001, p.163)
*/
__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
return 0.5f*(F_lf + F_lw2);
}

View File

@ -0,0 +1,39 @@
/*
This file implements the Central upwind flux
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Central upwind flux function
*/
__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
const float3 Fp = F_func(Qp, g);
const float up = Qp.y / Qp.x; // hu / h
const float cp = sqrt(g*Qp.x); // sqrt(g*h)
const float3 Fm = F_func(Qm, g);
const float um = Qm.y / Qm.x; // hu / h
const float cm = sqrt(g*Qm.x); // sqrt(g*h)
const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
}

View File

@ -0,0 +1,33 @@
/*
This file implements the First ORder CEntered flux
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "./LaxFriedrichs.cu"
#include "./LaxWendroff.cu"
/**
* First Ordered Centered (Toro 2001, p.163)
*/
__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
return 0.5f*(F_lf + F_lw2);
}

View File

@ -0,0 +1,36 @@
/*
This file implements the Godunov flux
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Godunovs centered scheme (Toro 2001, p 165)
*/
__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
return F_func(Q_godc, g_);
}

View File

@ -0,0 +1,63 @@
/*
This file implements the Harten-Lax-van Leer flux
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
*/
__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
const float h_l = Q_l.x;
const float h_r = Q_r.x;
// Calculate velocities
const float u_l = Q_l.y / h_l;
const float u_r = Q_r.y / h_r;
// Estimate the potential wave speeds
const float c_l = sqrt(g_*h_l);
const float c_r = sqrt(g_*h_r);
// Compute h in the "star region", h^dagger
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
// Compute wave speed estimates
const float S_l = u_l - c_l*q_l;
const float S_r = u_r + c_r*q_r;
//Upwind selection
if (S_l >= 0.0f) {
return F_func(Q_l, g_);
}
else if (S_r <= 0.0f) {
return F_func(Q_r, g_);
}
//Or estimate flux in the star region
else {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
return flux;
}
}

View File

@ -0,0 +1,80 @@
/*
This file implements the Harten-Lax-van Leer flux with contact discontinuity
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
*/
__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
const float h_l = Q_l.x;
const float h_r = Q_r.x;
// Calculate velocities
const float u_l = Q_l.y / h_l;
const float u_r = Q_r.y / h_r;
// Estimate the potential wave speeds
const float c_l = sqrt(g_*h_l);
const float c_r = sqrt(g_*h_r);
// Compute h in the "star region", h^dagger
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
// Compute wave speed estimates
const float S_l = u_l - c_l*q_l;
const float S_r = u_r + c_r*q_r;
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
//Upwind selection
if (S_l >= 0.0f) {
return F_l;
}
else if (S_r <= 0.0f) {
return F_r;
}
//Or estimate flux in the "left star" region
else if (S_l <= 0.0f && 0.0f <=S_star) {
const float v_l = Q_l.z / h_l;
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
const float3 flux = F_l + S_l*(Q_star_l - Q_l);
return flux;
}
//Or estimate flux in the "righ star" region
else if (S_star <= 0.0f && 0.0f <=S_r) {
const float v_r = Q_r.z / h_r;
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
const float3 flux = F_r + S_r*(Q_star_r - Q_r);
return flux;
}
else {
return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
}
}

View File

@ -0,0 +1,24 @@
/**
* Lax-Friedrichs flux (Toro 2001, p 163)
*/
__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
}
/**
* Lax-Friedrichs extended to 2D
*/
__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
//Note numerical diffusion for 2D here (0.25)
return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
}

View File

@ -0,0 +1,33 @@
/*
This file implements the Lax-Wendroff flux
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
*/
__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
const float3 F_l = F_func(Q_l, g_);
const float3 F_r = F_func(Q_r, g_);
const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
return F_func(Q_lw2, g_);
}

View File

@ -0,0 +1,187 @@
/*
This file implements the Weighted Average Flux
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "limiters.cu"
/**
* Superbee flux limiter for WAF.
* Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
* @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
* @param c_ the courant number for wave k, dt*S_k/dx
*/
__device__ float WAF_superbee(float r_, float c_) {
// r <= 0.0
if (r_ <= 0.0f) {
return 1.0f;
}
// 0.0 <= r <= 1/2
else if (r_ <= 0.5f) {
return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
}
// 1/2 <= r <= 1
else if (r_ <= 1.0f) {
return fabs(c_);
}
// 1 <= r <= 2
else if (r_ <= 2.0f) {
return 1.0f - (1.0f - fabsf(c_))*r_;
}
// r >= 2
else {
return 2.0f*fabsf(c_) - 1.0f;
}
}
__device__ float WAF_albada(float r_, float c_) {
if (r_ <= 0.0f) {
return 1.0f;
}
else {
return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
}
}
__device__ float WAF_minbee(float r_, float c_) {
r_ = fmaxf(-1.0f, fminf(2.0f, r_));
if (r_ <= 0.0f) {
return 1.0f;
}
if (r_ >= 0.0f && r_ <= 1.0f) {
return 1.0f - (1.0f - fabsf(c_)) * r_;
}
else {
return fabsf(c_);
}
}
__device__ float WAF_minmod(float r_, float c_) {
return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
}
__device__ float limiterToWAFLimiter(float r_, float c_) {
return 1.0f - (1.0f - fabsf(c_))*r_;
}
__device__ float desingularize(float x_, float eps_) {
return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
}
// Compute h in the "star region", h^dagger
__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
//This estimate for the h* gives rise to spurious oscillations.
//return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
return h_tmp*h_tmp / g_;
}
/**
* Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
* @param r_ The flux limiter parameter (see Toro 2001, p. 203)
* @param Q_l2 Q_{i-1}
* @param Q_l1 Q_{i}
* @param Q_r1 Q_{i+1}
* @param Q_r2 Q_{i+2}
*/
__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {
const float h_l = Q_l1.x;
const float h_r = Q_r1.x;
const float h_l2 = Q_l2.x;
const float h_r2 = Q_r2.x;
// Calculate velocities
const float u_l = Q_l1.y / h_l;
const float u_r = Q_r1.y / h_r;
const float u_l2 = Q_l2.y / h_l2;
const float u_r2 = Q_r2.y / h_r2;
const float v_l = Q_l1.z / h_l;
const float v_r = Q_r1.z / h_r;
const float v_l2 = Q_l2.z / h_l2;
const float v_r2 = Q_r2.z / h_r2;
// Estimate the potential wave speeds
const float c_l = sqrt(g_*h_l);
const float c_r = sqrt(g_*h_r);
const float c_l2 = sqrt(g_*h_l2);
const float c_r2 = sqrt(g_*h_r2);
// Compute h in the "star region", h^dagger
const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_);
const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_);
const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_);
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
// Compute wave speed estimates
const float S_l = u_l - c_l*q_l;
const float S_r = u_r + c_r*q_r;
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
// Estimate the fluxes in the four regions
const float3 F_1 = F_func(Q_l1, g_);
const float3 F_4 = F_func(Q_r1, g_);
const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
//const float3 F_2 = F_func(Q_star_l, g_);
//const float3 F_3 = F_func(Q_star_r, g_);
// Compute the courant numbers for the waves
const float c_1 = S_l * dt_ / dx_;
const float c_2 = S_star * dt_ / dx_;
const float c_3 = S_r * dt_ / dx_;
// Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
const float eps = 1.0e-6f;
const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
// Compute the limiter
// We use h for the nonlinear waves, and v for the middle shear wave
const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2);
const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
//Average the fluxes
const float3 flux = 0.5f*( F_1 + F_4 )
- 0.5f*( A_1 * (F_2 - F_1)
+ A_2 * (F_3 - F_2)
+ A_3 * (F_4 - F_3) );
return flux;
}

129
SWESimulators/limiters.cu Normal file
View File

@ -0,0 +1,129 @@
/*
This file implements different flux and slope limiters
Copyright (C) 2016, 2017, 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Reconstructs a slope using the generalized minmod limiter based on three
* consecutive values
*/
__device__ __inline__ float minmodSlope(float left, float center, float right, float theta) {
const float backward = (center - left) * theta;
const float central = (right - left) * 0.5f;
const float forward = (right - center) * theta;
return 0.25f
*copysign(1.0f, backward)
*(copysign(1.0f, backward) + copysign(1.0f, central))
*(copysign(1.0f, central) + copysign(1.0f, forward))
*min( min(fabs(backward), fabs(central)), fabs(forward) );
}
/**
* Reconstructs a minmod slope for a whole block along the abscissa
*/
__device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Reconstruct slopes along x axis
{
const int j = ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
const int k = i + 1;
for (int p=0; p<3; ++p) {
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
}
}
}
}
/**
* Reconstructs a minmod slope for a whole block along the ordinate
*/
__device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
const int i = tx;
const int k = i + 2; //Skip ghost cells
for (int p=0; p<3; ++p) {
Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
}
}
}
}
__device__ float monotonized_central(float r_) {
return fmaxf(0.0f, fminf(2.0f, fminf(2.0f*r_, 0.5f*(1.0f+r_))));
}
__device__ float osher(float r_, float beta_) {
return fmaxf(0.0f, fminf(beta_, r_));
}
__device__ float sweby(float r_, float beta_) {
return fmaxf(0.0f, fmaxf(fminf(r_, beta_), fminf(beta_*r_, 1.0f)));
}
__device__ float minmod(float r_) {
return fmaxf(0.0f, fminf(1.0f, r_));
}
__device__ float generalized_minmod(float r_, float theta_) {
return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
}
__device__ float superbee(float r_) {
return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
}
__device__ float vanAlbada1(float r_) {
return (r_*r_ + r_) / (r_*r_ + 1.0f);
}
__device__ float vanAlbada2(float r_) {
return 2.0f*r_ / (r_*r_* + 1.0f);
}
__device__ float vanLeer(float r_) {
return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
}