mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 06:24:13 +02:00
commit
90df79fa9b
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,108 +1,261 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the different helper functions and
|
||||
classes
|
||||
|
||||
Copyright (C) 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
import re
|
||||
import io
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class which keeps track of time spent for a section of code
|
||||
"""
|
||||
class Timer(object):
|
||||
def __init__(self, tag, log_level=logging.DEBUG):
|
||||
self.tag = tag
|
||||
self.log_level = log_level
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.end = time.time()
|
||||
self.secs = self.end - self.start
|
||||
self.msecs = self.secs * 1000 # millisecs
|
||||
self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class which keeps track of the CUDA context and some helper functions
|
||||
"""
|
||||
class CudaContext(object):
|
||||
def __init__(self, verbose=True, blocking=False):
|
||||
|
||||
def __init__(self, verbose=True, blocking=False, use_cache=True):
|
||||
self.verbose = verbose
|
||||
self.blocking = blocking
|
||||
self.use_cache = use_cache
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.kernels = {}
|
||||
|
||||
self.module_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
#Initialize cuda (must be first call to PyCUDA)
|
||||
cuda.init(flags=0)
|
||||
|
||||
if (self.verbose):
|
||||
print("CUDA version " + str(cuda.get_version()))
|
||||
print("Driver version " + str(cuda.get_driver_version()))
|
||||
#Print some info about CUDA
|
||||
self.logger.info("CUDA version %s", str(cuda.get_version()))
|
||||
self.logger.info("Driver version %s", str(cuda.get_driver_version()))
|
||||
|
||||
self.cuda_device = cuda.Device(0)
|
||||
if (self.verbose):
|
||||
print("Using " + self.cuda_device.name())
|
||||
print(" => compute capability: " + str(self.cuda_device.compute_capability()))
|
||||
print(" => memory: " + str(self.cuda_device.total_memory() / (1024*1024)) + " MB")
|
||||
self.logger.info("Using '%s' GPU", self.cuda_device.name())
|
||||
self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
|
||||
self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
|
||||
|
||||
# Create the CUDA context
|
||||
if (self.blocking):
|
||||
self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
|
||||
if (self.verbose):
|
||||
print("=== WARNING ===")
|
||||
print("Using blocking context")
|
||||
print("=== WARNING ===")
|
||||
self.logger.warning("Using blocking context")
|
||||
else:
|
||||
self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
|
||||
|
||||
if (self.verbose):
|
||||
print("Created context <" + str(self.cuda_context.handle) + ">")
|
||||
self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
|
||||
|
||||
#Create cache dir for cubin files
|
||||
if (self.use_cache):
|
||||
self.cache_path = os.path.join(self.module_path, "cuda_cache")
|
||||
if not os.path.isdir(self.cache_path):
|
||||
os.mkdir(self.cache_path)
|
||||
self.logger.debug("Using CUDA cache dir %s", self.cache_path)
|
||||
|
||||
|
||||
def __del__(self, *args):
|
||||
if self.verbose:
|
||||
print("Cleaning up CUDA context <" + str(self.cuda_context.handle) + ">")
|
||||
self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
|
||||
|
||||
# Loop over all contexts in stack, and remove "this"
|
||||
other_contexts = []
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
if (self.verbose):
|
||||
if (context.handle != self.cuda_context.handle):
|
||||
print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> which we do not own")
|
||||
other_contexts = [context] + other_contexts
|
||||
cuda.Context.pop()
|
||||
else:
|
||||
print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> (ourselves)")
|
||||
cuda.Context.pop()
|
||||
if (context.handle != self.cuda_context.handle):
|
||||
self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
|
||||
other_contexts = [context] + other_contexts
|
||||
cuda.Context.pop()
|
||||
else:
|
||||
self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
|
||||
cuda.Context.pop()
|
||||
|
||||
# Add all the contexts we popped that were not our own
|
||||
for context in other_contexts:
|
||||
if (self.verbose):
|
||||
print(" `-> <" + str(self.cuda_context.handle) + "> Pushing <" + str(context.handle) + ">")
|
||||
self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
|
||||
cuda.Context.push(context)
|
||||
|
||||
if (self.verbose):
|
||||
print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
|
||||
self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
|
||||
self.cuda_context.detach()
|
||||
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "CudaContext id " + str(self.cuda_context.handle)
|
||||
|
||||
|
||||
"""
|
||||
Reads a text file and creates an OpenCL kernel from that
|
||||
"""
|
||||
def get_kernel(self, kernel_filename, block_width, block_height):
|
||||
# Generate a kernel ID for our cache
|
||||
module_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
kernel_hash = ""
|
||||
def hash_kernel(kernel_filename, include_dirs, verbose=False):
|
||||
# Generate a kernel ID for our caches
|
||||
num_includes = 0
|
||||
max_includes = 100
|
||||
kernel_hasher = hashlib.md5()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Loop over file and includes, and check if something has changed
|
||||
files = [kernel_filename]
|
||||
while len(files):
|
||||
filename = os.path.join(module_path, files.pop())
|
||||
modified = os.path.getmtime(filename)
|
||||
with open(filename, "r") as file:
|
||||
file_str = file.read()
|
||||
file_hash = filename + "_" + str(hash(file_str)) + ":" + str(modified) + "--"
|
||||
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
|
||||
files = files + includes #WARNING FIXME This will not work with circular includes
|
||||
|
||||
if (num_includes > max_includes):
|
||||
raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
|
||||
|
||||
filename = files.pop()
|
||||
|
||||
logger.debug("Hashing %s", filename)
|
||||
|
||||
kernel_hash = kernel_hash + file_hash
|
||||
modified = os.path.getmtime(filename)
|
||||
|
||||
# Open the file
|
||||
with io.open(filename, "r") as file:
|
||||
|
||||
# Search for #inclue <something> and also hash the file
|
||||
file_str = file.read()
|
||||
kernel_hasher.update(file_str.encode('utf-8'))
|
||||
kernel_hasher.update(str(modified).encode('utf-8'))
|
||||
|
||||
#Find all includes
|
||||
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
|
||||
|
||||
# Loop over everything that looks like an include
|
||||
for include_file in includes:
|
||||
|
||||
#Search through include directories for the file
|
||||
file_path = os.path.dirname(filename)
|
||||
for include_path in [file_path] + include_dirs:
|
||||
|
||||
# If we find it, add it to list of files to check
|
||||
temp_path = os.path.join(include_path, include_file)
|
||||
if (os.path.isfile(temp_path)):
|
||||
files = files + [temp_path]
|
||||
num_includes = num_includes + 1 #For circular includes...
|
||||
break
|
||||
|
||||
return kernel_hasher.hexdigest()
|
||||
|
||||
# Recompile kernel if file or includes have changed
|
||||
if (kernel_hash not in self.kernels.keys()):
|
||||
#Create define string
|
||||
define_string = "#define block_width " + str(block_width) + "\n"
|
||||
define_string += "#define block_height " + str(block_height) + "\n\n"
|
||||
"""
|
||||
Reads a text file and creates an OpenCL kernel from that
|
||||
"""
|
||||
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
|
||||
prepared_call_args, \
|
||||
include_dirs=[], verbose=False, no_extern_c=True,
|
||||
**kwargs):
|
||||
"""
|
||||
Helper function to print compilation output
|
||||
"""
|
||||
def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
|
||||
self.logger.debug("Compilation returned %s", str(compile_success_bool))
|
||||
if info_str:
|
||||
self.logger.debug("Info: %s", info_str)
|
||||
if error_str:
|
||||
self.logger.debug("Error: %s", error_str)
|
||||
|
||||
self.logger.debug("Getting %s", kernel_filename)
|
||||
|
||||
kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
|
||||
self.kernels[kernel_hash] = cuda_compiler.SourceModule(kernel_string, include_dirs=[module_path])
|
||||
# Create a hash of the kernel (and its includes)
|
||||
kwargs_hasher = hashlib.md5()
|
||||
kwargs_hasher.update(str(kwargs).encode('utf-8'));
|
||||
kwargs_hash = kwargs_hasher.hexdigest()
|
||||
kwargs_hasher = None
|
||||
root, ext = os.path.splitext(kernel_filename)
|
||||
kernel_hash = root \
|
||||
+ "_" + CudaContext.hash_kernel( \
|
||||
os.path.join(self.module_path, kernel_filename), \
|
||||
include_dirs=[self.module_path] + include_dirs, \
|
||||
verbose=verbose) \
|
||||
+ "_" + kwargs_hash \
|
||||
+ ext
|
||||
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
|
||||
|
||||
# If we have the kernel in our hashmap, return it
|
||||
if (kernel_hash in self.kernels.keys()):
|
||||
self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
|
||||
return self.kernels[kernel_hash]
|
||||
|
||||
# If we have it on disk, return it
|
||||
elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
|
||||
self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
|
||||
|
||||
with io.open(cached_kernel_filename, "rb") as file:
|
||||
file_str = file.read()
|
||||
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
|
||||
|
||||
kernel = module.get_function(kernel_function_name)
|
||||
kernel.prepare(prepared_call_args)
|
||||
self.kernels[kernel_hash] = kernel
|
||||
return kernel
|
||||
|
||||
return self.kernels[kernel_hash]
|
||||
# Otherwise, compile it from source
|
||||
else:
|
||||
self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
|
||||
|
||||
#Create kernel string
|
||||
kernel_string = ""
|
||||
for key, value in kwargs.items():
|
||||
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
|
||||
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
|
||||
if (self.use_cache):
|
||||
with io.open(cached_kernel_filename + ".txt", "w") as file:
|
||||
file.write(kernel_string)
|
||||
|
||||
|
||||
with Timer("compiler") as timer:
|
||||
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
|
||||
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
|
||||
if (self.use_cache):
|
||||
with io.open(cached_kernel_filename, "wb") as file:
|
||||
file.write(cubin)
|
||||
|
||||
kernel = module.get_function(kernel_function_name)
|
||||
kernel.prepare(prepared_call_args)
|
||||
self.kernels[kernel_hash] = kernel
|
||||
|
||||
|
||||
return kernel
|
||||
|
||||
"""
|
||||
Clears the kernel cache (useful for debugging & development)
|
||||
@ -113,21 +266,6 @@ class CudaContext(object):
|
||||
|
||||
|
||||
|
||||
class Timer(object):
|
||||
def __init__(self, tag, verbose=True):
|
||||
self.verbose = verbose
|
||||
self.tag = tag
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.end = time.time()
|
||||
self.secs = self.end - self.start
|
||||
self.msecs = self.secs * 1000 # millisecs
|
||||
if self.verbose:
|
||||
print("=> " + self.tag + ' %f ms' % self.msecs)
|
||||
|
||||
|
||||
|
||||
@ -139,12 +277,14 @@ class CUDAArray2D:
|
||||
Uploads initial data to the CL device
|
||||
"""
|
||||
def __init__(self, stream, nx, ny, halo_x, halo_y, data):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.nx = nx
|
||||
self.ny = ny
|
||||
self.nx_halo = nx + 2*halo_x
|
||||
self.ny_halo = ny + 2*halo_y
|
||||
|
||||
self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
|
||||
|
||||
#Make sure data is in proper format
|
||||
assert np.issubdtype(data.dtype, np.float32), "Wrong datatype: %s" % str(data.dtype)
|
||||
assert not np.isfortran(data), "Wrong datatype (Fortran, expected C)"
|
||||
@ -156,17 +296,26 @@ class CUDAArray2D:
|
||||
self.bytes_per_float = data.itemsize
|
||||
assert(self.bytes_per_float == 4)
|
||||
self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
|
||||
self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
|
||||
|
||||
|
||||
def __del__(self, *args):
|
||||
self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
|
||||
self.data.gpudata.free()
|
||||
self.data = None
|
||||
|
||||
"""
|
||||
Enables downloading data from CL device to Python
|
||||
"""
|
||||
def download(self, stream, async=False):
|
||||
|
||||
#Copy data from device to host
|
||||
if (async):
|
||||
self.logger.debug("Buffer <%s> [%dx%d]: Downloading async ", int(self.data.gpudata), self.nx, self.ny)
|
||||
host_data = self.data.get_async(stream=stream)
|
||||
return host_data
|
||||
else:
|
||||
self.logger.debug("Buffer <%s> [%dx%d]: Downloading synchronously", int(self.data.gpudata), self.nx, self.ny)
|
||||
host_data = self.data.get(stream=stream)#, pagelocked=True) # pagelocked causes crash on windows at least
|
||||
return host_data
|
||||
|
||||
|
@ -21,13 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -42,7 +36,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations
|
||||
"""
|
||||
class FORCE:
|
||||
class FORCE (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -63,80 +57,40 @@ class FORCE:
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
1, 1, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.force_module = context.get_kernel("FORCE_kernel.cu", block_width, block_height)
|
||||
self.force_kernel = self.force_module.get_function("FORCEKernel")
|
||||
self.force_kernel.prepare("iiffffPiPiPiPiPiPi")
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 1
|
||||
ghost_cells_y = 1
|
||||
self.data = Common.SWEDataArakawaA(self.stream, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
def __str__(self):
|
||||
return "First order centered"
|
||||
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
def simulate(self, t_end):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
self.force_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
|
||||
self.t += local_dt
|
||||
|
||||
self.data.swap()
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
return self.t, n
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
|
@ -20,14 +20,15 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/FirstOrderCentered.cu"
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
|
||||
//Index of thread within block
|
||||
@ -38,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
// Q at interface from the right and left
|
||||
@ -63,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
int i=tx;
|
||||
@ -97,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
}
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__ void FORCEKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
@ -112,8 +114,8 @@ __global__ void FORCEKernel(
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
@ -149,4 +151,6 @@ __global__ void FORCEKernel(
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
@ -20,13 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -37,7 +31,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
|
||||
"""
|
||||
class HLL:
|
||||
class HLL (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -58,80 +52,40 @@ class HLL:
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
1, 1, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.hll_module = context.get_kernel("HLL_kernel.cu", block_width, block_height)
|
||||
self.hll_kernel = self.hll_module.get_function("HLLKernel")
|
||||
self.hll_kernel.prepare("iiffffPiPiPiPiPiPi")
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 1
|
||||
ghost_cells_y = 1
|
||||
self.data = Common.SWEDataArakawaA(self.stream, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
def __str__(self):
|
||||
return "Harten-Lax-van Leer"
|
||||
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / self.dt + 1)
|
||||
def simulate(self, t_end):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
for i in range(0, n):
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
self.hll_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
|
||||
self.t += local_dt
|
||||
|
||||
self.data.swap()
|
||||
|
||||
return self.t, n
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
|
@ -21,12 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -39,7 +34,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class HLL2:
|
||||
class HLL2 (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -61,99 +56,62 @@ class HLL2:
|
||||
g, \
|
||||
theta=1.8, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.hll2_module = context.get_kernel("HLL2_kernel.cu", block_width, block_height)
|
||||
self.hll2_kernel = self.hll2_module.get_function("HLL2Kernel")
|
||||
self.hll2_kernel.prepare("iifffffiPiPiPiPiPiPi")
|
||||
self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 2
|
||||
ghost_cells_y = 2
|
||||
self.data = Common.SWEDataArakawaA(self.stream, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "Harten-Lax-van Leer (2nd order)"
|
||||
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
for i in range(0, n):
|
||||
#Dimensional splitting: second order accurate for every other timestep,
|
||||
#thus run two timesteps in a go
|
||||
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
#Along X, then Y
|
||||
self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
#Along Y, then X
|
||||
self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
|
||||
self.t += local_dt
|
||||
|
||||
|
||||
return self.t, 2*n
|
||||
|
||||
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
@ -19,6 +19,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/HartenLaxVanLeer.cu"
|
||||
|
||||
|
||||
|
||||
@ -30,9 +32,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -41,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
@ -64,7 +66,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
|
||||
|
||||
// Compute flux based on prediction
|
||||
const float3 flux = HLLC_flux(Q_l_bar, Q_r_bar, g_);
|
||||
const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
|
||||
|
||||
//Write to shared memory
|
||||
F[0][j][i] = flux.x;
|
||||
@ -82,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -134,7 +136,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__ void HLL2Kernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
@ -155,9 +157,9 @@ __global__ void HLL2Kernel(
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float Qx[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
@ -226,4 +228,6 @@ __global__ void HLL2Kernel(
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
This OpenCL kernel implements the HLL flux
|
||||
This GPU kernel implements the HLL flux
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
@ -20,6 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/HartenLaxVanLeer.cu"
|
||||
|
||||
|
||||
|
||||
@ -29,8 +30,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -39,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
const float3 Q_l = make_float3(Q[0][l][k ], Q[1][l][k ], Q[2][l][k ]);
|
||||
@ -63,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
const int i=tx;
|
||||
@ -103,7 +104,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
__global__ void HLLKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
@ -118,6 +120,10 @@ __global__ void HLLKernel(
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
const int block_width = BLOCK_WIDTH;
|
||||
const int block_height = BLOCK_HEIGHT;
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
@ -159,4 +165,6 @@ __global__ void HLLKernel(
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
80
SWESimulators/IPythonMagic.py
Normal file
80
SWESimulators/IPythonMagic.py
Normal file
@ -0,0 +1,80 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements helpers for IPython / Jupyter and CUDA
|
||||
|
||||
Copyright (C) 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from IPython.core.magic import line_magic, Magics, magics_class
|
||||
import pycuda.driver as cuda
|
||||
|
||||
|
||||
|
||||
@magics_class
|
||||
class CudaContextHandler(Magics):
|
||||
@line_magic
|
||||
def cuda_context_handler(self, context_name):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.logger.debug("Registering %s as a global context", context_name)
|
||||
|
||||
if context_name in self.shell.user_ns.keys():
|
||||
self.logger.debug("Context already registered! Ignoring")
|
||||
return
|
||||
else:
|
||||
self.logger.debug("Creating context")
|
||||
self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
|
||||
|
||||
# this function will be called on exceptions in any cell
|
||||
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
|
||||
self.logger.exception("Exception caught: Resetting to CUDA context %s", context_name)
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
self.logger.info("Popping <%s>", str(context.handle))
|
||||
cuda.Context.pop()
|
||||
|
||||
if context_name in self.shell.user_ns.keys():
|
||||
self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
|
||||
self.shell.ex(context_name + ".cuda_context.push()")
|
||||
else:
|
||||
self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
|
||||
self.logger.error("CUDA will not work now")
|
||||
|
||||
# still show the error within the notebook, don't just swallow it
|
||||
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
|
||||
|
||||
# this registers a custom exception handler for the whole current notebook
|
||||
get_ipython().set_custom_exc((Exception,), custom_exc)
|
||||
|
||||
|
||||
# Handle CUDA context when exiting python
|
||||
import atexit
|
||||
def exitfunc():
|
||||
self.logger.info("Exitfunc: Resetting CUDA context stack")
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
self.logger.info("`-> Popping <%s>", str(context.handle))
|
||||
cuda.Context.pop()
|
||||
atexit.register(exitfunc)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Registering automatic CUDA context handling")
|
||||
logger.debug("(use %cuda_context_handler my_context to create a context called my_context")
|
||||
ip = get_ipython()
|
||||
ip.register_magics(CudaContextHandler)
|
@ -26,12 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -40,7 +35,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class KP07:
|
||||
class KP07 (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -53,131 +48,64 @@ class KP07:
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
f: Coriolis parameter (1.2e-4 s^1)
|
||||
r: Bottom friction coefficient (2.4e-3 m/s)
|
||||
wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
|
||||
wind_tau0: Amplitude of wind stress (Pa)
|
||||
wind_rho: Density of sea water (1025.0 kg / m^3)
|
||||
wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
|
||||
wind_xm: Maximum wind stress for bell shaped wind stress
|
||||
wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
|
||||
wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
|
||||
wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
|
||||
wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
|
||||
wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, theta=1.3, \
|
||||
r=0.0, use_rk2=True,
|
||||
g, \
|
||||
theta=1.3, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.kp07_module = context.get_kernel("KP07_kernel.cu", block_width, block_height)
|
||||
self.kp07_kernel = self.kp07_module.get_function("KP07Kernel")
|
||||
self.kp07_kernel.prepare("iiffffffiPiPiPiPiPiPi")
|
||||
self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 2
|
||||
ghost_cells_y = 2
|
||||
self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
self.theta = np.float32(theta)
|
||||
self.r = np.float32(r)
|
||||
self.use_rk2 = use_rk2
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "Kurganov-Petrova 2007"
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / self.dt + 1)
|
||||
def simulate(self, t_end):
|
||||
return super().simulateRK(t_end, 2)
|
||||
|
||||
for i in range(0, n):
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
def substepRK(self, dt, substep):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(substep), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
|
||||
if (self.use_rk2):
|
||||
self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
self.r, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
|
||||
self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
self.r, \
|
||||
np.int32(1), \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch)
|
||||
else:
|
||||
self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
self.r, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.cl_data.swap()
|
||||
|
||||
self.t += local_dt
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.substepRK(dt, 0)
|
||||
self.t += dt
|
||||
|
||||
return self.t, n
|
||||
|
||||
|
||||
|
||||
def stepRK(self, dt, order):
|
||||
if (order != 2):
|
||||
raise NotImplementedError("Only second order implemented")
|
||||
self.substepRK(dt, 0)
|
||||
self.substepRK(dt, 1)
|
||||
self.t += dt
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
@ -26,12 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -40,7 +35,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations using the dimentionally split KP07 scheme
|
||||
"""
|
||||
class KP07_dimsplit:
|
||||
class KP07_dimsplit (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -62,98 +57,63 @@ class KP07_dimsplit:
|
||||
g, \
|
||||
theta=1.3, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.kp07_dimsplit_module = context.get_kernel("KP07_dimsplit_kernel.cu", block_width, block_height)
|
||||
self.kp07_dimsplit_kernel = self.kp07_dimsplit_module.get_function("KP07DimsplitKernel")
|
||||
self.kp07_dimsplit_kernel.prepare("iifffffiPiPiPiPiPiPi")
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 2
|
||||
ghost_cells_y = 2
|
||||
self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
def __str__(self):
|
||||
return "Kurganov-Petrova 2007 dimensionally split"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
for i in range(0, n):
|
||||
#Dimensional splitting: second order accurate for every other timestep,
|
||||
#thus run two timesteps in a go
|
||||
|
||||
#Compute timestep
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
#Along X, then Y
|
||||
self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
|
||||
#Along Y, then X
|
||||
self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
|
||||
self.t += 2.0*local_dt
|
||||
|
||||
|
||||
return self.t, 2*n
|
||||
|
||||
|
||||
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
|
@ -25,12 +25,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/CentralUpwind.cu"
|
||||
|
||||
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -39,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
@ -73,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
}
|
||||
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -125,6 +127,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
/**
|
||||
* This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
|
||||
*/
|
||||
extern "C" {
|
||||
__global__ void KP07DimsplitKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
@ -146,9 +149,9 @@ __global__ void KP07DimsplitKernel(
|
||||
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float Qx[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
@ -217,4 +220,6 @@ __global__ void KP07DimsplitKernel(
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
@ -25,12 +25,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/CentralUpwind.cu"
|
||||
|
||||
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -39,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Q at interface from the right and left
|
||||
const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
|
||||
@ -59,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
}
|
||||
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -104,8 +106,6 @@ __global__ void KP07Kernel(
|
||||
|
||||
float theta_,
|
||||
|
||||
float r_, //< Bottom friction coefficient
|
||||
|
||||
int step_,
|
||||
|
||||
//Input h^n
|
||||
@ -127,14 +127,14 @@ __global__ void KP07Kernel(
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
|
||||
//The following slightly wastes memory, but enables us to reuse the
|
||||
//funcitons in common.opencl
|
||||
__shared__ float Qx[3][block_height+2][block_width+2];
|
||||
__shared__ float Qy[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float G[3][block_height+1][block_width+1];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
@ -178,15 +178,13 @@ __global__ void KP07Kernel(
|
||||
float* const h_row = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
|
||||
float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
|
||||
float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
|
||||
|
||||
const float C = 2.0f*r_*dt_/Q[0][j][i];
|
||||
|
||||
if (step_ == 0) {
|
||||
//First step of RK2 ODE integrator
|
||||
|
||||
h_row[ti] = h1;
|
||||
hu_row[ti] = hu1 / (1.0f + C);
|
||||
hv_row[ti] = hv1 / (1.0f + C);
|
||||
hu_row[ti] = hu1;
|
||||
hv_row[ti] = hv1;
|
||||
}
|
||||
else if (step_ == 1) {
|
||||
//Second step of RK2 ODE integrator
|
||||
@ -203,8 +201,8 @@ __global__ void KP07Kernel(
|
||||
|
||||
//Write to main memory
|
||||
h_row[ti] = h_b;
|
||||
hu_row[ti] = hu_b / (1.0f + 0.5f*C);
|
||||
hv_row[ti] = hv_b / (1.0f + 0.5f*C);
|
||||
hu_row[ti] = hu_b;
|
||||
hv_row[ti] = hv_b;
|
||||
}
|
||||
}
|
||||
}
|
@ -21,13 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -38,7 +32,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations using the Lax Friedrichs scheme
|
||||
"""
|
||||
class LxF:
|
||||
class LxF (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -59,80 +53,40 @@ class LxF:
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
1, 1, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.lxf_module = context.get_kernel("LxF_kernel.cu", block_width, block_height)
|
||||
self.lxf_kernel = self.lxf_module.get_function("LxFKernel")
|
||||
self.lxf_kernel.prepare("iiffffPiPiPiPiPiPi")
|
||||
# Get kernels
|
||||
self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 1
|
||||
ghost_cells_y = 1
|
||||
self.data = Common.SWEDataArakawaA(self.stream, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "Lax Friedrichs"
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
def simulate(self, t_end):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
self.lxf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
|
||||
self.t += local_dt
|
||||
|
||||
self.data.swap()
|
||||
|
||||
return self.t, n
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
|
||||
|
@ -20,11 +20,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/LaxFriedrichs.cu"
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
template <int block_width, int block_height>
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height][block_width+1],
|
||||
@ -59,8 +61,9 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
|
||||
/**
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
*/
|
||||
template <int block_width, int block_height>
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
@ -94,7 +97,9 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
}
|
||||
|
||||
|
||||
__global__ void LxFKernel(
|
||||
extern "C" {
|
||||
__global__
|
||||
void LxFKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
@ -109,6 +114,9 @@ __global__ void LxFKernel(
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
const int block_width = BLOCK_WIDTH;
|
||||
const int block_height = BLOCK_HEIGHT;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
@ -130,8 +138,8 @@ __global__ void LxFKernel(
|
||||
|
||||
|
||||
//Compute fluxes along the x and y axis
|
||||
computeFluxF(Q, F, g_, dx_, dt_);
|
||||
computeFluxG(Q, G, g_, dy_, dt_);
|
||||
computeFluxF<block_width, block_height>(Q, F, g_, dx_, dt_);
|
||||
computeFluxG<block_width, block_height>(Q, G, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
@ -159,4 +167,7 @@ __global__ void LxFKernel(
|
||||
hu_row[ti] = hu1;
|
||||
hv_row[ti] = hv1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
|
@ -1,165 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python class aids in plotting results from the numerical
|
||||
simulations
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
import matplotlib.gridspec as gridspec
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
"""
|
||||
Class that makes plotting faster by caching the plots instead of recreating them
|
||||
"""
|
||||
class PlotHelper:
|
||||
|
||||
def __init__(self, fig, x_coords, y_coords, radius, eta1, u1, v1, eta2=None, u2=None, v2=None, interpolation_type='spline36'):
|
||||
self.ny, self.nx = eta1.shape
|
||||
self.fig = fig;
|
||||
|
||||
fig.set_figheight(15)
|
||||
fig.set_figwidth(15)
|
||||
|
||||
min_x = np.min(x_coords[:,0]);
|
||||
min_y = np.min(y_coords[0,:]);
|
||||
|
||||
max_x = np.max(x_coords[0,:]);
|
||||
max_y = np.max(y_coords[:,0]);
|
||||
|
||||
domain_extent = [ x_coords[0, 0], x_coords[0, -1], y_coords[0, 0], y_coords[-1, 0] ]
|
||||
|
||||
if (eta2 is not None):
|
||||
assert(u2 is not None)
|
||||
assert(v2 is not None)
|
||||
self.gs = gridspec.GridSpec(3, 3)
|
||||
else:
|
||||
self.gs = gridspec.GridSpec(2, 3)
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[0, 0])
|
||||
self.sp_eta = plt.imshow(eta1, interpolation=interpolation_type, origin='bottom', vmin=-0.05, vmax=0.05, extent=domain_extent)
|
||||
plt.axis('tight')
|
||||
ax.set_aspect('equal')
|
||||
plt.title('Eta')
|
||||
plt.colorbar()
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[0, 1])
|
||||
self.sp_u = plt.imshow(u1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
|
||||
plt.axis('tight')
|
||||
ax.set_aspect('equal')
|
||||
plt.title('U')
|
||||
plt.colorbar()
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[0, 2])
|
||||
self.sp_v = plt.imshow(v1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
|
||||
plt.axis('tight')
|
||||
ax.set_aspect('equal')
|
||||
plt.title('V')
|
||||
plt.colorbar()
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[1, 0])
|
||||
self.sp_radial1, = plt.plot(radius.ravel(), eta1.ravel(), '.')
|
||||
plt.axis([0, min(max_x, max_y), -1.5, 1])
|
||||
plt.title('Eta Radial plot')
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[1, 1])
|
||||
self.sp_x_axis1, = plt.plot(x_coords[self.ny/2,:], eta1[self.ny/2,:], 'k+--', label='x-axis')
|
||||
self.sp_y_axis1, = plt.plot(y_coords[:,self.nx/2], eta1[:,self.nx/2], 'kx:', label='y-axis')
|
||||
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
|
||||
plt.title('Eta along axis')
|
||||
plt.legend()
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[1, 2])
|
||||
self.sp_x_diag1, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
|
||||
np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2), \
|
||||
'k+--', label='x = -y')
|
||||
self.sp_y_diag1, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
|
||||
np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2), \
|
||||
'kx:', label='x = y')
|
||||
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
|
||||
plt.title('Eta along diagonal')
|
||||
plt.legend()
|
||||
|
||||
|
||||
if (eta2 is not None):
|
||||
ax = self.fig.add_subplot(self.gs[2, 0])
|
||||
self.sp_radial2, = plt.plot(radius.ravel(), eta2.ravel(), '.')
|
||||
plt.axis([0, min(max_x, max_y), -1.5, 1])
|
||||
plt.title('Eta2 Radial plot')
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[2, 1])
|
||||
self.sp_x_axis2, = plt.plot(x_coords[self.ny/2,:], eta2[self.ny/2,:], 'k+--', label='x-axis')
|
||||
self.sp_y_axis2, = plt.plot(y_coords[:,self.nx/2], eta2[:,self.nx/2], 'kx:', label='y-axis')
|
||||
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
|
||||
plt.title('Eta2 along axis')
|
||||
plt.legend()
|
||||
|
||||
ax = self.fig.add_subplot(self.gs[2, 2])
|
||||
self.sp_x_diag2, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
|
||||
np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2), \
|
||||
'k+--', label='x = -y')
|
||||
self.sp_y_diag2, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
|
||||
np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2), \
|
||||
'kx:', label='x = y')
|
||||
plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
|
||||
plt.title('Eta2 along diagonal')
|
||||
plt.legend()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def plot(self, eta1, u1, v1, eta2=None, u2=None, v2=None):
|
||||
self.fig.add_subplot(self.gs[0, 0])
|
||||
self.sp_eta.set_data(eta1)
|
||||
|
||||
self.fig.add_subplot(self.gs[0, 1])
|
||||
self.sp_u.set_data(u1)
|
||||
|
||||
self.fig.add_subplot(self.gs[0, 2])
|
||||
self.sp_v.set_data(v1)
|
||||
|
||||
self.fig.add_subplot(self.gs[1, 0])
|
||||
self.sp_radial1.set_ydata(eta1.ravel());
|
||||
|
||||
self.fig.add_subplot(self.gs[1, 1])
|
||||
self.sp_x_axis1.set_ydata(eta1[(self.ny+2)/2,:])
|
||||
self.sp_y_axis1.set_ydata(eta1[:,(self.nx+2)/2])
|
||||
|
||||
self.fig.add_subplot(self.gs[1, 2])
|
||||
self.sp_x_diag1.set_ydata(np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2))
|
||||
self.sp_y_diag1.set_ydata(np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2))
|
||||
|
||||
if (eta2 is not None):
|
||||
self.fig.add_subplot(self.gs[2, 0])
|
||||
self.sp_radial2.set_ydata(eta2.ravel());
|
||||
|
||||
self.fig.add_subplot(self.gs[2, 1])
|
||||
self.sp_x_axis2.set_ydata(eta2[(self.ny+2)/2,:])
|
||||
self.sp_y_axis2.set_ydata(eta2[:,(self.nx+2)/2])
|
||||
|
||||
self.fig.add_subplot(self.gs[2, 2])
|
||||
self.sp_x_diag2.set_ydata(np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2))
|
||||
self.sp_y_diag2.set_ydata(np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2))
|
||||
|
||||
plt.draw()
|
||||
time.sleep(0.001)
|
||||
|
||||
|
186
SWESimulators/Simulator.py
Normal file
186
SWESimulators/Simulator.py
Normal file
@ -0,0 +1,186 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the classical Lax-Friedrichs numerical
|
||||
scheme for the shallow water equations
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
|
||||
|
||||
class BaseSimulator:
|
||||
"""
|
||||
Initialization routine
|
||||
context: GPU context to use
|
||||
kernel_wrapper: wrapper function of GPU kernel
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height):
|
||||
#Get logger
|
||||
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
|
||||
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
#Create data by uploading to device
|
||||
self.data = Common.SWEDataArakawaA(self.stream, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#GPU kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Keep track of simulation time
|
||||
self.t = 0.0;
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
"""
|
||||
Function which simulates forward in time using the default simulation type
|
||||
"""
|
||||
def simulate(self, t_end):
|
||||
raise(exceptions.NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
"""
|
||||
Function which simulates t_end seconds using forward Euler
|
||||
Requires that the stepEuler functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateEuler(self, t_end):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateEuler") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Step with forward Euler
|
||||
self.stepEuler(local_dt)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
|
||||
return self.t, n
|
||||
|
||||
"""
|
||||
Function which simulates t_end seconds using Runge-Kutta 2
|
||||
Requires that the stepRK functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateRK(self, t_end, order):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Perform all the Runge-Kutta substeps
|
||||
self.stepRK(local_dt, order)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
|
||||
return self.t, n
|
||||
|
||||
"""
|
||||
Function which simulates t_end seconds using second order dimensional splitting (XYYX)
|
||||
Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateDimsplit(self, t_end):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Perform the dimensional split substeps
|
||||
self.stepDimsplitXY(local_dt)
|
||||
self.stepDimsplitYX(local_dt)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs)
|
||||
return self.t, 2*n
|
||||
|
||||
|
||||
"""
|
||||
Function which performs one single timestep of size dt using forward euler
|
||||
"""
|
||||
def stepEuler(self, dt):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def stepRK(self, dt, substep):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def sim_time(self):
|
||||
return self.t
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
@ -22,12 +22,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from SWESimulators import Common
|
||||
from SWESimulators import Simulator
|
||||
|
||||
|
||||
|
||||
@ -36,7 +31,7 @@ from SWESimulators import Common
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class WAF:
|
||||
class WAF (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
@ -57,92 +52,57 @@ class WAF:
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.waf_module = context.get_kernel("WAF_kernel.cu", block_width, block_height)
|
||||
self.waf_kernel = self.waf_module.get_function("WAFKernel")
|
||||
self.waf_kernel.prepare("iiffffiPiPiPiPiPiPi")
|
||||
|
||||
#Create data by uploading to device
|
||||
ghost_cells_x = 2
|
||||
ghost_cells_y = 2
|
||||
self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#OpenCL kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Initialize time
|
||||
self.t = np.float32(0.0)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
|
||||
self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
|
||||
"iiffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=block_width, \
|
||||
BLOCK_HEIGHT=block_height)
|
||||
|
||||
def __str__(self):
|
||||
return "Weighted average flux"
|
||||
|
||||
"""
|
||||
Function which steps n timesteps
|
||||
"""
|
||||
def step(self, t_end=0.0):
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
for i in range(0, n):
|
||||
#Dimensional splitting: second order accurate for every other timestep,
|
||||
#thus run two timesteps in a go
|
||||
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
#Along X, then Y
|
||||
self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
|
||||
#Along Y, then X
|
||||
self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, local_dt, \
|
||||
self.g, \
|
||||
np.int32(1), \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch, \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch)
|
||||
|
||||
self.t += local_dt
|
||||
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
return self.t, 2*n
|
||||
|
||||
|
||||
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.pitch, \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.pitch, \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.pitch, \
|
||||
self.data.h1.data.gpudata, self.data.h1.pitch, \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.pitch, \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.pitch)
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/WeightedAverageFlux.cu"
|
||||
|
||||
|
||||
|
||||
@ -32,8 +33,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -42,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
|
||||
// Q at interface from the right and left
|
||||
@ -71,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
@ -114,6 +115,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__ void WAFKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
@ -129,8 +131,8 @@ __global__ void WAFKernel(
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+4][block_width+4];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
@ -192,4 +194,6 @@ __global__ void WAFKernel(
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
|
||||
__device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+2][block_width+2],
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = block_width * get_group_id(0);
|
||||
const int by = block_height * get_group_id(1);
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<block_height+2; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = clamp(by + j, 0, ny_+1); // Out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
|
||||
|
||||
for (int i=tx; i<block_width+2; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
|
||||
const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
|
||||
|
||||
Q[0][j][i] = h_row[k];
|
||||
@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
__device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+4][block_width+4],
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = block_width * get_group_id(0);
|
||||
const int by = block_height * get_group_id(1);
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<block_height+4; j+=block_height) {
|
||||
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
|
||||
const int l = clamp(by + j, 0, ny_+3); // Out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
|
||||
|
||||
for (int i=tx; i<block_width+4; i+=block_width) {
|
||||
for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
|
||||
const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
|
||||
|
||||
Q[0][j][i] = h_row[k];
|
||||
@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+2][block_width+2],
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
|
||||
__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][block_height+4][block_width+4],
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
* No flow boundary conditions for the shallow water equations
|
||||
* with one ghost cell in each direction
|
||||
*/
|
||||
__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
|
||||
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
|
||||
* No flow boundary conditions for the shallow water equations
|
||||
* with two ghost cells in each direction
|
||||
*/
|
||||
__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
|
||||
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
|
||||
/**
|
||||
* Evolves the solution in time along the x axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
|
||||
/**
|
||||
* Evolves the solution in time along the x axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
|
||||
float F[3][block_height+1][block_width+1],
|
||||
__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
|
||||
/**
|
||||
* Evolves the solution in time along the y axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
|
||||
/**
|
||||
* Evolves the solution in time along the y axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
|
||||
float G[3][block_height+1][block_width+1],
|
||||
__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
@ -464,79 +464,6 @@ __device__ void evolveG2(float Q[3][block_height+4][block_width+4],
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a slope using the minmod limiter based on three
|
||||
* consecutive values
|
||||
*/
|
||||
__device__ float minmodSlope(float left, float center, float right, float theta) {
|
||||
const float backward = (center - left) * theta;
|
||||
const float central = (right - left) * 0.5f;
|
||||
const float forward = (right - center) * theta;
|
||||
|
||||
return 0.25f
|
||||
*copysign(1.0f, backward)
|
||||
*(copysign(1.0f, backward) + copysign(1.0f, central))
|
||||
*(copysign(1.0f, central) + copysign(1.0f, forward))
|
||||
*min( min(fabs(backward), fabs(central)), fabs(forward) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along x
|
||||
*/
|
||||
__device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
|
||||
float Qx[3][block_height+2][block_width+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Reconstruct slopes along x axis
|
||||
{
|
||||
const int j = ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+2; i+=block_width) {
|
||||
const int k = i + 1;
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along y
|
||||
*/
|
||||
__device__ void minmodSlopeY(float Q[3][block_height+4][block_width+4],
|
||||
float Qy[3][block_height+2][block_width+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+2; j+=block_height) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
const int i = tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float3 F_func(const float3 Q, const float g) {
|
||||
float3 F;
|
||||
|
||||
@ -551,23 +478,7 @@ __device__ float3 F_func(const float3 Q, const float g) {
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Central upwind flux function
|
||||
*/
|
||||
__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
|
||||
const float3 Fp = F_func(Qp, g);
|
||||
const float up = Qp.y / Qp.x; // hu / h
|
||||
const float cp = sqrt(g*Qp.x); // sqrt(g*h)
|
||||
|
||||
const float3 Fm = F_func(Qm, g);
|
||||
const float um = Qm.y / Qm.x; // hu / h
|
||||
const float cm = sqrt(g*Qm.x); // sqrt(g*h)
|
||||
|
||||
const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
|
||||
const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
|
||||
|
||||
return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -578,49 +489,6 @@ __device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
|
||||
*/
|
||||
__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
const float h_l = Q_l.x;
|
||||
const float h_r = Q_r.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l.y / h_l;
|
||||
const float u_r = Q_r.y / h_r;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
|
||||
//Upwind selection
|
||||
if (S_l >= 0.0f) {
|
||||
return F_func(Q_l, g_);
|
||||
}
|
||||
else if (S_r <= 0.0f) {
|
||||
return F_func(Q_r, g_);
|
||||
}
|
||||
//Or estimate flux in the star region
|
||||
else {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
|
||||
return flux;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -630,347 +498,3 @@ __device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
|
||||
*/
|
||||
__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
const float h_l = Q_l.x;
|
||||
const float h_r = Q_r.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l.y / h_l;
|
||||
const float u_r = Q_r.y / h_r;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
|
||||
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
//Upwind selection
|
||||
if (S_l >= 0.0f) {
|
||||
return F_l;
|
||||
}
|
||||
else if (S_r <= 0.0f) {
|
||||
return F_r;
|
||||
}
|
||||
//Or estimate flux in the "left star" region
|
||||
else if (S_l <= 0.0f && 0.0f <=S_star) {
|
||||
const float v_l = Q_l.z / h_l;
|
||||
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
|
||||
const float3 flux = F_l + S_l*(Q_star_l - Q_l);
|
||||
return flux;
|
||||
}
|
||||
//Or estimate flux in the "righ star" region
|
||||
else if (S_star <= 0.0f && 0.0f <=S_r) {
|
||||
const float v_r = Q_r.z / h_r;
|
||||
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
|
||||
const float3 flux = F_r + S_r*(Q_star_r - Q_r);
|
||||
return flux;
|
||||
}
|
||||
else {
|
||||
return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Superbee flux limiter for WAF.
|
||||
* Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
|
||||
* @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
|
||||
* @param c_ the courant number for wave k, dt*S_k/dx
|
||||
*/
|
||||
__device__ float WAF_superbee(float r_, float c_) {
|
||||
// r <= 0.0
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
// 0.0 <= r <= 1/2
|
||||
else if (r_ <= 0.5f) {
|
||||
return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
// 1/2 <= r <= 1
|
||||
else if (r_ <= 1.0f) {
|
||||
return fabs(c_);
|
||||
}
|
||||
// 1 <= r <= 2
|
||||
else if (r_ <= 2.0f) {
|
||||
return 1.0f - (1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
// r >= 2
|
||||
else {
|
||||
return 2.0f*fabsf(c_) - 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float WAF_albada(float r_, float c_) {
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
else {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float WAF_minbee(float r_, float c_) {
|
||||
r_ = fmaxf(-1.0f, fminf(2.0f, r_));
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
if (r_ >= 0.0f && r_ <= 1.0f) {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * r_;
|
||||
}
|
||||
else {
|
||||
return fabsf(c_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float WAF_minmod(float r_, float c_) {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
|
||||
}
|
||||
|
||||
|
||||
__device__ float minmod(float r_) {
|
||||
return fmaxf(0.0f, fminf(1.0f, r_));
|
||||
}
|
||||
|
||||
__device__ float generalized_minmod(float r_, float theta_) {
|
||||
return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
|
||||
}
|
||||
|
||||
__device__ float superbee(float r_) {
|
||||
return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
|
||||
}
|
||||
|
||||
__device__ float vanAlbada1(float r_) {
|
||||
return (r_*r_ + r_) / (r_*r_ + 1.0f);
|
||||
}
|
||||
|
||||
__device__ float vanLeer(float r_) {
|
||||
return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
|
||||
}
|
||||
|
||||
__device__ float limiterToWAFLimiter(float r_, float c_) {
|
||||
return 1.0f - (1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
|
||||
__device__ float desingularize(float x_, float eps_) {
|
||||
return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
|
||||
}
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
__device__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
|
||||
//return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
|
||||
return h_tmp*h_tmp / g_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
|
||||
* @param r_ The flux limiter parameter (see Toro 2001, p. 203)
|
||||
* @param Q_l2 Q_{i-1}
|
||||
* @param Q_l1 Q_{i}
|
||||
* @param Q_r1 Q_{i+1}
|
||||
* @param Q_r2 Q_{i+2}
|
||||
*/
|
||||
__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {
|
||||
const float h_l = Q_l1.x;
|
||||
const float h_r = Q_r1.x;
|
||||
|
||||
const float h_l2 = Q_l2.x;
|
||||
const float h_r2 = Q_r2.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l1.y / h_l;
|
||||
const float u_r = Q_r1.y / h_r;
|
||||
|
||||
const float u_l2 = Q_l2.y / h_l2;
|
||||
const float u_r2 = Q_r2.y / h_r2;
|
||||
|
||||
const float v_l = Q_l1.z / h_l;
|
||||
const float v_r = Q_r1.z / h_r;
|
||||
|
||||
const float v_l2 = Q_l2.z / h_l2;
|
||||
const float v_r2 = Q_r2.z / h_r2;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
const float c_l2 = sqrt(g_*h_l2);
|
||||
const float c_r2 = sqrt(g_*h_r2);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_);
|
||||
const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_);
|
||||
const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
|
||||
|
||||
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
|
||||
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
|
||||
|
||||
// Estimate the fluxes in the four regions
|
||||
const float3 F_1 = F_func(Q_l1, g_);
|
||||
const float3 F_4 = F_func(Q_r1, g_);
|
||||
|
||||
const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
|
||||
const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
|
||||
//const float3 F_2 = F_func(Q_star_l, g_);
|
||||
//const float3 F_3 = F_func(Q_star_r, g_);
|
||||
|
||||
// Compute the courant numbers for the waves
|
||||
const float c_1 = S_l * dt_ / dx_;
|
||||
const float c_2 = S_star * dt_ / dx_;
|
||||
const float c_3 = S_r * dt_ / dx_;
|
||||
|
||||
// Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
|
||||
const float eps = 1.0e-6f;
|
||||
const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
|
||||
const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
|
||||
const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
|
||||
|
||||
/*
|
||||
const float eps1 = 1.0e-6f;
|
||||
const float eps2 = eps1;//*1.0e-1f;
|
||||
const float rh_m = desingularize((h_l - h_l2), eps1) / desingularize((h_r - h_l), eps2);
|
||||
const float rh_p = desingularize((h_r2 - h_r), eps1) / desingularize((h_r - h_l), eps2);
|
||||
const float rv_m = desingularize((v_l - v_l2), eps1) / desingularize((v_r - v_l), eps2);
|
||||
const float rv_p = desingularize((v_r2 - v_r), eps1) / desingularize((v_r - v_l), eps2);
|
||||
|
||||
// Compute the r parameters for the flux limiter
|
||||
const float rh_1 = (c_1 > 0.0f) ? rh_m : rh_p;
|
||||
const float rv_1 = (c_1 > 0.0f) ? rv_m : rv_p;
|
||||
|
||||
const float rh_2 = (c_2 > 0.0f) ? rh_m : rh_p;
|
||||
const float rv_2 = (c_2 > 0.0f) ? rv_m : rv_p;
|
||||
|
||||
const float rh_3 = (c_3 > 0.0f) ? rh_m : rh_p;
|
||||
const float rv_3 = (c_3 > 0.0f) ? rv_m : rv_p;
|
||||
|
||||
const float r_1 = rh_1;
|
||||
const float r_2 = rv_2;
|
||||
const float r_3 = rh_3;
|
||||
*/
|
||||
|
||||
// Compute the limiter
|
||||
// We use h for the nonlinear waves, and v for the middle shear wave
|
||||
const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
|
||||
const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); //Middle shear wave
|
||||
const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
|
||||
|
||||
//Average the fluxes
|
||||
const float3 flux = 0.5f*( F_1 + F_4 )
|
||||
- 0.5f*( A_1 * (F_2 - F_1)
|
||||
+ A_2 * (F_3 - F_2)
|
||||
+ A_3 * (F_4 - F_3) );
|
||||
|
||||
return flux;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Lax-Friedrichs flux (Toro 2001, p 163)
|
||||
*/
|
||||
__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Lax-Friedrichs extended to 2D
|
||||
*/
|
||||
__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
//Note numerical diffusion for 2D here (0.25)
|
||||
return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
|
||||
*/
|
||||
__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
|
||||
|
||||
return F_func(Q_lw2, g_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Godunovs centered scheme (Toro 2001, p 165)
|
||||
*/
|
||||
__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
|
||||
|
||||
return F_func(Q_godc, g_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* First Ordered Centered (Toro 2001, p.163)
|
||||
*/
|
||||
__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
|
||||
const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
|
||||
return 0.5f*(F_lf + F_lw2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
39
SWESimulators/fluxes/CentralUpwind.cu
Normal file
39
SWESimulators/fluxes/CentralUpwind.cu
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
This file implements the Central upwind flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Central upwind flux function
|
||||
*/
|
||||
__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
|
||||
const float3 Fp = F_func(Qp, g);
|
||||
const float up = Qp.y / Qp.x; // hu / h
|
||||
const float cp = sqrt(g*Qp.x); // sqrt(g*h)
|
||||
|
||||
const float3 Fm = F_func(Qm, g);
|
||||
const float um = Qm.y / Qm.x; // hu / h
|
||||
const float cm = sqrt(g*Qm.x); // sqrt(g*h)
|
||||
|
||||
const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
|
||||
const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
|
||||
|
||||
return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
|
||||
}
|
||||
|
33
SWESimulators/fluxes/FirstOrderCentered.cu
Normal file
33
SWESimulators/fluxes/FirstOrderCentered.cu
Normal file
@ -0,0 +1,33 @@
|
||||
/*
|
||||
This file implements the First ORder CEntered flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "./LaxFriedrichs.cu"
|
||||
#include "./LaxWendroff.cu"
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* First Ordered Centered (Toro 2001, p.163)
|
||||
*/
|
||||
__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
|
||||
const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
|
||||
return 0.5f*(F_lf + F_lw2);
|
||||
}
|
36
SWESimulators/fluxes/Godunov.cu
Normal file
36
SWESimulators/fluxes/Godunov.cu
Normal file
@ -0,0 +1,36 @@
|
||||
/*
|
||||
This file implements the Godunov flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Godunovs centered scheme (Toro 2001, p 165)
|
||||
*/
|
||||
__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
|
||||
|
||||
return F_func(Q_godc, g_);
|
||||
}
|
||||
|
||||
|
||||
|
63
SWESimulators/fluxes/HartenLaxVanLeer.cu
Normal file
63
SWESimulators/fluxes/HartenLaxVanLeer.cu
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
This file implements the Harten-Lax-van Leer flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
|
||||
*/
|
||||
__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
const float h_l = Q_l.x;
|
||||
const float h_r = Q_r.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l.y / h_l;
|
||||
const float u_r = Q_r.y / h_r;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
|
||||
//Upwind selection
|
||||
if (S_l >= 0.0f) {
|
||||
return F_func(Q_l, g_);
|
||||
}
|
||||
else if (S_r <= 0.0f) {
|
||||
return F_func(Q_r, g_);
|
||||
}
|
||||
//Or estimate flux in the star region
|
||||
else {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
|
||||
return flux;
|
||||
}
|
||||
}
|
80
SWESimulators/fluxes/HartenLaxVanLeerContact.cu
Normal file
80
SWESimulators/fluxes/HartenLaxVanLeerContact.cu
Normal file
@ -0,0 +1,80 @@
|
||||
/*
|
||||
This file implements the Harten-Lax-van Leer flux with contact discontinuity
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
|
||||
*/
|
||||
__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
const float h_l = Q_l.x;
|
||||
const float h_r = Q_r.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l.y / h_l;
|
||||
const float u_r = Q_r.y / h_r;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
|
||||
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
//Upwind selection
|
||||
if (S_l >= 0.0f) {
|
||||
return F_l;
|
||||
}
|
||||
else if (S_r <= 0.0f) {
|
||||
return F_r;
|
||||
}
|
||||
//Or estimate flux in the "left star" region
|
||||
else if (S_l <= 0.0f && 0.0f <=S_star) {
|
||||
const float v_l = Q_l.z / h_l;
|
||||
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
|
||||
const float3 flux = F_l + S_l*(Q_star_l - Q_l);
|
||||
return flux;
|
||||
}
|
||||
//Or estimate flux in the "righ star" region
|
||||
else if (S_star <= 0.0f && 0.0f <=S_r) {
|
||||
const float v_r = Q_r.z / h_r;
|
||||
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
|
||||
const float3 flux = F_r + S_r*(Q_star_r - Q_r);
|
||||
return flux;
|
||||
}
|
||||
else {
|
||||
return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
|
||||
}
|
||||
}
|
||||
|
24
SWESimulators/fluxes/LaxFriedrichs.cu
Normal file
24
SWESimulators/fluxes/LaxFriedrichs.cu
Normal file
@ -0,0 +1,24 @@
|
||||
|
||||
|
||||
/**
|
||||
* Lax-Friedrichs flux (Toro 2001, p 163)
|
||||
*/
|
||||
__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Lax-Friedrichs extended to 2D
|
||||
*/
|
||||
__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
//Note numerical diffusion for 2D here (0.25)
|
||||
return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
|
||||
}
|
33
SWESimulators/fluxes/LaxWendroff.cu
Normal file
33
SWESimulators/fluxes/LaxWendroff.cu
Normal file
@ -0,0 +1,33 @@
|
||||
/*
|
||||
This file implements the Lax-Wendroff flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
|
||||
*/
|
||||
__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
|
||||
|
||||
return F_func(Q_lw2, g_);
|
||||
}
|
||||
|
187
SWESimulators/fluxes/WeightedAverageFlux.cu
Normal file
187
SWESimulators/fluxes/WeightedAverageFlux.cu
Normal file
@ -0,0 +1,187 @@
|
||||
/*
|
||||
This file implements the Weighted Average Flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "limiters.cu"
|
||||
|
||||
/**
|
||||
* Superbee flux limiter for WAF.
|
||||
* Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
|
||||
* @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
|
||||
* @param c_ the courant number for wave k, dt*S_k/dx
|
||||
*/
|
||||
__device__ float WAF_superbee(float r_, float c_) {
|
||||
// r <= 0.0
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
// 0.0 <= r <= 1/2
|
||||
else if (r_ <= 0.5f) {
|
||||
return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
// 1/2 <= r <= 1
|
||||
else if (r_ <= 1.0f) {
|
||||
return fabs(c_);
|
||||
}
|
||||
// 1 <= r <= 2
|
||||
else if (r_ <= 2.0f) {
|
||||
return 1.0f - (1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
// r >= 2
|
||||
else {
|
||||
return 2.0f*fabsf(c_) - 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float WAF_albada(float r_, float c_) {
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
else {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float WAF_minbee(float r_, float c_) {
|
||||
r_ = fmaxf(-1.0f, fminf(2.0f, r_));
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
if (r_ >= 0.0f && r_ <= 1.0f) {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * r_;
|
||||
}
|
||||
else {
|
||||
return fabsf(c_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float WAF_minmod(float r_, float c_) {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
|
||||
}
|
||||
|
||||
|
||||
|
||||
__device__ float limiterToWAFLimiter(float r_, float c_) {
|
||||
return 1.0f - (1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
|
||||
__device__ float desingularize(float x_, float eps_) {
|
||||
return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
|
||||
}
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
|
||||
|
||||
//This estimate for the h* gives rise to spurious oscillations.
|
||||
//return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
|
||||
return h_tmp*h_tmp / g_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
|
||||
* @param r_ The flux limiter parameter (see Toro 2001, p. 203)
|
||||
* @param Q_l2 Q_{i-1}
|
||||
* @param Q_l1 Q_{i}
|
||||
* @param Q_r1 Q_{i+1}
|
||||
* @param Q_r2 Q_{i+2}
|
||||
*/
|
||||
__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {
|
||||
const float h_l = Q_l1.x;
|
||||
const float h_r = Q_r1.x;
|
||||
|
||||
const float h_l2 = Q_l2.x;
|
||||
const float h_r2 = Q_r2.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l1.y / h_l;
|
||||
const float u_r = Q_r1.y / h_r;
|
||||
|
||||
const float u_l2 = Q_l2.y / h_l2;
|
||||
const float u_r2 = Q_r2.y / h_r2;
|
||||
|
||||
const float v_l = Q_l1.z / h_l;
|
||||
const float v_r = Q_r1.z / h_r;
|
||||
|
||||
const float v_l2 = Q_l2.z / h_l2;
|
||||
const float v_r2 = Q_r2.z / h_r2;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
const float c_l2 = sqrt(g_*h_l2);
|
||||
const float c_r2 = sqrt(g_*h_r2);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_);
|
||||
const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_);
|
||||
const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
|
||||
|
||||
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
|
||||
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
|
||||
|
||||
// Estimate the fluxes in the four regions
|
||||
const float3 F_1 = F_func(Q_l1, g_);
|
||||
const float3 F_4 = F_func(Q_r1, g_);
|
||||
|
||||
const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
|
||||
const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
|
||||
//const float3 F_2 = F_func(Q_star_l, g_);
|
||||
//const float3 F_3 = F_func(Q_star_r, g_);
|
||||
|
||||
// Compute the courant numbers for the waves
|
||||
const float c_1 = S_l * dt_ / dx_;
|
||||
const float c_2 = S_star * dt_ / dx_;
|
||||
const float c_3 = S_r * dt_ / dx_;
|
||||
|
||||
// Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
|
||||
const float eps = 1.0e-6f;
|
||||
const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
|
||||
const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
|
||||
const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
|
||||
|
||||
// Compute the limiter
|
||||
// We use h for the nonlinear waves, and v for the middle shear wave
|
||||
const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
|
||||
const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2);
|
||||
const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
|
||||
|
||||
//Average the fluxes
|
||||
const float3 flux = 0.5f*( F_1 + F_4 )
|
||||
- 0.5f*( A_1 * (F_2 - F_1)
|
||||
+ A_2 * (F_3 - F_2)
|
||||
+ A_3 * (F_4 - F_3) );
|
||||
|
||||
return flux;
|
||||
}
|
129
SWESimulators/limiters.cu
Normal file
129
SWESimulators/limiters.cu
Normal file
@ -0,0 +1,129 @@
|
||||
/*
|
||||
This file implements different flux and slope limiters
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a slope using the generalized minmod limiter based on three
|
||||
* consecutive values
|
||||
*/
|
||||
__device__ __inline__ float minmodSlope(float left, float center, float right, float theta) {
|
||||
const float backward = (center - left) * theta;
|
||||
const float central = (right - left) * 0.5f;
|
||||
const float forward = (right - center) * theta;
|
||||
|
||||
return 0.25f
|
||||
*copysign(1.0f, backward)
|
||||
*(copysign(1.0f, backward) + copysign(1.0f, central))
|
||||
*(copysign(1.0f, central) + copysign(1.0f, forward))
|
||||
*min( min(fabs(backward), fabs(central)), fabs(forward) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along the abscissa
|
||||
*/
|
||||
__device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Reconstruct slopes along x axis
|
||||
{
|
||||
const int j = ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along the ordinate
|
||||
*/
|
||||
__device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
const int i = tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float monotonized_central(float r_) {
|
||||
return fmaxf(0.0f, fminf(2.0f, fminf(2.0f*r_, 0.5f*(1.0f+r_))));
|
||||
}
|
||||
|
||||
__device__ float osher(float r_, float beta_) {
|
||||
return fmaxf(0.0f, fminf(beta_, r_));
|
||||
}
|
||||
|
||||
__device__ float sweby(float r_, float beta_) {
|
||||
return fmaxf(0.0f, fmaxf(fminf(r_, beta_), fminf(beta_*r_, 1.0f)));
|
||||
}
|
||||
|
||||
__device__ float minmod(float r_) {
|
||||
return fmaxf(0.0f, fminf(1.0f, r_));
|
||||
}
|
||||
|
||||
__device__ float generalized_minmod(float r_, float theta_) {
|
||||
return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
|
||||
}
|
||||
|
||||
__device__ float superbee(float r_) {
|
||||
return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
|
||||
}
|
||||
|
||||
__device__ float vanAlbada1(float r_) {
|
||||
return (r_*r_ + r_) / (r_*r_ + 1.0f);
|
||||
}
|
||||
|
||||
__device__ float vanAlbada2(float r_) {
|
||||
return 2.0f*r_ / (r_*r_* + 1.0f);
|
||||
}
|
||||
|
||||
__device__ float vanLeer(float r_) {
|
||||
return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user