mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-11-29 17:28:03 +01:00
Working prototype of autotuning
This commit is contained in:
277
GPUSimulators/Autotuner.py
Normal file
277
GPUSimulators/Autotuner.py
Normal file
@@ -0,0 +1,277 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the different helper functions and
|
||||
classes
|
||||
|
||||
Copyright (C) 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import os
|
||||
import gc
|
||||
import numpy as np
|
||||
import logging
|
||||
from socket import gethostname
|
||||
|
||||
import pycuda.driver as cuda
|
||||
|
||||
|
||||
from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF
|
||||
|
||||
class Autotuner:
|
||||
def __init__(self,
|
||||
nx=2048, ny=2048,
|
||||
block_widths=range(8, 32, 2),
|
||||
block_heights=range(8, 32, 2)):
|
||||
logger = logging.getLogger(__name__)
|
||||
self.filename = "autotuning_data_" + gethostname() + ".npz"
|
||||
self.nx = nx
|
||||
self.ny = ny
|
||||
self.block_widths = block_widths
|
||||
self.block_heights = block_heights
|
||||
self.performance = {}
|
||||
|
||||
|
||||
def benchmark(self, simulator, force=False):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#Run through simulators and benchmark
|
||||
key = str(simulator.__name__)
|
||||
logger.info("Benchmarking %s to %s", key, self.filename)
|
||||
|
||||
#If this simulator has been benchmarked already, skip it
|
||||
if (force==False and os.path.isfile(self.filename)):
|
||||
with np.load(self.filename) as data:
|
||||
if key in data["simulators"]:
|
||||
logger.info("%s already benchmarked - skipping", key)
|
||||
return
|
||||
|
||||
# Set arguments to send to the simulators during construction
|
||||
context = Common.CudaContext(autotuning=False)
|
||||
g = 9.81
|
||||
h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
|
||||
arguments = {
|
||||
'context': context,
|
||||
'h0': h0, 'hu0': hu0, 'hv0': hv0,
|
||||
'nx': self.nx, 'ny': self.ny,
|
||||
'dx': dx, 'dy': dy, 'dt': 0.9*dt,
|
||||
'g': g
|
||||
}
|
||||
|
||||
# Load existing data into memory
|
||||
benchmark_data = {
|
||||
"simulators": [],
|
||||
}
|
||||
if (os.path.isfile(self.filename)):
|
||||
with np.load(self.filename) as data:
|
||||
for k, v in data.items():
|
||||
benchmark_data[k] = v
|
||||
|
||||
# Run benchmark
|
||||
benchmark_data[key + "_megacells"] = Autotuner.benchmark_single_simulator(simulator, arguments, self.block_widths, self.block_heights)
|
||||
benchmark_data[key + "_block_widths"] = self.block_widths
|
||||
benchmark_data[key + "_block_heights"] = self.block_heights
|
||||
benchmark_data[key + "_arguments"] = str(arguments)
|
||||
|
||||
existing_sims = benchmark_data["simulators"]
|
||||
if (isinstance(existing_sims, np.ndarray)):
|
||||
existing_sims = existing_sims.tolist()
|
||||
if (key not in existing_sims):
|
||||
benchmark_data["simulators"] = existing_sims + [key]
|
||||
|
||||
# Save to file
|
||||
np.savez_compressed(self.filename, **benchmark_data)
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Function which reads a numpy file with autotuning data
|
||||
and reports the maximum performance and block size
|
||||
"""
|
||||
def get_peak_performance(self, simulator):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
assert issubclass(simulator, Simulator.BaseSimulator)
|
||||
key = simulator.__name__
|
||||
|
||||
if (key in self.performance):
|
||||
return self.performance[key]
|
||||
else:
|
||||
#Run simulation if required
|
||||
if (not os.path.isfile(self.filename)):
|
||||
logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
|
||||
self.benchmark(simulator)
|
||||
|
||||
with np.load(self.filename) as data:
|
||||
if key not in data['simulators']:
|
||||
logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
|
||||
data.close()
|
||||
self.benchmark(simulator)
|
||||
data = np.load(self.filename)
|
||||
|
||||
def find_max_index(megacells):
|
||||
max_index = np.nanargmax(megacells)
|
||||
return np.unravel_index(max_index, megacells.shape)
|
||||
|
||||
megacells = data[key + '_megacells']
|
||||
block_widths = data[key + '_block_widths']
|
||||
block_heights = data[key + '_block_heights']
|
||||
j, i = find_max_index(megacells)
|
||||
|
||||
self.performance[key] = { "block_width": block_widths[i],
|
||||
"block_height": block_heights[j],
|
||||
"megacells": megacells[j, i] }
|
||||
logger.debug("Returning %s as peak performance parameters", self.performance[key])
|
||||
return self.performance[key]
|
||||
|
||||
#This should never happen
|
||||
raise "Something wrong: Could not get autotuning data!"
|
||||
return None
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Runs a set of benchmarks for a single simulator
|
||||
"""
|
||||
def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
megacells = np.empty((len(block_heights), len(block_widths)))
|
||||
megacells.fill(np.nan)
|
||||
|
||||
logger.debug("Running %d benchmarks with %s", len(block_heights)*len(block_widths), simulator.__name__)
|
||||
|
||||
sim_arguments = arguments.copy()
|
||||
|
||||
with Common.Timer(simulator.__name__) as t:
|
||||
for j, block_height in enumerate(block_heights):
|
||||
sim_arguments.update({'block_height': block_height})
|
||||
for i, block_width in enumerate(block_widths):
|
||||
sim_arguments.update({'block_width': block_width})
|
||||
megacells[j, i] = Autotuner.run_benchmark(simulator, sim_arguments)
|
||||
|
||||
|
||||
logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
|
||||
|
||||
return megacells
|
||||
|
||||
|
||||
"""
|
||||
Runs a benchmark, and returns the number of megacells achieved
|
||||
"""
|
||||
def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#Initialize simulator
|
||||
try:
|
||||
sim = simulator(**arguments)
|
||||
except:
|
||||
#An exception raised - not possible to continue
|
||||
logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
|
||||
return np.nan
|
||||
|
||||
#Create timer events
|
||||
start = cuda.Event()
|
||||
end = cuda.Event()
|
||||
|
||||
#Warmup
|
||||
for i in range(warmup_timesteps):
|
||||
sim.stepEuler(sim.dt)
|
||||
|
||||
#Run simulation with timer
|
||||
start.record(sim.stream)
|
||||
for i in range(timesteps):
|
||||
sim.stepEuler(sim.dt)
|
||||
end.record(sim.stream)
|
||||
|
||||
#Synchronize end event
|
||||
end.synchronize()
|
||||
|
||||
#Compute megacells
|
||||
gpu_elapsed = end.time_since(start)*1.0e-3
|
||||
megacells = (sim.nx*sim.ny*timesteps / (1000*1000)) / gpu_elapsed
|
||||
|
||||
#Sanity check solution
|
||||
h, hu, hv = sim.download()
|
||||
sane = True
|
||||
sane = sane and Autotuner.sanity_check(h, 0.3, 0.7)
|
||||
sane = sane and Autotuner.sanity_check(hu, -0.2, 0.2)
|
||||
sane = sane and Autotuner.sanity_check(hv, -0.2, 0.2)
|
||||
|
||||
if (sane):
|
||||
logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
|
||||
return megacells
|
||||
else:
|
||||
logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
|
||||
return np.nan
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Generates test dataset
|
||||
"""
|
||||
def gen_test_data(nx, ny, g):
|
||||
width = 100.0
|
||||
height = 100.0
|
||||
dx = width / float(nx)
|
||||
dy = height / float(ny)
|
||||
|
||||
x_center = dx*nx/2.0
|
||||
y_center = dy*ny/2.0
|
||||
|
||||
#Create a gaussian "dam break" that will not form shocks
|
||||
size = width / 5.0
|
||||
dt = 10**10
|
||||
|
||||
h = np.zeros((ny, nx), dtype=np.float32);
|
||||
hu = np.zeros((ny, nx), dtype=np.float32);
|
||||
hv = np.zeros((ny, nx), dtype=np.float32);
|
||||
|
||||
x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
|
||||
y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
|
||||
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
|
||||
r = np.sqrt(xv**2 + yv**2)
|
||||
xv = None
|
||||
yv = None
|
||||
gc.collect()
|
||||
|
||||
#Generate highres
|
||||
h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
|
||||
hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
|
||||
hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
|
||||
|
||||
scale = 0.7
|
||||
max_h_estimate = 0.6
|
||||
max_u_estimate = 0.1*np.sqrt(2.0)
|
||||
dx = width/nx
|
||||
dy = height/ny
|
||||
dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g*max_h_estimate))
|
||||
|
||||
return h, hu, hv, dx, dy, dt
|
||||
|
||||
"""
|
||||
Checks that a variable is "sane"
|
||||
"""
|
||||
def sanity_check(variable, bound_min, bound_max):
|
||||
maxval = np.amax(variable)
|
||||
minval = np.amin(variable)
|
||||
if (np.isnan(maxval)
|
||||
or np.isnan(minval)
|
||||
or maxval > bound_max
|
||||
or minval < bound_min):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
424
GPUSimulators/Common.py
Normal file
424
GPUSimulators/Common.py
Normal file
@@ -0,0 +1,424 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the different helper functions and
|
||||
classes
|
||||
|
||||
Copyright (C) 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import time
|
||||
import re
|
||||
import io
|
||||
import hashlib
|
||||
import logging
|
||||
import gc
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from GPUSimulators import Autotuner
|
||||
|
||||
"""
|
||||
Class which keeps track of time spent for a section of code
|
||||
"""
|
||||
class Timer(object):
|
||||
def __init__(self, tag, log_level=logging.DEBUG):
|
||||
self.tag = tag
|
||||
self.log_level = log_level
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.end = time.time()
|
||||
self.secs = self.end - self.start
|
||||
self.msecs = self.secs * 1000 # millisecs
|
||||
self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class which keeps track of the CUDA context and some helper functions
|
||||
"""
|
||||
class CudaContext(object):
|
||||
|
||||
def __init__(self, blocking=False, use_cache=True, autotuning=True):
|
||||
self.blocking = blocking
|
||||
self.use_cache = use_cache
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.kernels = {}
|
||||
|
||||
self.module_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
#Initialize cuda (must be first call to PyCUDA)
|
||||
cuda.init(flags=0)
|
||||
|
||||
self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
|
||||
|
||||
#Print some info about CUDA
|
||||
self.logger.info("CUDA version %s", str(cuda.get_version()))
|
||||
self.logger.info("Driver version %s", str(cuda.get_driver_version()))
|
||||
|
||||
self.cuda_device = cuda.Device(0)
|
||||
self.logger.info("Using '%s' GPU", self.cuda_device.name())
|
||||
self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
|
||||
self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
|
||||
|
||||
# Create the CUDA context
|
||||
if (self.blocking):
|
||||
self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
|
||||
self.logger.warning("Using blocking context")
|
||||
else:
|
||||
self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
|
||||
|
||||
self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
|
||||
|
||||
#Create cache dir for cubin files
|
||||
if (self.use_cache):
|
||||
self.cache_path = os.path.join(self.module_path, "cuda_cache")
|
||||
if not os.path.isdir(self.cache_path):
|
||||
os.mkdir(self.cache_path)
|
||||
self.logger.debug("Using CUDA cache dir %s", self.cache_path)
|
||||
|
||||
self.autotuner = None
|
||||
if (autotuning):
|
||||
self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
|
||||
self.autotuner = Autotuner.Autotuner()
|
||||
|
||||
|
||||
def __del__(self, *args):
|
||||
self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
|
||||
|
||||
# Loop over all contexts in stack, and remove "this"
|
||||
other_contexts = []
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
if (context.handle != self.cuda_context.handle):
|
||||
self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
|
||||
other_contexts = [context] + other_contexts
|
||||
cuda.Context.pop()
|
||||
else:
|
||||
self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
|
||||
cuda.Context.pop()
|
||||
|
||||
# Add all the contexts we popped that were not our own
|
||||
for context in other_contexts:
|
||||
self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
|
||||
cuda.Context.push(context)
|
||||
|
||||
self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
|
||||
self.cuda_context.detach()
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "CudaContext id " + str(self.cuda_context.handle)
|
||||
|
||||
|
||||
def hash_kernel(kernel_filename, include_dirs):
|
||||
# Generate a kernel ID for our caches
|
||||
num_includes = 0
|
||||
max_includes = 100
|
||||
kernel_hasher = hashlib.md5()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Loop over file and includes, and check if something has changed
|
||||
files = [kernel_filename]
|
||||
while len(files):
|
||||
|
||||
if (num_includes > max_includes):
|
||||
raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
|
||||
|
||||
filename = files.pop()
|
||||
|
||||
#logger.debug("Hashing %s", filename)
|
||||
|
||||
modified = os.path.getmtime(filename)
|
||||
|
||||
# Open the file
|
||||
with io.open(filename, "r") as file:
|
||||
|
||||
# Search for #inclue <something> and also hash the file
|
||||
file_str = file.read()
|
||||
kernel_hasher.update(file_str.encode('utf-8'))
|
||||
kernel_hasher.update(str(modified).encode('utf-8'))
|
||||
|
||||
#Find all includes
|
||||
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
|
||||
|
||||
# Loop over everything that looks like an include
|
||||
for include_file in includes:
|
||||
|
||||
#Search through include directories for the file
|
||||
file_path = os.path.dirname(filename)
|
||||
for include_path in [file_path] + include_dirs:
|
||||
|
||||
# If we find it, add it to list of files to check
|
||||
temp_path = os.path.join(include_path, include_file)
|
||||
if (os.path.isfile(temp_path)):
|
||||
files = files + [temp_path]
|
||||
num_includes = num_includes + 1 #For circular includes...
|
||||
break
|
||||
|
||||
return kernel_hasher.hexdigest()
|
||||
|
||||
"""
|
||||
Reads a text file and creates an OpenCL kernel from that
|
||||
"""
|
||||
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
|
||||
prepared_call_args, \
|
||||
include_dirs=[], no_extern_c=True,
|
||||
**kwargs):
|
||||
"""
|
||||
Helper function to print compilation output
|
||||
"""
|
||||
def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
|
||||
self.logger.debug("Compilation returned %s", str(compile_success_bool))
|
||||
if info_str:
|
||||
self.logger.debug("Info: %s", info_str)
|
||||
if error_str:
|
||||
self.logger.debug("Error: %s", error_str)
|
||||
|
||||
#self.logger.debug("Getting %s", kernel_filename)
|
||||
|
||||
# Create a hash of the kernel (and its includes)
|
||||
kwargs_hasher = hashlib.md5()
|
||||
kwargs_hasher.update(str(kwargs).encode('utf-8'));
|
||||
kwargs_hash = kwargs_hasher.hexdigest()
|
||||
kwargs_hasher = None
|
||||
root, ext = os.path.splitext(kernel_filename)
|
||||
kernel_hash = root \
|
||||
+ "_" + CudaContext.hash_kernel( \
|
||||
os.path.join(self.module_path, kernel_filename), \
|
||||
include_dirs=[self.module_path] + include_dirs) \
|
||||
+ "_" + kwargs_hash \
|
||||
+ ext
|
||||
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
|
||||
|
||||
# If we have the kernel in our hashmap, return it
|
||||
if (kernel_hash in self.kernels.keys()):
|
||||
self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
|
||||
return self.kernels[kernel_hash]
|
||||
|
||||
# If we have it on disk, return it
|
||||
elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
|
||||
self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
|
||||
|
||||
with io.open(cached_kernel_filename, "rb") as file:
|
||||
file_str = file.read()
|
||||
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
|
||||
|
||||
kernel = module.get_function(kernel_function_name)
|
||||
kernel.prepare(prepared_call_args)
|
||||
self.kernels[kernel_hash] = kernel
|
||||
return kernel
|
||||
|
||||
# Otherwise, compile it from source
|
||||
else:
|
||||
self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
|
||||
|
||||
#Create kernel string
|
||||
kernel_string = ""
|
||||
for key, value in kwargs.items():
|
||||
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
|
||||
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
|
||||
if (self.use_cache):
|
||||
with io.open(cached_kernel_filename + ".txt", "w") as file:
|
||||
file.write(kernel_string)
|
||||
|
||||
|
||||
with Timer("compiler") as timer:
|
||||
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
|
||||
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
|
||||
if (self.use_cache):
|
||||
with io.open(cached_kernel_filename, "wb") as file:
|
||||
file.write(cubin)
|
||||
|
||||
kernel = module.get_function(kernel_function_name)
|
||||
kernel.prepare(prepared_call_args)
|
||||
self.kernels[kernel_hash] = kernel
|
||||
|
||||
|
||||
return kernel
|
||||
|
||||
"""
|
||||
Clears the kernel cache (useful for debugging & development)
|
||||
"""
|
||||
def clear_kernel_cache(self):
|
||||
self.logger.debug("Clearing cache")
|
||||
self.kernels = {}
|
||||
gc.collect()
|
||||
|
||||
"""
|
||||
Synchronizes all streams etc
|
||||
"""
|
||||
def synchronize(self):
|
||||
self.cuda_context.synchronize()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that holds data
|
||||
"""
|
||||
class CudaArray2D:
|
||||
"""
|
||||
Uploads initial data to the CL device
|
||||
"""
|
||||
def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.nx = nx
|
||||
self.ny = ny
|
||||
self.x_halo = x_halo
|
||||
self.y_halo = y_halo
|
||||
|
||||
nx_halo = nx + 2*x_halo
|
||||
ny_halo = ny + 2*y_halo
|
||||
|
||||
#self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
|
||||
|
||||
#Make sure data is in proper format
|
||||
assert np.issubdtype(cpu_data.dtype, np.float32), "Wrong datatype: %s" % str(cpu_data.dtype)
|
||||
assert cpu_data.itemsize == 4, "Wrong size of data type"
|
||||
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
|
||||
|
||||
#Upload data to the device
|
||||
if (cpu_data.shape == (ny_halo, nx_halo)):
|
||||
self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
|
||||
elif (cpu_data.shape == (self.ny, self.nx)):
|
||||
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
|
||||
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), cpu_data.dtype)
|
||||
#self.data.fill(0.0)
|
||||
|
||||
#Create copy object from host to device
|
||||
copy = cuda.Memcpy2D()
|
||||
copy.set_src_host(cpu_data)
|
||||
copy.set_dst_device(self.data.gpudata)
|
||||
|
||||
#Set offsets and pitch of destination
|
||||
copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
|
||||
copy.dst_y = self.y_halo
|
||||
copy.dst_pitch = self.data.strides[0]
|
||||
|
||||
#Set width in bytes to copy for each row and
|
||||
#number of rows to copy
|
||||
copy.width_in_bytes = self.nx*cpu_data.itemsize
|
||||
copy.height = self.ny
|
||||
|
||||
#Perform the copy
|
||||
copy(stream)
|
||||
stream.synchronize()
|
||||
|
||||
else:
|
||||
assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
|
||||
|
||||
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
|
||||
|
||||
|
||||
def __del__(self, *args):
|
||||
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
|
||||
self.data.gpudata.free()
|
||||
self.data = None
|
||||
|
||||
"""
|
||||
Enables downloading data from GPU to Python
|
||||
"""
|
||||
def download(self, stream, async=False):
|
||||
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
|
||||
#Allocate host memory
|
||||
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
|
||||
cpu_data = np.empty((self.ny, self.nx), dtype=np.float32)
|
||||
|
||||
#Create copy object from device to host
|
||||
copy = cuda.Memcpy2D()
|
||||
copy.set_src_device(self.data.gpudata)
|
||||
copy.set_dst_host(cpu_data)
|
||||
|
||||
#Set offsets and pitch of source
|
||||
copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
|
||||
copy.src_y = self.y_halo
|
||||
copy.src_pitch = self.data.strides[0]
|
||||
|
||||
#Set width in bytes to copy for each row and
|
||||
#number of rows to copy
|
||||
copy.width_in_bytes = self.nx*cpu_data.itemsize
|
||||
copy.height = self.ny
|
||||
|
||||
copy(stream)
|
||||
if async==False:
|
||||
stream.synchronize()
|
||||
|
||||
return cpu_data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
|
||||
"""
|
||||
class SWEDataArakawaA:
|
||||
"""
|
||||
Uploads initial data to the CL device
|
||||
"""
|
||||
def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0):
|
||||
self.h0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
|
||||
self.hu0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
|
||||
self.hv0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
|
||||
|
||||
self.h1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
|
||||
self.hu1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
|
||||
self.hv1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
|
||||
|
||||
"""
|
||||
Swaps the variables after a timestep has been completed
|
||||
"""
|
||||
def swap(self):
|
||||
self.h1, self.h0 = self.h0, self.h1
|
||||
self.hu1, self.hu0 = self.hu0, self.hu1
|
||||
self.hv1, self.hv0 = self.hv0, self.hv1
|
||||
|
||||
"""
|
||||
Enables downloading data from CL device to Python
|
||||
"""
|
||||
def download(self, stream):
|
||||
h_cpu = self.h0.download(stream, async=True)
|
||||
hu_cpu = self.hu0.download(stream, async=True)
|
||||
hv_cpu = self.hv0.download(stream, async=False)
|
||||
|
||||
return h_cpu, hu_cpu, hv_cpu
|
||||
|
||||
|
||||
96
GPUSimulators/FORCE.py
Normal file
96
GPUSimulators/FORCE.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the FORCE flux
|
||||
for the shallow water equations
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations
|
||||
"""
|
||||
class FORCE (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
1, 1, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "First order centered"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
|
||||
156
GPUSimulators/FORCE_kernel.cu
Normal file
156
GPUSimulators/FORCE_kernel.cu
Normal file
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
This OpenCL kernel implements the classical Lax-Friedrichs scheme
|
||||
for the shallow water equations, with edge fluxes.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/FirstOrderCentered.cu"
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the x axis
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
// Q at interface from the right and left
|
||||
const float3 Qp = make_float3(Q[0][l][k+1],
|
||||
Q[1][l][k+1],
|
||||
Q[2][l][k+1]);
|
||||
const float3 Qm = make_float3(Q[0][l][k],
|
||||
Q[1][l][k],
|
||||
Q[2][l][k]);
|
||||
|
||||
// Computed flux
|
||||
const float3 flux = FORCE_1D_flux(Qm, Qp, g_, dx_, dt_);
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
int i=tx;
|
||||
const int k = i + 1; //Skip ghost cells
|
||||
|
||||
// Q at interface from the right and left
|
||||
// Note that we swap hu and hv
|
||||
const float3 Qp = make_float3(Q[0][l+1][k],
|
||||
Q[2][l+1][k],
|
||||
Q[1][l+1][k]);
|
||||
const float3 Qm = make_float3(Q[0][l][k],
|
||||
Q[2][l][k],
|
||||
Q[1][l][k]);
|
||||
|
||||
// Computed flux
|
||||
// Note that we swap back
|
||||
const float3 flux = FORCE_1D_flux(Qm, Qp, g_, dy_, dt_);
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__ void FORCEKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock1(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary1(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute flux along x, and evolve
|
||||
computeFluxF(Q, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF1(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary1(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute flux along y, and evolve
|
||||
computeFluxG(Q, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG1(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Write to main memory
|
||||
writeBlock1(h1_ptr_, h1_pitch_,
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
91
GPUSimulators/HLL.py
Normal file
91
GPUSimulators/HLL.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the HLL flux
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
|
||||
"""
|
||||
class HLL (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
1, 1, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "Harten-Lax-van Leer"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
|
||||
117
GPUSimulators/HLL2.py
Normal file
117
GPUSimulators/HLL2.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the 2nd order HLL flux
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class HLL2 (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
theta=1.8, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "Harten-Lax-van Leer (2nd order)"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
233
GPUSimulators/HLL2_kernel.cu
Normal file
233
GPUSimulators/HLL2_kernel.cu
Normal file
@@ -0,0 +1,233 @@
|
||||
/*
|
||||
This OpenCL kernel implements the second order HLL flux
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/HartenLaxVanLeer.cu"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
const float3 Q_rl = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
|
||||
Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
|
||||
Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
|
||||
const float3 Q_rr = make_float3(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
|
||||
Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
|
||||
Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
|
||||
|
||||
const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qx[0][j][i],
|
||||
Q[1][l][k] - 0.5f*Qx[1][j][i],
|
||||
Q[2][l][k] - 0.5f*Qx[2][j][i]);
|
||||
const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qx[0][j][i],
|
||||
Q[1][l][k] + 0.5f*Qx[1][j][i],
|
||||
Q[2][l][k] + 0.5f*Qx[2][j][i]);
|
||||
|
||||
//Evolve half a timestep (predictor step)
|
||||
const float3 Q_r_bar = Q_rl + dt_/(2.0f*dx_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
|
||||
const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
|
||||
|
||||
// Compute flux based on prediction
|
||||
const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
|
||||
|
||||
//Write to shared memory
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
//NOte that hu and hv are swapped ("transposing" the domain)!
|
||||
const float3 Q_rl = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
|
||||
Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
|
||||
Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
|
||||
const float3 Q_rr = make_float3(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
|
||||
Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
|
||||
Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
|
||||
|
||||
const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qy[0][j][i],
|
||||
Q[2][l][k] - 0.5f*Qy[2][j][i],
|
||||
Q[1][l][k] - 0.5f*Qy[1][j][i]);
|
||||
const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qy[0][j][i],
|
||||
Q[2][l][k] + 0.5f*Qy[2][j][i],
|
||||
Q[1][l][k] + 0.5f*Qy[1][j][i]);
|
||||
|
||||
//Evolve half a timestep (predictor step)
|
||||
const float3 Q_r_bar = Q_rl + dt_/(2.0f*dy_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
|
||||
const float3 Q_l_bar = Q_lr + dt_/(2.0f*dy_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
|
||||
|
||||
// Compute flux based on prediction
|
||||
const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
|
||||
|
||||
//Write to shared memory
|
||||
//Note that we here swap hu and hv back to the original
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__ void HLL2Kernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
|
||||
float theta_,
|
||||
|
||||
int step_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock2(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Step 0 => evolve x first, then y
|
||||
if (step_ == 0) {
|
||||
//Compute fluxes along the x axis and evolve
|
||||
minmodSlopeX(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxF(Q, Qx, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF2(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute fluxes along the y axis and evolve
|
||||
minmodSlopeY(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxG(Q, Qx, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG2(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
}
|
||||
//Step 1 => evolve y first, then x
|
||||
else {
|
||||
//Compute fluxes along the y axis and evolve
|
||||
minmodSlopeY(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxG(Q, Qx, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG2(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute fluxes along the x axis and evolve
|
||||
minmodSlopeX(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxF(Q, Qx, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF2(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Write to main memory for all internal cells
|
||||
writeBlock2(h1_ptr_, h1_pitch_,
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
170
GPUSimulators/HLL_kernel.cu
Normal file
170
GPUSimulators/HLL_kernel.cu
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
This GPU kernel implements the HLL flux
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/HartenLaxVanLeer.cu"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i;
|
||||
|
||||
const float3 Q_l = make_float3(Q[0][l][k ], Q[1][l][k ], Q[2][l][k ]);
|
||||
const float3 Q_r = make_float3(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
|
||||
|
||||
const float3 flux = HLL_flux(Q_l, Q_r, g_);
|
||||
|
||||
//Write to shared memory
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j;
|
||||
{
|
||||
const int i=tx;
|
||||
const int k = i + 1; //Skip ghost cells
|
||||
|
||||
//NOte that hu and hv are swapped ("transposing" the domain)!
|
||||
const float3 Q_l = make_float3(Q[0][l ][k], Q[2][l ][k], Q[1][l ][k]);
|
||||
const float3 Q_r = make_float3(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
|
||||
|
||||
// Computed flux
|
||||
const float3 flux = HLL_flux(Q_l, Q_r, g_);
|
||||
|
||||
//Write to shared memory
|
||||
//Note that we here swap hu and hv back to the original
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
__global__ void HLLKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
const int block_width = BLOCK_WIDTH;
|
||||
const int block_height = BLOCK_HEIGHT;
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height+1][block_width+1];
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock1(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
noFlowBoundary1(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute F flux
|
||||
computeFluxF(Q, F, g_);
|
||||
__syncthreads();
|
||||
evolveF1(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary1(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute G flux
|
||||
computeFluxG(Q, F, g_);
|
||||
__syncthreads();
|
||||
evolveG1(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Q[0][get_local_id(1) + 1][get_local_id(0) + 1] += 0.1;
|
||||
|
||||
|
||||
|
||||
// Write to main memory for all internal cells
|
||||
writeBlock1(h1_ptr_, h1_pitch_,
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
118
GPUSimulators/IPythonMagic.py
Normal file
118
GPUSimulators/IPythonMagic.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements helpers for IPython / Jupyter and CUDA
|
||||
|
||||
Copyright (C) 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from IPython.core import magic_arguments
|
||||
from IPython.core.magic import line_magic, Magics, magics_class
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from GPUSimulators import Common
|
||||
|
||||
|
||||
@magics_class
|
||||
class MyIPythonMagic(Magics):
|
||||
@line_magic
|
||||
def cuda_context_handler(self, context_name):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.logger.debug("Registering %s as a global context", context_name)
|
||||
|
||||
if context_name in self.shell.user_ns.keys():
|
||||
self.logger.debug("Context already registered! Ignoring")
|
||||
return
|
||||
else:
|
||||
self.logger.debug("Creating context")
|
||||
#self.shell.ex(context_name + " = Common.CudaContext(blocking=False)")
|
||||
self.shell.user_ns[context_name] = Common.CudaContext(blocking=False)
|
||||
|
||||
# this function will be called on exceptions in any cell
|
||||
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
|
||||
self.logger.exception("Exception caught: Resetting to CUDA context %s", context_name)
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
self.logger.info("Popping <%s>", str(context.handle))
|
||||
cuda.Context.pop()
|
||||
|
||||
if context_name in self.shell.user_ns.keys():
|
||||
self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
|
||||
#self.shell.ex(context_name + ".cuda_context.push()")
|
||||
self.shell.user_ns[context_name].cuda_context.push()
|
||||
else:
|
||||
self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
|
||||
self.logger.error("CUDA will not work now")
|
||||
|
||||
self.logger.debug("==================================================================")
|
||||
|
||||
# still show the error within the notebook, don't just swallow it
|
||||
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
|
||||
|
||||
# this registers a custom exception handler for the whole current notebook
|
||||
get_ipython().set_custom_exc((Exception,), custom_exc)
|
||||
|
||||
|
||||
# Handle CUDA context when exiting python
|
||||
import atexit
|
||||
def exitfunc():
|
||||
self.logger.info("Exitfunc: Resetting CUDA context stack")
|
||||
while (cuda.Context.get_current() != None):
|
||||
context = cuda.Context.get_current()
|
||||
self.logger.info("`-> Popping <%s>", str(context.handle))
|
||||
cuda.Context.pop()
|
||||
self.logger.debug("==================================================================")
|
||||
atexit.register(exitfunc)
|
||||
|
||||
@line_magic
|
||||
@magic_arguments.magic_arguments()
|
||||
@magic_arguments.argument(
|
||||
'--out', '-o', type=str, default='output.log', help='The filename to store the log to')
|
||||
@magic_arguments.argument(
|
||||
'--level', '-l', type=int, default=20, help='The level of logging to screen [0, 50]')
|
||||
@magic_arguments.argument(
|
||||
'--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
|
||||
def setup_logging(self, line):
|
||||
args = magic_arguments.parse_argstring(self.setup_logging, line)
|
||||
import sys
|
||||
|
||||
#Get root logger
|
||||
logger = logging.getLogger('')
|
||||
logger.setLevel(min(args.level, args.file_level))
|
||||
|
||||
#Add log to screen
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(args.level)
|
||||
logger.addHandler(ch)
|
||||
logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level))
|
||||
|
||||
#Add log to file
|
||||
logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), args.out)
|
||||
fh = logging.FileHandler(args.out)
|
||||
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||
fh.setFormatter(formatter)
|
||||
fh.setLevel(args.file_level)
|
||||
logger.addHandler(fh)
|
||||
|
||||
logger.info("Python version %s", sys.version)
|
||||
|
||||
# Register
|
||||
ip = get_ipython()
|
||||
ip.register_magics(MyIPythonMagic)
|
||||
|
||||
112
GPUSimulators/KP07.py
Normal file
112
GPUSimulators/KP07.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class KP07 (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
theta=1.3, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "Kurganov-Petrova 2007"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateRK(t_end, 2)
|
||||
|
||||
def substepRK(self, dt, substep):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(substep), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.substepRK(dt, 0)
|
||||
self.t += dt
|
||||
|
||||
def stepRK(self, dt, order):
|
||||
if (order != 2):
|
||||
raise NotImplementedError("Only second order implemented")
|
||||
self.substepRK(dt, 0)
|
||||
self.substepRK(dt, 1)
|
||||
self.t += dt
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
119
GPUSimulators/KP07_dimsplit.py
Normal file
119
GPUSimulators/KP07_dimsplit.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the dimentionally split KP07 scheme
|
||||
"""
|
||||
class KP07_dimsplit (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
theta=1.3, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
self.theta = np.float32(theta)
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "Kurganov-Petrova 2007 dimensionally split"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.theta, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
|
||||
225
GPUSimulators/KP07_dimsplit_kernel.cu
Normal file
225
GPUSimulators/KP07_dimsplit_kernel.cu
Normal file
@@ -0,0 +1,225 @@
|
||||
/*
|
||||
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/CentralUpwind.cu"
|
||||
|
||||
|
||||
__device__
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
const float3 Q_rl = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
|
||||
Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
|
||||
Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
|
||||
const float3 Q_rr = make_float3(Q[0][l][k+1] + 0.5f*Qx[0][j][i+1],
|
||||
Q[1][l][k+1] + 0.5f*Qx[1][j][i+1],
|
||||
Q[2][l][k+1] + 0.5f*Qx[2][j][i+1]);
|
||||
|
||||
const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qx[0][j][i],
|
||||
Q[1][l][k] - 0.5f*Qx[1][j][i],
|
||||
Q[2][l][k] - 0.5f*Qx[2][j][i]);
|
||||
const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qx[0][j][i],
|
||||
Q[1][l][k] + 0.5f*Qx[1][j][i],
|
||||
Q[2][l][k] + 0.5f*Qx[2][j][i]);
|
||||
|
||||
//Evolve half a timestep (predictor step)
|
||||
const float3 Q_r_bar = Q_rl + dt_/(2.0f*dx_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
|
||||
const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
|
||||
|
||||
// Compute flux based on prediction
|
||||
const float3 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, g_);
|
||||
|
||||
//Write to shared memory
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
// Reconstruct point values of Q at the left and right hand side
|
||||
// of the cell for both the left (i) and right (i+1) cell
|
||||
//NOte that hu and hv are swapped ("transposing" the domain)!
|
||||
const float3 Q_rl = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
|
||||
Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
|
||||
Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
|
||||
const float3 Q_rr = make_float3(Q[0][l+1][k] + 0.5f*Qy[0][j+1][i],
|
||||
Q[2][l+1][k] + 0.5f*Qy[2][j+1][i],
|
||||
Q[1][l+1][k] + 0.5f*Qy[1][j+1][i]);
|
||||
|
||||
const float3 Q_ll = make_float3(Q[0][l][k] - 0.5f*Qy[0][j][i],
|
||||
Q[2][l][k] - 0.5f*Qy[2][j][i],
|
||||
Q[1][l][k] - 0.5f*Qy[1][j][i]);
|
||||
const float3 Q_lr = make_float3(Q[0][l][k] + 0.5f*Qy[0][j][i],
|
||||
Q[2][l][k] + 0.5f*Qy[2][j][i],
|
||||
Q[1][l][k] + 0.5f*Qy[1][j][i]);
|
||||
|
||||
//Evolve half a timestep (predictor step)
|
||||
const float3 Q_r_bar = Q_rl + dt_/(2.0f*dy_) * (F_func(Q_rl, g_) - F_func(Q_rr, g_));
|
||||
const float3 Q_l_bar = Q_lr + dt_/(2.0f*dy_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));
|
||||
|
||||
// Compute flux based on prediction
|
||||
const float3 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, g_);
|
||||
|
||||
//Write to shared memory
|
||||
//Note that we here swap hu and hv back to the original
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
|
||||
*/
|
||||
extern "C" {
|
||||
__global__ void KP07DimsplitKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
|
||||
float theta_,
|
||||
|
||||
int step_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock2(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Fix boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
||||
//Step 0 => evolve x first, then y
|
||||
if (step_ == 0) {
|
||||
//Compute fluxes along the x axis and evolve
|
||||
minmodSlopeX(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxF(Q, Qx, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF2(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute fluxes along the y axis and evolve
|
||||
minmodSlopeY(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxG(Q, Qx, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG2(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
}
|
||||
//Step 1 => evolve y first, then x
|
||||
else {
|
||||
//Compute fluxes along the y axis and evolve
|
||||
minmodSlopeY(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxG(Q, Qx, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG2(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute fluxes along the x axis and evolve
|
||||
minmodSlopeX(Q, Qx, theta_);
|
||||
__syncthreads();
|
||||
computeFluxF(Q, Qx, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF2(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
// Write to main memory for all internal cells
|
||||
writeBlock2(h1_ptr_, h1_pitch_,
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
210
GPUSimulators/KP07_kernel.cu
Normal file
210
GPUSimulators/KP07_kernel.cu
Normal file
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "limiters.cu"
|
||||
#include "fluxes/CentralUpwind.cu"
|
||||
|
||||
|
||||
__device__
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
// Q at interface from the right and left
|
||||
const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
|
||||
Q[1][l][k+1] - 0.5f*Qx[1][j][i+1],
|
||||
Q[2][l][k+1] - 0.5f*Qx[2][j][i+1]);
|
||||
const float3 Qm = make_float3(Q[0][l][k ] + 0.5f*Qx[0][j][i ],
|
||||
Q[1][l][k ] + 0.5f*Qx[1][j][i ],
|
||||
Q[2][l][k ] + 0.5f*Qx[2][j][i ]);
|
||||
|
||||
// Computed flux
|
||||
const float3 flux = CentralUpwindFlux(Qm, Qp, g_);
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
// Q at interface from the right and left
|
||||
// Note that we swap hu and hv
|
||||
const float3 Qp = make_float3(Q[0][l+1][k] - 0.5f*Qy[0][j+1][i],
|
||||
Q[2][l+1][k] - 0.5f*Qy[2][j+1][i],
|
||||
Q[1][l+1][k] - 0.5f*Qy[1][j+1][i]);
|
||||
const float3 Qm = make_float3(Q[0][l ][k] + 0.5f*Qy[0][j ][i],
|
||||
Q[2][l ][k] + 0.5f*Qy[2][j ][i],
|
||||
Q[1][l ][k] + 0.5f*Qy[1][j ][i]);
|
||||
|
||||
// Computed flux
|
||||
// Note that we swap back
|
||||
const float3 flux = CentralUpwindFlux(Qm, Qp, g_);
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
|
||||
*/
|
||||
extern "C" {
|
||||
__global__ void KP07Kernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
|
||||
float theta_,
|
||||
|
||||
int step_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
|
||||
//The following slightly wastes memory, but enables us to reuse the
|
||||
//funcitons in common.opencl
|
||||
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
//Read into shared memory
|
||||
readBlock2(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Fix boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Reconstruct slopes along x and axis
|
||||
minmodSlopeX(Q, Qx, theta_);
|
||||
minmodSlopeY(Q, Qy, theta_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Compute fluxes along the x and y axis
|
||||
computeFluxF(Q, Qx, F, g_);
|
||||
computeFluxG(Q, Qy, G, g_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Sum fluxes and advance in time for all internal cells
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +2
|
||||
const int j = ty + 2;
|
||||
|
||||
const float h1 = Q[0][j][i] + (F[0][ty][tx] - F[0][ty ][tx+1]) * dt_ / dx_
|
||||
+ (G[0][ty][tx] - G[0][ty+1][tx ]) * dt_ / dy_;
|
||||
const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty ][tx+1]) * dt_ / dx_
|
||||
+ (G[1][ty][tx] - G[1][ty+1][tx ]) * dt_ / dy_;
|
||||
const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty ][tx+1]) * dt_ / dx_
|
||||
+ (G[2][ty][tx] - G[2][ty+1][tx ]) * dt_ / dy_;
|
||||
|
||||
float* const h_row = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
|
||||
float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
|
||||
float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
|
||||
|
||||
if (step_ == 0) {
|
||||
//First step of RK2 ODE integrator
|
||||
|
||||
h_row[ti] = h1;
|
||||
hu_row[ti] = hu1;
|
||||
hv_row[ti] = hv1;
|
||||
}
|
||||
else if (step_ == 1) {
|
||||
//Second step of RK2 ODE integrator
|
||||
|
||||
//First read Q^n
|
||||
const float h_a = h_row[ti];
|
||||
const float hu_a = hu_row[ti];
|
||||
const float hv_a = hv_row[ti];
|
||||
|
||||
//Compute Q^n+1
|
||||
const float h_b = 0.5f*(h_a + h1);
|
||||
const float hu_b = 0.5f*(hu_a + hu1);
|
||||
const float hv_b = 0.5f*(hv_a + hv1);
|
||||
|
||||
//Write to main memory
|
||||
h_row[ti] = h_b;
|
||||
hu_row[ti] = hu_b;
|
||||
hv_row[ti] = hv_b;
|
||||
}
|
||||
}
|
||||
}
|
||||
} //extern "C"
|
||||
92
GPUSimulators/LxF.py
Normal file
92
GPUSimulators/LxF.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the classical Lax-Friedrichs numerical
|
||||
scheme for the shallow water equations
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the Lax Friedrichs scheme
|
||||
"""
|
||||
class LxF (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
1, 1, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
# Get kernels
|
||||
self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "Lax Friedrichs"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
|
||||
173
GPUSimulators/LxF_kernel.cu
Normal file
173
GPUSimulators/LxF_kernel.cu
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
This OpenCL kernel implements the classical Lax-Friedrichs scheme
|
||||
for the shallow water equations, with edge fluxes.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/LaxFriedrichs.cu"
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
template <int block_width, int block_height>
|
||||
__device__
|
||||
void computeFluxF(float Q[3][block_height+2][block_width+2],
|
||||
float F[3][block_height][block_width+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
{
|
||||
const int j=ty;
|
||||
const int l = j + 1; //Skip ghost cells
|
||||
for (int i=tx; i<block_width+1; i+=block_width) {
|
||||
const int k = i;
|
||||
|
||||
// Q at interface from the right and left
|
||||
const float3 Qp = make_float3(Q[0][l][k+1],
|
||||
Q[1][l][k+1],
|
||||
Q[2][l][k+1]);
|
||||
const float3 Qm = make_float3(Q[0][l][k],
|
||||
Q[1][l][k],
|
||||
Q[2][l][k]);
|
||||
|
||||
// Computed flux
|
||||
const float3 flux = LxF_2D_flux(Qm, Qp, g_, dx_, dt_);
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
template <int block_width, int block_height>
|
||||
__device__
|
||||
void computeFluxG(float Q[3][block_height+2][block_width+2],
|
||||
float G[3][block_height+1][block_width],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<block_height+1; j+=block_height) {
|
||||
const int l = j;
|
||||
{
|
||||
const int i=tx;
|
||||
const int k = i + 1; //Skip ghost cells
|
||||
|
||||
// Q at interface from the right and left
|
||||
// Note that we swap hu and hv
|
||||
const float3 Qp = make_float3(Q[0][l+1][k],
|
||||
Q[2][l+1][k],
|
||||
Q[1][l+1][k]);
|
||||
const float3 Qm = make_float3(Q[0][l][k],
|
||||
Q[2][l][k],
|
||||
Q[1][l][k]);
|
||||
|
||||
// Computed flux
|
||||
// Note that we swap back
|
||||
const float3 flux = LxF_2D_flux(Qm, Qp, g_, dy_, dt_);
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__
|
||||
void LxFKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
|
||||
const int block_width = BLOCK_WIDTH;
|
||||
const int block_height = BLOCK_HEIGHT;
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
|
||||
__shared__ float Q[3][block_height+2][block_width+2];
|
||||
__shared__ float F[3][block_height][block_width+1];
|
||||
__shared__ float G[3][block_height+1][block_width];
|
||||
|
||||
//Read into shared memory
|
||||
readBlock1(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary1(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Compute fluxes along the x and y axis
|
||||
computeFluxF<block_width, block_height>(Q, F, g_, dx_, dt_);
|
||||
computeFluxG<block_width, block_height>(Q, G, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Evolve for all internal cells
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
||||
const float h1 = Q[0][j][i] + (F[0][ty][tx] - F[0][ty ][tx+1]) * dt_ / dx_
|
||||
+ (G[0][ty][tx] - G[0][ty+1][tx ]) * dt_ / dy_;
|
||||
const float hu1 = Q[1][j][i] + (F[1][ty][tx] - F[1][ty ][tx+1]) * dt_ / dx_
|
||||
+ (G[1][ty][tx] - G[1][ty+1][tx ]) * dt_ / dy_;
|
||||
const float hv1 = Q[2][j][i] + (F[2][ty][tx] - F[2][ty ][tx+1]) * dt_ / dx_
|
||||
+ (G[2][ty][tx] - G[2][ty+1][tx ]) * dt_ / dy_;
|
||||
|
||||
float* const h_row = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
|
||||
float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
|
||||
float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
|
||||
|
||||
h_row[ti] = h1;
|
||||
hu_row[ti] = hu1;
|
||||
hv_row[ti] = hv1;
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
197
GPUSimulators/Simulator.py
Normal file
197
GPUSimulators/Simulator.py
Normal file
@@ -0,0 +1,197 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the classical Lax-Friedrichs numerical
|
||||
scheme for the shallow water equations
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import pycuda.compiler as cuda_compiler
|
||||
import pycuda.gpuarray
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from GPUSimulators import Common
|
||||
|
||||
|
||||
class BaseSimulator:
|
||||
"""
|
||||
Initialization routine
|
||||
context: GPU context to use
|
||||
kernel_wrapper: wrapper function of GPU kernel
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height):
|
||||
#Get logger
|
||||
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
|
||||
|
||||
self.context = context
|
||||
|
||||
if (self.context.autotuner):
|
||||
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
|
||||
block_width = int(peak_configuration["block_width"])
|
||||
block_height = int(peak_configuration["block_height"])
|
||||
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
|
||||
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
#Create data by uploading to device
|
||||
self.data = Common.SWEDataArakawaA(self.stream, \
|
||||
nx, ny, \
|
||||
ghost_cells_x, ghost_cells_y, \
|
||||
h0, hu0, hv0)
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#GPU kernel
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
self.dy = np.float32(dy)
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Keep track of simulation time
|
||||
self.t = 0.0;
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
)
|
||||
|
||||
"""
|
||||
Function which simulates forward in time using the default simulation type
|
||||
"""
|
||||
def simulate(self, t_end):
|
||||
raise(exceptions.NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
"""
|
||||
Function which simulates t_end seconds using forward Euler
|
||||
Requires that the stepEuler functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateEuler(self, t_end):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateEuler") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Step with forward Euler
|
||||
self.stepEuler(local_dt)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
|
||||
return self.t, n
|
||||
|
||||
"""
|
||||
Function which simulates t_end seconds using Runge-Kutta 2
|
||||
Requires that the stepRK functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateRK(self, t_end, order):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Perform all the Runge-Kutta substeps
|
||||
self.stepRK(local_dt, order)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
|
||||
return self.t, n
|
||||
|
||||
"""
|
||||
Function which simulates t_end seconds using second order dimensional splitting (XYYX)
|
||||
Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateDimsplit(self, t_end):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Perform the dimensional split substeps
|
||||
self.stepDimsplitXY(local_dt)
|
||||
self.stepDimsplitYX(local_dt)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs)
|
||||
return self.t, 2*n
|
||||
|
||||
|
||||
"""
|
||||
Function which performs one single timestep of size dt using forward euler
|
||||
"""
|
||||
def stepEuler(self, dt):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def stepRK(self, dt, substep):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
raise(NotImplementedError("Needs to be implemented in subclass"))
|
||||
|
||||
def sim_time(self):
|
||||
return self.t
|
||||
|
||||
def download(self):
|
||||
return self.data.download(self.stream)
|
||||
|
||||
def synchronize(self):
|
||||
self.stream.synchronize()
|
||||
|
||||
108
GPUSimulators/WAF.py
Normal file
108
GPUSimulators/WAF.py
Normal file
@@ -0,0 +1,108 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
This python module implements the Weighted average flux (WAF) described in
|
||||
E. Toro, Shock-Capturing methods for free-surface shallow flows, 2001
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
#Import packages we need
|
||||
import numpy as np
|
||||
from GPUSimulators import Simulator
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Class that solves the SW equations using the Forward-Backward linear scheme
|
||||
"""
|
||||
class WAF (Simulator.BaseSimulator):
|
||||
|
||||
"""
|
||||
Initialization routine
|
||||
h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
|
||||
nx: Number of cells along x-axis
|
||||
ny: Number of cells along y-axis
|
||||
dx: Grid cell spacing along x-axis (20 000 m)
|
||||
dy: Grid cell spacing along y-axis (20 000 m)
|
||||
dt: Size of each timestep (90 s)
|
||||
g: Gravitational accelleration (9.81 m/s^2)
|
||||
"""
|
||||
def __init__(self, \
|
||||
context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width=16, block_height=16):
|
||||
|
||||
# Call super constructor
|
||||
super().__init__(context, \
|
||||
h0, hu0, hv0, \
|
||||
nx, ny, \
|
||||
2, 2, \
|
||||
dx, dy, dt, \
|
||||
g, \
|
||||
block_width, block_height);
|
||||
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
|
||||
"iiffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
|
||||
def __str__(self):
|
||||
return "Weighted average flux"
|
||||
|
||||
def simulate(self, t_end):
|
||||
return super().simulateDimsplit(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
np.int32(0), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
np.int32(1), \
|
||||
self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
|
||||
self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
|
||||
self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
|
||||
self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
|
||||
self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
|
||||
self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
|
||||
self.data.swap()
|
||||
self.t += dt
|
||||
199
GPUSimulators/WAF_kernel.cu
Normal file
199
GPUSimulators/WAF_kernel.cu
Normal file
@@ -0,0 +1,199 @@
|
||||
/*
|
||||
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include "common.cu"
|
||||
#include "fluxes/WeightedAverageFlux.cu"
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the x axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
{
|
||||
int j=ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
|
||||
// Q at interface from the right and left
|
||||
const float3 Ql2 = make_float3(Q[0][l][k-1], Q[1][l][k-1], Q[2][l][k-1]);
|
||||
const float3 Ql1 = make_float3(Q[0][l][k ], Q[1][l][k ], Q[2][l][k ]);
|
||||
const float3 Qr1 = make_float3(Q[0][l][k+1], Q[1][l][k+1], Q[2][l][k+1]);
|
||||
const float3 Qr2 = make_float3(Q[0][l][k+2], Q[1][l][k+2], Q[2][l][k+2]);
|
||||
|
||||
// Computed flux
|
||||
const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dx_, dt_);
|
||||
F[0][j][i] = flux.x;
|
||||
F[1][j][i] = flux.y;
|
||||
F[2][j][i] = flux.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Computes the flux along the y axis for all faces
|
||||
*/
|
||||
__device__
|
||||
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const float g_, const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Compute fluxes along the y axis
|
||||
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
int i=tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
// Q at interface from the right and left
|
||||
// Note that we swap hu and hv
|
||||
const float3 Ql2 = make_float3(Q[0][l-1][k], Q[2][l-1][k], Q[1][l-1][k]);
|
||||
const float3 Ql1 = make_float3(Q[0][l ][k], Q[2][l ][k], Q[1][l ][k]);
|
||||
const float3 Qr1 = make_float3(Q[0][l+1][k], Q[2][l+1][k], Q[1][l+1][k]);
|
||||
const float3 Qr2 = make_float3(Q[0][l+2][k], Q[2][l+2][k], Q[1][l+2][k]);
|
||||
|
||||
// Computed flux
|
||||
// Note that we swap back
|
||||
const float3 flux = WAF_1D_flux(Ql2, Ql1, Qr1, Qr2, g_, dy_, dt_);
|
||||
G[0][j][i] = flux.x;
|
||||
G[1][j][i] = flux.z;
|
||||
G[2][j][i] = flux.y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" {
|
||||
__global__ void WAFKernel(
|
||||
int nx_, int ny_,
|
||||
float dx_, float dy_, float dt_,
|
||||
float g_, int step_,
|
||||
|
||||
//Input h^n
|
||||
float* h0_ptr_, int h0_pitch_,
|
||||
float* hu0_ptr_, int hu0_pitch_,
|
||||
float* hv0_ptr_, int hv0_pitch_,
|
||||
|
||||
//Output h^{n+1}
|
||||
float* h1_ptr_, int h1_pitch_,
|
||||
float* hu1_ptr_, int hu1_pitch_,
|
||||
float* hv1_ptr_, int hv1_pitch_) {
|
||||
//Shared memory variables
|
||||
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
|
||||
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
|
||||
|
||||
|
||||
|
||||
//Read into shared memory Q from global memory
|
||||
readBlock2(h0_ptr_, h0_pitch_,
|
||||
hu0_ptr_, hu0_pitch_,
|
||||
hv0_ptr_, hv0_pitch_,
|
||||
Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
//Set boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
||||
//Step 0 => evolve x first, then y
|
||||
if (step_ == 0) {
|
||||
//Compute fluxes along the x axis and evolve
|
||||
computeFluxF(Q, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF2(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Fix boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute fluxes along the y axis and evolve
|
||||
computeFluxG(Q, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG2(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
}
|
||||
//Step 1 => evolve y first, then x
|
||||
else {
|
||||
//Compute fluxes along the y axis and evolve
|
||||
computeFluxG(Q, F, g_, dy_, dt_);
|
||||
__syncthreads();
|
||||
evolveG2(Q, F, nx_, ny_, dy_, dt_);
|
||||
__syncthreads();
|
||||
|
||||
//Fix boundary conditions
|
||||
noFlowBoundary2(Q, nx_, ny_);
|
||||
__syncthreads();
|
||||
|
||||
//Compute fluxes along the x axis and evolve
|
||||
computeFluxF(Q, F, g_, dx_, dt_);
|
||||
__syncthreads();
|
||||
evolveF2(Q, F, nx_, ny_, dx_, dt_);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Write to main memory for all internal cells
|
||||
writeBlock2(h1_ptr_, h1_pitch_,
|
||||
hu1_ptr_, hu1_pitch_,
|
||||
hv1_ptr_, hv1_pitch_,
|
||||
Q, nx_, ny_);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
5
GPUSimulators/__init__.py
Normal file
5
GPUSimulators/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
#!/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
# Nothing general to do
|
||||
500
GPUSimulators/common.cu
Normal file
500
GPUSimulators/common.cu
Normal file
@@ -0,0 +1,500 @@
|
||||
/*
|
||||
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
|
||||
for the shallow water equations, described in
|
||||
A. Kurganov & Guergana Petrova
|
||||
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
|
||||
Scheme for the Saint-Venant System Communications in Mathematical
|
||||
Sciences, 5 (2007), 133-160.
|
||||
|
||||
Copyright (C) 2016 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Location of thread in block
|
||||
*/
|
||||
inline __device__ int get_local_id(int dim) {
|
||||
switch(dim) {
|
||||
case 0: return threadIdx.x;
|
||||
case 1: return threadIdx.y;
|
||||
case 2: return threadIdx.z;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get block index
|
||||
*/
|
||||
__device__ int get_group_id(int dim) {
|
||||
switch(dim) {
|
||||
case 0: return blockIdx.x;
|
||||
case 1: return blockIdx.y;
|
||||
case 2: return blockIdx.z;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Location of thread in global domain
|
||||
*/
|
||||
__device__ int get_global_id(int dim) {
|
||||
switch(dim) {
|
||||
case 0: return blockDim.x*blockIdx.x + threadIdx.x;
|
||||
case 1: return blockDim.y*blockIdx.y + threadIdx.y;
|
||||
case 2: return blockDim.z*blockIdx.z + threadIdx.z;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Float3 operators
|
||||
*/
|
||||
inline __device__ float3 operator*(const float a, const float3 b) {
|
||||
return make_float3(a*b.x, a*b.y, a*b.z);
|
||||
}
|
||||
|
||||
inline __device__ float3 operator/(const float3 a, const float b) {
|
||||
return make_float3(a.x/b, a.y/b, a.z/b);
|
||||
}
|
||||
|
||||
inline __device__ float3 operator-(const float3 a, const float3 b) {
|
||||
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
|
||||
}
|
||||
|
||||
inline __device__ float3 operator+(const float3 a, const float3 b) {
|
||||
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
|
||||
}
|
||||
|
||||
inline __device__ __host__ float clamp(const float f, const float a, const float b) {
|
||||
return fmaxf(a, fminf(f, b));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reads a block of data with one ghost cell for the shallow water equations
|
||||
*/
|
||||
__device__ void readBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = clamp(by + j, 0, ny_+1); // Out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
float* const h_row = (float*) ((char*) h_ptr_ + h_pitch_*l);
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
|
||||
|
||||
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
|
||||
const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
|
||||
|
||||
Q[0][j][i] = h_row[k];
|
||||
Q[1][j][i] = hu_row[k];
|
||||
Q[2][j][i] = hv_row[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reads a block of data with two ghost cells for the shallow water equations
|
||||
*/
|
||||
__device__ void readBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of block within domain
|
||||
const int bx = BLOCK_WIDTH * get_group_id(0);
|
||||
const int by = BLOCK_HEIGHT * get_group_id(1);
|
||||
|
||||
//Read into shared memory
|
||||
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
|
||||
const int l = clamp(by + j, 0, ny_+3); // Out of bounds
|
||||
|
||||
//Compute the pointer to current row in the arrays
|
||||
float* const h_row = (float*) ((char*) h_ptr_ + h_pitch_*l);
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
|
||||
|
||||
for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
|
||||
const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
|
||||
|
||||
Q[0][j][i] = h_row[k];
|
||||
Q[1][j][i] = hu_row[k];
|
||||
Q[2][j][i] = hv_row[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Writes a block of data to global memory for the shallow water equations.
|
||||
*/
|
||||
__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
|
||||
//Only write internal cells
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
||||
float* const h_row = (float*) ((char*) h_ptr_ + h_pitch_*tj);
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj);
|
||||
|
||||
h_row[ti] = Q[0][j][i];
|
||||
hu_row[ti] = Q[1][j][i];
|
||||
hv_row[ti] = Q[2][j][i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Writes a block of data to global memory for the shallow water equations.
|
||||
*/
|
||||
__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
|
||||
float* hu_ptr_, int hu_pitch_,
|
||||
float* hv_ptr_, int hv_pitch_,
|
||||
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
const int nx_, const int ny_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
//Only write internal cells
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +2
|
||||
const int j = ty + 2;
|
||||
|
||||
float* const h_row = (float*) ((char*) h_ptr_ + h_pitch_*tj);
|
||||
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*tj);
|
||||
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*tj);
|
||||
|
||||
h_row[ti] = Q[0][j][i];
|
||||
hu_row[ti] = Q[1][j][i];
|
||||
hv_row[ti] = Q[2][j][i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* No flow boundary conditions for the shallow water equations
|
||||
* with one ghost cell in each direction
|
||||
*/
|
||||
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
|
||||
//Block-local indices
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
||||
//Fix boundary conditions
|
||||
if (ti == 1) {
|
||||
Q[0][j][i-1] = Q[0][j][i];
|
||||
Q[1][j][i-1] = -Q[1][j][i];
|
||||
Q[2][j][i-1] = Q[2][j][i];
|
||||
}
|
||||
if (ti == nx_) {
|
||||
Q[0][j][i+1] = Q[0][j][i];
|
||||
Q[1][j][i+1] = -Q[1][j][i];
|
||||
Q[2][j][i+1] = Q[2][j][i];
|
||||
}
|
||||
if (tj == 1) {
|
||||
Q[0][j-1][i] = Q[0][j][i];
|
||||
Q[1][j-1][i] = Q[1][j][i];
|
||||
Q[2][j-1][i] = -Q[2][j][i];
|
||||
}
|
||||
if (tj == ny_) {
|
||||
Q[0][j+1][i] = Q[0][j][i];
|
||||
Q[1][j+1][i] = Q[1][j][i];
|
||||
Q[2][j+1][i] = -Q[2][j][i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* No flow boundary conditions for the shallow water equations
|
||||
* with two ghost cells in each direction
|
||||
*/
|
||||
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
|
||||
//Global index
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
//Block-local indices
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +2
|
||||
const int j = ty + 2;
|
||||
|
||||
if (ti == 2) {
|
||||
Q[0][j][i-1] = Q[0][j][i];
|
||||
Q[1][j][i-1] = -Q[1][j][i];
|
||||
Q[2][j][i-1] = Q[2][j][i];
|
||||
|
||||
Q[0][j][i-2] = Q[0][j][i+1];
|
||||
Q[1][j][i-2] = -Q[1][j][i+1];
|
||||
Q[2][j][i-2] = Q[2][j][i+1];
|
||||
}
|
||||
if (ti == nx_+1) {
|
||||
Q[0][j][i+1] = Q[0][j][i];
|
||||
Q[1][j][i+1] = -Q[1][j][i];
|
||||
Q[2][j][i+1] = Q[2][j][i];
|
||||
|
||||
Q[0][j][i+2] = Q[0][j][i-1];
|
||||
Q[1][j][i+2] = -Q[1][j][i-1];
|
||||
Q[2][j][i+2] = Q[2][j][i-1];
|
||||
}
|
||||
if (tj == 2) {
|
||||
Q[0][j-1][i] = Q[0][j][i];
|
||||
Q[1][j-1][i] = Q[1][j][i];
|
||||
Q[2][j-1][i] = -Q[2][j][i];
|
||||
|
||||
Q[0][j-2][i] = Q[0][j+1][i];
|
||||
Q[1][j-2][i] = Q[1][j+1][i];
|
||||
Q[2][j-2][i] = -Q[2][j+1][i];
|
||||
}
|
||||
if (tj == ny_+1) {
|
||||
Q[0][j+1][i] = Q[0][j][i];
|
||||
Q[1][j+1][i] = Q[1][j][i];
|
||||
Q[2][j+1][i] = -Q[2][j][i];
|
||||
|
||||
Q[0][j+2][i] = Q[0][j-1][i];
|
||||
Q[1][j+2][i] = Q[1][j-1][i];
|
||||
Q[2][j+2][i] = -Q[2][j-1][i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Evolves the solution in time along the x axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
||||
Q[0][j][i] = Q[0][j][i] + (F[0][ty][tx] - F[0][ty][tx+1]) * dt_ / dx_;
|
||||
Q[1][j][i] = Q[1][j][i] + (F[1][ty][tx] - F[1][ty][tx+1]) * dt_ / dx_;
|
||||
Q[2][j][i] = Q[2][j][i] + (F[2][ty][tx] - F[2][ty][tx+1]) * dt_ / dx_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Evolves the solution in time along the x axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dx_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 2;
|
||||
|
||||
Q[0][j][i] = Q[0][j][i] + (F[0][ty][tx] - F[0][ty][tx+1]) * dt_ / dx_;
|
||||
Q[1][j][i] = Q[1][j][i] + (F[1][ty][tx] - F[1][ty][tx+1]) * dt_ / dx_;
|
||||
Q[2][j][i] = Q[2][j][i] + (F[2][ty][tx] - F[2][ty][tx+1]) * dt_ / dx_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Evolves the solution in time along the y axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
|
||||
const int tj = get_global_id(1) + 1;
|
||||
|
||||
if (ti > 0 && ti < nx_+1 && tj > 0 && tj < ny_+1) {
|
||||
const int i = tx + 1; //Skip local ghost cells, i.e., +1
|
||||
const int j = ty + 1;
|
||||
|
||||
Q[0][j][i] = Q[0][j][i] + (G[0][ty][tx] - G[0][ty+1][tx]) * dt_ / dy_;
|
||||
Q[1][j][i] = Q[1][j][i] + (G[1][ty][tx] - G[1][ty+1][tx]) * dt_ / dy_;
|
||||
Q[2][j][i] = Q[2][j][i] + (G[2][ty][tx] - G[2][ty+1][tx]) * dt_ / dy_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Evolves the solution in time along the y axis (dimensional splitting)
|
||||
*/
|
||||
__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
|
||||
const int nx_, const int ny_,
|
||||
const float dy_, const float dt_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
|
||||
const int tj = get_global_id(1) + 2;
|
||||
|
||||
if (ti > 1 && ti < nx_+2 && tj > 1 && tj < ny_+2) {
|
||||
const int i = tx + 2; //Skip local ghost cells, i.e., +2
|
||||
const int j = ty + 2;
|
||||
|
||||
Q[0][j][i] = Q[0][j][i] + (G[0][ty][tx] - G[0][ty+1][tx]) * dt_ / dy_;
|
||||
Q[1][j][i] = Q[1][j][i] + (G[1][ty][tx] - G[1][ty+1][tx]) * dt_ / dy_;
|
||||
Q[2][j][i] = Q[2][j][i] + (G[2][ty][tx] - G[2][ty+1][tx]) * dt_ / dy_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float3 F_func(const float3 Q, const float g) {
|
||||
float3 F;
|
||||
|
||||
F.x = Q.y; //hu
|
||||
F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h;
|
||||
F.z = Q.y*Q.z / Q.x; //hu*hv/h;
|
||||
|
||||
return F;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
39
GPUSimulators/fluxes/CentralUpwind.cu
Normal file
39
GPUSimulators/fluxes/CentralUpwind.cu
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
This file implements the Central upwind flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Central upwind flux function
|
||||
*/
|
||||
__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
|
||||
const float3 Fp = F_func(Qp, g);
|
||||
const float up = Qp.y / Qp.x; // hu / h
|
||||
const float cp = sqrt(g*Qp.x); // sqrt(g*h)
|
||||
|
||||
const float3 Fm = F_func(Qm, g);
|
||||
const float um = Qm.y / Qm.x; // hu / h
|
||||
const float cm = sqrt(g*Qm.x); // sqrt(g*h)
|
||||
|
||||
const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
|
||||
const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
|
||||
|
||||
return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
|
||||
}
|
||||
|
||||
33
GPUSimulators/fluxes/FirstOrderCentered.cu
Normal file
33
GPUSimulators/fluxes/FirstOrderCentered.cu
Normal file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
This file implements the First ORder CEntered flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "./LaxFriedrichs.cu"
|
||||
#include "./LaxWendroff.cu"
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* First Ordered Centered (Toro 2001, p.163)
|
||||
*/
|
||||
__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
|
||||
const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
|
||||
return 0.5f*(F_lf + F_lw2);
|
||||
}
|
||||
36
GPUSimulators/fluxes/Godunov.cu
Normal file
36
GPUSimulators/fluxes/Godunov.cu
Normal file
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
This file implements the Godunov flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Godunovs centered scheme (Toro 2001, p 165)
|
||||
*/
|
||||
__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
|
||||
|
||||
return F_func(Q_godc, g_);
|
||||
}
|
||||
|
||||
|
||||
|
||||
63
GPUSimulators/fluxes/HartenLaxVanLeer.cu
Normal file
63
GPUSimulators/fluxes/HartenLaxVanLeer.cu
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
This file implements the Harten-Lax-van Leer flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
|
||||
*/
|
||||
__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
const float h_l = Q_l.x;
|
||||
const float h_r = Q_r.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l.y / h_l;
|
||||
const float u_r = Q_r.y / h_r;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
|
||||
//Upwind selection
|
||||
if (S_l >= 0.0f) {
|
||||
return F_func(Q_l, g_);
|
||||
}
|
||||
else if (S_r <= 0.0f) {
|
||||
return F_func(Q_r, g_);
|
||||
}
|
||||
//Or estimate flux in the star region
|
||||
else {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
|
||||
return flux;
|
||||
}
|
||||
}
|
||||
80
GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
Normal file
80
GPUSimulators/fluxes/HartenLaxVanLeerContact.cu
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
This file implements the Harten-Lax-van Leer flux with contact discontinuity
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
|
||||
*/
|
||||
__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {
|
||||
const float h_l = Q_l.x;
|
||||
const float h_r = Q_r.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l.y / h_l;
|
||||
const float u_r = Q_r.y / h_r;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
|
||||
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
//Upwind selection
|
||||
if (S_l >= 0.0f) {
|
||||
return F_l;
|
||||
}
|
||||
else if (S_r <= 0.0f) {
|
||||
return F_r;
|
||||
}
|
||||
//Or estimate flux in the "left star" region
|
||||
else if (S_l <= 0.0f && 0.0f <=S_star) {
|
||||
const float v_l = Q_l.z / h_l;
|
||||
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
|
||||
const float3 flux = F_l + S_l*(Q_star_l - Q_l);
|
||||
return flux;
|
||||
}
|
||||
//Or estimate flux in the "righ star" region
|
||||
else if (S_star <= 0.0f && 0.0f <=S_r) {
|
||||
const float v_r = Q_r.z / h_r;
|
||||
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
|
||||
const float3 flux = F_r + S_r*(Q_star_r - Q_r);
|
||||
return flux;
|
||||
}
|
||||
else {
|
||||
return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
|
||||
}
|
||||
}
|
||||
|
||||
24
GPUSimulators/fluxes/LaxFriedrichs.cu
Normal file
24
GPUSimulators/fluxes/LaxFriedrichs.cu
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
|
||||
/**
|
||||
* Lax-Friedrichs flux (Toro 2001, p 163)
|
||||
*/
|
||||
__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Lax-Friedrichs extended to 2D
|
||||
*/
|
||||
__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
//Note numerical diffusion for 2D here (0.25)
|
||||
return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
|
||||
}
|
||||
33
GPUSimulators/fluxes/LaxWendroff.cu
Normal file
33
GPUSimulators/fluxes/LaxWendroff.cu
Normal file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
This file implements the Lax-Wendroff flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
|
||||
*/
|
||||
__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
|
||||
const float3 F_l = F_func(Q_l, g_);
|
||||
const float3 F_r = F_func(Q_r, g_);
|
||||
|
||||
const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
|
||||
|
||||
return F_func(Q_lw2, g_);
|
||||
}
|
||||
|
||||
187
GPUSimulators/fluxes/WeightedAverageFlux.cu
Normal file
187
GPUSimulators/fluxes/WeightedAverageFlux.cu
Normal file
@@ -0,0 +1,187 @@
|
||||
/*
|
||||
This file implements the Weighted Average Flux
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "limiters.cu"
|
||||
|
||||
/**
|
||||
* Superbee flux limiter for WAF.
|
||||
* Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
|
||||
* @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
|
||||
* @param c_ the courant number for wave k, dt*S_k/dx
|
||||
*/
|
||||
__device__ float WAF_superbee(float r_, float c_) {
|
||||
// r <= 0.0
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
// 0.0 <= r <= 1/2
|
||||
else if (r_ <= 0.5f) {
|
||||
return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
// 1/2 <= r <= 1
|
||||
else if (r_ <= 1.0f) {
|
||||
return fabs(c_);
|
||||
}
|
||||
// 1 <= r <= 2
|
||||
else if (r_ <= 2.0f) {
|
||||
return 1.0f - (1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
// r >= 2
|
||||
else {
|
||||
return 2.0f*fabsf(c_) - 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float WAF_albada(float r_, float c_) {
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
else {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float WAF_minbee(float r_, float c_) {
|
||||
r_ = fmaxf(-1.0f, fminf(2.0f, r_));
|
||||
if (r_ <= 0.0f) {
|
||||
return 1.0f;
|
||||
}
|
||||
if (r_ >= 0.0f && r_ <= 1.0f) {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * r_;
|
||||
}
|
||||
else {
|
||||
return fabsf(c_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ float WAF_minmod(float r_, float c_) {
|
||||
return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
|
||||
}
|
||||
|
||||
|
||||
|
||||
__device__ float limiterToWAFLimiter(float r_, float c_) {
|
||||
return 1.0f - (1.0f - fabsf(c_))*r_;
|
||||
}
|
||||
|
||||
__device__ float desingularize(float x_, float eps_) {
|
||||
return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
|
||||
}
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
|
||||
|
||||
//This estimate for the h* gives rise to spurious oscillations.
|
||||
//return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
|
||||
|
||||
const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
|
||||
return h_tmp*h_tmp / g_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
|
||||
* @param r_ The flux limiter parameter (see Toro 2001, p. 203)
|
||||
* @param Q_l2 Q_{i-1}
|
||||
* @param Q_l1 Q_{i}
|
||||
* @param Q_r1 Q_{i+1}
|
||||
* @param Q_r2 Q_{i+2}
|
||||
*/
|
||||
__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {
|
||||
const float h_l = Q_l1.x;
|
||||
const float h_r = Q_r1.x;
|
||||
|
||||
const float h_l2 = Q_l2.x;
|
||||
const float h_r2 = Q_r2.x;
|
||||
|
||||
// Calculate velocities
|
||||
const float u_l = Q_l1.y / h_l;
|
||||
const float u_r = Q_r1.y / h_r;
|
||||
|
||||
const float u_l2 = Q_l2.y / h_l2;
|
||||
const float u_r2 = Q_r2.y / h_r2;
|
||||
|
||||
const float v_l = Q_l1.z / h_l;
|
||||
const float v_r = Q_r1.z / h_r;
|
||||
|
||||
const float v_l2 = Q_l2.z / h_l2;
|
||||
const float v_r2 = Q_r2.z / h_r2;
|
||||
|
||||
// Estimate the potential wave speeds
|
||||
const float c_l = sqrt(g_*h_l);
|
||||
const float c_r = sqrt(g_*h_r);
|
||||
|
||||
const float c_l2 = sqrt(g_*h_l2);
|
||||
const float c_r2 = sqrt(g_*h_r2);
|
||||
|
||||
// Compute h in the "star region", h^dagger
|
||||
const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_);
|
||||
const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_);
|
||||
const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_);
|
||||
|
||||
const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
|
||||
const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
|
||||
|
||||
const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
|
||||
const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
|
||||
|
||||
// Compute wave speed estimates
|
||||
const float S_l = u_l - c_l*q_l;
|
||||
const float S_r = u_r + c_r*q_r;
|
||||
const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
|
||||
|
||||
const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
|
||||
const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
|
||||
|
||||
// Estimate the fluxes in the four regions
|
||||
const float3 F_1 = F_func(Q_l1, g_);
|
||||
const float3 F_4 = F_func(Q_r1, g_);
|
||||
|
||||
const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
|
||||
const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
|
||||
//const float3 F_2 = F_func(Q_star_l, g_);
|
||||
//const float3 F_3 = F_func(Q_star_r, g_);
|
||||
|
||||
// Compute the courant numbers for the waves
|
||||
const float c_1 = S_l * dt_ / dx_;
|
||||
const float c_2 = S_star * dt_ / dx_;
|
||||
const float c_3 = S_r * dt_ / dx_;
|
||||
|
||||
// Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
|
||||
const float eps = 1.0e-6f;
|
||||
const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
|
||||
const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
|
||||
const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
|
||||
|
||||
// Compute the limiter
|
||||
// We use h for the nonlinear waves, and v for the middle shear wave
|
||||
const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
|
||||
const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2);
|
||||
const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
|
||||
|
||||
//Average the fluxes
|
||||
const float3 flux = 0.5f*( F_1 + F_4 )
|
||||
- 0.5f*( A_1 * (F_2 - F_1)
|
||||
+ A_2 * (F_3 - F_2)
|
||||
+ A_3 * (F_4 - F_3) );
|
||||
|
||||
return flux;
|
||||
}
|
||||
129
GPUSimulators/limiters.cu
Normal file
129
GPUSimulators/limiters.cu
Normal file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
This file implements different flux and slope limiters
|
||||
|
||||
Copyright (C) 2016, 2017, 2018 SINTEF ICT
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a slope using the generalized minmod limiter based on three
|
||||
* consecutive values
|
||||
*/
|
||||
__device__ __inline__ float minmodSlope(float left, float center, float right, float theta) {
|
||||
const float backward = (center - left) * theta;
|
||||
const float central = (right - left) * 0.5f;
|
||||
const float forward = (right - center) * theta;
|
||||
|
||||
return 0.25f
|
||||
*copysign(1.0f, backward)
|
||||
*(copysign(1.0f, backward) + copysign(1.0f, central))
|
||||
*(copysign(1.0f, central) + copysign(1.0f, forward))
|
||||
*min( min(fabs(backward), fabs(central)), fabs(forward) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along the abscissa
|
||||
*/
|
||||
__device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
//Reconstruct slopes along x axis
|
||||
{
|
||||
const int j = ty;
|
||||
const int l = j + 2; //Skip ghost cells
|
||||
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
|
||||
const int k = i + 1;
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reconstructs a minmod slope for a whole block along the ordinate
|
||||
*/
|
||||
__device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
|
||||
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
|
||||
const float theta_) {
|
||||
//Index of thread within block
|
||||
const int tx = get_local_id(0);
|
||||
const int ty = get_local_id(1);
|
||||
|
||||
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
|
||||
const int l = j + 1;
|
||||
{
|
||||
const int i = tx;
|
||||
const int k = i + 2; //Skip ghost cells
|
||||
for (int p=0; p<3; ++p) {
|
||||
Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__device__ float monotonized_central(float r_) {
|
||||
return fmaxf(0.0f, fminf(2.0f, fminf(2.0f*r_, 0.5f*(1.0f+r_))));
|
||||
}
|
||||
|
||||
__device__ float osher(float r_, float beta_) {
|
||||
return fmaxf(0.0f, fminf(beta_, r_));
|
||||
}
|
||||
|
||||
__device__ float sweby(float r_, float beta_) {
|
||||
return fmaxf(0.0f, fmaxf(fminf(r_, beta_), fminf(beta_*r_, 1.0f)));
|
||||
}
|
||||
|
||||
__device__ float minmod(float r_) {
|
||||
return fmaxf(0.0f, fminf(1.0f, r_));
|
||||
}
|
||||
|
||||
__device__ float generalized_minmod(float r_, float theta_) {
|
||||
return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
|
||||
}
|
||||
|
||||
__device__ float superbee(float r_) {
|
||||
return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
|
||||
}
|
||||
|
||||
__device__ float vanAlbada1(float r_) {
|
||||
return (r_*r_ + r_) / (r_*r_ + 1.0f);
|
||||
}
|
||||
|
||||
__device__ float vanAlbada2(float r_) {
|
||||
return 2.0f*r_ / (r_*r_* + 1.0f);
|
||||
}
|
||||
|
||||
__device__ float vanLeer(float r_) {
|
||||
return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
|
||||
}
|
||||
Reference in New Issue
Block a user