From 10d8e26108c8655f65f29f2933675e3fd7d1fdc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= Date: Tue, 4 Sep 2018 16:33:50 +0200 Subject: [PATCH] Refactoring --- GPUSimulators/Common.py | 220 -------- GPUSimulators/CudaContext.py | 263 ++++++++++ GPUSimulators/FORCE.py | 2 +- GPUSimulators/HLL.py | 2 +- GPUSimulators/HLL2.py | 2 +- GPUSimulators/IPythonMagic.py | 4 +- GPUSimulators/KP07.py | 2 +- GPUSimulators/KP07_dimsplit.py | 2 +- GPUSimulators/LxF.py | 4 +- GPUSimulators/SWECommon.cu | 66 --- GPUSimulators/WAF.py | 2 +- .../{EulerCommon.cu => cuda/EulerCommon.h} | 1 + GPUSimulators/cuda/SWECommon.h | 474 ++++++++++++++++++ .../{FORCE_kernel.cu => cuda/SWE_FORCE.cu} | 5 +- .../{HLL_kernel.cu => cuda/SWE_HLL.cu} | 5 +- .../{HLL2_kernel.cu => cuda/SWE_HLL2.cu} | 7 +- .../{KP07_kernel.cu => cuda/SWE_KP07.cu} | 7 +- .../SWE_KP07_dimsplit.cu} | 7 +- .../{LxF_kernel.cu => cuda/SWE_LxF.cu} | 5 +- .../{WAF_kernel.cu => cuda/SWE_WAF.cu} | 5 +- GPUSimulators/{common.cu => cuda/common.h} | 32 +- .../{limiters.cu => cuda/limiters.h} | 2 +- GPUSimulators/fluxes/CentralUpwind.cu | 39 -- GPUSimulators/fluxes/FirstOrderCentered.cu | 33 -- GPUSimulators/fluxes/Godunov.cu | 36 -- GPUSimulators/fluxes/HartenLaxVanLeer.cu | 63 --- .../fluxes/HartenLaxVanLeerContact.cu | 80 --- GPUSimulators/fluxes/LaxFriedrichs.cu | 24 - GPUSimulators/fluxes/LaxWendroff.cu | 33 -- GPUSimulators/fluxes/WeightedAverageFlux.cu | 187 ------- 30 files changed, 774 insertions(+), 840 deletions(-) create mode 100644 GPUSimulators/CudaContext.py delete mode 100644 GPUSimulators/SWECommon.cu rename GPUSimulators/{EulerCommon.cu => cuda/EulerCommon.h} (99%) create mode 100644 GPUSimulators/cuda/SWECommon.h rename GPUSimulators/{FORCE_kernel.cu => cuda/SWE_FORCE.cu} (98%) rename GPUSimulators/{HLL_kernel.cu => cuda/SWE_HLL.cu} (98%) rename GPUSimulators/{HLL2_kernel.cu => cuda/SWE_HLL2.cu} (98%) rename GPUSimulators/{KP07_kernel.cu => cuda/SWE_KP07.cu} (98%) rename GPUSimulators/{KP07_dimsplit_kernel.cu => cuda/SWE_KP07_dimsplit.cu} (98%) rename GPUSimulators/{LxF_kernel.cu => cuda/SWE_LxF.cu} (98%) rename GPUSimulators/{WAF_kernel.cu => cuda/SWE_WAF.cu} (98%) rename GPUSimulators/{common.cu => cuda/common.h} (91%) rename GPUSimulators/{limiters.cu => cuda/limiters.h} (99%) delete mode 100644 GPUSimulators/fluxes/CentralUpwind.cu delete mode 100644 GPUSimulators/fluxes/FirstOrderCentered.cu delete mode 100644 GPUSimulators/fluxes/Godunov.cu delete mode 100644 GPUSimulators/fluxes/HartenLaxVanLeer.cu delete mode 100644 GPUSimulators/fluxes/HartenLaxVanLeerContact.cu delete mode 100644 GPUSimulators/fluxes/LaxFriedrichs.cu delete mode 100644 GPUSimulators/fluxes/LaxWendroff.cu delete mode 100644 GPUSimulators/fluxes/WeightedAverageFlux.cu diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index 17b20a4..e88069e 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -34,7 +34,6 @@ import pycuda.compiler as cuda_compiler import pycuda.gpuarray import pycuda.driver as cuda -from GPUSimulators import Autotuner """ Class which keeps track of time spent for a section of code @@ -58,225 +57,6 @@ class Timer(object): - -""" -Class which keeps track of the CUDA context and some helper functions -""" -class CudaContext(object): - - def __init__(self, blocking=False, use_cache=True, autotuning=True): - self.blocking = blocking - self.use_cache = use_cache - self.logger = logging.getLogger(__name__) - self.kernels = {} - - self.module_path = os.path.dirname(os.path.realpath(__file__)) - - #Initialize cuda (must be first call to PyCUDA) - cuda.init(flags=0) - - self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) - - #Print some info about CUDA - self.logger.info("CUDA version %s", str(cuda.get_version())) - self.logger.info("Driver version %s", str(cuda.get_driver_version())) - - self.cuda_device = cuda.Device(0) - self.logger.info("Using '%s' GPU", self.cuda_device.name()) - self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) - - # Create the CUDA context - if (self.blocking): - self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC) - self.logger.warning("Using blocking context") - else: - self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO) - - free, total = cuda.mem_get_info() - self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024))) - - self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) - - #Create cache dir for cubin files - if (self.use_cache): - self.cache_path = os.path.join(self.module_path, "cuda_cache") - if not os.path.isdir(self.cache_path): - os.mkdir(self.cache_path) - self.logger.info("Using CUDA cache dir %s", self.cache_path) - - self.autotuner = None - if (autotuning): - self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience") - self.autotuner = Autotuner.Autotuner() - - - def __del__(self, *args): - self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) - - # Loop over all contexts in stack, and remove "this" - other_contexts = [] - while (cuda.Context.get_current() != None): - context = cuda.Context.get_current() - if (context.handle != self.cuda_context.handle): - self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle)) - other_contexts = [context] + other_contexts - cuda.Context.pop() - else: - self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle)) - cuda.Context.pop() - - # Add all the contexts we popped that were not our own - for context in other_contexts: - self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle)) - cuda.Context.push(context) - - self.logger.debug("<%s> Detaching", str(self.cuda_context.handle)) - self.cuda_context.detach() - - - def __str__(self): - return "CudaContext id " + str(self.cuda_context.handle) - - - def hash_kernel(kernel_filename, include_dirs): - # Generate a kernel ID for our caches - num_includes = 0 - max_includes = 100 - kernel_hasher = hashlib.md5() - logger = logging.getLogger(__name__) - - # Loop over file and includes, and check if something has changed - files = [kernel_filename] - while len(files): - - if (num_includes > max_includes): - raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename)) - - filename = files.pop() - - #logger.debug("Hashing %s", filename) - - modified = os.path.getmtime(filename) - - # Open the file - with io.open(filename, "r") as file: - - # Search for #inclue and also hash the file - file_str = file.read() - kernel_hasher.update(file_str.encode('utf-8')) - kernel_hasher.update(str(modified).encode('utf-8')) - - #Find all includes - includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M) - - # Loop over everything that looks like an include - for include_file in includes: - - #Search through include directories for the file - file_path = os.path.dirname(filename) - for include_path in [file_path] + include_dirs: - - # If we find it, add it to list of files to check - temp_path = os.path.join(include_path, include_file) - if (os.path.isfile(temp_path)): - files = files + [temp_path] - num_includes = num_includes + 1 #For circular includes... - break - - return kernel_hasher.hexdigest() - - """ - Reads a text file and creates an OpenCL kernel from that - """ - def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ - prepared_call_args, \ - include_dirs=[], no_extern_c=True, - **kwargs): - """ - Helper function to print compilation output - """ - def cuda_compile_message_handler(compile_success_bool, info_str, error_str): - self.logger.debug("Compilation returned %s", str(compile_success_bool)) - if info_str: - self.logger.debug("Info: %s", info_str) - if error_str: - self.logger.debug("Error: %s", error_str) - - #self.logger.debug("Getting %s", kernel_filename) - - # Create a hash of the kernel (and its includes) - kwargs_hasher = hashlib.md5() - kwargs_hasher.update(str(kwargs).encode('utf-8')); - kwargs_hash = kwargs_hasher.hexdigest() - kwargs_hasher = None - root, ext = os.path.splitext(kernel_filename) - kernel_hash = root \ - + "_" + CudaContext.hash_kernel( \ - os.path.join(self.module_path, kernel_filename), \ - include_dirs=[self.module_path] + include_dirs) \ - + "_" + kwargs_hash \ - + ext - cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) - - # If we have the kernel in our hashmap, return it - if (kernel_hash in self.kernels.keys()): - self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) - return self.kernels[kernel_hash] - - # If we have it on disk, return it - elif (self.use_cache and os.path.isfile(cached_kernel_filename)): - self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) - - with io.open(cached_kernel_filename, "rb") as file: - file_str = file.read() - module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) - - kernel = module.get_function(kernel_function_name) - kernel.prepare(prepared_call_args) - self.kernels[kernel_hash] = kernel - return kernel - - # Otherwise, compile it from source - else: - self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) - - #Create kernel string - kernel_string = "" - for key, value in kwargs.items(): - kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) - kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) - if (self.use_cache): - with io.open(cached_kernel_filename + ".txt", "w") as file: - file.write(kernel_string) - - - with Timer("compiler") as timer: - cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) - module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) - if (self.use_cache): - with io.open(cached_kernel_filename, "wb") as file: - file.write(cubin) - - kernel = module.get_function(kernel_function_name) - kernel.prepare(prepared_call_args) - self.kernels[kernel_hash] = kernel - - - return kernel - - """ - Clears the kernel cache (useful for debugging & development) - """ - def clear_kernel_cache(self): - self.logger.debug("Clearing cache") - self.kernels = {} - gc.collect() - - """ - Synchronizes all streams etc - """ - def synchronize(self): - self.cuda_context.synchronize() diff --git a/GPUSimulators/CudaContext.py b/GPUSimulators/CudaContext.py new file mode 100644 index 0000000..fd50864 --- /dev/null +++ b/GPUSimulators/CudaContext.py @@ -0,0 +1,263 @@ +# -*- coding: utf-8 -*- + +""" +This python module implements Cuda context handling + +Copyright (C) 2018 SINTEF ICT + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" + + + +import os + +import numpy as np +import time +import re +import io +import hashlib +import logging +import gc + +import pycuda.compiler as cuda_compiler +import pycuda.gpuarray +import pycuda.driver as cuda + +from GPUSimulators import Autotuner, Common + + + +""" +Class which keeps track of the CUDA context and some helper functions +""" +class CudaContext(object): + + def __init__(self, blocking=False, use_cache=True, autotuning=True): + self.blocking = blocking + self.use_cache = use_cache + self.logger = logging.getLogger(__name__) + self.kernels = {} + + self.module_path = os.path.dirname(os.path.realpath(__file__)) + + #Initialize cuda (must be first call to PyCUDA) + cuda.init(flags=0) + + self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) + + #Print some info about CUDA + self.logger.info("CUDA version %s", str(cuda.get_version())) + self.logger.info("Driver version %s", str(cuda.get_driver_version())) + + self.cuda_device = cuda.Device(0) + self.logger.info("Using '%s' GPU", self.cuda_device.name()) + self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) + + # Create the CUDA context + if (self.blocking): + self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC) + self.logger.warning("Using blocking context") + else: + self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO) + + free, total = cuda.mem_get_info() + self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024))) + + self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) + + #Create cache dir for cubin files + if (self.use_cache): + self.cache_path = os.path.join(self.module_path, "cuda_cache") + if not os.path.isdir(self.cache_path): + os.mkdir(self.cache_path) + self.logger.info("Using CUDA cache dir %s", self.cache_path) + + self.autotuner = None + if (autotuning): + self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience") + self.autotuner = Autotuner.Autotuner() + + + def __del__(self, *args): + self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) + + # Loop over all contexts in stack, and remove "this" + other_contexts = [] + while (cuda.Context.get_current() != None): + context = cuda.Context.get_current() + if (context.handle != self.cuda_context.handle): + self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle)) + other_contexts = [context] + other_contexts + cuda.Context.pop() + else: + self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle)) + cuda.Context.pop() + + # Add all the contexts we popped that were not our own + for context in other_contexts: + self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle)) + cuda.Context.push(context) + + self.logger.debug("<%s> Detaching", str(self.cuda_context.handle)) + self.cuda_context.detach() + + + def __str__(self): + return "CudaContext id " + str(self.cuda_context.handle) + + + def hash_kernel(kernel_filename, include_dirs): + # Generate a kernel ID for our caches + num_includes = 0 + max_includes = 100 + kernel_hasher = hashlib.md5() + logger = logging.getLogger(__name__) + + # Loop over file and includes, and check if something has changed + files = [kernel_filename] + while len(files): + + if (num_includes > max_includes): + raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename)) + + filename = files.pop() + + #logger.debug("Hashing %s", filename) + + modified = os.path.getmtime(filename) + + # Open the file + with io.open(filename, "r") as file: + + # Search for #inclue and also hash the file + file_str = file.read() + kernel_hasher.update(file_str.encode('utf-8')) + kernel_hasher.update(str(modified).encode('utf-8')) + + #Find all includes + includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M) + + # Loop over everything that looks like an include + for include_file in includes: + + #Search through include directories for the file + file_path = os.path.dirname(filename) + for include_path in [file_path] + include_dirs: + + # If we find it, add it to list of files to check + temp_path = os.path.join(include_path, include_file) + if (os.path.isfile(temp_path)): + files = files + [temp_path] + num_includes = num_includes + 1 #For circular includes... + break + + return kernel_hasher.hexdigest() + + """ + Reads a text file and creates an OpenCL kernel from that + """ + def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ + prepared_call_args, \ + include_dirs=[], no_extern_c=True, + **kwargs): + """ + Helper function to print compilation output + """ + def cuda_compile_message_handler(compile_success_bool, info_str, error_str): + self.logger.debug("Compilation returned %s", str(compile_success_bool)) + if info_str: + self.logger.debug("Info: %s", info_str) + if error_str: + self.logger.debug("Error: %s", error_str) + + kernel_filename = os.path.normpath(kernel_filename) + #self.logger.debug("Getting %s", kernel_filename) + + # Create a hash of the kernel (and its includes) + kwargs_hasher = hashlib.md5() + kwargs_hasher.update(str(kwargs).encode('utf-8')); + kwargs_hash = kwargs_hasher.hexdigest() + kwargs_hasher = None + root, ext = os.path.splitext(kernel_filename) + kernel_hash = root \ + + "_" + CudaContext.hash_kernel( \ + os.path.join(self.module_path, kernel_filename), \ + include_dirs=[self.module_path] + include_dirs) \ + + "_" + kwargs_hash \ + + ext + cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) + + # If we have the kernel in our hashmap, return it + if (kernel_hash in self.kernels.keys()): + self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) + return self.kernels[kernel_hash] + + # If we have it on disk, return it + elif (self.use_cache and os.path.isfile(cached_kernel_filename)): + self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) + + with io.open(cached_kernel_filename, "rb") as file: + file_str = file.read() + module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) + + kernel = module.get_function(kernel_function_name) + kernel.prepare(prepared_call_args) + self.kernels[kernel_hash] = kernel + return kernel + + # Otherwise, compile it from source + else: + self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) + + #Create kernel string + kernel_string = "" + for key, value in kwargs.items(): + kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) + kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) + if (self.use_cache): + cached_kernel_dir = os.path.dirname(cached_kernel_filename) + if not os.path.isdir(cached_kernel_dir): + os.mkdir(cached_kernel_dir) + with io.open(cached_kernel_filename + ".txt", "w") as file: + file.write(kernel_string) + + + with Common.Timer("compiler") as timer: + cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) + module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) + if (self.use_cache): + with io.open(cached_kernel_filename, "wb") as file: + file.write(cubin) + + kernel = module.get_function(kernel_function_name) + kernel.prepare(prepared_call_args) + self.kernels[kernel_hash] = kernel + + + return kernel + + """ + Clears the kernel cache (useful for debugging & development) + """ + def clear_kernel_cache(self): + self.logger.debug("Clearing cache") + self.kernels = {} + gc.collect() + + """ + Synchronizes all streams etc + """ + def synchronize(self): + self.cuda_context.synchronize() \ No newline at end of file diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py index c384c11..989b842 100644 --- a/GPUSimulators/FORCE.py +++ b/GPUSimulators/FORCE.py @@ -66,7 +66,7 @@ class FORCE (Simulator.BaseSimulator): block_width, block_height); #Get kernels - self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \ "iiffffPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py index 54bb9f6..89d13ea 100644 --- a/GPUSimulators/HLL.py +++ b/GPUSimulators/HLL.py @@ -61,7 +61,7 @@ class HLL (Simulator.BaseSimulator): block_width, block_height); #Get kernels - self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \ "iiffffPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py index e16521f..db1b9e3 100644 --- a/GPUSimulators/HLL2.py +++ b/GPUSimulators/HLL2.py @@ -67,7 +67,7 @@ class HLL2 (Simulator.BaseSimulator): self.theta = np.float32(theta) #Get kernels - self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \ "iifffffiPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/IPythonMagic.py b/GPUSimulators/IPythonMagic.py index ce4c16f..35d515f 100644 --- a/GPUSimulators/IPythonMagic.py +++ b/GPUSimulators/IPythonMagic.py @@ -25,7 +25,7 @@ from IPython.core import magic_arguments from IPython.core.magic import line_magic, Magics, magics_class import pycuda.driver as cuda -from GPUSimulators import Common +from GPUSimulators import Common, CudaContext @magics_class @@ -53,7 +53,7 @@ class MyIPythonMagic(Magics): self.logger.debug("Creating context") use_cache = False if args.no_cache else True use_autotuning = False if args.no_autotuning else True - self.shell.user_ns[args.name] = Common.CudaContext(blocking=args.blocking, use_cache=use_cache, autotuning=use_autotuning) + self.shell.user_ns[args.name] = CudaContext.CudaContext(blocking=args.blocking, use_cache=use_cache, autotuning=use_autotuning) # this function will be called on exceptions in any cell def custom_exc(shell, etype, evalue, tb, tb_offset=None): diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py index 258b8bb..e26f58a 100644 --- a/GPUSimulators/KP07.py +++ b/GPUSimulators/KP07.py @@ -68,7 +68,7 @@ class KP07 (Simulator.BaseSimulator): self.theta = np.float32(theta) #Get kernels - self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \ "iifffffiPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py index eb715df..20556fb 100644 --- a/GPUSimulators/KP07_dimsplit.py +++ b/GPUSimulators/KP07_dimsplit.py @@ -68,7 +68,7 @@ class KP07_dimsplit (Simulator.BaseSimulator): self.theta = np.float32(theta) #Get kernels - self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \ "iifffffiPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py index c75184c..1f1019c 100644 --- a/GPUSimulators/LxF.py +++ b/GPUSimulators/LxF.py @@ -21,7 +21,7 @@ along with this program. If not, see . """ #Import packages we need -from GPUSimulators import Simulator, Common +from GPUSimulators import Simulator, Common, CudaContext @@ -62,7 +62,7 @@ class LxF (Simulator.BaseSimulator): block_width, block_height); # Get kernels - self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \ "iiffffPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/SWECommon.cu b/GPUSimulators/SWECommon.cu deleted file mode 100644 index 389a062..0000000 --- a/GPUSimulators/SWECommon.cu +++ /dev/null @@ -1,66 +0,0 @@ -/* -This OpenCL kernel implements the Kurganov-Petrova numerical scheme -for the shallow water equations, described in -A. Kurganov & Guergana Petrova -A Second-Order Well-Balanced Positivity Preserving Central-Upwind -Scheme for the Saint-Venant System Communications in Mathematical -Sciences, 5 (2007), 133-160. - -Copyright (C) 2016 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - - - - - - - - -__device__ float3 F_func(const float3 Q, const float g) { - float3 F; - - F.x = Q.y; //hu - F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h; - F.z = Q.y*Q.z / Q.x; //hu*hv/h; - - return F; -} - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index 7f5959a..f6768b6 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -61,7 +61,7 @@ class WAF (Simulator.BaseSimulator): block_width, block_height); #Get kernels - self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \ + self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \ "iiffffiPiPiPiPiPiPi", \ BLOCK_WIDTH=self.local_size[0], \ BLOCK_HEIGHT=self.local_size[1]) diff --git a/GPUSimulators/EulerCommon.cu b/GPUSimulators/cuda/EulerCommon.h similarity index 99% rename from GPUSimulators/EulerCommon.cu rename to GPUSimulators/cuda/EulerCommon.h index eaaffe9..a8442a0 100644 --- a/GPUSimulators/EulerCommon.cu +++ b/GPUSimulators/cuda/EulerCommon.h @@ -22,6 +22,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ +#pragma once diff --git a/GPUSimulators/cuda/SWECommon.h b/GPUSimulators/cuda/SWECommon.h new file mode 100644 index 0000000..233e9f0 --- /dev/null +++ b/GPUSimulators/cuda/SWECommon.h @@ -0,0 +1,474 @@ +/* +This OpenCL kernel implements the Kurganov-Petrova numerical scheme +for the shallow water equations, described in +A. Kurganov & Guergana Petrova +A Second-Order Well-Balanced Positivity Preserving Central-Upwind +Scheme for the Saint-Venant System Communications in Mathematical +Sciences, 5 (2007), 133-160. + +Copyright (C) 2016 SINTEF ICT + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +*/ + +#pragma once +#include "limiters.h" + + + + + + +__device__ float3 F_func(const float3 Q, const float g) { + float3 F; + + F.x = Q.y; //hu + F.y = Q.y*Q.y / Q.x + 0.5f*g*Q.x*Q.x; //hu*hu/h + 0.5f*g*h*h; + F.z = Q.y*Q.z / Q.x; //hu*hv/h; + + return F; +} + + + + + + + + + + + + + +/** + * Superbee flux limiter for WAF. + * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r) + * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204) + * @param c_ the courant number for wave k, dt*S_k/dx + */ +__device__ float WAF_superbee(float r_, float c_) { + // r <= 0.0 + if (r_ <= 0.0f) { + return 1.0f; + } + // 0.0 <= r <= 1/2 + else if (r_ <= 0.5f) { + return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_; + } + // 1/2 <= r <= 1 + else if (r_ <= 1.0f) { + return fabs(c_); + } + // 1 <= r <= 2 + else if (r_ <= 2.0f) { + return 1.0f - (1.0f - fabsf(c_))*r_; + } + // r >= 2 + else { + return 2.0f*fabsf(c_) - 1.0f; + } +} + + + + +__device__ float WAF_albada(float r_, float c_) { + if (r_ <= 0.0f) { + return 1.0f; + } + else { + return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_); + } +} + +__device__ float WAF_minbee(float r_, float c_) { + r_ = fmaxf(-1.0f, fminf(2.0f, r_)); + if (r_ <= 0.0f) { + return 1.0f; + } + if (r_ >= 0.0f && r_ <= 1.0f) { + return 1.0f - (1.0f - fabsf(c_)) * r_; + } + else { + return fabsf(c_); + } +} + +__device__ float WAF_minmod(float r_, float c_) { + return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_)); +} + +__device__ float limiterToWAFLimiter(float r_, float c_) { + return 1.0f - (1.0f - fabsf(c_))*r_; +} + +// Compute h in the "star region", h^dagger +__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) { + + //This estimate for the h* gives rise to spurious oscillations. + //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r); + + const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r); + return h_tmp*h_tmp / g_; +} + +/** + * Weighted average flux (Toro 2001, p 200) for interface {i+1/2} + * @param r_ The flux limiter parameter (see Toro 2001, p. 203) + * @param Q_l2 Q_{i-1} + * @param Q_l1 Q_{i} + * @param Q_r1 Q_{i+1} + * @param Q_r2 Q_{i+2} + */ +__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) { + const float h_l = Q_l1.x; + const float h_r = Q_r1.x; + + const float h_l2 = Q_l2.x; + const float h_r2 = Q_r2.x; + + // Calculate velocities + const float u_l = Q_l1.y / h_l; + const float u_r = Q_r1.y / h_r; + + const float u_l2 = Q_l2.y / h_l2; + const float u_r2 = Q_r2.y / h_r2; + + const float v_l = Q_l1.z / h_l; + const float v_r = Q_r1.z / h_r; + + const float v_l2 = Q_l2.z / h_l2; + const float v_r2 = Q_r2.z / h_r2; + + // Estimate the potential wave speeds + const float c_l = sqrt(g_*h_l); + const float c_r = sqrt(g_*h_r); + + const float c_l2 = sqrt(g_*h_l2); + const float c_r2 = sqrt(g_*h_r2); + + // Compute h in the "star region", h^dagger + const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_); + const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_); + const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_); + + const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l; + const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r; + + const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f; + const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f; + + // Compute wave speed estimates + const float S_l = u_l - c_l*q_l; + const float S_r = u_r + c_r*q_r; + const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) ); + + const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l); + const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r); + + // Estimate the fluxes in the four regions + const float3 F_1 = F_func(Q_l1, g_); + const float3 F_4 = F_func(Q_r1, g_); + + const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1); + const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1); + //const float3 F_2 = F_func(Q_star_l, g_); + //const float3 F_3 = F_func(Q_star_r, g_); + + // Compute the courant numbers for the waves + const float c_1 = S_l * dt_ / dx_; + const float c_2 = S_star * dt_ / dx_; + const float c_3 = S_r * dt_ / dx_; + + // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces + const float eps = 1.0e-6f; + const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps); + const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps); + const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps); + + // Compute the limiter + // We use h for the nonlinear waves, and v for the middle shear wave + const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1); + const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); + const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3); + + //Average the fluxes + const float3 flux = 0.5f*( F_1 + F_4 ) + - 0.5f*( A_1 * (F_2 - F_1) + + A_2 * (F_3 - F_2) + + A_3 * (F_4 - F_3) ); + + return flux; +} + + + + + + + + + + + + + + +/** + * Central upwind flux function + */ +__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) { + const float3 Fp = F_func(Qp, g); + const float up = Qp.y / Qp.x; // hu / h + const float cp = sqrt(g*Qp.x); // sqrt(g*h) + + const float3 Fm = F_func(Qm, g); + const float um = Qm.y / Qm.x; // hu / h + const float cm = sqrt(g*Qm.x); // sqrt(g*h) + + const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed + const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed + + return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am); +} + + + + + + + + + +/** + * Godunovs centered scheme (Toro 2001, p 165) + */ +__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { + const float3 F_l = F_func(Q_l, g_); + const float3 F_r = F_func(Q_r, g_); + + const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r); + + return F_func(Q_godc, g_); +} + + + + + + + + + +/** + * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180) + */ +__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) { + const float h_l = Q_l.x; + const float h_r = Q_r.x; + + // Calculate velocities + const float u_l = Q_l.y / h_l; + const float u_r = Q_r.y / h_r; + + // Estimate the potential wave speeds + const float c_l = sqrt(g_*h_l); + const float c_r = sqrt(g_*h_r); + + // Compute h in the "star region", h^dagger + const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r); + + const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) ); + const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) ); + + const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f; + const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f; + + // Compute wave speed estimates + const float S_l = u_l - c_l*q_l; + const float S_r = u_r + c_r*q_r; + + //Upwind selection + if (S_l >= 0.0f) { + return F_func(Q_l, g_); + } + else if (S_r <= 0.0f) { + return F_func(Q_r, g_); + } + //Or estimate flux in the star region + else { + const float3 F_l = F_func(Q_l, g_); + const float3 F_r = F_func(Q_r, g_); + const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l); + return flux; + } +} + + + + + + + + + + + + + + + + + +/** + * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181) + */ +__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) { + const float h_l = Q_l.x; + const float h_r = Q_r.x; + + // Calculate velocities + const float u_l = Q_l.y / h_l; + const float u_r = Q_r.y / h_r; + + // Estimate the potential wave speeds + const float c_l = sqrt(g_*h_l); + const float c_r = sqrt(g_*h_r); + + // Compute h in the "star region", h^dagger + const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r); + + const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) ); + const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) ); + + const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f; + const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f; + + // Compute wave speed estimates + const float S_l = u_l - c_l*q_l; + const float S_r = u_r + c_r*q_r; + const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) ); + + const float3 F_l = F_func(Q_l, g_); + const float3 F_r = F_func(Q_r, g_); + + //Upwind selection + if (S_l >= 0.0f) { + return F_l; + } + else if (S_r <= 0.0f) { + return F_r; + } + //Or estimate flux in the "left star" region + else if (S_l <= 0.0f && 0.0f <=S_star) { + const float v_l = Q_l.z / h_l; + const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l); + const float3 flux = F_l + S_l*(Q_star_l - Q_l); + return flux; + } + //Or estimate flux in the "righ star" region + else if (S_star <= 0.0f && 0.0f <=S_r) { + const float v_r = Q_r.z / h_r; + const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r); + const float3 flux = F_r + S_r*(Q_star_r - Q_r); + return flux; + } + else { + return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here + } +} + + + + + + + + + + + + + +/** + * Lax-Friedrichs flux (Toro 2001, p 163) + */ +__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { + const float3 F_l = F_func(Q_l, g_); + const float3 F_r = F_func(Q_r, g_); + + return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r); +} + + + + + + + + +/** + * Lax-Friedrichs extended to 2D + */ +__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { + const float3 F_l = F_func(Q_l, g_); + const float3 F_r = F_func(Q_r, g_); + + //Note numerical diffusion for 2D here (0.25) + return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r); +} + + + + + + + + + + + + +/** + * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164) + */ +__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { + const float3 F_l = F_func(Q_l, g_); + const float3 F_r = F_func(Q_r, g_); + + const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r); + + return F_func(Q_lw2, g_); +} + + + + + + + + + + + + +/** + * First Ordered Centered (Toro 2001, p.163) + */ +__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { + const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_); + const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_); + return 0.5f*(F_lf + F_lw2); +} diff --git a/GPUSimulators/FORCE_kernel.cu b/GPUSimulators/cuda/SWE_FORCE.cu similarity index 98% rename from GPUSimulators/FORCE_kernel.cu rename to GPUSimulators/cuda/SWE_FORCE.cu index bedb0d7..002503d 100644 --- a/GPUSimulators/FORCE_kernel.cu +++ b/GPUSimulators/cuda/SWE_FORCE.cu @@ -19,9 +19,8 @@ along with this program. If not, see . */ -#include "common.cu" -#include "SWECommon.cu" -#include "fluxes/FirstOrderCentered.cu" +#include "common.h" +#include "SWECommon.h" /** diff --git a/GPUSimulators/HLL_kernel.cu b/GPUSimulators/cuda/SWE_HLL.cu similarity index 98% rename from GPUSimulators/HLL_kernel.cu rename to GPUSimulators/cuda/SWE_HLL.cu index a86963b..def3d04 100644 --- a/GPUSimulators/HLL_kernel.cu +++ b/GPUSimulators/cuda/SWE_HLL.cu @@ -19,9 +19,8 @@ along with this program. If not, see . -#include "common.cu" -#include "SWECommon.cu" -#include "fluxes/HartenLaxVanLeer.cu" +#include "common.h" +#include "SWECommon.h" diff --git a/GPUSimulators/HLL2_kernel.cu b/GPUSimulators/cuda/SWE_HLL2.cu similarity index 98% rename from GPUSimulators/HLL2_kernel.cu rename to GPUSimulators/cuda/SWE_HLL2.cu index 7909a57..d51233f 100644 --- a/GPUSimulators/HLL2_kernel.cu +++ b/GPUSimulators/cuda/SWE_HLL2.cu @@ -18,10 +18,9 @@ along with this program. If not, see . */ -#include "common.cu" -#include "SWECommon.cu" -#include "limiters.cu" -#include "fluxes/HartenLaxVanLeer.cu" +#include "common.h" +#include "SWECommon.h" +#include "limiters.h" diff --git a/GPUSimulators/KP07_kernel.cu b/GPUSimulators/cuda/SWE_KP07.cu similarity index 98% rename from GPUSimulators/KP07_kernel.cu rename to GPUSimulators/cuda/SWE_KP07.cu index dbd1964..41ffc54 100644 --- a/GPUSimulators/KP07_kernel.cu +++ b/GPUSimulators/cuda/SWE_KP07.cu @@ -24,10 +24,9 @@ along with this program. If not, see . -#include "common.cu" -#include "SWECommon.cu" -#include "limiters.cu" -#include "fluxes/CentralUpwind.cu" +#include "common.h" +#include "SWECommon.h" +#include "limiters.h" __device__ diff --git a/GPUSimulators/KP07_dimsplit_kernel.cu b/GPUSimulators/cuda/SWE_KP07_dimsplit.cu similarity index 98% rename from GPUSimulators/KP07_dimsplit_kernel.cu rename to GPUSimulators/cuda/SWE_KP07_dimsplit.cu index 7d9fd9d..a350bcb 100644 --- a/GPUSimulators/KP07_dimsplit_kernel.cu +++ b/GPUSimulators/cuda/SWE_KP07_dimsplit.cu @@ -24,10 +24,9 @@ along with this program. If not, see . -#include "common.cu" -#include "SWECommon.cu" -#include "limiters.cu" -#include "fluxes/CentralUpwind.cu" +#include "common.h" +#include "SWECommon.h" +#include "limiters.h" __device__ diff --git a/GPUSimulators/LxF_kernel.cu b/GPUSimulators/cuda/SWE_LxF.cu similarity index 98% rename from GPUSimulators/LxF_kernel.cu rename to GPUSimulators/cuda/SWE_LxF.cu index ff094ad..f9cbcae 100644 --- a/GPUSimulators/LxF_kernel.cu +++ b/GPUSimulators/cuda/SWE_LxF.cu @@ -19,9 +19,8 @@ along with this program. If not, see . */ -#include "Common.cu" -#include "SWECommon.cu" -#include "fluxes/LaxFriedrichs.cu" +#include "common.h" +#include "SWECommon.h" /** diff --git a/GPUSimulators/WAF_kernel.cu b/GPUSimulators/cuda/SWE_WAF.cu similarity index 98% rename from GPUSimulators/WAF_kernel.cu rename to GPUSimulators/cuda/SWE_WAF.cu index b35e5dd..b56af1d 100644 --- a/GPUSimulators/WAF_kernel.cu +++ b/GPUSimulators/cuda/SWE_WAF.cu @@ -24,9 +24,8 @@ along with this program. If not, see . -#include "common.cu" -#include "SWECommon.cu" -#include "fluxes/WeightedAverageFlux.cu" +#include "common.h" +#include "SWECommon.h" diff --git a/GPUSimulators/common.cu b/GPUSimulators/cuda/common.h similarity index 91% rename from GPUSimulators/common.cu rename to GPUSimulators/cuda/common.h index 2f3f057..c0af6f5 100644 --- a/GPUSimulators/common.cu +++ b/GPUSimulators/cuda/common.h @@ -22,6 +22,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ +#pragma once /** @@ -50,34 +51,17 @@ inline __device__ __host__ float clamp(const float f, const float a, const float -/** - * Reads a block of data with one ghost cell for the shallow water equations - */ -template -inline __device__ void readBlock(float* ptr_, int pitch_, - float (&shmem)[sm_height][sm_width], - const int max_x_, const int max_y_) { - - //Index of block within domain - const int bx = blockDim.x * blockIdx.x; - const int by = blockDim.y * blockIdx.y; - - //Read into shared memory - for (int j=threadIdx.y; j. */ - +#pragma once diff --git a/GPUSimulators/fluxes/CentralUpwind.cu b/GPUSimulators/fluxes/CentralUpwind.cu deleted file mode 100644 index e20d480..0000000 --- a/GPUSimulators/fluxes/CentralUpwind.cu +++ /dev/null @@ -1,39 +0,0 @@ -/* -This file implements the Central upwind flux - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - - -/** - * Central upwind flux function - */ -__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) { - const float3 Fp = F_func(Qp, g); - const float up = Qp.y / Qp.x; // hu / h - const float cp = sqrt(g*Qp.x); // sqrt(g*h) - - const float3 Fm = F_func(Qm, g); - const float um = Qm.y / Qm.x; // hu / h - const float cm = sqrt(g*Qm.x); // sqrt(g*h) - - const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed - const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed - - return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am); -} - diff --git a/GPUSimulators/fluxes/FirstOrderCentered.cu b/GPUSimulators/fluxes/FirstOrderCentered.cu deleted file mode 100644 index 1c34760..0000000 --- a/GPUSimulators/fluxes/FirstOrderCentered.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* -This file implements the First ORder CEntered flux - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include "./LaxFriedrichs.cu" -#include "./LaxWendroff.cu" - - - - -/** - * First Ordered Centered (Toro 2001, p.163) - */ -__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { - const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_); - const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_); - return 0.5f*(F_lf + F_lw2); -} diff --git a/GPUSimulators/fluxes/Godunov.cu b/GPUSimulators/fluxes/Godunov.cu deleted file mode 100644 index 5d1352a..0000000 --- a/GPUSimulators/fluxes/Godunov.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* -This file implements the Godunov flux - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - - - -/** - * Godunovs centered scheme (Toro 2001, p 165) - */ -__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { - const float3 F_l = F_func(Q_l, g_); - const float3 F_r = F_func(Q_r, g_); - - const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r); - - return F_func(Q_godc, g_); -} - - - diff --git a/GPUSimulators/fluxes/HartenLaxVanLeer.cu b/GPUSimulators/fluxes/HartenLaxVanLeer.cu deleted file mode 100644 index e561781..0000000 --- a/GPUSimulators/fluxes/HartenLaxVanLeer.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* -This file implements the Harten-Lax-van Leer flux - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - -/** - * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180) - */ -__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) { - const float h_l = Q_l.x; - const float h_r = Q_r.x; - - // Calculate velocities - const float u_l = Q_l.y / h_l; - const float u_r = Q_r.y / h_r; - - // Estimate the potential wave speeds - const float c_l = sqrt(g_*h_l); - const float c_r = sqrt(g_*h_r); - - // Compute h in the "star region", h^dagger - const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r); - - const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) ); - const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) ); - - const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f; - const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f; - - // Compute wave speed estimates - const float S_l = u_l - c_l*q_l; - const float S_r = u_r + c_r*q_r; - - //Upwind selection - if (S_l >= 0.0f) { - return F_func(Q_l, g_); - } - else if (S_r <= 0.0f) { - return F_func(Q_r, g_); - } - //Or estimate flux in the star region - else { - const float3 F_l = F_func(Q_l, g_); - const float3 F_r = F_func(Q_r, g_); - const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l); - return flux; - } -} diff --git a/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu b/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu deleted file mode 100644 index 9dc870e..0000000 --- a/GPUSimulators/fluxes/HartenLaxVanLeerContact.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* -This file implements the Harten-Lax-van Leer flux with contact discontinuity - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - - - -/** - * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181) - */ -__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) { - const float h_l = Q_l.x; - const float h_r = Q_r.x; - - // Calculate velocities - const float u_l = Q_l.y / h_l; - const float u_r = Q_r.y / h_r; - - // Estimate the potential wave speeds - const float c_l = sqrt(g_*h_l); - const float c_r = sqrt(g_*h_r); - - // Compute h in the "star region", h^dagger - const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r); - - const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) ); - const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) ); - - const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f; - const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f; - - // Compute wave speed estimates - const float S_l = u_l - c_l*q_l; - const float S_r = u_r + c_r*q_r; - const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) ); - - const float3 F_l = F_func(Q_l, g_); - const float3 F_r = F_func(Q_r, g_); - - //Upwind selection - if (S_l >= 0.0f) { - return F_l; - } - else if (S_r <= 0.0f) { - return F_r; - } - //Or estimate flux in the "left star" region - else if (S_l <= 0.0f && 0.0f <=S_star) { - const float v_l = Q_l.z / h_l; - const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l); - const float3 flux = F_l + S_l*(Q_star_l - Q_l); - return flux; - } - //Or estimate flux in the "righ star" region - else if (S_star <= 0.0f && 0.0f <=S_r) { - const float v_r = Q_r.z / h_r; - const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r); - const float3 flux = F_r + S_r*(Q_star_r - Q_r); - return flux; - } - else { - return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here - } -} - diff --git a/GPUSimulators/fluxes/LaxFriedrichs.cu b/GPUSimulators/fluxes/LaxFriedrichs.cu deleted file mode 100644 index 5759b2d..0000000 --- a/GPUSimulators/fluxes/LaxFriedrichs.cu +++ /dev/null @@ -1,24 +0,0 @@ - - -/** - * Lax-Friedrichs flux (Toro 2001, p 163) - */ -__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { - const float3 F_l = F_func(Q_l, g_); - const float3 F_r = F_func(Q_r, g_); - - return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r); -} - - - -/** - * Lax-Friedrichs extended to 2D - */ -__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { - const float3 F_l = F_func(Q_l, g_); - const float3 F_r = F_func(Q_r, g_); - - //Note numerical diffusion for 2D here (0.25) - return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r); -} diff --git a/GPUSimulators/fluxes/LaxWendroff.cu b/GPUSimulators/fluxes/LaxWendroff.cu deleted file mode 100644 index 839763b..0000000 --- a/GPUSimulators/fluxes/LaxWendroff.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* -This file implements the Lax-Wendroff flux - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - - - -/** - * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164) - */ -__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) { - const float3 F_l = F_func(Q_l, g_); - const float3 F_r = F_func(Q_r, g_); - - const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r); - - return F_func(Q_lw2, g_); -} - diff --git a/GPUSimulators/fluxes/WeightedAverageFlux.cu b/GPUSimulators/fluxes/WeightedAverageFlux.cu deleted file mode 100644 index 0574c32..0000000 --- a/GPUSimulators/fluxes/WeightedAverageFlux.cu +++ /dev/null @@ -1,187 +0,0 @@ -/* -This file implements the Weighted Average Flux - -Copyright (C) 2016, 2017, 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ - -#include "limiters.cu" - -/** - * Superbee flux limiter for WAF. - * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r) - * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204) - * @param c_ the courant number for wave k, dt*S_k/dx - */ -__device__ float WAF_superbee(float r_, float c_) { - // r <= 0.0 - if (r_ <= 0.0f) { - return 1.0f; - } - // 0.0 <= r <= 1/2 - else if (r_ <= 0.5f) { - return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_; - } - // 1/2 <= r <= 1 - else if (r_ <= 1.0f) { - return fabs(c_); - } - // 1 <= r <= 2 - else if (r_ <= 2.0f) { - return 1.0f - (1.0f - fabsf(c_))*r_; - } - // r >= 2 - else { - return 2.0f*fabsf(c_) - 1.0f; - } -} - - - - -__device__ float WAF_albada(float r_, float c_) { - if (r_ <= 0.0f) { - return 1.0f; - } - else { - return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_); - } -} - -__device__ float WAF_minbee(float r_, float c_) { - r_ = fmaxf(-1.0f, fminf(2.0f, r_)); - if (r_ <= 0.0f) { - return 1.0f; - } - if (r_ >= 0.0f && r_ <= 1.0f) { - return 1.0f - (1.0f - fabsf(c_)) * r_; - } - else { - return fabsf(c_); - } -} - -__device__ float WAF_minmod(float r_, float c_) { - return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_)); -} - - - -__device__ float limiterToWAFLimiter(float r_, float c_) { - return 1.0f - (1.0f - fabsf(c_))*r_; -} - -__device__ float desingularize(float x_, float eps_) { - return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_)); -} - -// Compute h in the "star region", h^dagger -__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) { - - //This estimate for the h* gives rise to spurious oscillations. - //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r); - - const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r); - return h_tmp*h_tmp / g_; -} - -/** - * Weighted average flux (Toro 2001, p 200) for interface {i+1/2} - * @param r_ The flux limiter parameter (see Toro 2001, p. 203) - * @param Q_l2 Q_{i-1} - * @param Q_l1 Q_{i} - * @param Q_r1 Q_{i+1} - * @param Q_r2 Q_{i+2} - */ -__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) { - const float h_l = Q_l1.x; - const float h_r = Q_r1.x; - - const float h_l2 = Q_l2.x; - const float h_r2 = Q_r2.x; - - // Calculate velocities - const float u_l = Q_l1.y / h_l; - const float u_r = Q_r1.y / h_r; - - const float u_l2 = Q_l2.y / h_l2; - const float u_r2 = Q_r2.y / h_r2; - - const float v_l = Q_l1.z / h_l; - const float v_r = Q_r1.z / h_r; - - const float v_l2 = Q_l2.z / h_l2; - const float v_r2 = Q_r2.z / h_r2; - - // Estimate the potential wave speeds - const float c_l = sqrt(g_*h_l); - const float c_r = sqrt(g_*h_r); - - const float c_l2 = sqrt(g_*h_l2); - const float c_r2 = sqrt(g_*h_r2); - - // Compute h in the "star region", h^dagger - const float h_dag_l = computeHStar(h_l2, h_l, u_l2, u_l, c_l2, c_l, g_); - const float h_dag = computeHStar( h_l, h_r, u_l, u_r, c_l, c_r, g_); - const float h_dag_r = computeHStar( h_r, h_r2, u_r, u_r2, c_r, c_r2, g_); - - const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l; - const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r; - - const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f; - const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f; - - // Compute wave speed estimates - const float S_l = u_l - c_l*q_l; - const float S_r = u_r + c_r*q_r; - const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) ); - - const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l); - const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r); - - // Estimate the fluxes in the four regions - const float3 F_1 = F_func(Q_l1, g_); - const float3 F_4 = F_func(Q_r1, g_); - - const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1); - const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1); - //const float3 F_2 = F_func(Q_star_l, g_); - //const float3 F_3 = F_func(Q_star_r, g_); - - // Compute the courant numbers for the waves - const float c_1 = S_l * dt_ / dx_; - const float c_2 = S_star * dt_ / dx_; - const float c_3 = S_r * dt_ / dx_; - - // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces - const float eps = 1.0e-6f; - const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps); - const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps); - const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps); - - // Compute the limiter - // We use h for the nonlinear waves, and v for the middle shear wave - const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1); - const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); - const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3); - - //Average the fluxes - const float3 flux = 0.5f*( F_1 + F_4 ) - - 0.5f*( A_1 * (F_2 - F_1) - + A_2 * (F_3 - F_2) - + A_3 * (F_4 - F_3) ); - - return flux; -}