From 7592ad5b9f9c01be9882e4c9770d3672ec02798e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= Date: Thu, 15 Nov 2018 16:47:13 +0100 Subject: [PATCH] Fixed order again --- GPUSimulators/Autotuner.py | 2 +- GPUSimulators/CudaContext.py | 28 +-- GPUSimulators/EE2D_KP07_dimsplit.py | 100 ++++---- GPUSimulators/FORCE.py | 68 +++--- GPUSimulators/HLL.py | 61 ++--- GPUSimulators/HLL2.py | 84 +++---- GPUSimulators/KP07.py | 72 +++--- GPUSimulators/KP07_dimsplit.py | 90 +++---- GPUSimulators/LxF.py | 59 ++--- GPUSimulators/Simulator.py | 60 ++--- GPUSimulators/WAF.py | 76 +++--- GPUSimulators/cuda/EE2D_KP07_dimsplit.cu | 80 +++---- GPUSimulators/cuda/EulerCommon.h | 54 +++++ GPUSimulators/cuda/SWE2D_FORCE.cu | 23 +- GPUSimulators/cuda/SWE2D_HLL.cu | 23 +- GPUSimulators/cuda/SWE2D_HLL2.cu | 37 ++- GPUSimulators/cuda/SWE2D_KP07.cu | 9 +- GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu | 103 ++++---- GPUSimulators/cuda/SWE2D_LxF.cu | 22 +- GPUSimulators/cuda/SWE2D_WAF.cu | 29 ++- GPUSimulators/cuda/common.h | 277 ++++++++++++++-------- GPUSimulators/cuda/limiters.h | 20 +- 22 files changed, 758 insertions(+), 619 deletions(-) diff --git a/GPUSimulators/Autotuner.py b/GPUSimulators/Autotuner.py index 763aa43..47a3a41 100644 --- a/GPUSimulators/Autotuner.py +++ b/GPUSimulators/Autotuner.py @@ -29,7 +29,7 @@ from socket import gethostname import pycuda.driver as cuda -from GPUSimulators import Common, Simulator +from GPUSimulators import Common, Simulator, CudaContext class Autotuner: def __init__(self, diff --git a/GPUSimulators/CudaContext.py b/GPUSimulators/CudaContext.py index 15772b6..243469f 100644 --- a/GPUSimulators/CudaContext.py +++ b/GPUSimulators/CudaContext.py @@ -48,7 +48,7 @@ class CudaContext(object): self.blocking = blocking self.use_cache = use_cache self.logger = logging.getLogger(__name__) - self.kernels = {} + self.modules = {} self.module_path = os.path.dirname(os.path.realpath(__file__)) @@ -164,12 +164,12 @@ class CudaContext(object): break return kernel_hasher.hexdigest() - + + """ Reads a text file and creates an OpenCL kernel from that """ - def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ - prepared_call_args, \ + def get_module(self, kernel_filename, include_dirs=[], \ defines={}, \ compile_args={'no_extern_c', True}, jit_compile_args={}): @@ -206,9 +206,9 @@ class CudaContext(object): cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) # If we have the kernel in our hashmap, return it - if (kernel_hash in self.kernels.keys()): + if (kernel_hash in self.modules.keys()): self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) - return self.kernels[kernel_hash] + return self.modules[kernel_hash] # If we have it on disk, return it elif (self.use_cache and os.path.isfile(cached_kernel_filename)): @@ -218,10 +218,8 @@ class CudaContext(object): file_str = file.read() module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) - kernel = module.get_function(kernel_function_name) - kernel.prepare(prepared_call_args) - self.kernels[kernel_hash] = kernel - return kernel + self.modules[kernel_hash] = module + return module # Otherwise, compile it from source else: @@ -250,19 +248,15 @@ class CudaContext(object): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) - kernel = module.get_function(kernel_function_name) - kernel.prepare(prepared_call_args) - self.kernels[kernel_hash] = kernel - - - return kernel + self.modules[kernel_hash] = module + return module """ Clears the kernel cache (useful for debugging & development) """ def clear_kernel_cache(self): self.logger.debug("Clearing cache") - self.kernels = {} + self.modules = {} gc.collect() """ diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py index fca10b4..7726cec 100644 --- a/GPUSimulators/EE2D_KP07_dimsplit.py +++ b/GPUSimulators/EE2D_KP07_dimsplit.py @@ -24,6 +24,8 @@ from GPUSimulators import Simulator, Common from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition import numpy as np +from pycuda import gpuarray + @@ -52,80 +54,81 @@ class EE2D_KP07_dimsplit (BaseSimulator): gamma: Gas constant p: pressure """ - def __init__(self, \ - context, \ - rho, rho_u, rho_v, E, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - gamma, \ - theta=1.3, \ - order=2, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + rho, rho_u, rho_v, E, + nx, ny, + dx, dy, dt, + g, + gamma, + theta=1.3, + cfl_scale=0.25*0.9, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=8): # Call super constructor super().__init__(context, \ nx, ny, \ - dx, dy, dt, \ + dx, dy, 2*dt, \ block_width, block_height) self.g = np.float32(g) self.gamma = np.float32(gamma) self.theta = np.float32(theta) - self.order = np.int32(order) + self.cfl_scale = cfl_scale self.boundary_conditions = boundary_conditions.asCodedInt() #Get kernels - self.kernel = context.get_prepared_kernel("cuda/EE2D_KP07_dimsplit.cu", "KP07DimsplitKernel", \ - "iiffffffiiPiPiPiPiPiPiPiPi", \ + module = context.get_module("cuda/EE2D_KP07_dimsplit.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("KP07DimsplitKernel") + self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP") + #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [rho, rho_u, rho_v, E]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [None, None, None, None]) + self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) + self.cfl_data.fill(self.dt, stream=self.stream) + def step(self, dt): - if (self.order == 1): - self.substepDimsplit(dt, substep=(self.nt % 2)) - elif (self.order == 2): - self.substepDimsplit(dt, substep=0) - self.substepDimsplit(dt, substep=1) - else: - raise(NotImplementedError("Order {:d} is not implemented".format(self.order))) + self.substepDimsplit(0.5*dt, 0) + self.substepDimsplit(0.5*dt, 1) self.t += dt - self.nt += 1 + self.nt += 2 def substepDimsplit(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.gamma, \ - self.theta, \ - Simulator.stepOrderToCodedInt(step=substep, order=self.order), \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u0[3].data.gpudata, self.u0[3].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ - self.u1[2].data.gpudata, self.u1[2].data.strides[0], \ - self.u1[3].data.gpudata, self.u1[3].data.strides[0]) + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata) self.u0, self.u1 = self.u1, self.u0 def download(self): @@ -134,4 +137,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): def check(self): self.u0.check() self.u1.check() - pass \ No newline at end of file + + def computeDt(self): + max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); + return max_dt*self.cfl_scale \ No newline at end of file diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py index 7a86a7c..304bd81 100644 --- a/GPUSimulators/FORCE.py +++ b/GPUSimulators/FORCE.py @@ -52,61 +52,67 @@ class FORCE (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt, block_width, block_height); self.g = np.float32(g) self.boundary_conditions = boundary_conditions.asCodedInt() #Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_FORCE.cu", "FORCEKernel", \ - "iiffffiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_FORCE.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("FORCEKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 1, 1, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 1, 1, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 1, 1, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 1, 1, [None, None, None]) def step(self, dt): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 self.t += dt self.nt += 1 def download(self): - return self.u0.download(self.stream) \ No newline at end of file + return self.u0.download(self.stream) + + def check(self): + self.u0.check() + self.u1.check() + \ No newline at end of file diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py index f2fcb6c..0a62955 100644 --- a/GPUSimulators/HLL.py +++ b/GPUSimulators/HLL.py @@ -47,57 +47,58 @@ class HLL (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt, block_width, block_height); self.g = np.float32(g) self.boundary_conditions = boundary_conditions.asCodedInt() #Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_HLL.cu", "HLLKernel", \ - "iiffffiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_HLL.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("HLLKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 1, 1, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 1, 1, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 1, 1, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 1, 1, [None, None, None]) def step(self, dt): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 self.t += dt diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py index 95ff09c..ee047da 100644 --- a/GPUSimulators/HLL2.py +++ b/GPUSimulators/HLL2.py @@ -49,79 +49,69 @@ class HLL2 (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - theta=1.8, \ - order=2, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, + theta=1.8, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt*2, block_width, block_height); self.g = np.float32(g) self.theta = np.float32(theta) - self.order = np.int32(order) self.boundary_conditions = boundary_conditions.asCodedInt() - #This kernel is dimensionally split, and therefore only second order every other - #dimsplit timestep. Therefore, step always runs two substeps - self.dt = 2*self.dt - #Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_HLL2.cu", "HLL2Kernel", \ - "iifffffiiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_HLL2.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("HLL2Kernel") + self.kernel.prepare("iifffffiiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [None, None, None]) def step(self, dt): - if (self.order == 1): - self.substepDimsplit(0.5*dt, 0) - self.substepDimsplit(0.5*dt, 1) - elif (self.order == 2): - self.substepDimsplit(0.5*dt, 0) - self.substepDimsplit(0.5*dt, 1) - else: - raise(NotImplementedError("Order {:d} is not implemented".format(self.order))) + self.substepDimsplit(dt*0.5, 0) + self.substepDimsplit(dt*0.5, 1) + self.t += dt self.nt += 2 def substepDimsplit(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.theta, \ - Simulator.stepOrderToCodedInt(step=substep, order=self.order), \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py index 6710a97..31ae928 100644 --- a/GPUSimulators/KP07.py +++ b/GPUSimulators/KP07.py @@ -50,21 +50,21 @@ class KP07 (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - theta=1.3, \ - order=2, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, + theta=1.3, + order=2, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt, block_width, block_height); self.g = np.float32(g) self.theta = np.float32(theta) @@ -72,26 +72,27 @@ class KP07 (Simulator.BaseSimulator): self.boundary_conditions = boundary_conditions.asCodedInt() #Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_KP07.cu", "KP07Kernel", \ - "iifffffiiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_KP07.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("KP07Kernel") + self.kernel.prepare("iifffffiiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [None, None, None]) @@ -108,20 +109,21 @@ class KP07 (Simulator.BaseSimulator): def substepRK(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.theta, \ - Simulator.stepOrderToCodedInt(step=substep, order=self.order), \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.theta, + Simulator.stepOrderToCodedInt(step=substep, order=self.order), + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 - + + def download(self): return self.u0.download(self.stream) \ No newline at end of file diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py index 876ddc5..58535d3 100644 --- a/GPUSimulators/KP07_dimsplit.py +++ b/GPUSimulators/KP07_dimsplit.py @@ -50,76 +50,78 @@ class KP07_dimsplit (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - theta=1.3, \ - order=2, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, + theta=1.3, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt*2, block_width, block_height) + self.gc_x = 2 + self.gc_y = 2 self.g = np.float32(g) self.theta = np.float32(theta) - self.order = np.int32(order) self.boundary_conditions = boundary_conditions.asCodedInt() #Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_KP07_dimsplit.cu", "KP07DimsplitKernel", \ - "iifffffiiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("KP07DimsplitKernel") + self.kernel.prepare("iifffffiiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + self.gc_x, self.gc_y, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + self.gc_x, self.gc_y, [None, None, None]) def step(self, dt): - if (self.order == 1): - self.substepDimsplit(dt, substep=(self.nt % 2)) - elif (self.order == 2): - self.substepDimsplit(dt, substep=0) - self.substepDimsplit(dt, substep=1) - else: - raise(NotImplementedError("Order {:d} is not implemented".format(self.order))) + self.substepDimsplit(dt*0.5, 0) + self.substepDimsplit(dt*0.5, 1) + self.t += dt - self.nt += 1 + self.nt += 2 def substepDimsplit(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.theta, \ - Simulator.stepOrderToCodedInt(step=substep, order=self.order), \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 + def download(self): - return self.u0.download(self.stream) \ No newline at end of file + return self.u0.download(self.stream) + + def check(self): + self.u0.check() + self.u1.check() \ No newline at end of file diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py index 0801209..0ad7743 100644 --- a/GPUSimulators/LxF.py +++ b/GPUSimulators/LxF.py @@ -48,57 +48,58 @@ class LxF (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt, block_width, block_height); self.g = np.float32(g) self.boundary_conditions = boundary_conditions.asCodedInt() # Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_LxF.cu", "LxFKernel", \ - "iiffffiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_LxF.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("LxFKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 1, 1, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 1, 1, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 1, 1, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 1, 1, [None, None, None]) def step(self, dt): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 self.t += dt diff --git a/GPUSimulators/Simulator.py b/GPUSimulators/Simulator.py index 2140173..bfeedf6 100644 --- a/GPUSimulators/Simulator.py +++ b/GPUSimulators/Simulator.py @@ -50,9 +50,9 @@ class BoundaryCondition(object): Neumann = 1, Periodic = 2, Reflective = 3 - - - + + + def __init__(self, types={ \ 'north': Type.Reflective, \ 'south': Type.Reflective, \ @@ -85,13 +85,13 @@ class BoundaryCondition(object): bc = bc | (self.north & 0x0000000F) << 24 bc = bc | (self.south & 0x0000000F) << 16 bc = bc | (self.east & 0x0000000F) << 8 - bc = bc | (self.west & 0x0000000F) + bc = bc | (self.west & 0x0000000F) #for t in types: # print("{0:s}, {1:d}, {1:032b}, {1:08b}".format(t, types[t])) #print("bc: {0:032b}".format(bc)) - return np.int32(bc) + return np.int32(bc) @@ -101,10 +101,10 @@ class BoundaryCondition(object): class BaseSimulator(object): - def __init__(self, \ - context, \ - nx, ny, \ - dx, dy, dt, \ + def __init__(self, + context, + nx, ny, + dx, dy, dt, block_width, block_height): """ Initialization routine @@ -141,9 +141,9 @@ class BaseSimulator(object): #Compute kernel launch parameters self.block_size = (block_width, block_height, 1) - self.grid_size = ( \ - int(np.ceil(self.nx / float(self.block_size[0]))), \ - int(np.ceil(self.ny / float(self.block_size[1]))) \ + self.grid_size = ( + int(np.ceil(self.nx / float(self.block_size[0]))), + int(np.ceil(self.ny / float(self.block_size[1]))) ) #Create a CUDA stream @@ -158,43 +158,42 @@ class BaseSimulator(object): return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny) - def simulate(self, t_end): + def simulate(self, t): """ Function which simulates t_end seconds using the step function Requires that the step() function is implemented in the subclasses """ - # Compute number of timesteps to perform - n = int(t_end / self.dt + 1) - printer = Common.ProgressPrinter(n) + printer = Common.ProgressPrinter(t) + + t_end = self.simTime() + t + + while(self.simTime() < t_end): + if (self.simSteps() % 100 == 0): + self.dt = self.computeDt() - for i in range(0, n): # Compute timestep for "this" iteration (i.e., shorten last timestep) - local_dt = np.float32(min(self.dt, t_end-i*self.dt)) - + local_dt = np.float32(min(self.dt, t_end-self.simTime())) + # Stop if end reached (should not happen) if (local_dt <= 0.0): - self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.nt + i)) + self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.simSteps())) break # Step forward in time self.step(local_dt) #Print info - print_string = printer.getPrintString(i) + print_string = printer.getPrintString(t_end - self.simTime()) if (print_string): - self.logger.info("%s (Euler): %s", self, print_string) + self.logger.info("%s: %s", self, print_string) try: self.check() except AssertionError as e: e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime())) raise - - - #self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n) - return self.t, n - - + + def step(self, dt): """ Function which performs one single timestep of size dt @@ -208,7 +207,8 @@ class BaseSimulator(object): self.stream.synchronize() def check(self): - raise(NotImplementedError("Needs to be implemented in subclass")) + self.logger.warning("check() is not implemented - please implement") + #raise(NotImplementedError("Needs to be implemented in subclass")) def simTime(self): return self.t @@ -216,6 +216,8 @@ class BaseSimulator(object): def simSteps(self): return self.nt + def computeDt(self): + raise(NotImplementedError("Needs to be implemented in subclass")) diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index d33ec82..903e0d1 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -46,71 +46,65 @@ class WAF (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def __init__(self, \ - context, \ - h0, hu0, hv0, \ - nx, ny, \ - dx, dy, dt, \ - g, \ - order=2, \ - boundary_conditions=BoundaryCondition(), \ + def __init__(self, + context, + h0, hu0, hv0, + nx, ny, + dx, dy, dt, + g, + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor - super().__init__(context, \ - nx, ny, \ - dx, dy, dt, \ + super().__init__(context, + nx, ny, + dx, dy, dt*2, block_width, block_height); self.g = np.float32(g) - self.order = np.int32(order) self.boundary_conditions = boundary_conditions.asCodedInt() #Get kernels - self.kernel = context.get_prepared_kernel("cuda/SWE2D_WAF.cu", "WAFKernel", \ - "iiffffiiPiPiPiPiPiPi", \ + module = context.get_module("cuda/SWE2D_WAF.cu", defines={ 'BLOCK_WIDTH': self.block_size[0], 'BLOCK_HEIGHT': self.block_size[1] - }, \ + }, compile_args={ 'no_extern_c': True, 'options': ["--use_fast_math"], - }, \ + }, jit_compile_args={}) + self.kernel = module.get_function("WAFKernel") + self.kernel.prepare("iiffffiiPiPiPiPiPiPi") #Create data by uploading to device - self.u0 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u0 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [h0, hu0, hv0]) - self.u1 = Common.ArakawaA2D(self.stream, \ - nx, ny, \ - 2, 2, \ + self.u1 = Common.ArakawaA2D(self.stream, + nx, ny, + 2, 2, [None, None, None]) def step(self, dt): - if (self.order == 1): - self.substepDimsplit(dt, substep=(self.nt % 2)) - elif (self.order == 2): - self.substepDimsplit(dt, substep=0) - self.substepDimsplit(dt, substep=1) - else: - raise(NotImplementedError("Order {:d} is not implemented".format(self.order))) + self.substepDimsplit(dt*0.5, substep=0) + self.substepDimsplit(dt*0.5, substep=1) self.t += dt - self.nt += 1 + self.nt += 2 def substepDimsplit(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ - self.nx, self.ny, \ - self.dx, self.dy, dt, \ - self.g, \ - Simulator.stepOrderToCodedInt(step=substep, order=self.order), \ - self.boundary_conditions, \ - self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ - self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ - self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ - self.u1[0].data.gpudata, self.u1[0].data.strides[0], \ - self.u1[1].data.gpudata, self.u1[1].data.strides[0], \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0]) self.u0, self.u1 = self.u1, self.u0 diff --git a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu index b353364..9ccdd40 100644 --- a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu +++ b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu @@ -58,8 +58,8 @@ void computeFluxF(float Q[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const float4 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, gamma_) - F_func(Q_lr, gamma_)); // Compute flux based on prediction - const float4 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, gamma_); - //const float4 flux = HLL_flux(Q_l_bar, Q_r_bar, gamma_); + //const float4 flux = CentralUpwindFlux(Q_l_bar, Q_r_bar, gamma_); + const float4 flux = HLL_flux(Q_l_bar, Q_r_bar, gamma_); //Write to shared memory F[0][j][i] = flux.x; @@ -131,7 +131,7 @@ __global__ void KP07DimsplitKernel( float theta_, - int step_order_, + int step_, int boundary_conditions_, //Input h^n @@ -144,51 +144,49 @@ __global__ void KP07DimsplitKernel( float* rho1_ptr_, int rho1_pitch_, float* rho_u1_ptr_, int rho_u1_pitch_, float* rho_v1_ptr_, int rho_v1_pitch_, - float* E1_ptr_, int E1_pitch_) { + float* E1_ptr_, int E1_pitch_, + + //Output CFL + float* cfl_) { const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; - const unsigned int gc = 2; + const unsigned int gc_x = 2; + const unsigned int gc_y = 2; const unsigned int vars = 4; //Shared memory variables - __shared__ float Q[4][h+4][w+4]; - __shared__ float Qx[4][h+4][w+4]; - __shared__ float F[4][h+4][w+4]; + __shared__ float Q[4][h+2*gc_y][w+2*gc_x]; + __shared__ float Qx[4][h+2*gc_y][w+2*gc_x]; + __shared__ float F[4][h+2*gc_y][w+2*gc_x]; //Read into shared memory - readBlock( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_); - readBlock( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_); - __syncthreads(); - + readBlock( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_); //Step 0 => evolve x first, then y - if (getStep(step_order_) == 0) { + if (step_ == 0) { //Compute fluxes along the x axis and evolve - minmodSlopeX(Q, Qx, theta_); + minmodSlopeX(Q, Qx, theta_); __syncthreads(); - computeFluxF(Q, Qx, F, gamma_, dx_, dt_); __syncthreads(); - - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); //Compute fluxes along the y axis and evolve - minmodSlopeY(Q, Qx, theta_); + minmodSlopeY(Q, Qx, theta_); __syncthreads(); - computeFluxG(Q, Qx, F, gamma_, dy_, dt_); __syncthreads(); - - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); //Gravity source term if (g_ > 0.0f) { - const int i = threadIdx.x + gc; - const int j = threadIdx.y + gc; + const int i = threadIdx.x + gc_x; + const int j = threadIdx.y + gc_y; const float rho_v = Q[2][j][i]; Q[2][j][i] -= g_*Q[0][j][i]*dt_; Q[3][j][i] -= g_*rho_v*dt_; @@ -198,29 +196,25 @@ __global__ void KP07DimsplitKernel( //Step 1 => evolve y first, then x else { //Compute fluxes along the y axis and evolve - minmodSlopeY(Q, Qx, theta_); + minmodSlopeY(Q, Qx, theta_); __syncthreads(); - computeFluxG(Q, Qx, F, gamma_, dy_, dt_); __syncthreads(); - - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); //Compute fluxes along the x axis and evolve - minmodSlopeX(Q, Qx, theta_); + minmodSlopeX(Q, Qx, theta_); __syncthreads(); - computeFluxF(Q, Qx, F, gamma_, dx_, dt_); __syncthreads(); - - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); //Gravity source term if (g_ > 0.0f) { - const int i = threadIdx.x + gc; - const int j = threadIdx.y + gc; + const int i = threadIdx.x + gc_x; + const int j = threadIdx.y + gc_y; const float rho_v = Q[2][j][i]; Q[2][j][i] -= g_*Q[0][j][i]*dt_; Q[3][j][i] -= g_*rho_v*dt_; @@ -230,12 +224,16 @@ __global__ void KP07DimsplitKernel( // Write to main memory for all internal cells - const int step = getStep(step_order_); - const int order = getOrder(step_order_); - writeBlock( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, step, order); - writeBlock(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, step, order); - writeBlock(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, step, order); - writeBlock( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, step, order); + writeBlock( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1); + + //Compute the CFL for this block + if (cfl_ != NULL) { + writeCfl(Q, F[0], nx_, ny_, dx_, dy_, gamma_, cfl_); + } } + } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/EulerCommon.h b/GPUSimulators/cuda/EulerCommon.h index 02af22b..cb22a53 100644 --- a/GPUSimulators/cuda/EulerCommon.h +++ b/GPUSimulators/cuda/EulerCommon.h @@ -23,6 +23,60 @@ along with this program. If not, see . +template +__device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x], + float shmem[h+2*gc_y][w+2*gc_x], + const int nx_, const int ny_, + const float dx_, const float dy_, const float gamma_, + float* output_) { + //Index of thread within block + const int tx = threadIdx.x + gc_x; + const int ty = threadIdx.y + gc_y; + + //Index of cell within domain + const int ti = blockDim.x*blockIdx.x + tx; + const int tj = blockDim.y*blockIdx.y + ty; + + //Only internal cells + if (ti < nx_+gc_x && tj < ny_+gc_y) { + const float rho = Q[0][ty][tx]; + const float u = Q[1][ty][tx] / rho; + const float v = Q[2][ty][tx] / rho; + + const float max_u = dx_ / (fabsf(u) + sqrtf(gamma_*rho)); + const float max_v = dy_ / (fabsf(v) + sqrtf(gamma_*rho)); + + shmem[ty][tx] = fminf(max_u, max_v); + } + __syncthreads(); + + //One row of threads loop over all rows + if (ti < nx_+gc_x && tj < ny_+gc_y) { + if (ty == gc_y) { + float min_val = shmem[ty][tx]; + const int max_y = min(h, ny_+gc_y - tj); + for (int j=gc_y; j( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); __syncthreads(); //Compute flux along x, and evolve computeFluxF(Q, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); //Compute flux along y, and evolve computeFluxG(Q, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); //Write to main memory - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE2D_HLL.cu b/GPUSimulators/cuda/SWE2D_HLL.cu index 54704ed..6797891 100644 --- a/GPUSimulators/cuda/SWE2D_HLL.cu +++ b/GPUSimulators/cuda/SWE2D_HLL.cu @@ -117,36 +117,37 @@ __global__ void HLLKernel( const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; - const unsigned int gc = 1; + const unsigned int gc_x = 1; + const unsigned int gc_y = 1; const unsigned int vars = 3; //Shared memory variables - __shared__ float Q[3][h+2][w+2]; - __shared__ float F[3][h+2][w+2]; + __shared__ float Q[vars][h+2*gc_y][w+2*gc_x]; + __shared__ float F[vars][h+2*gc_y][w+2*gc_x]; //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); //Compute F flux computeFluxF(Q, F, g_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); //Compute G flux computeFluxG(Q, F, g_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); // Write to main memory for all internal cells - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE2D_HLL2.cu b/GPUSimulators/cuda/SWE2D_HLL2.cu index b2b4a43..ed02b0e 100644 --- a/GPUSimulators/cuda/SWE2D_HLL2.cu +++ b/GPUSimulators/cuda/SWE2D_HLL2.cu @@ -130,7 +130,7 @@ __global__ void HLL2Kernel( float theta_, - int step_order_, + int step_, int boundary_conditions_, //Input h^n @@ -145,7 +145,8 @@ __global__ void HLL2Kernel( const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; - const unsigned int gc = 2; + const unsigned int gc_x = 2; + const unsigned int gc_y = 2; const unsigned int vars = 3; //Shared memory variables @@ -154,44 +155,44 @@ __global__ void HLL2Kernel( __shared__ float F[3][h+4][w+4]; //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); //Step 0 => evolve x first, then y - if (getStep(step_order_) == 0) { + if (step_ == 0) { //Compute fluxes along the x axis and evolve - minmodSlopeX(Q, Qx, theta_); + minmodSlopeX(Q, Qx, theta_); __syncthreads(); computeFluxF(Q, Qx, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); //Compute fluxes along the y axis and evolve - minmodSlopeY(Q, Qx, theta_); + minmodSlopeY(Q, Qx, theta_); __syncthreads(); computeFluxG(Q, Qx, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); } //Step 1 => evolve y first, then x else { //Compute fluxes along the y axis and evolve - minmodSlopeY(Q, Qx, theta_); + minmodSlopeY(Q, Qx, theta_); __syncthreads(); computeFluxG(Q, Qx, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); //Compute fluxes along the x axis and evolve - minmodSlopeX(Q, Qx, theta_); + minmodSlopeX(Q, Qx, theta_); __syncthreads(); computeFluxF(Q, Qx, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); } @@ -199,11 +200,9 @@ __global__ void HLL2Kernel( // Write to main memory for all internal cells - const int step = getStep(step_order_); - const int order = getOrder(step_order_); - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, step, order); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, step, order); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, step, order); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE2D_KP07.cu b/GPUSimulators/cuda/SWE2D_KP07.cu index 218b6f0..207ee57 100644 --- a/GPUSimulators/cuda/SWE2D_KP07.cu +++ b/GPUSimulators/cuda/SWE2D_KP07.cu @@ -155,7 +155,8 @@ __global__ void KP07Kernel( const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; - const unsigned int gc = 2; + const unsigned int gc_x = 2; + const unsigned int gc_y = 2; const unsigned int vars = 3; //Index of thread within block @@ -176,9 +177,9 @@ __global__ void KP07Kernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); //Reconstruct slopes along x and axis diff --git a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu index cacc9e4..939239c 100644 --- a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu +++ b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu @@ -29,13 +29,14 @@ along with this program. If not, see . #include "limiters.h" +template __device__ -void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], - float Qx[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], - float F[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], +void computeFluxF(float Q[3][h+2*gc_y][w+2*gc_x], + float Qx[3][h+2*gc_y][w+2*gc_x], + float F[3][h+2*gc_y][w+2*gc_x], const float g_, const float dx_, const float dt_) { - for (int j=threadIdx.y; j __device__ -void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], - float Qy[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], - float G[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], +void computeFluxG(float Q[3][h+2*gc_y][w+2*gc_x], + float Qy[3][h+2*gc_y][w+2*gc_x], + float G[3][h+2*gc_y][w+2*gc_x], const float g_, const float dy_, const float dt_) { - for (int j=threadIdx.y+1; j( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); - - - //Step 0 => evolve x first, then y - if (getStep(step_order_) == 0) { - //Compute fluxes along the x axis and evolve - minmodSlopeX(Q, Qx, theta_); + if (step_ == 0) { + //Along X + minmodSlopeX(Q, Qx, theta_); __syncthreads(); - computeFluxF(Q, Qx, F, g_, dx_, dt_); + computeFluxF(Q, Qx, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); - //Compute fluxes along the y axis and evolve - minmodSlopeY(Q, Qx, theta_); + //Along Y + minmodSlopeY(Q, Qx, theta_); __syncthreads(); - computeFluxG(Q, Qx, F, g_, dy_, dt_); + computeFluxG(Q, Qx, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); } - //Step 1 => evolve y first, then x else { - //Compute fluxes along the y axis and evolve - minmodSlopeY(Q, Qx, theta_); + //Along Y + minmodSlopeY(Q, Qx, theta_); __syncthreads(); - computeFluxG(Q, Qx, F, g_, dy_, dt_); + computeFluxG(Q, Qx, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); - //Compute fluxes along the x axis and evolve - minmodSlopeX(Q, Qx, theta_); + //Along X + minmodSlopeX(Q, Qx, theta_); __syncthreads(); - computeFluxF(Q, Qx, F, g_, dx_, dt_); + computeFluxF(Q, Qx, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); } - // Write to main memory for all internal cells - const int step = getStep(step_order_); - const int order = getOrder(step_order_); - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, step, order); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, step, order); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, step, order); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); } + + + + + + + + + } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE2D_LxF.cu b/GPUSimulators/cuda/SWE2D_LxF.cu index 1439a7a..115cc29 100644 --- a/GPUSimulators/cuda/SWE2D_LxF.cu +++ b/GPUSimulators/cuda/SWE2D_LxF.cu @@ -118,16 +118,18 @@ void LxFKernel( const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; - const unsigned int gc = 1; + const unsigned int gc_x = 1; + const unsigned int gc_y = 1; + const unsigned int vars = 3; - __shared__ float Q[3][h+2][w+2]; - __shared__ float F[3][h ][w+1]; - __shared__ float G[3][h+1][w ]; + __shared__ float Q[vars][h+2][w+2]; + __shared__ float F[vars][h ][w+1]; + __shared__ float G[vars][h+1][w ]; //Read from global memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); //Compute fluxes along the x and y axis computeFluxF(Q, F, g_, dx_, dt_); @@ -149,9 +151,9 @@ void LxFKernel( __syncthreads(); //Write to main memory - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); } } // extern "C" diff --git a/GPUSimulators/cuda/SWE2D_WAF.cu b/GPUSimulators/cuda/SWE2D_WAF.cu index 68890e3..2c38cdf 100644 --- a/GPUSimulators/cuda/SWE2D_WAF.cu +++ b/GPUSimulators/cuda/SWE2D_WAF.cu @@ -105,7 +105,7 @@ __global__ void WAFKernel( float dx_, float dy_, float dt_, float g_, - int step_order_, + int step_, int boundary_conditions_, //Input h^n @@ -120,7 +120,8 @@ __global__ void WAFKernel( const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; - const unsigned int gc = 2; + const unsigned int gc_x = 2; + const unsigned int gc_y = 2; const unsigned int vars = 3; //Shared memory variables @@ -130,25 +131,25 @@ __global__ void WAFKernel( //Read into shared memory Q from global memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); __syncthreads(); //Step 0 => evolve x first, then y - if (getStep(step_order_) == 0) { + if (step_ == 0) { //Compute fluxes along the x axis and evolve computeFluxF(Q, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); //Compute fluxes along the y axis and evolve computeFluxG(Q, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); } //Step 1 => evolve y first, then x @@ -156,24 +157,22 @@ __global__ void WAFKernel( //Compute fluxes along the y axis and evolve computeFluxG(Q, F, g_, dy_, dt_); __syncthreads(); - evolveG(Q, F, dy_, dt_); + evolveG(Q, F, dy_, dt_); __syncthreads(); //Compute fluxes along the x axis and evolve computeFluxF(Q, F, g_, dx_, dt_); __syncthreads(); - evolveF(Q, F, dx_, dt_); + evolveF(Q, F, dx_, dt_); __syncthreads(); } // Write to main memory for all internal cells - const int step = getStep(step_order_); - const int order = getOrder(step_order_); - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, step, order); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, step, order); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, step, order); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/common.h b/GPUSimulators/cuda/common.h index c8d1579..22678ea 100644 --- a/GPUSimulators/cuda/common.h +++ b/GPUSimulators/cuda/common.h @@ -137,19 +137,21 @@ inline __device__ BoundaryCondition getBCWest(int bc_) { /** * Alter the index l so that it gives periodic boundary conditions when reading */ -template +template inline __device__ int handlePeriodicBoundaryX(int k, int nx_, int boundary_conditions_) { - const int gc_pad = 2*ghost_cells; + const int gc_pad = gc_x; //West boundary: add an offset to read from east of domain - if ((k < gc_pad) - && getBCWest(boundary_conditions_) == Periodic) { - k += (nx_+2*ghost_cells - 2*gc_pad); - } - //East boundary: subtract an offset to read from west of domain - else if ((k >= nx_+2*ghost_cells-gc_pad) - && getBCEast(boundary_conditions_) == Periodic) { - k -= (nx_+2*ghost_cells - 2*gc_pad); + if (gc_x > 0) { + if ((k < gc_pad) + && getBCWest(boundary_conditions_) == Periodic) { + k += (nx_+2*gc_x - 2*gc_pad); + } + //East boundary: subtract an offset to read from west of domain + else if ((k >= nx_+2*gc_x-gc_pad) + && getBCEast(boundary_conditions_) == Periodic) { + k -= (nx_+2*gc_x - 2*gc_pad); + } } return k; @@ -158,45 +160,49 @@ inline __device__ int handlePeriodicBoundaryX(int k, int nx_, int boundary_condi /** * Alter the index l so that it gives periodic boundary conditions when reading */ -template +template inline __device__ int handlePeriodicBoundaryY(int l, int ny_, int boundary_conditions_) { - const int gc_pad = 2*ghost_cells; + const int gc_pad = gc_y; //South boundary: add an offset to read from north of domain - if ((l < gc_pad) - && getBCSouth(boundary_conditions_) == Periodic) { - l += (ny_+2*ghost_cells - 2*gc_pad); - } - //North boundary: subtract an offset to read from south of domain - else if ((l >= ny_+2*ghost_cells-gc_pad) - && getBCNorth(boundary_conditions_) == Periodic) { - l -= (ny_+2*ghost_cells - 2*gc_pad); + if (gc_y > 0) { + if ((l < gc_pad) + && getBCSouth(boundary_conditions_) == Periodic) { + l += (ny_+2*gc_y - 2*gc_pad); + } + //North boundary: subtract an offset to read from south of domain + else if ((l >= ny_+2*gc_y-gc_pad) + && getBCNorth(boundary_conditions_) == Periodic) { + l -= (ny_+2*gc_y - 2*gc_pad); + } } return l; } -template -inline __device__ int handleReflectiveBoundary( - float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], +template +inline __device__ +void handleReflectiveBoundary( + float Q[h+2*gc_y][w+2*gc_x], const int nx_, const int ny_, const int boundary_conditions_) { + //Handle reflective boundary conditions if (getBCNorth(boundary_conditions_) == Reflective) { - bcNorthReflective(Q, nx_, ny_); + bcNorthReflective(Q, nx_, ny_); __syncthreads(); } if (getBCSouth(boundary_conditions_) == Reflective) { - bcSouthReflective(Q, nx_, ny_); + bcSouthReflective(Q, nx_, ny_); __syncthreads(); } if (getBCEast(boundary_conditions_) == Reflective) { - bcEastReflective(Q, nx_, ny_); + bcEastReflective(Q, nx_, ny_); __syncthreads(); } if (getBCWest(boundary_conditions_) == Reflective) { - bcWestReflective(Q, nx_, ny_); + bcWestReflective(Q, nx_, ny_); __syncthreads(); } } @@ -204,9 +210,9 @@ inline __device__ int handleReflectiveBoundary( /** * Reads a block of data with ghost cells */ -template +template inline __device__ void readBlock(float* ptr_, int pitch_, - float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], + float Q[h+2*gc_y][w+2*gc_x], const int nx_, const int ny_, const int boundary_conditions_) { //Index of block within domain @@ -215,16 +221,16 @@ inline __device__ void readBlock(float* ptr_, int pitch_, //Read into shared memory //Loop over all variables - for (int j=threadIdx.y; j(by + j, ny_, boundary_conditions_); - l = min(l, ny_+2*ghost_cells-1); + int l = handlePeriodicBoundaryY(by + j, ny_, boundary_conditions_); + l = min(l, ny_+2*gc_y-1); float* row = (float*) ((char*) ptr_ + pitch_*l); - for (int i=threadIdx.x; i(bx + i, nx_, boundary_conditions_); - k = min(k, nx_+2*ghost_cells-1); + int k = handlePeriodicBoundaryX(bx + i, nx_, boundary_conditions_); + k = min(k, nx_+2*gc_x-1); //Read from global memory Q[j][i] = row[k]; @@ -232,7 +238,7 @@ inline __device__ void readBlock(float* ptr_, int pitch_, } __syncthreads(); - handleReflectiveBoundary(Q, nx_, ny_, boundary_conditions_); + handleReflectiveBoundary(Q, nx_, ny_, boundary_conditions_); } @@ -241,45 +247,68 @@ inline __device__ void readBlock(float* ptr_, int pitch_, /** * Writes a block of data to global memory for the shallow water equations. */ -template +template inline __device__ void writeBlock(float* ptr_, int pitch_, - float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells], - const int width, const int height, + float shmem[h+2*gc_y][w+2*gc_x], + const int nx_, const int ny_, int rk_step_, int rk_order_) { //Index of cell within domain - const int ti = blockDim.x*blockIdx.x + threadIdx.x + ghost_cells; - const int tj = blockDim.y*blockIdx.y + threadIdx.y + ghost_cells; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x; + const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y; //Only write internal cells - if (ti < width+ghost_cells && tj < height+ghost_cells) { + if (ti < nx_+gc_x && tj < ny_+gc_y) { //Index of thread within block - const int tx = threadIdx.x + ghost_cells; - const int ty = threadIdx.y + ghost_cells; + const int tx = threadIdx.x + gc_x; + const int ty = threadIdx.y + gc_y; float* const row = (float*) ((char*) ptr_ + pitch_*tj); //Handle runge-kutta timestepping here row[ti] = shmem[ty][tx]; + + + /** + * SSPRK1 (forward Euler) + * u^1 = u^n + dt*f(u^n) + */ + if (rk_order_ == 1) { + row[ti] = shmem[ty][tx]; + } /** * SSPRK2 * u^1 = u^n + dt*f(u^n) * u^n+1 = 1/2*u^n + 1/2*(u^1 + dt*f(u^1)) - * + */ + else if (rk_order_ == 2) { + if (rk_step_ == 0) { + row[ti] = shmem[ty][tx]; + } + else if (rk_step_ == 1) { + row[ti] = 0.5f*row[ti] + 0.5f*shmem[ty][tx]; + } + } + /** * SSPRK3 * u^1 = u^n + dt*f(u^n) * u^2 = 3/4 * u^n + 1/4 * (u^1 + dt*f(u^1)) * u^n+1 = 1/3 * u^n + 2/3 * (u^2 + dt*f(u^2)) + * FIXME: This is not correct now, need a temporary to hold intermediate step u^2 */ - - /* - if (rk_order_ == 2 && rk_step_ == 1) { - row[ti] = 0.5f*(row[ti] + shmem[ty][tx]); + else if (rk_order_ == 3) { + if (rk_step_ == 0) { + row[ti] = shmem[ty][tx]; + } + else if (rk_step_ == 1) { + row[ti] = 0.75f*row[ti] + 0.25f*shmem[ty][tx]; + } + else if (rk_step_ == 2) { + const float t = 1.0f / 3.0f; //Not representable in base 2 + row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx]; + } } - else { - row[ti] = shmem[ty][tx]; - }*/ } } @@ -297,25 +326,26 @@ inline __device__ void writeBlock(float* ptr_, int pitch_, // West boundary -template -__device__ void bcWestReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) { - for (int j=threadIdx.y; j +__device__ void bcWestReflective(float Q[h+2*gc_y][w+2*gc_x], + const int nx_, const int ny_) { + for (int j=threadIdx.y; j= 1 && ti == gc_x) { Q[j][i-1] = sign*Q[j][i]; } - if (ghost_cells >= 2 && ti == ghost_cells + 1) { + if (gc_x >= 2 && ti == gc_x + 1) { Q[j][i-3] = sign*Q[j][i]; } - if (ghost_cells >= 3 && ti == ghost_cells + 2) { + if (gc_x >= 3 && ti == gc_x + 2) { Q[j][i-5] = sign*Q[j][i]; } - if (ghost_cells >= 4 && ti == ghost_cells + 3) { + if (gc_x >= 4 && ti == gc_x + 3) { Q[j][i-7] = sign*Q[j][i]; } - if (ghost_cells >= 5 && ti == ghost_cells + 4) { + if (gc_x >= 5 && ti == gc_x + 4) { Q[j][i-9] = sign*Q[j][i]; } } @@ -323,25 +353,26 @@ __device__ void bcWestReflective(float Q[block_height+2*ghost_cells][block_width // East boundary -template -__device__ void bcEastReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) { - for (int j=threadIdx.y; j +__device__ void bcEastReflective(float Q[h+2*gc_y][w+2*gc_x], + const int nx_, const int ny_) { + for (int j=threadIdx.y; j= 1 && ti == nx_ + gc_x - 1) { Q[j][i+1] = sign*Q[j][i]; } - if (ghost_cells >= 2 && ti == nx_ + ghost_cells - 2) { + if (gc_x >= 2 && ti == nx_ + gc_x - 2) { Q[j][i+3] = sign*Q[j][i]; } - if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 3) { + if (gc_x >= 3 && ti == nx_ + gc_x - 3) { Q[j][i+5] = sign*Q[j][i]; } - if (ghost_cells >= 4 && ti == nx_ + ghost_cells - 4) { + if (gc_x >= 4 && ti == nx_ + gc_x - 4) { Q[j][i+7] = sign*Q[j][i]; } - if (ghost_cells >= 5 && ti == nx_ + ghost_cells - 5) { + if (gc_x >= 5 && ti == nx_ + gc_x - 5) { Q[j][i+9] = sign*Q[j][i]; } } @@ -349,25 +380,26 @@ __device__ void bcEastReflective(float Q[block_height+2*ghost_cells][block_width // South boundary -template -__device__ void bcSouthReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) { - for (int i=threadIdx.x; i +__device__ void bcSouthReflective(float Q[h+2*gc_y][w+2*gc_x], + const int nx_, const int ny_) { + for (int i=threadIdx.x; i= 1 && tj == gc_y) { Q[j-1][i] = sign*Q[j][i]; } - if (ghost_cells >= 2 && tj == ghost_cells + 1) { + if (gc_y >= 2 && tj == gc_y + 1) { Q[j-3][i] = sign*Q[j][i]; } - if (ghost_cells >= 3 && tj == ghost_cells + 2) { + if (gc_y >= 3 && tj == gc_y + 2) { Q[j-5][i] = sign*Q[j][i]; } - if (ghost_cells >= 4 && tj == ghost_cells + 3) { + if (gc_y >= 4 && tj == gc_y + 3) { Q[j-7][i] = sign*Q[j][i]; } - if (ghost_cells >= 5 && tj == ghost_cells + 4) { + if (gc_y >= 5 && tj == gc_y + 4) { Q[j-9][i] = sign*Q[j][i]; } } @@ -377,25 +409,25 @@ __device__ void bcSouthReflective(float Q[block_height+2*ghost_cells][block_widt // North boundary -template -__device__ void bcNorthReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) { - for (int i=threadIdx.x; i +__device__ void bcNorthReflective(float Q[h+2*gc_y][w+2*gc_x], const int nx_, const int ny_) { + for (int i=threadIdx.x; i= 1 && tj == ny_ + gc_y - 1) { Q[j+1][i] = sign*Q[j][i]; } - if (ghost_cells >= 2 && tj == ny_ + ghost_cells - 2) { + if (gc_y >= 2 && tj == ny_ + gc_y - 2) { Q[j+3][i] = sign*Q[j][i]; } - if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 3) { + if (gc_y >= 3 && tj == ny_ + gc_y - 3) { Q[j+5][i] = sign*Q[j][i]; } - if (ghost_cells >= 4 && tj == ny_ + ghost_cells - 4) { + if (gc_y >= 4 && tj == ny_ + gc_y - 4) { Q[j+7][i] = sign*Q[j][i]; } - if (ghost_cells >= 5 && tj == ny_ + ghost_cells - 5) { + if (gc_y >= 5 && tj == ny_ + gc_y - 5) { Q[j+9][i] = sign*Q[j][i]; } } @@ -422,13 +454,13 @@ __device__ void bcNorthReflective(float Q[block_height+2*ghost_cells][block_widt -template -__device__ void evolveF(float Q[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], - float F[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], +template +__device__ void evolveF(float Q[vars][h+2*gc_y][w+2*gc_x], + float F[vars][h+2*gc_y][w+2*gc_x], const float dx_, const float dt_) { for (int var=0; var < vars; ++var) { - for (int j=threadIdx.y; j -__device__ void evolveG(float Q[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], - float G[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], +template +__device__ void evolveG(float Q[vars][h+2*gc_y][w+2*gc_x], + float G[vars][h+2*gc_y][w+2*gc_x], const float dy_, const float dt_) { for (int var=0; var < vars; ++var) { - for (int j=threadIdx.y+ghost_cells; j +__device__ void reduce_max(float* data, unsigned int n) { + __shared__ float sdata[threads]; + unsigned int tid = threadIdx.x; + + //Reduce to "threads" elements + sdata[tid] = FLT_MIN; + for (unsigned int i=tid; i= 512) { + if (tid < 256) { + sdata[tid] = max(sdata[tid], sdata[tid + 256]); + } + __syncthreads(); + } + if (threads >= 256) { + if (tid < 128) { + sdata[tid] = max(sdata[tid], sdata[tid + 128]); + } + __syncthreads(); + } + if (threads >= 128) { + if (tid < 64) { + sdata[tid] = max(sdata[tid], sdata[tid + 64]); + } + __syncthreads(); + } + if (tid < 32) { + volatile float* sdata_volatile = sdata; + if (threads >= 64) { + sdata_volatile[tid] = max(sdata_volatile[tid], sdata_volatile[tid + 32]); + } + if (tid < 16) { + if (threads >= 32) sdata_volatile[tid] = max(sdata_volatile[tid], sdata_volatile[tid + 16]); + if (threads >= 16) sdata_volatile[tid] = max(sdata_volatile[tid], sdata_volatile[tid + 8]); + if (threads >= 8) sdata_volatile[tid] = max(sdata_volatile[tid], sdata_volatile[tid + 4]); + if (threads >= 4) sdata_volatile[tid] = max(sdata_volatile[tid], sdata_volatile[tid + 2]); + if (threads >= 2) sdata_volatile[tid] = max(sdata_volatile[tid], sdata_volatile[tid + 1]); + } + + if (tid == 0) { + return sdata_volatile[0]; + } + } +} diff --git a/GPUSimulators/cuda/limiters.h b/GPUSimulators/cuda/limiters.h index 65ed892..c2effa7 100644 --- a/GPUSimulators/cuda/limiters.h +++ b/GPUSimulators/cuda/limiters.h @@ -46,14 +46,14 @@ __device__ __inline__ float minmodSlope(float left, float center, float right, f /** * Reconstructs a minmod slope for a whole block along the abscissa */ -template -__device__ void minmodSlopeX(float Q[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], - float Qx[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], +template +__device__ void minmodSlopeX(float Q[vars][h+2*gc_y][w+2*gc_x], + float Qx[vars][h+2*gc_y][w+2*gc_x], const float theta_) { //Reconstruct slopes along x axis for (int p=0; p -__device__ void minmodSlopeY(float Q[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], - float Qy[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], +template +__device__ void minmodSlopeY(float Q[vars][h+2*gc_y][w+2*gc_x], + float Qy[vars][h+2*gc_y][w+2*gc_x], const float theta_) { //Reconstruct slopes along y axis for (int p=0; p