diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py index 989b842..21ff961 100644 --- a/GPUSimulators/FORCE.py +++ b/GPUSimulators/FORCE.py @@ -80,9 +80,6 @@ class FORCE (Simulator.BaseSimulator): nx, ny, \ 1, 1, \ [None, None, None]) - - def __str__(self): - return "First order centered" def simulate(self, t_end): return super().simulateEuler(t_end) diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py index 89d13ea..a764b40 100644 --- a/GPUSimulators/HLL.py +++ b/GPUSimulators/HLL.py @@ -75,9 +75,6 @@ class HLL (Simulator.BaseSimulator): nx, ny, \ 1, 1, \ [None, None, None]) - - def __str__(self): - return "Harten-Lax-van Leer" def simulate(self, t_end): return super().simulateEuler(t_end) diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py index db1b9e3..d0a146c 100644 --- a/GPUSimulators/HLL2.py +++ b/GPUSimulators/HLL2.py @@ -81,9 +81,6 @@ class HLL2 (Simulator.BaseSimulator): nx, ny, \ 2, 2, \ [None, None, None]) - - def __str__(self): - return "Harten-Lax-van Leer (2nd order)" def simulate(self, t_end): return super().simulateDimsplit(t_end) diff --git a/GPUSimulators/HLL2Euler.py b/GPUSimulators/HLL2Euler.py index 8a9f285..9346d9c 100644 --- a/GPUSimulators/HLL2Euler.py +++ b/GPUSimulators/HLL2Euler.py @@ -82,9 +82,6 @@ class HLL2Euler (Simulator.BaseSimulator): nx, ny, \ 2, 2, \ [None, None, None, None]) - - def __str__(self): - return "Harten-Lax-van Leer (2nd order)" def simulate(self, t_end): return super().simulateDimsplit(t_end) diff --git a/GPUSimulators/IPythonMagic.py b/GPUSimulators/IPythonMagic.py index 35d515f..cf0a8ef 100644 --- a/GPUSimulators/IPythonMagic.py +++ b/GPUSimulators/IPythonMagic.py @@ -90,6 +90,17 @@ class MyIPythonMagic(Magics): self.logger.debug("==================================================================") atexit.register(exitfunc) + + + + + + + + logger_initialized = False + + + @line_magic @magic_arguments.magic_arguments() @magic_arguments.argument( @@ -99,29 +110,43 @@ class MyIPythonMagic(Magics): @magic_arguments.argument( '--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]') def setup_logging(self, line): - args = magic_arguments.parse_argstring(self.setup_logging, line) - import sys - - #Get root logger - logger = logging.getLogger('') - logger.setLevel(min(args.level, args.file_level)) + if (self.logger_initialized): + logging.getLogger('').info("Global logger already initialized!") + return; + else: + self.logger_initialized = True + + args = magic_arguments.parse_argstring(self.setup_logging, line) + import sys + + #Get root logger + logger = logging.getLogger('') + logger.setLevel(min(args.level, args.file_level)) - #Add log to screen - ch = logging.StreamHandler() - ch.setLevel(args.level) - logger.addHandler(ch) - logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level)) - - #Add log to file - logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), args.out) - fh = logging.FileHandler(args.out) - formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') - fh.setFormatter(formatter) - fh.setLevel(args.file_level) - logger.addHandler(fh) + #Add log to screen + ch = logging.StreamHandler() + ch.setLevel(args.level) + logger.addHandler(ch) + logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level)) + + #Add log to file + logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), args.out) + fh = logging.FileHandler(args.out) + formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') + fh.setFormatter(formatter) + fh.setLevel(args.file_level) + logger.addHandler(fh) logger.info("Python version %s", sys.version) + + + + + + + + # Register ip = get_ipython() ip.register_magics(MyIPythonMagic) diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py index e26f58a..f9d2f02 100644 --- a/GPUSimulators/KP07.py +++ b/GPUSimulators/KP07.py @@ -82,9 +82,6 @@ class KP07 (Simulator.BaseSimulator): nx, ny, \ 2, 2, \ [None, None, None]) - - def __str__(self): - return "Kurganov-Petrova 2007" def simulate(self, t_end): return super().simulateRK(t_end, 2) diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py index 20556fb..372c0fa 100644 --- a/GPUSimulators/KP07_dimsplit.py +++ b/GPUSimulators/KP07_dimsplit.py @@ -82,9 +82,6 @@ class KP07_dimsplit (Simulator.BaseSimulator): nx, ny, \ 2, 2, \ [None, None, None]) - - def __str__(self): - return "Kurganov-Petrova 2007 dimensionally split" def simulate(self, t_end): return super().simulateDimsplit(t_end) diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py index 1f1019c..f48d6a9 100644 --- a/GPUSimulators/LxF.py +++ b/GPUSimulators/LxF.py @@ -77,9 +77,6 @@ class LxF (Simulator.BaseSimulator): 1, 1, \ [None, None, None]) - def __str__(self): - return "Lax Friedrichs" - def simulate(self, t_end): return super().simulateEuler(t_end) diff --git a/GPUSimulators/Simulator.py b/GPUSimulators/Simulator.py index 5234c92..0006e83 100644 --- a/GPUSimulators/Simulator.py +++ b/GPUSimulators/Simulator.py @@ -86,6 +86,9 @@ class BaseSimulator: int(np.ceil(self.ny / float(self.local_size[1]))) \ ) + def __str__(self): + return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny) + """ Function which simulates forward in time using the default simulation type """ @@ -112,7 +115,7 @@ class BaseSimulator: # Step with forward Euler self.stepEuler(local_dt) - self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs) + self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs) return self.t, n """ @@ -135,7 +138,7 @@ class BaseSimulator: # Perform all the Runge-Kutta substeps self.stepRK(local_dt, order) - self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs) + self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs) return self.t, n """ @@ -159,7 +162,7 @@ class BaseSimulator: self.stepDimsplitXY(local_dt) self.stepDimsplitYX(local_dt) - self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs) + self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs) return self.t, 2*n diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index f6768b6..a7bdd51 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -75,9 +75,6 @@ class WAF (Simulator.BaseSimulator): nx, ny, \ 2, 2, \ [None, None, None]) - - def __str__(self): - return "Weighted average flux" def simulate(self, t_end): return super().simulateDimsplit(t_end) diff --git a/GPUSimulators/cuda/EulerCommon.h b/GPUSimulators/cuda/EulerCommon.h index a8442a0..47b1eb2 100644 --- a/GPUSimulators/cuda/EulerCommon.h +++ b/GPUSimulators/cuda/EulerCommon.h @@ -1,12 +1,8 @@ /* -This OpenCL kernel implements the Kurganov-Petrova numerical scheme -for the shallow water equations, described in -A. Kurganov & Guergana Petrova -A Second-Order Well-Balanced Positivity Preserving Central-Upwind -Scheme for the Saint-Venant System Communications in Mathematical -Sciences, 5 (2007), 133-160. +These CUDA functions implement different types of numerical flux +functions for the shallow water equations -Copyright (C) 2016 SINTEF ICT +Copyright (C) 2016, 2017, 2018 SINTEF Digital This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,13 +19,14 @@ along with this program. If not, see . */ #pragma once +#include "limiters.h" -__device__ float pressure(float4 Q, float gamma) { +inline __device__ float pressure(float4 Q, float gamma) { const float rho = Q.x; const float rho_u = Q.y; const float rho_v = Q.z; @@ -39,14 +36,13 @@ __device__ float pressure(float4 Q, float gamma) { } -__device__ float4 F_func(const float4 Q, float gamma) { +__device__ float4 F_func(const float4 Q, float P) { const float rho = Q.x; const float rho_u = Q.y; const float rho_v = Q.z; const float E = Q.w; const float u = rho_u/rho; - const float P = pressure(Q, gamma); float4 F; @@ -56,4 +52,29 @@ __device__ float4 F_func(const float4 Q, float gamma) { F.w = u*(E+P); return F; +} + + + + + +/** + * Central upwind flux function + */ +__device__ float4 CentralUpwindFlux(const float4 Qm, float4 Qp, const float gamma) { + + const float Pp = pressure(Qp, gamma); + const float4 Fp = F_func(Qp, Pp); + const float up = Qp.y / Qp.x; // rho*u / rho + const float cp = sqrt(gamma*P*Qp.x); // sqrt(gamma*P/rho) + + const float Pm = pressure(Qm, gamma); + const float3 Fm = F_func(Qm, Pm); + const float um = Qm.y / Qm.x; // rho*u / rho + const float cm = sqrt(gamma*P/Qm.x); // sqrt(g*h) + + const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed + const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed + + return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am); } \ No newline at end of file diff --git a/GPUSimulators/cuda/SWECommon.h b/GPUSimulators/cuda/SWECommon.h index 233e9f0..50dd6c0 100644 --- a/GPUSimulators/cuda/SWECommon.h +++ b/GPUSimulators/cuda/SWECommon.h @@ -1,12 +1,8 @@ /* -This OpenCL kernel implements the Kurganov-Petrova numerical scheme -for the shallow water equations, described in -A. Kurganov & Guergana Petrova -A Second-Order Well-Balanced Positivity Preserving Central-Upwind -Scheme for the Saint-Venant System Communications in Mathematical -Sciences, 5 (2007), 133-160. +These CUDA functions implement different types of numerical flux +functions for the shallow water equations -Copyright (C) 2016 SINTEF ICT +Copyright (C) 2016, 2017, 2018 SINTEF Digital This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/GPUSimulators/cuda/SWE_FORCE.cu b/GPUSimulators/cuda/SWE_FORCE.cu index 002503d..adab937 100644 --- a/GPUSimulators/cuda/SWE_FORCE.cu +++ b/GPUSimulators/cuda/SWE_FORCE.cu @@ -114,18 +114,23 @@ __global__ void FORCEKernel( float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { - __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 1; + __shared__ float Q[3][h+2][w+2]; + __shared__ float F[3][h+1][w+1]; //Read into shared memory - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; - readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Set boundary conditions - noFlowBoundary1(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute flux along x, and evolve @@ -135,7 +140,9 @@ __global__ void FORCEKernel( __syncthreads(); //Set boundary conditions - noFlowBoundary1(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute flux along y, and evolve @@ -145,9 +152,9 @@ __global__ void FORCEKernel( __syncthreads(); //Write to main memory - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE_HLL.cu b/GPUSimulators/cuda/SWE_HLL.cu index def3d04..e34658c 100644 --- a/GPUSimulators/cuda/SWE_HLL.cu +++ b/GPUSimulators/cuda/SWE_HLL.cu @@ -121,19 +121,24 @@ __global__ void HLLKernel( float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { - //Shared memory variables - __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 1; + //Shared memory variables + __shared__ float Q[3][h+2][w+2]; + __shared__ float F[3][h+1][w+1]; //Read into shared memory - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; - readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Set boundary conditions - noFlowBoundary1(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute F flux @@ -143,7 +148,9 @@ __global__ void HLLKernel( __syncthreads(); //Set boundary conditions - noFlowBoundary1(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute G flux @@ -153,9 +160,9 @@ __global__ void HLLKernel( __syncthreads(); // Write to main memory for all internal cells - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE_HLL2.cu b/GPUSimulators/cuda/SWE_HLL2.cu index d51233f..dc63d66 100644 --- a/GPUSimulators/cuda/SWE_HLL2.cu +++ b/GPUSimulators/cuda/SWE_HLL2.cu @@ -155,23 +155,26 @@ __global__ void HLL2Kernel( float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { + + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 2; //Shared memory variables - __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4]; - __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; - - - + __shared__ float Q[3][h+4][w+4]; + __shared__ float Qx[3][h+2][w+2]; + __shared__ float F[3][h+1][w+1]; //Read into shared memory - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; - readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Set boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Step 0 => evolve x first, then y @@ -185,7 +188,9 @@ __global__ void HLL2Kernel( __syncthreads(); //Set boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the y axis and evolve @@ -207,7 +212,9 @@ __global__ void HLL2Kernel( __syncthreads(); //Set boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the x axis and evolve @@ -223,10 +230,9 @@ __global__ void HLL2Kernel( // Write to main memory for all internal cells - writeBlock2(h1_ptr_, h1_pitch_, - hu1_ptr_, hu1_pitch_, - hv1_ptr_, hv1_pitch_, - Q, nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE_KP07.cu b/GPUSimulators/cuda/SWE_KP07.cu index 41ffc54..09b44a5 100644 --- a/GPUSimulators/cuda/SWE_KP07.cu +++ b/GPUSimulators/cuda/SWE_KP07.cu @@ -118,6 +118,10 @@ __global__ void KP07Kernel( float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { + + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 2; //Index of thread within block const int tx = threadIdx.x; @@ -128,26 +132,28 @@ __global__ void KP07Kernel( const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; //Shared memory variables - __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4]; + __shared__ float Q[3][h+4][w+4]; //The following slightly wastes memory, but enables us to reuse the //funcitons in common.opencl - __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; - __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; + __shared__ float Qx[3][h+2][w+2]; + __shared__ float Qy[3][h+2][w+2]; + __shared__ float F[3][h+1][w+1]; + __shared__ float G[3][h+1][w+1]; //Read into shared memory - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; - readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Fix boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); diff --git a/GPUSimulators/cuda/SWE_KP07_dimsplit.cu b/GPUSimulators/cuda/SWE_KP07_dimsplit.cu index a350bcb..74381d0 100644 --- a/GPUSimulators/cuda/SWE_KP07_dimsplit.cu +++ b/GPUSimulators/cuda/SWE_KP07_dimsplit.cu @@ -146,24 +146,30 @@ __global__ void KP07DimsplitKernel( float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { + + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 2; //Shared memory variables - __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4]; - __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; + __shared__ float Q[3][h+4][w+4]; + __shared__ float Qx[3][h+2][w+2]; + __shared__ float F[3][h+1][w+1]; //Read into shared memory - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; - readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Fix boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); @@ -179,7 +185,9 @@ __global__ void KP07DimsplitKernel( __syncthreads(); //Set boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the y axis and evolve @@ -201,7 +209,9 @@ __global__ void KP07DimsplitKernel( __syncthreads(); //Set boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the x axis and evolve @@ -215,10 +225,9 @@ __global__ void KP07DimsplitKernel( // Write to main memory for all internal cells - writeBlock2(h1_ptr_, h1_pitch_, - hu1_ptr_, hu1_pitch_, - hv1_ptr_, hv1_pitch_, - Q, nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWE_LxF.cu b/GPUSimulators/cuda/SWE_LxF.cu index f9cbcae..20d6294 100644 --- a/GPUSimulators/cuda/SWE_LxF.cu +++ b/GPUSimulators/cuda/SWE_LxF.cu @@ -97,7 +97,6 @@ void computeFluxG(float Q[3][block_height+2][block_width+2], } - extern "C" { __global__ void LxFKernel( @@ -114,44 +113,50 @@ void LxFKernel( float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_) { - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2]; - __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1]; - __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH]; + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 1; - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; + __shared__ float Q[3][h+2][w+2]; + __shared__ float F[3][h ][w+1]; + __shared__ float G[3][h+1][w ]; - readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2); + //Read from global memory + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Set boundary conditions - noFlowBoundary1(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the x and y axis - computeFluxF(Q, F, g_, dx_, dt_); - computeFluxG(Q, G, g_, dy_, dt_); + computeFluxF(Q, F, g_, dx_, dt_); + computeFluxG(Q, G, g_, dy_, dt_); __syncthreads(); - //Evolve for all cells + const int tx = threadIdx.x; + const int ty = threadIdx.y; const int i = tx + 1; //Skip local ghost cells, i.e., +1 const int j = ty + 1; + Q[0][j][i] += (F[0][ty][tx] - F[0][ty ][tx+1]) * dt_ / dx_ + (G[0][ty][tx] - G[0][ty+1][tx ]) * dt_ / dy_; Q[1][j][i] += (F[1][ty][tx] - F[1][ty ][tx+1]) * dt_ / dx_ + (G[1][ty][tx] - G[1][ty+1][tx ]) * dt_ / dy_; Q[2][j][i] += (F[2][ty][tx] - F[2][ty ][tx+1]) * dt_ / dx_ + (G[2][ty][tx] - G[2][ty+1][tx ]) * dt_ / dy_; + __syncthreads(); //Write to main memory - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" diff --git a/GPUSimulators/cuda/SWE_WAF.cu b/GPUSimulators/cuda/SWE_WAF.cu index b56af1d..62a82e2 100644 --- a/GPUSimulators/cuda/SWE_WAF.cu +++ b/GPUSimulators/cuda/SWE_WAF.cu @@ -129,22 +129,29 @@ __global__ void WAFKernel( //Output h^{n+1} float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, - float* hv1_ptr_, int hv1_pitch_) { + float* hv1_ptr_, int hv1_pitch_) { + + const unsigned int w = BLOCK_WIDTH; + const unsigned int h = BLOCK_HEIGHT; + const unsigned int gc = 2; + //Shared memory variables - __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4]; - __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1]; + __shared__ float Q[3][h+4][w+4]; + __shared__ float F[3][h+1][w+1]; //Read into shared memory Q from global memory - float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_}; - int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_}; - readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2); __syncthreads(); //Set boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); @@ -158,7 +165,9 @@ __global__ void WAFKernel( __syncthreads(); //Fix boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the y axis and evolve @@ -176,7 +185,9 @@ __global__ void WAFKernel( __syncthreads(); //Fix boundary conditions - noFlowBoundary2(Q, nx_, ny_); + noFlowBoundary(Q[0], nx_, ny_); + noFlowBoundary(Q[1], nx_, ny_); + noFlowBoundary(Q[2], nx_, ny_); __syncthreads(); //Compute fluxes along the x axis and evolve @@ -189,10 +200,9 @@ __global__ void WAFKernel( // Write to main memory for all internal cells - writeBlock2(h1_ptr_, h1_pitch_, - hu1_ptr_, hu1_pitch_, - hv1_ptr_, hv1_pitch_, - Q, nx_, ny_); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/common.h b/GPUSimulators/cuda/common.h index c0af6f5..faf961d 100644 --- a/GPUSimulators/cuda/common.h +++ b/GPUSimulators/cuda/common.h @@ -48,6 +48,10 @@ inline __device__ __host__ float clamp(const float f, const float a, const float return fmaxf(a, fminf(f, b)); } +inline __device__ __host__ int clamp(const int f, const int a, const int b) { + return (f < b) ? ( (f > a) ? f : a) : b; +} + @@ -60,37 +64,27 @@ __device__ float desingularize(float x_, float eps_) { - - /** * Reads a block of data with ghost cells */ -template -inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars], - float shmem[vars][sm_height][sm_width], +template +inline __device__ void readBlock(float* ptr_, int pitch_, + float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells], const int max_x_, const int max_y_) { - //Index of block within domain const int bx = blockDim.x * blockIdx.x; const int by = blockDim.y * blockIdx.y; - - float* rows[3]; - + //Read into shared memory - for (int j=threadIdx.y; j +template inline __device__ void writeBlock(float* ptr_, int pitch_, - float shmem[sm_height][sm_width], + float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells], const int width, const int height) { //Index of cell within domain - const int ti = blockDim.x*blockIdx.x + threadIdx.x + offset_x; - const int tj = blockDim.y*blockIdx.y + threadIdx.y + offset_y; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + ghost_cells; + const int tj = blockDim.y*blockIdx.y + threadIdx.y + ghost_cells; //Only write internal cells - if (ti < width+offset_x && tj < height+offset_y) { + if (ti < width+ghost_cells && tj < height+ghost_cells) { //Index of thread within block - const int tx = threadIdx.x + offset_x; - const int ty = threadIdx.y + offset_y; + const int tx = threadIdx.x + ghost_cells; + const int ty = threadIdx.y + ghost_cells; float* const row = (float*) ((char*) ptr_ + pitch_*tj); row[ti] = shmem[ty][tx]; @@ -129,60 +122,93 @@ inline __device__ void writeBlock(float* ptr_, int pitch_, -/** - * Writes a block of data to global memory for the shallow water equations. - */ -__device__ void writeBlock2(float* h_ptr_, int h_pitch_, - float* hu_ptr_, int hu_pitch_, - float* hv_ptr_, int hv_pitch_, - float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], - const int nx_, const int ny_) { - writeBlock( h_ptr_, h_pitch_, Q[0], nx_, ny_); - writeBlock(hu_ptr_, hu_pitch_, Q[1], nx_, ny_); - writeBlock(hv_ptr_, hv_pitch_, Q[2], nx_, ny_); -} - -/** - * No flow boundary conditions for the shallow water equations - * with one ghost cell in each direction - */ -__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) { - //Global index - const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1 - const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1; +template +__device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) { + const int ti = blockDim.x*blockIdx.x + threadIdx.x + ghost_cells; + const int tj = blockDim.y*blockIdx.y + threadIdx.y + ghost_cells; - //Block-local indices - const int tx = threadIdx.x; - const int ty = threadIdx.y; + const int i = threadIdx.x + ghost_cells; + const int j = threadIdx.y + ghost_cells; - const int i = tx + 1; //Skip local ghost cells, i.e., +1 - const int j = ty + 1; - //Fix boundary conditions - if (ti == 1) { - Q[0][j][i-1] = Q[0][j][i]; - Q[1][j][i-1] = -Q[1][j][i]; - Q[2][j][i-1] = Q[2][j][i]; + // West boundary + if (ti == ghost_cells) { + Q[j][i-1] = scale_east_west*Q[j][i]; } - if (ti == nx_) { - Q[0][j][i+1] = Q[0][j][i]; - Q[1][j][i+1] = -Q[1][j][i]; - Q[2][j][i+1] = Q[2][j][i]; + if (ghost_cells >= 2 && ti == ghost_cells + 1) { + Q[j][i-3] = scale_east_west*Q[j][i]; } - if (tj == 1) { - Q[0][j-1][i] = Q[0][j][i]; - Q[1][j-1][i] = Q[1][j][i]; - Q[2][j-1][i] = -Q[2][j][i]; + if (ghost_cells >= 3 && ti == ghost_cells + 2) { + Q[j][i-5] = scale_east_west*Q[j][i]; } - if (tj == ny_) { - Q[0][j+1][i] = Q[0][j][i]; - Q[1][j+1][i] = Q[1][j][i]; - Q[2][j+1][i] = -Q[2][j][i]; + if (ghost_cells >= 4 && ti == ghost_cells + 3) { + Q[j][i-7] = scale_east_west*Q[j][i]; + } + if (ghost_cells >= 5 && ti == ghost_cells + 4) { + Q[j][i-9] = scale_east_west*Q[j][i]; + } + + + + // East boundary + if (ti == nx_ + ghost_cells - 1) { + Q[j][i+1] = scale_east_west*Q[j][i]; + } + if (ghost_cells >= 2 && ti == nx_ + ghost_cells - 2) { + Q[j][i+3] = scale_east_west*Q[j][i]; + } + if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 3) { + Q[j][i+5] = scale_east_west*Q[j][i]; + } + if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 4) { + Q[j][i+7] = scale_east_west*Q[j][i]; + } + if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 5) { + Q[j][i+9] = scale_east_west*Q[j][i]; + } + + + + + // South boundary + if (tj == ghost_cells) { + Q[j-1][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 2 && tj == ghost_cells + 1) { + Q[j-3][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 3 && tj == ghost_cells + 2) { + Q[j-5][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 4 && tj == ghost_cells + 3) { + Q[j-7][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 5 && tj == ghost_cells + 4) { + Q[j-9][i] = scale_north_south*Q[j][i]; + } + + + + // North boundary + if (tj == ny_ + ghost_cells - 1) { + Q[j+1][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 2 && tj == ny_ + ghost_cells - 2) { + Q[j+3][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 3) { + Q[j+5][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 4) { + Q[j+7][i] = scale_north_south*Q[j][i]; + } + if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 5) { + Q[j+9][i] = scale_north_south*Q[j][i]; } } @@ -191,59 +217,9 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const -/** - * No flow boundary conditions for the shallow water equations - * with two ghost cells in each direction - */ -__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) { - //Global index - const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2 - const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2; - - //Block-local indices - const int tx = threadIdx.x; - const int ty = threadIdx.y; - - const int i = tx + 2; //Skip local ghost cells, i.e., +2 - const int j = ty + 2; - - if (ti == 2) { - Q[0][j][i-1] = Q[0][j][i]; - Q[1][j][i-1] = -Q[1][j][i]; - Q[2][j][i-1] = Q[2][j][i]; - - Q[0][j][i-2] = Q[0][j][i+1]; - Q[1][j][i-2] = -Q[1][j][i+1]; - Q[2][j][i-2] = Q[2][j][i+1]; - } - if (ti == nx_+1) { - Q[0][j][i+1] = Q[0][j][i]; - Q[1][j][i+1] = -Q[1][j][i]; - Q[2][j][i+1] = Q[2][j][i]; - - Q[0][j][i+2] = Q[0][j][i-1]; - Q[1][j][i+2] = -Q[1][j][i-1]; - Q[2][j][i+2] = Q[2][j][i-1]; - } - if (tj == 2) { - Q[0][j-1][i] = Q[0][j][i]; - Q[1][j-1][i] = Q[1][j][i]; - Q[2][j-1][i] = -Q[2][j][i]; - - Q[0][j-2][i] = Q[0][j+1][i]; - Q[1][j-2][i] = Q[1][j+1][i]; - Q[2][j-2][i] = -Q[2][j+1][i]; - } - if (tj == ny_+1) { - Q[0][j+1][i] = Q[0][j][i]; - Q[1][j+1][i] = Q[1][j][i]; - Q[2][j+1][i] = -Q[2][j][i]; - - Q[0][j+2][i] = Q[0][j-1][i]; - Q[1][j+2][i] = Q[1][j-1][i]; - Q[2][j+2][i] = -Q[2][j-1][i]; - } -} + + +