diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py index 616d965..8df04b1 100644 --- a/GPUSimulators/FORCE.py +++ b/GPUSimulators/FORCE.py @@ -41,7 +41,9 @@ class FORCE (Simulator.BaseSimulator): g, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), - block_width=16, block_height=16): + block_width=16, block_height=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -55,6 +57,7 @@ class FORCE (Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -75,11 +78,11 @@ class FORCE (Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("FORCEKernel") - self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -91,10 +94,15 @@ class FORCE (Simulator.BaseSimulator): 1, 1, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, @@ -108,7 +116,10 @@ class FORCE (Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) + self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py index 52fc0ca..840cd83 100644 --- a/GPUSimulators/HLL.py +++ b/GPUSimulators/HLL.py @@ -40,7 +40,9 @@ class HLL (Simulator.BaseSimulator): g, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), - block_width=16, block_height=16): + block_width=16, block_height=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -54,6 +56,7 @@ class HLL (Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -74,11 +77,11 @@ class HLL (Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("HLLKernel") - self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -90,10 +93,14 @@ class HLL (Simulator.BaseSimulator): 1, 1, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, @@ -107,7 +114,9 @@ class HLL (Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py index 9fdcdd8..12f8dcf 100644 --- a/GPUSimulators/HLL2.py +++ b/GPUSimulators/HLL2.py @@ -41,7 +41,9 @@ class HLL2 (Simulator.BaseSimulator): theta=1.8, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), - block_width=16, block_height=16): + block_width=16, block_height=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -55,6 +57,7 @@ class HLL2 (Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -76,11 +79,11 @@ class HLL2 (Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("HLL2Kernel") - self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") + self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -92,10 +95,15 @@ class HLL2 (Simulator.BaseSimulator): 2, 2, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): self.substepDimsplit(dt*0.5, step_number) @@ -114,7 +122,9 @@ class HLL2 (Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py index 37ca7ec..ff255e7 100644 --- a/GPUSimulators/KP07.py +++ b/GPUSimulators/KP07.py @@ -47,7 +47,9 @@ class KP07 (Simulator.BaseSimulator): cfl_scale=0.9, order=2, boundary_conditions=BoundaryCondition(), - block_width=16, block_height=16): + block_width=16, block_height=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -61,6 +63,7 @@ class KP07 (Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -83,11 +86,11 @@ class KP07 (Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("KP07Kernel") - self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") + self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -99,10 +102,15 @@ class KP07 (Simulator.BaseSimulator): 2, 2, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): @@ -123,7 +131,9 @@ class KP07 (Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py index 680adb7..b5eb7aa 100644 --- a/GPUSimulators/KP07_dimsplit.py +++ b/GPUSimulators/KP07_dimsplit.py @@ -46,7 +46,9 @@ class KP07_dimsplit(Simulator.BaseSimulator): theta=1.3, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), - block_width=16, block_height=16): + block_width=16, block_height=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -60,6 +62,7 @@ class KP07_dimsplit(Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -83,11 +86,11 @@ class KP07_dimsplit(Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("KP07DimsplitKernel") - self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") + self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -99,10 +102,15 @@ class KP07_dimsplit(Simulator.BaseSimulator): self.gc_x, self.gc_y, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): self.substepDimsplit(dt*0.5, step_number) @@ -121,7 +129,9 @@ class KP07_dimsplit(Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py index e5833a9..614b514 100644 --- a/GPUSimulators/LxF.py +++ b/GPUSimulators/LxF.py @@ -41,7 +41,9 @@ class LxF (Simulator.BaseSimulator): g: float, cfl_scale: float=0.9, boundary_conditions=BoundaryCondition(), - block_width: int=16, block_height: int=16): + block_width: int=16, block_height: int=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -55,6 +57,7 @@ class LxF (Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -75,11 +78,11 @@ class LxF (Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("LxFKernel") - self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -91,10 +94,15 @@ class LxF (Simulator.BaseSimulator): 1, 1, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): """ @@ -113,7 +121,9 @@ class LxF (Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index 21fd484..25c0d6a 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -41,7 +41,9 @@ class WAF (Simulator.BaseSimulator): g, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), - block_width=16, block_height=16): + block_width=16, block_height=16, + dt: float=None, + compile_opts: list[str]=[]): """ Initialization routine @@ -55,6 +57,7 @@ class WAF (Simulator.BaseSimulator): dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) + compile_opts: Pass a list of nvcc compiler options """ # Call super constructor @@ -75,11 +78,11 @@ class WAF (Simulator.BaseSimulator): }, compile_args={ 'no_extern_c': True, - 'options': ["--use_fast_math"], + 'options': ["--use_fast_math"] + compile_opts, }, jit_compile_args={}) self.kernel = module.get_function("WAFKernel") - self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") + self.kernel.prepare("iiffffiiPiPiPiPiPiPiPiiii") #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -91,10 +94,15 @@ class WAF (Simulator.BaseSimulator): 2, 2, [None, None, None]) self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) - dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) - dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) + + if dt == None: + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) + dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) + self.dt = min(dt_x, dt_y) + else: + self.dt = dt + + self.cfl_data.fill(self.dt, stream=self.stream) def substep(self, dt, step_number): self.substepDimsplit(dt*0.5, step_number) @@ -112,7 +120,9 @@ class WAF (Simulator.BaseSimulator): self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) self.u0, self.u1 = self.u1, self.u0 def getOutput(self): diff --git a/GPUSimulators/cuda/SWE2D_FORCE.cu b/GPUSimulators/cuda/SWE2D_FORCE.cu index dac46be..787fbf4 100644 --- a/GPUSimulators/cuda/SWE2D_FORCE.cu +++ b/GPUSimulators/cuda/SWE2D_FORCE.cu @@ -100,8 +100,18 @@ __global__ void FORCEKernel( float* hv1_ptr_, int hv1_pitch_, //Output CFL - float* cfl_) { - + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; + const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; const unsigned int gc_x = 1; @@ -112,9 +122,9 @@ __global__ void FORCEKernel( __shared__ float F[vars][h+2*gc_y][w+2*gc_x]; //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); __syncthreads(); //Compute flux along x, and evolve @@ -130,9 +140,9 @@ __global__ void FORCEKernel( __syncthreads(); //Write to main memory - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); //Compute the CFL for this block if (cfl_ != NULL) { diff --git a/GPUSimulators/cuda/SWE2D_HLL.cu b/GPUSimulators/cuda/SWE2D_HLL.cu index 3ed6b35..07c5cec 100644 --- a/GPUSimulators/cuda/SWE2D_HLL.cu +++ b/GPUSimulators/cuda/SWE2D_HLL.cu @@ -116,7 +116,17 @@ __global__ void HLLKernel( float* hv1_ptr_, int hv1_pitch_, //Output CFL - float* cfl_) { + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; @@ -129,9 +139,9 @@ __global__ void HLLKernel( __shared__ float F[vars][h+2*gc_y][w+2*gc_x]; //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); //Compute F flux computeFluxF(Q, F, g_); @@ -148,9 +158,9 @@ __global__ void HLLKernel( __syncthreads(); // Write to main memory for all internal cells - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); //Compute the CFL for this block if (cfl_ != NULL) { diff --git a/GPUSimulators/cuda/SWE2D_HLL2.cu b/GPUSimulators/cuda/SWE2D_HLL2.cu index 94f92b5..e920948 100644 --- a/GPUSimulators/cuda/SWE2D_HLL2.cu +++ b/GPUSimulators/cuda/SWE2D_HLL2.cu @@ -144,7 +144,17 @@ __global__ void HLL2Kernel( float* hv1_ptr_, int hv1_pitch_, //Output CFL - float* cfl_) { + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; @@ -158,9 +168,9 @@ __global__ void HLL2Kernel( __shared__ float F[3][h+4][w+4]; //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); //Step 0 => evolve x first, then y if (step_ == 0) { @@ -203,9 +213,9 @@ __global__ void HLL2Kernel( // Write to main memory for all internal cells - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); //Compute the CFL for this block if (cfl_ != NULL) { diff --git a/GPUSimulators/cuda/SWE2D_KP07.cu b/GPUSimulators/cuda/SWE2D_KP07.cu index 6fa6154..f39ead4 100644 --- a/GPUSimulators/cuda/SWE2D_KP07.cu +++ b/GPUSimulators/cuda/SWE2D_KP07.cu @@ -154,7 +154,18 @@ __global__ void KP07Kernel( float* hv1_ptr_, int hv1_pitch_, //Output CFL - float* cfl_) { + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; + const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; const unsigned int gc_x = 2; @@ -179,9 +190,9 @@ __global__ void KP07Kernel( //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); //Reconstruct slopes along x and axis diff --git a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu index ac256e3..a8bfae8 100644 --- a/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu +++ b/GPUSimulators/cuda/SWE2D_KP07_dimsplit.cu @@ -141,7 +141,18 @@ __global__ void KP07DimsplitKernel( float* hv1_ptr_, int hv1_pitch_, //Output CFL - float* cfl_) { + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; + const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; const unsigned int gc_x = 2; @@ -154,9 +165,9 @@ __global__ void KP07DimsplitKernel( __shared__ float F[vars][h+2*gc_y][w+2*gc_x]; //Read into shared memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); if (step_ == 0) { //Along X @@ -194,9 +205,9 @@ __global__ void KP07DimsplitKernel( } // Write to main memory for all internal cells - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); //Compute the CFL for this block if (cfl_ != NULL) { diff --git a/GPUSimulators/cuda/SWE2D_LxF.cu b/GPUSimulators/cuda/SWE2D_LxF.cu index 1f197fd..a68cd29 100644 --- a/GPUSimulators/cuda/SWE2D_LxF.cu +++ b/GPUSimulators/cuda/SWE2D_LxF.cu @@ -116,8 +116,18 @@ void LxFKernel( float* hu1_ptr_, int hu1_pitch_, float* hv1_ptr_, int hv1_pitch_, - //Output CFL - float* cfl_) { + //Output CFL + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; @@ -130,9 +140,9 @@ void LxFKernel( __shared__ float G[vars][h+1][w ]; //Read from global memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); //Compute fluxes along the x and y axis computeFluxF(Q, F, g_, dx_, dt_); @@ -154,9 +164,9 @@ void LxFKernel( __syncthreads(); //Write to main memory - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); //Compute the CFL for this block if (cfl_ != NULL) { diff --git a/GPUSimulators/cuda/SWE2D_WAF.cu b/GPUSimulators/cuda/SWE2D_WAF.cu index 2c38cdf..cfcbd79 100644 --- a/GPUSimulators/cuda/SWE2D_WAF.cu +++ b/GPUSimulators/cuda/SWE2D_WAF.cu @@ -116,7 +116,17 @@ __global__ void WAFKernel( //Output h^{n+1} float* h1_ptr_, int h1_pitch_, float* hu1_ptr_, int hu1_pitch_, - float* hv1_ptr_, int hv1_pitch_) { + float* hv1_ptr_, int hv1_pitch_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; @@ -131,9 +141,9 @@ __global__ void WAFKernel( //Read into shared memory Q from global memory - readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); + readBlock( h0_ptr_, h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); __syncthreads(); @@ -170,9 +180,9 @@ __global__ void WAFKernel( // Write to main memory for all internal cells - writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); + writeBlock( h1_ptr_, h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); } } // extern "C" \ No newline at end of file diff --git a/GPUSimulators/cuda/SWECommon.h b/GPUSimulators/cuda/SWECommon.h index 52f8b31..72eff5c 100644 --- a/GPUSimulators/cuda/SWECommon.h +++ b/GPUSimulators/cuda/SWECommon.h @@ -477,10 +477,11 @@ __device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float - -template -__device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x], - float shmem[h+2*gc_y][w+2*gc_x], +// TODO give better names for `Q_w` and `Q_h` in the template +// as it probably does not reflect well on the name +template +__device__ void writeCfl(float Q[vars][Q_h+2*gc_y][Q_w+2*gc_x], + float shmem[Q_h+2*gc_y][Q_w+2*gc_x], const int nx_, const int ny_, const float dx_, const float dy_, const float g_, float* output_) { @@ -509,7 +510,7 @@ __device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x], if (ti < nx_+gc_x && tj < ny_+gc_y) { if (ty == gc_y) { float min_val = shmem[ty][tx]; - const int max_y = min(h, ny_+gc_y - tj); + const int max_y = min(Q_h, ny_+gc_y - tj); for (int j=gc_y; j