mirror of
				https://github.com/smyalygames/FiniteVolumeGPU.git
				synced 2025-10-31 20:27:40 +01:00 
			
		
		
		
	feat(GPUSimulator): add the extra variables introduced in SWWECommon for all other algorithms
This commit is contained in:
		
							parent
							
								
									71cc5628c8
								
							
						
					
					
						commit
						aa54d21f0a
					
				| @ -41,7 +41,9 @@ class FORCE (Simulator.BaseSimulator): | ||||
|                  g,  | ||||
|                  cfl_scale=0.9, | ||||
|                  boundary_conditions=BoundaryCondition(),  | ||||
|                  block_width=16, block_height=16): | ||||
|                  block_width=16, block_height=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
|          | ||||
| @ -55,6 +57,7 @@ class FORCE (Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -75,11 +78,11 @@ class FORCE (Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("FORCEKernel") | ||||
|         self.kernel.prepare("iiffffiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii") | ||||
|      | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -91,10 +94,15 @@ class FORCE (Simulator.BaseSimulator): | ||||
|                         1, 1,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
| 
 | ||||
|         if dt == None: | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
| 
 | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|          | ||||
|     def substep(self, dt, step_number): | ||||
|         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,  | ||||
| @ -108,7 +116,10 @@ class FORCE (Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
| 
 | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
|          | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -40,7 +40,9 @@ class HLL (Simulator.BaseSimulator): | ||||
|                  g,  | ||||
|                  cfl_scale=0.9, | ||||
|                  boundary_conditions=BoundaryCondition(),  | ||||
|                  block_width=16, block_height=16): | ||||
|                  block_width=16, block_height=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
| 
 | ||||
| @ -54,6 +56,7 @@ class HLL (Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -74,11 +77,11 @@ class HLL (Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("HLLKernel") | ||||
|         self.kernel.prepare("iiffffiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii") | ||||
|      | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -90,10 +93,14 @@ class HLL (Simulator.BaseSimulator): | ||||
|                         1, 1,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
|         if dt == None: | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
|          | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|          | ||||
|     def substep(self, dt, step_number): | ||||
|         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,  | ||||
| @ -107,7 +114,9 @@ class HLL (Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
|          | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -41,7 +41,9 @@ class HLL2 (Simulator.BaseSimulator): | ||||
|                  theta=1.8,  | ||||
|                  cfl_scale=0.9, | ||||
|                  boundary_conditions=BoundaryCondition(),  | ||||
|                  block_width=16, block_height=16): | ||||
|                  block_width=16, block_height=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
| 
 | ||||
| @ -55,6 +57,7 @@ class HLL2 (Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -76,11 +79,11 @@ class HLL2 (Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("HLL2Kernel") | ||||
|         self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii") | ||||
|          | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -92,10 +95,15 @@ class HLL2 (Simulator.BaseSimulator): | ||||
|                         2, 2,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
|          | ||||
|         if dt == None: | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
|          | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|          | ||||
|     def substep(self, dt, step_number): | ||||
|         self.substepDimsplit(dt*0.5, step_number) | ||||
| @ -114,7 +122,9 @@ class HLL2 (Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
|      | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -47,7 +47,9 @@ class KP07 (Simulator.BaseSimulator): | ||||
|                  cfl_scale=0.9, | ||||
|                  order=2, | ||||
|                  boundary_conditions=BoundaryCondition(),  | ||||
|                  block_width=16, block_height=16): | ||||
|                  block_width=16, block_height=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
|          | ||||
| @ -61,6 +63,7 @@ class KP07 (Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -83,11 +86,11 @@ class KP07 (Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("KP07Kernel") | ||||
|         self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii") | ||||
|          | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -99,10 +102,15 @@ class KP07 (Simulator.BaseSimulator): | ||||
|                         2, 2,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
|          | ||||
|         if dt == None: | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
|          | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|                          | ||||
|          | ||||
|     def substep(self, dt, step_number): | ||||
| @ -123,7 +131,9 @@ class KP07 (Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
| 
 | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -46,7 +46,9 @@ class KP07_dimsplit(Simulator.BaseSimulator): | ||||
|                  theta=1.3,  | ||||
|                  cfl_scale=0.9, | ||||
|                  boundary_conditions=BoundaryCondition(),  | ||||
|                  block_width=16, block_height=16): | ||||
|                  block_width=16, block_height=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
|          | ||||
| @ -60,6 +62,7 @@ class KP07_dimsplit(Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -83,11 +86,11 @@ class KP07_dimsplit(Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("KP07DimsplitKernel") | ||||
|         self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii") | ||||
|      | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -99,10 +102,15 @@ class KP07_dimsplit(Simulator.BaseSimulator): | ||||
|                         self.gc_x, self.gc_y,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
| 
 | ||||
|         if dt == None: | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
|          | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|      | ||||
|     def substep(self, dt, step_number): | ||||
|         self.substepDimsplit(dt*0.5, step_number) | ||||
| @ -121,7 +129,9 @@ class KP07_dimsplit(Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
| 
 | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -41,7 +41,9 @@ class LxF (Simulator.BaseSimulator): | ||||
|                  g: float,  | ||||
|                  cfl_scale: float=0.9, | ||||
|                  boundary_conditions=BoundaryCondition(), | ||||
|                  block_width: int=16, block_height: int=16): | ||||
|                  block_width: int=16, block_height: int=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
| 
 | ||||
| @ -55,6 +57,7 @@ class LxF (Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -75,11 +78,11 @@ class LxF (Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("LxFKernel") | ||||
|         self.kernel.prepare("iiffffiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii") | ||||
| 
 | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -91,10 +94,15 @@ class LxF (Simulator.BaseSimulator): | ||||
|                         1, 1,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
| 
 | ||||
|         if dt == None:   | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))  | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
|          | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|          | ||||
|     def substep(self, dt, step_number): | ||||
|         """ | ||||
| @ -113,7 +121,9 @@ class LxF (Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
|    | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -41,7 +41,9 @@ class WAF (Simulator.BaseSimulator): | ||||
|                  g,  | ||||
|                  cfl_scale=0.9, | ||||
|                  boundary_conditions=BoundaryCondition(),  | ||||
|                  block_width=16, block_height=16): | ||||
|                  block_width=16, block_height=16, | ||||
|                  dt: float=None, | ||||
|                  compile_opts: list[str]=[]): | ||||
|         """ | ||||
|         Initialization routine | ||||
| 
 | ||||
| @ -55,6 +57,7 @@ class WAF (Simulator.BaseSimulator): | ||||
|             dy: Grid cell spacing along y-axis (20 000 m) | ||||
|             dt: Size of each timestep (90 s) | ||||
|             g: Gravitational accelleration (9.81 m/s^2) | ||||
|             compile_opts: Pass a list of nvcc compiler options | ||||
|         """ | ||||
|                   | ||||
|         # Call super constructor | ||||
| @ -75,11 +78,11 @@ class WAF (Simulator.BaseSimulator): | ||||
|                                         },  | ||||
|                                         compile_args={ | ||||
|                                             'no_extern_c': True, | ||||
|                                             'options': ["--use_fast_math"],  | ||||
|                                             'options': ["--use_fast_math"] + compile_opts,  | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("WAFKernel") | ||||
|         self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iiffffiiPiPiPiPiPiPiPiiii") | ||||
|      | ||||
|         #Create data by uploading to device | ||||
|         self.u0 = Common.ArakawaA2D(self.stream,  | ||||
| @ -91,10 +94,15 @@ class WAF (Simulator.BaseSimulator): | ||||
|                         2, 2,  | ||||
|                         [None, None, None]) | ||||
|         self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) | ||||
|         dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|         dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|         dt = min(dt_x, dt_y) | ||||
|         self.cfl_data.fill(dt, stream=self.stream) | ||||
|          | ||||
|         if dt == None: | ||||
|             dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) | ||||
|             dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) | ||||
|             self.dt = min(dt_x, dt_y) | ||||
|         else: | ||||
|             self.dt = dt | ||||
|          | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|      | ||||
|     def substep(self, dt, step_number): | ||||
|         self.substepDimsplit(dt*0.5, step_number) | ||||
| @ -112,7 +120,9 @@ class WAF (Simulator.BaseSimulator): | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, self.ny) | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
| 
 | ||||
|     def getOutput(self): | ||||
|  | ||||
| @ -100,8 +100,18 @@ __global__ void FORCEKernel( | ||||
|         float* hv1_ptr_, int hv1_pitch_, | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|      | ||||
|         float* cfl_, | ||||
| 
 | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_;  | ||||
| 
 | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
|     const unsigned int gc_x = 1; | ||||
| @ -112,9 +122,9 @@ __global__ void FORCEKernel( | ||||
|     __shared__ float F[vars][h+2*gc_y][w+2*gc_x]; | ||||
|      | ||||
|     //Read into shared memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     __syncthreads(); | ||||
|      | ||||
|     //Compute flux along x, and evolve | ||||
| @ -130,9 +140,9 @@ __global__ void FORCEKernel( | ||||
|     __syncthreads(); | ||||
|      | ||||
|     //Write to main memory | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute the CFL for this block | ||||
|     if (cfl_ != NULL) { | ||||
|  | ||||
| @ -116,7 +116,17 @@ __global__ void HLLKernel( | ||||
|         float* hv1_ptr_, int hv1_pitch_, | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|         float* cfl_, | ||||
| 
 | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_; | ||||
|      | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
| @ -129,9 +139,9 @@ __global__ void HLLKernel( | ||||
|     __shared__ float F[vars][h+2*gc_y][w+2*gc_x]; | ||||
|      | ||||
|     //Read into shared memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute F flux | ||||
|     computeFluxF(Q, F, g_); | ||||
| @ -148,9 +158,9 @@ __global__ void HLLKernel( | ||||
|     __syncthreads(); | ||||
|      | ||||
|     // Write to main memory for all internal cells | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute the CFL for this block | ||||
|     if (cfl_ != NULL) { | ||||
|  | ||||
| @ -144,7 +144,17 @@ __global__ void HLL2Kernel( | ||||
|         float* hv1_ptr_, int hv1_pitch_, | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|         float* cfl_, | ||||
| 
 | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_; | ||||
|      | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
| @ -158,9 +168,9 @@ __global__ void HLL2Kernel( | ||||
|     __shared__ float  F[3][h+4][w+4]; | ||||
|      | ||||
|     //Read into shared memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|      | ||||
|     //Step 0 => evolve x first, then y | ||||
|     if (step_ == 0) { | ||||
| @ -203,9 +213,9 @@ __global__ void HLL2Kernel( | ||||
|      | ||||
|      | ||||
|     // Write to main memory for all internal cells | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute the CFL for this block | ||||
|     if (cfl_ != NULL) { | ||||
|  | ||||
| @ -154,7 +154,18 @@ __global__ void KP07Kernel( | ||||
|         float* hv1_ptr_, int hv1_pitch_, | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|         float* cfl_, | ||||
| 
 | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_; | ||||
| 
 | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
|     const unsigned int gc_x = 2; | ||||
| @ -179,9 +190,9 @@ __global__ void KP07Kernel( | ||||
|      | ||||
|      | ||||
|     //Read into shared memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|      | ||||
|      | ||||
|     //Reconstruct slopes along x and axis | ||||
|  | ||||
| @ -141,7 +141,18 @@ __global__ void KP07DimsplitKernel( | ||||
|         float* hv1_ptr_, int hv1_pitch_,  | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|         float* cfl_, | ||||
| 
 | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_; | ||||
| 
 | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
|     const unsigned int gc_x = 2; | ||||
| @ -154,9 +165,9 @@ __global__ void KP07DimsplitKernel( | ||||
|     __shared__ float  F[vars][h+2*gc_y][w+2*gc_x]; | ||||
|      | ||||
|     //Read into shared memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|      | ||||
|     if (step_ == 0) { | ||||
|         //Along X | ||||
| @ -194,9 +205,9 @@ __global__ void KP07DimsplitKernel( | ||||
|     } | ||||
|      | ||||
|     // Write to main memory for all internal cells | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute the CFL for this block | ||||
|     if (cfl_ != NULL) { | ||||
|  | ||||
| @ -116,8 +116,18 @@ void LxFKernel( | ||||
|         float* hu1_ptr_, int hu1_pitch_, | ||||
|         float* hv1_ptr_, int hv1_pitch_, | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|        //Output CFL | ||||
|        float* cfl_, | ||||
| 
 | ||||
|        //Subarea of internal domain to compute | ||||
|        int x0=0, int y0=0, | ||||
|        int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_; | ||||
|      | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
| @ -130,9 +140,9 @@ void LxFKernel( | ||||
|     __shared__ float G[vars][h+1][w  ]; | ||||
|      | ||||
|     //Read from global memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute fluxes along the x and y axis | ||||
|     computeFluxF<w, h>(Q, F, g_, dx_, dt_); | ||||
| @ -154,9 +164,9 @@ void LxFKernel( | ||||
|     __syncthreads(); | ||||
| 
 | ||||
|     //Write to main memory | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute the CFL for this block | ||||
|     if (cfl_ != NULL) { | ||||
|  | ||||
| @ -116,7 +116,17 @@ __global__ void WAFKernel( | ||||
|         //Output h^{n+1} | ||||
|         float* h1_ptr_, int h1_pitch_, | ||||
|         float* hu1_ptr_, int hu1_pitch_, | ||||
|         float* hv1_ptr_, int hv1_pitch_) {    | ||||
|         float* hv1_ptr_, int hv1_pitch_, | ||||
|   | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
|   | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_;   | ||||
|              | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
| @ -131,9 +141,9 @@ __global__ void WAFKernel( | ||||
|      | ||||
|      | ||||
|     //Read into shared memory Q from global memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>( h0_ptr_,  h0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(hu0_ptr_, hu0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(hv0_ptr_, hv0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     __syncthreads(); | ||||
|      | ||||
|      | ||||
| @ -170,9 +180,9 @@ __global__ void WAFKernel( | ||||
| 
 | ||||
|      | ||||
|     // Write to main memory for all internal cells | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>( h1_ptr_,  h1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
| } | ||||
| 
 | ||||
| } // extern "C" | ||||
| @ -477,10 +477,11 @@ __device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| template<int w, int h, int gc_x, int gc_y, int vars> | ||||
| __device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x], | ||||
|         float shmem[h+2*gc_y][w+2*gc_x], | ||||
| // TODO give better names for `Q_w` and `Q_h` in the template
 | ||||
| // as it probably does not reflect well on the name 
 | ||||
| template<int Q_w, int Q_h, int gc_x, int gc_y, int vars> | ||||
| __device__ void writeCfl(float Q[vars][Q_h+2*gc_y][Q_w+2*gc_x], | ||||
|         float shmem[Q_h+2*gc_y][Q_w+2*gc_x], | ||||
|         const int nx_, const int ny_, | ||||
|         const float dx_, const float dy_, const float g_, | ||||
|         float* output_) { | ||||
| @ -509,7 +510,7 @@ __device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x], | ||||
|     if (ti < nx_+gc_x && tj < ny_+gc_y) { | ||||
|         if (ty == gc_y) { | ||||
|             float min_val = shmem[ty][tx]; | ||||
|             const int max_y = min(h, ny_+gc_y - tj); | ||||
|             const int max_y = min(Q_h, ny_+gc_y - tj); | ||||
|             for (int j=gc_y; j<max_y+gc_y; j++) { | ||||
|                 min_val = fminf(min_val, shmem[j][tx]); | ||||
|             } | ||||
| @ -521,7 +522,7 @@ __device__ void writeCfl(float Q[vars][h+2*gc_y][w+2*gc_x], | ||||
|     //One thread loops over first row to find global max
 | ||||
|     if (tx == gc_x && ty == gc_y) { | ||||
|         float min_val = shmem[ty][tx]; | ||||
|         const int max_x = min(w, nx_+gc_x - ti); | ||||
|         const int max_x = min(Q_w, nx_+gc_x - ti); | ||||
|         for (int i=gc_x; i<max_x+gc_x; ++i) { | ||||
|             min_val = fminf(min_val, shmem[ty][i]); | ||||
|         } | ||||
|  | ||||
| @ -322,8 +322,8 @@ inline __device__ void readBlock(float* ptr_, int pitch_, | ||||
|                 float Q[h+2*gc_y][w+2*gc_x],  | ||||
|                 const int nx_, const int ny_, | ||||
|                 const int boundary_conditions_, | ||||
|                  int x0, int y0, | ||||
|                  int x1, int y1) { | ||||
|                 int x0, int y0, | ||||
|                 int x1, int y1) { | ||||
|     //Index of block within domain
 | ||||
|     const int bx = blockDim.x * blockIdx.x; | ||||
|     const int by = blockDim.y * blockIdx.y; | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Anthony Berg
						Anthony Berg