feat: add handlers to run models

This commit is contained in:
Anthony Berg 2025-07-01 19:47:44 +02:00
parent dc40972878
commit 78564e1186
15 changed files with 870 additions and 223 deletions

View File

@ -24,7 +24,6 @@ import gc
from IPython.core import magic_arguments
from IPython.core.magic import line_magic, Magics, magics_class
import pycuda.driver as cuda
from GPUSimulators.common import IPEngine
from GPUSimulators.gpu import KernelContext
@ -43,14 +42,15 @@ class MagicCudaContext(Magics):
@magic_arguments.argument(
'--no_autotuning', '-na', action="store_true", help='Disable autotuning of kernels')
def cuda_context_handler(self, line):
# import pycuda.driver as cuda
args = magic_arguments.parse_argstring(self.cuda_context_handler, line)
self.logger = logging.getLogger(__name__)
self.logger.info(f"Registering {args.name} in user workspace")
context_flags = None
if args.blocking:
context_flags = cuda.ctx_flags.SCHED_BLOCKING_SYNC
# if args.blocking:
# context_flags = cuda.ctx_flags.SCHED_BLOCKING_SYNC
if args.name in self.shell.user_ns.keys():
self.logger.debug("Context already registered! Ignoring")
@ -65,19 +65,19 @@ class MagicCudaContext(Magics):
# this function will be called on exceptions in any cell
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
self.logger.exception(f"Exception caught: Resetting to CUDA context {args.name}")
while cuda.Context.get_current() is not None:
context = cuda.Context.get_current()
self.logger.info(f"Popping <{str(context.handle)}>")
cuda.Context.pop()
# while cuda.Context.get_current() is not None:
# context = cuda.Context.get_current()
# self.logger.info(f"Popping <{str(context.handle)}>")
# cuda.Context.pop()
if args.name in self.shell.user_ns.keys():
self.logger.info(f"Pushing <{str(self.shell.user_ns[args.name].cuda_context.handle)}>")
self.shell.user_ns[args.name].cuda_context.push()
else:
self.logger.error(f"No CUDA context called {args.name} found (something is wrong)")
self.logger.error("CUDA will not work now")
self.logger.debug("==================================================================")
# if args.name in self.shell.user_ns.keys():
# self.logger.info(f"Pushing <{str(self.shell.user_ns[args.name].context.handle)}>")
# self.shell.user_ns[args.name].context.push()
# else:
# self.logger.error(f"No CUDA context called {args.name} found (something is wrong)")
# self.logger.error("CUDA will not work now")
#
# self.logger.debug("==================================================================")
# still show the error within the notebook, don't just swallow it
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
@ -89,10 +89,10 @@ class MagicCudaContext(Magics):
import atexit
def exitfunc():
self.logger.info("Exitfunc: Resetting CUDA context stack")
while cuda.Context.get_current() is not None:
context = cuda.Context.get_current()
self.logger.info(f"`-> Popping <{str(context.handle)}>")
cuda.Context.pop()
# while cuda.Context.get_current() is not None:
# context = cuda.Context.get_current()
# self.logger.info(f"`-> Popping <{str(context.handle)}>")
# cuda.Context.pop()
self.logger.debug("==================================================================")
atexit.register(exitfunc)

View File

@ -4,5 +4,7 @@ __env_name = 'GPU_LANG'
if __env_name in environ and environ.get(__env_name).lower() == "cuda":
from .cuda_context import CudaContext as KernelContext
from .cuda_handler import CudaHandler as GPUHandler
else:
from .hip_context import HIPContext as KernelContext
from .hip_context import HIPContext as KernelContext
from .hip_handler import HIPHandler as GPUHandler

View File

@ -0,0 +1,30 @@
import numpy as np
from pycuda import gpuarray
from GPUSimulators.gpu.handler import BaseGPUHandler
from GPUSimulators.gpu import KernelContext
class CudaHandler(BaseGPUHandler):
def __init__(self, context: KernelContext, module, function, arguments,
grid_size):
super().__init__(context, module, function, arguments, grid_size)
self.arguments = arguments
self.kernel = module.get_function(function)
self.kernel.prepare(arguments)
self.cfl_data = gpuarray.GPUArray(grid_size, dtype=np.float32)
def prepared_call(self, grid_size, block_size, stream, args: list):
# if len(args) != len(self.arguments):
# raise ValueError("The parameters do not match the defined arguments.")
self.kernel.prepared_async_call(grid_size, block_size, stream, *args)
def array_fill(self, data, stream):
self.cfl_data.fill(data, stream=stream)
def array_min(self, stream):
return gpuarray.min(self.cfl_data, stream=stream).get()

View File

@ -0,0 +1,54 @@
from GPUSimulators.gpu import KernelContext
class BaseGPUHandler(object):
"""
A handler to make GPU calls.
"""
def __init__(self, context: KernelContext, module, function: str, arguments: str,
grid_size: tuple[int, int]):
"""
Create a new GPU handler.
Args:
context: The KernelContext that is used to make the calls to the kernel.
module: The module created from KernelContext for a function.
function: Name of the function to use in the kernel.
arguments: A string of the argument types to parse to the kernel.
grid_size: The size of the array for the data of the simulation.
"""
def prepared_call(self, grid_size, block_size: tuple[int, int, int], stream, args: list):
"""
Makes a call to the kernel on the GPU with the function that was used to initialize this object.
Args:
grid_size: The size of the grid to do the computation of.
block_size: The block size, as a tuple.
stream: The GPU data stream.
args: Parameters to be passed into the GPU kernel.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")
def array_fill(self, data: float, stream):
"""
Fills the entire array with the same data that was parsed as the parameter.
Args:
data: The data to fill the array with.
stream: The GPU data stream.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")
def array_min(self, stream) -> float:
"""
Gets the minimum value in the array stored in the handler.
Args:
stream: The GPU data stream.
Returns:
The minimum value in the stored array as a float.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")

View File

@ -0,0 +1,70 @@
import ctypes
import numpy as np
from hip import hip, hipblas
from GPUSimulators.common import hip_check
from GPUSimulators.gpu.handler import BaseGPUHandler
from GPUSimulators.gpu import KernelContext
class HIPHandler(BaseGPUHandler):
def __init__(self, context: KernelContext, module, function, arguments,
grid_size):
super().__init__(context, module, function, arguments, grid_size)
self.kernel = hip_check(hip.hipModuleGetFunction(module, bytes(function, "utf-8")))
self.context = context
self.dtype = np.float32
self.cfl_data_h = np.empty(grid_size, dtype=self.dtype)
self.num_bytes = self.cfl_data_h.size * self.cfl_data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(self.num_bytes)).configure(
typestr=np.finfo(self.dtype).dtype.name, shape=grid_size
)
def __del__(self):
hip_check(hip.hipFree(self.cfl_data))
def prepared_call(self, grid_size, block_size, stream, args):
if len(grid_size) < 3:
grid_size = (*grid_size, 1)
for i in range(len(args)):
val = args[i]
if isinstance(val, int) or isinstance(val, np.int32):
args[i] = ctypes.c_int(val)
elif isinstance(val, float) or isinstance(val, np.float32):
args[i] = ctypes.c_float(val)
args = tuple(args)
hip_check(hip.hipModuleLaunchKernel(
self.kernel,
*grid_size,
*block_size,
0,
stream,
None,
args
))
def array_fill(self, data, stream):
self.cfl_data_h.fill(data)
hip_check(
hip.hipMemcpyAsync(self.cfl_data, self.cfl_data_h, self.num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice,
stream))
def array_min(self, stream):
handle = hip_check(hipblas.hipblasCreate())
value = np.empty(1, self.dtype)
hip_check(hipblas.hipblasIsamin(handle, self.cfl_data.size, self.cfl_data, 1, value))
hip_check(hipblas.hipblasDestroy(handle))
hip_check(hip.hipMemcpy(value, self.cfl_data, self.cfl_data_h.itemsize, hip.hipMemcpyKind.hipMemcpyDeviceToHost))
return value[0]

View File

@ -21,10 +21,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class EE2DKP07Dimsplit(BaseSimulator):
@ -43,7 +43,7 @@ class EE2DKP07Dimsplit(BaseSimulator):
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=8,
compile_opts: list[str] = []):
compile_opts: list[str] = []):
"""
Initialization routine
@ -89,8 +89,8 @@ class EE2DKP07Dimsplit(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "KP07DimsplitKernel", "iiffffffiiPiPiPiPiPiPiPiPiPiiii",
self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -101,11 +101,11 @@ class EE2DKP07Dimsplit(BaseSimulator):
nx, ny,
2, 2,
[None, None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
dt_x = np.min(self.dx / (np.abs(rho_u / rho) + np.sqrt(gamma * rho)))
dt_y = np.min(self.dy / (np.abs(rho_v / rho) + np.sqrt(gamma * rho)))
self.dt = min(dt_x, dt_y)
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number, external=True, internal=True):
self.substep_dimsplit(0.5 * dt, step_number, external, internal)
@ -114,25 +114,25 @@ class EE2DKP07Dimsplit(BaseSimulator):
if external and internal:
# print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u0[3].data, self.u0[3].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.u1[3].data, self.u1[3].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny])
return
if external and not internal:
@ -145,121 +145,121 @@ class EE2DKP07Dimsplit(BaseSimulator):
# NORTH
# (x0, y0) x (x1, y1)
# (0, ny-y_halo) x (nx, ny)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(ns_grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u0[3].data, self.u0[3].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.u1[3].data, self.u1[3].get_strides()[0],
self.handler.cfl_data,
0, self.ny - int(self.u0[0].y_halo),
self.nx, self.ny)
self.nx, self.ny])
# SOUTH
# (x0, y0) x (x1, y1)
# (0, 0) x (nx, y_halo)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(ns_grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u0[3].data, self.u0[3].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.u1[3].data, self.u1[3].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, int(self.u0[0].y_halo))
self.nx, int(self.u0[0].y_halo)])
we_grid_size = (1, self.grid_size[1])
# WEST
# (x0, y0) x (x1, y1)
# (0, 0) x (x_halo, ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(we_grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u0[3].data, self.u0[3].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.u1[3].data, self.u1[3].get_strides()[0],
self.handler.cfl_data,
0, 0,
int(self.u0[0].x_halo), self.ny)
int(self.u0[0].x_halo), self.ny])
# EAST
# (x0, y0) x (x1, y1)
# (nx-x_halo, 0) x (nx, ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(we_grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u0[3].data, self.u0[3].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.u1[3].data, self.u1[3].get_strides()[0],
self.handler.cfl_data,
self.nx - int(self.u0[0].x_halo), 0,
self.nx, self.ny)
self.nx, self.ny])
return
if internal and not external:
# INTERNAL DOMAIN
# (x0, y0) x (x1, y1)
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.internal_stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u0[3].data, self.u0[3].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.u1[3].data, self.u1[3].get_strides()[0],
self.handler.cfl_data,
int(self.u0[0].x_halo), int(self.u0[0].y_halo),
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo)])
return
def swap_buffers(self):
@ -275,5 +275,5 @@ class EE2DKP07Dimsplit(BaseSimulator):
return
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(stream=self.stream)
return max_dt * 0.5

View File

@ -22,10 +22,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class Force(BaseSimulator):
@ -85,8 +85,7 @@ class Force(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("FORCEKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "FORCEKernel", "iiffffiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -97,7 +96,6 @@ class Force(BaseSimulator):
nx, ny,
1, 1,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
@ -106,23 +104,23 @@ class Force(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny)
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
@ -134,5 +132,5 @@ class Force(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt

View File

@ -18,13 +18,15 @@ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import ctypes
import time
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.common.arrays import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class HLL(BaseSimulator):
@ -84,8 +86,7 @@ class HLL(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("HLLKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "HLLKernel", "iiffffiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -96,7 +97,6 @@ class HLL(BaseSimulator):
nx, ny,
1, 1,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
@ -104,23 +104,23 @@ class HLL(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
def get_output(self):
@ -131,5 +131,5 @@ class HLL(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt * 0.5

View File

@ -21,10 +21,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class HLL2(BaseSimulator):
@ -86,8 +86,7 @@ class HLL2(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("HLL2Kernel")
self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "HLL2Kernel", "iifffffiiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -98,7 +97,6 @@ class HLL2(BaseSimulator):
nx, ny,
2, 2,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
@ -107,28 +105,28 @@ class HLL2(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
self.substep_dimsplit(dt * 0.5, step_number)
def substep_dimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny)
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
def get_output(self):
@ -139,5 +137,5 @@ class HLL2(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt * 0.5

View File

@ -26,10 +26,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition, conversion
from GPUSimulators.gpu import GPUHandler
class KP07(BaseSimulator):
@ -93,8 +93,7 @@ class KP07(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("KP07Kernel")
self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "KP07Kernel", "iifffffiiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -105,7 +104,6 @@ class KP07(BaseSimulator):
nx, ny,
2, 2,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
@ -114,28 +112,28 @@ class KP07(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
self.substep_rk(dt, step_number)
def substep_rk(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
conversion.step_order_to_coded_int(step=substep, order=self.order),
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny)
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
def get_output(self):
@ -146,5 +144,5 @@ class KP07(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt * 0.5 ** (self.order - 1)

View File

@ -26,10 +26,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class KP07Dimsplit(BaseSimulator):
@ -93,8 +93,7 @@ class KP07Dimsplit(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "KP07DimsplitKernel", "iifffffiiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -105,7 +104,6 @@ class KP07Dimsplit(BaseSimulator):
nx, ny,
self.gc_x, self.gc_y,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
@ -114,28 +112,28 @@ class KP07Dimsplit(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
self.substep_dimsplit(dt * 0.5, step_number)
def substep_dimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny)
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
def get_output(self):
@ -146,5 +144,5 @@ class KP07Dimsplit(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt * 0.5

View File

@ -22,10 +22,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class LxF(BaseSimulator):
@ -85,8 +85,7 @@ class LxF(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("LxFKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "LxFKernel", "iiffffiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to thedevice
self.u0 = ArakawaA2D(self.stream,
@ -97,7 +96,6 @@ class LxF(BaseSimulator):
nx, ny,
1, 1,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
@ -106,7 +104,7 @@ class LxF(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
"""
@ -114,20 +112,20 @@ class LxF(BaseSimulator):
dt: Size of each timestep (seconds)
"""
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny)
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
def get_output(self):
@ -138,5 +136,5 @@ class LxF(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt * 0.5

View File

@ -22,10 +22,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.gpu import GPUHandler
class WAF(BaseSimulator):
@ -85,8 +85,7 @@ class WAF(BaseSimulator):
'hip': compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("WAFKernel")
self.kernel.prepare("iiffffiiPiPiPiPiPiPiPiiii")
self.handler = GPUHandler(context, module, "WAFKernel", "iiffffiiPiPiPiPiPiPiPiiii", self.grid_size)
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
@ -97,7 +96,6 @@ class WAF(BaseSimulator):
nx, ny,
2, 2,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
@ -106,27 +104,27 @@ class WAF(BaseSimulator):
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
self.handler.array_fill(self.dt, self.stream)
def substep(self, dt, step_number):
self.substep_dimsplit(dt * 0.5, step_number)
def substep_dimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.handler.prepared_call(self.grid_size, self.block_size, self.stream,
[self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
self.u0[0].data, self.u0[0].get_strides()[0],
self.u0[1].data, self.u0[1].get_strides()[0],
self.u0[2].data, self.u0[2].get_strides()[0],
self.u1[0].data, self.u1[0].get_strides()[0],
self.u1[1].data, self.u1[1].get_strides()[0],
self.u1[2].data, self.u1[2].get_strides()[0],
self.handler.cfl_data,
0, 0,
self.nx, self.ny)
self.nx, self.ny])
self.u0, self.u1 = self.u1, self.u0
def get_output(self):
@ -137,5 +135,5 @@ class WAF(BaseSimulator):
self.u1.check()
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
max_dt = self.handler.array_min(self.stream)
return max_dt * 0.5

498
HIPTestSchemes.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -13,6 +13,11 @@ dependencies:
- pytools
- netcdf4
- scipy
- matplotlib
- ipyparallel
- jupyter
- nb_conda_kernels
- line_profiler
- tqdm
- pip:
- hip-python==6.4.1.552.39