feat(simulator): add autotuning for HIP

This commit is contained in:
Anthony Berg 2025-07-03 11:37:17 +02:00
parent 716394f46b
commit ab6660d719
8 changed files with 2496 additions and 172 deletions

View File

@ -26,12 +26,11 @@ import os
from socket import gethostname
import numpy as np
import pycuda.driver as cuda
from tqdm.auto import tqdm
from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
from GPUSimulators.common import Timer
from GPUSimulators.gpu import KernelContext
from GPUSimulators.gpu import KernelContext, Event
def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
@ -51,8 +50,8 @@ def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
return np.nan
# Create timer events
start = cuda.Event()
end = cuda.Event()
start = Event()
end = Event()
# Warmup
for i in range(warmup_timesteps):
@ -74,9 +73,9 @@ def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
# Sanity check solution
h, hu, hv = sim.download()
sane = True
sane = sane and sanity_check(0.3, 0.7)
sane = sane and sanity_check(-0.2, 0.2)
sane = sane and sanity_check(-0.2, 0.2)
sane = sane and sanity_check(h, 0.3, 0.7)
sane = sane and sanity_check(hu, -0.2, 0.2)
sane = sane and sanity_check(hv, -0.2, 0.2)
if sane:
logger.debug(f"{simulator.__name__} [{arguments["block_width"]} x {arguments["block_height"]}] succeeded: "
@ -170,7 +169,7 @@ def benchmark_single_simulator(simulator, arguments, block_widths, block_heights
sim_arguments.update({'block_height': block_height})
for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
sim_arguments.update({'block_width': block_width})
megacells[j, i] = run_benchmark(sim_arguments)
megacells[j, i] = run_benchmark(simulator, sim_arguments)
logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
@ -207,14 +206,14 @@ class Autotuner:
# Set arguments to send to the simulators during construction
context = KernelContext(autotuning=False)
g = 9.81
h0, hu0, hv0, dx, dy, dt = gen_test_data(ny=self.ny, g=g)
h0, hu0, hv0, dx, dy, dt = gen_test_data(nx=self.nx, ny=self.ny, g=g)
arguments = {
'context': context,
'h0': h0, 'hu0': hu0, 'hv0': hv0,
'nx': self.nx, 'ny': self.ny,
'dx': dx, 'dy': dy, 'dt': 0.9 * dt,
'g': g,
'compile_opts': ['-Wno-deprecated-gpu-targets']
'compile_opts': []
}
# Load existing data into memory
@ -227,7 +226,7 @@ class Autotuner:
benchmark_data[k] = v
# Run benchmark
benchmark_data[key + "_megacells"] = benchmark_single_simulator(arguments, self.block_widths,
benchmark_data[key + "_megacells"] = benchmark_single_simulator(simulator, arguments, self.block_widths,
self.block_heights)
benchmark_data[key + "_block_widths"] = self.block_widths
benchmark_data[key + "_block_heights"] = self.block_heights
@ -268,9 +267,9 @@ class Autotuner:
self.benchmark(simulator)
data = np.load(self.filename)
def find_max_index(megacells):
max_index = np.nanargmax(megacells)
return np.unravel_index(max_index, megacells.shape)
def find_max_index(megacells_arg):
max_index = np.nanargmax(megacells_arg)
return np.unravel_index(max_index, megacells_arg.shape)
megacells = data[key + '_megacells']
block_widths = data[key + '_block_widths']
@ -282,7 +281,3 @@ class Autotuner:
"megacells": megacells[j, i]}
logger.debug(f"Returning {self.performance[key]} as peak performance parameters")
return self.performance[key]
# This should never happen
raise "Something wrong: Could not get autotuning data!"
return None

View File

@ -5,6 +5,8 @@ __env_name = 'GPU_LANG'
if __env_name in environ and environ.get(__env_name).lower() == "cuda":
from .cuda_context import CudaContext as KernelContext
from .cuda_handler import CudaHandler as GPUHandler
from .cuda_event import CudaEvent as Event
else:
from .hip_context import HIPContext as KernelContext
from .hip_handler import HIPHandler as GPUHandler
from .hip_handler import HIPHandler as GPUHandler
from .hip_event import HIPEvent as Event

View File

@ -28,7 +28,7 @@ import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from GPUSimulators import Autotuner
from GPUSimulators.Autotuner import Autotuner
from GPUSimulators.common import Timer
from GPUSimulators.gpu.context import Context
@ -79,7 +79,7 @@ class CudaContext(Context):
if autotuning:
self.logger.info(
"Autotuning enabled. It may take several minutes to run the code the first time: have patience")
self.autotuner = Autotuner.Autotuner()
self.autotuner = Autotuner()
def __del__(self, *args):
self.logger.info(f"Cleaning up CUDA context handle <{str(self.cuda_context.handle)}>")

View File

@ -0,0 +1,43 @@
import pycuda.driver as cuda
from .event import BaseEvent
class CudaEvent(BaseEvent):
"""
A GPU Event handler.
"""
def __init__(self):
"""
Creates a GPU Event.
"""
super().__init__()
self.event = cuda.Event()
def record(self, stream):
"""
Insert a recording point into the ``stream``.
Args:
stream: The stream to insert the recording point into.
"""
self.event.record(stream)
def synchronize(self):
"""
Wait for the event to complete.
"""
self.event.synchronize()
def time_since(self, start):
"""
Return the elapsed time from the ``start`` event and this class.
Args:
start: The Event to measure time from.
Returns:
Time since the ``start`` event and the end time of this class.
"""
return self.event.time_since(start)

View File

@ -0,0 +1,36 @@
class BaseEvent(object):
"""
A GPU Event handler.
"""
def __init__(self):
"""
Creates a GPU Event.
"""
def record(self, stream):
"""
Insert a recording point into the ``stream``.
Args:
stream: The stream to insert the recording point into.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")
def synchronize(self):
"""
Wait for the event to complete.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")
def time_since(self, start) -> float:
"""
Return the elapsed time from the ``start`` event and this class.
Args:
start: The Event to measure time from.
Returns:
Time since the ``start`` event and the end time of this class.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")

View File

@ -37,11 +37,10 @@ class HIPContext(Context):
self.logger.debug(f" => total available memory: {int(props.totalGlobalMem / pow(1024, 2))} MiB")
if autotuning:
from GPUSimulators.Autotuner import Autotuner
self.logger.info(
"Autotuning enabled. It may take several minutes to run the code the first time: have patience")
raise NotImplementedError("Autotuner is not yet implemented for HIP.")
# TODO Implement Autotuner for HIP
# self.autotuner = Autotuner.Autotuner()
self.autotuner = Autotuner()
def __del__(self):
for module in self.modules.values():

View File

@ -0,0 +1,51 @@
from hip import hip
from hip.hip import ihipStream_t, ihipEvent_t
from .event import BaseEvent
from GPUSimulators.common import hip_check
class HIPEvent(BaseEvent):
"""
A GPU Event handler.
"""
def __init__(self):
"""
Creates a GPU Event.
"""
super().__init__()
self.event = hip_check(hip.hipEventCreate())
def __del__(self):
hip_check(hip.hipEventDestroy(self.event))
def record(self, stream: ihipStream_t | object):
"""
Insert a recording point into the ``stream``.
Args:
stream: The stream to insert the recording point into.
"""
hip_check(hip.hipEventRecord(self.event, stream))
def synchronize(self):
"""
Wait for the event to complete.
"""
hip_check(hip.hipEventSynchronize(self.event))
def time_since(self, start: ihipEvent_t | object):
"""
Return the elapsed time from the ``start`` event and this class.
Args:
start: The Event to measure time from. Can also use the HIPEvent class instead of obj.event.
Returns:
Time since the ``start`` event and the end time of this class.
"""
if isinstance(start, HIPEvent):
start = start.event
return hip_check(hip.hipEventElapsedTime(start, self.event))

File diff suppressed because one or more lines are too long