feat(simulator): add autotuning for HIP

2025-10-31 20:17:41 +01:00 · 2025-07-03 11:37:17 +02:00 · 2025-07-03 11:37:17 +02:00 · ab6660d719
commit ab6660d719
parent 716394f46b
8 changed files with 2496 additions and 172 deletions
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@ -26,12 +26,11 @@ import os
 from socket import gethostname

 import numpy as np
-import pycuda.driver as cuda
 from tqdm.auto import tqdm

 from GPUSimulators.simulator import BaseSimulator, BoundaryCondition
 from GPUSimulators.common import Timer
-from GPUSimulators.gpu import KernelContext
+from GPUSimulators.gpu import KernelContext, Event


 def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
@ -51,8 +50,8 @@ def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
        return np.nan

    # Create timer events
-    start = cuda.Event()
-    end = cuda.Event()
+    start = Event()
+    end = Event()

    # Warmup
    for i in range(warmup_timesteps):
@ -74,9 +73,9 @@ def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
    # Sanity check solution
    h, hu, hv = sim.download()
    sane = True
-    sane = sane and sanity_check(0.3, 0.7)
-    sane = sane and sanity_check(-0.2, 0.2)
-    sane = sane and sanity_check(-0.2, 0.2)
+    sane = sane and sanity_check(h, 0.3, 0.7)
+    sane = sane and sanity_check(hu, -0.2, 0.2)
+    sane = sane and sanity_check(hv, -0.2, 0.2)

    if sane:
        logger.debug(f"{simulator.__name__} [{arguments["block_width"]} x {arguments["block_height"]}] succeeded: "
@ -170,7 +169,7 @@ def benchmark_single_simulator(simulator, arguments, block_widths, block_heights
            sim_arguments.update({'block_height': block_height})
            for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
                sim_arguments.update({'block_width': block_width})
-                megacells[j, i] = run_benchmark(sim_arguments)
+                megacells[j, i] = run_benchmark(simulator, sim_arguments)

    logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)

@ -207,14 +206,14 @@ class Autotuner:
        # Set arguments to send to the simulators during construction
        context = KernelContext(autotuning=False)
        g = 9.81
-        h0, hu0, hv0, dx, dy, dt = gen_test_data(ny=self.ny, g=g)
+        h0, hu0, hv0, dx, dy, dt = gen_test_data(nx=self.nx, ny=self.ny, g=g)
        arguments = {
            'context': context,
            'h0': h0, 'hu0': hu0, 'hv0': hv0,
            'nx': self.nx, 'ny': self.ny,
            'dx': dx, 'dy': dy, 'dt': 0.9 * dt,
            'g': g,
-            'compile_opts': ['-Wno-deprecated-gpu-targets']
+            'compile_opts': []
        }

        # Load existing data into memory
@ -227,7 +226,7 @@ class Autotuner:
                    benchmark_data[k] = v

        # Run benchmark
-        benchmark_data[key + "_megacells"] = benchmark_single_simulator(arguments, self.block_widths,
+        benchmark_data[key + "_megacells"] = benchmark_single_simulator(simulator, arguments, self.block_widths,
                                                                        self.block_heights)
        benchmark_data[key + "_block_widths"] = self.block_widths
        benchmark_data[key + "_block_heights"] = self.block_heights
@ -268,9 +267,9 @@ class Autotuner:
                    self.benchmark(simulator)
                    data = np.load(self.filename)

-                def find_max_index(megacells):
-                    max_index = np.nanargmax(megacells)
-                    return np.unravel_index(max_index, megacells.shape)
+                def find_max_index(megacells_arg):
+                    max_index = np.nanargmax(megacells_arg)
+                    return np.unravel_index(max_index, megacells_arg.shape)

                megacells = data[key + '_megacells']
                block_widths = data[key + '_block_widths']
@ -282,7 +281,3 @@ class Autotuner:
                                         "megacells": megacells[j, i]}
                logger.debug(f"Returning {self.performance[key]} as peak performance parameters")
                return self.performance[key]
-
-            # This should never happen
-            raise "Something wrong: Could not get autotuning data!"
-            return None
--- a/GPUSimulators/gpu/init.py
+++ b/GPUSimulators/gpu/init.py
@ -5,6 +5,8 @@ __env_name = 'GPU_LANG'
 if __env_name in environ and environ.get(__env_name).lower() == "cuda":
    from .cuda_context import CudaContext as KernelContext
    from .cuda_handler import CudaHandler as GPUHandler
+    from .cuda_event import CudaEvent as Event
 else:
    from .hip_context import HIPContext as KernelContext
-    from .hip_handler import HIPHandler as GPUHandler
+    from .hip_handler import HIPHandler as GPUHandler
+    from .hip_event import HIPEvent as Event
--- a/GPUSimulators/gpu/cuda_context.py
+++ b/GPUSimulators/gpu/cuda_context.py
@ -28,7 +28,7 @@ import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda

-from GPUSimulators import Autotuner
+from GPUSimulators.Autotuner import Autotuner
 from GPUSimulators.common import Timer
 from GPUSimulators.gpu.context import Context

@ -79,7 +79,7 @@ class CudaContext(Context):
        if autotuning:
            self.logger.info(
                "Autotuning enabled. It may take several minutes to run the code the first time: have patience")
-            self.autotuner = Autotuner.Autotuner()
+            self.autotuner = Autotuner()

    def __del__(self, *args):
        self.logger.info(f"Cleaning up CUDA context handle <{str(self.cuda_context.handle)}>")
--- a/GPUSimulators/gpu/cuda_event.py
+++ b/GPUSimulators/gpu/cuda_event.py
@ -0,0 +1,43 @@
+import pycuda.driver as cuda
+
+from .event import BaseEvent
+
+
+class CudaEvent(BaseEvent):
+    """
+    A GPU Event handler.
+    """
+
+    def __init__(self):
+        """
+        Creates a GPU Event.
+        """
+        super().__init__()
+        self.event = cuda.Event()
+
+    def record(self, stream):
+        """
+        Insert a recording point into the ``stream``.
+
+        Args:
+            stream: The stream to insert the recording point into.
+        """
+        self.event.record(stream)
+
+    def synchronize(self):
+        """
+        Wait for the event to complete.
+        """
+        self.event.synchronize()
+
+    def time_since(self, start):
+        """
+        Return the elapsed time from the ``start`` event and this class.
+
+        Args:
+            start: The Event to measure time from.
+
+        Returns:
+            Time since the ``start`` event and the end time of this class.
+        """
+        return self.event.time_since(start)
--- a/GPUSimulators/gpu/event.py
+++ b/GPUSimulators/gpu/event.py
@ -0,0 +1,36 @@
+class BaseEvent(object):
+    """
+    A GPU Event handler.
+    """
+
+    def __init__(self):
+        """
+        Creates a GPU Event.
+        """
+
+    def record(self, stream):
+        """
+        Insert a recording point into the ``stream``.
+
+        Args:
+            stream: The stream to insert the recording point into.
+        """
+        raise NotImplementedError("This function needs to be implemented in a subclass.")
+
+    def synchronize(self):
+        """
+        Wait for the event to complete.
+        """
+        raise NotImplementedError("This function needs to be implemented in a subclass.")
+
+    def time_since(self, start) -> float:
+        """
+        Return the elapsed time from the ``start`` event and this class.
+
+        Args:
+            start: The Event to measure time from.
+
+        Returns:
+            Time since the ``start`` event and the end time of this class.
+        """
+        raise NotImplementedError("This function needs to be implemented in a subclass.")
--- a/GPUSimulators/gpu/hip_context.py
+++ b/GPUSimulators/gpu/hip_context.py
@ -37,11 +37,10 @@ class HIPContext(Context):
        self.logger.debug(f" => total available memory: {int(props.totalGlobalMem / pow(1024, 2))} MiB")

        if autotuning:
+            from GPUSimulators.Autotuner import Autotuner
            self.logger.info(
                "Autotuning enabled. It may take several minutes to run the code the first time: have patience")
-            raise NotImplementedError("Autotuner is not yet implemented for HIP.")
-            # TODO Implement Autotuner for HIP
-            # self.autotuner = Autotuner.Autotuner()
+            self.autotuner = Autotuner()

    def __del__(self):
        for module in self.modules.values():
--- a/GPUSimulators/gpu/hip_event.py
+++ b/GPUSimulators/gpu/hip_event.py
@ -0,0 +1,51 @@
+from hip import hip
+from hip.hip import ihipStream_t, ihipEvent_t
+
+from .event import BaseEvent
+from GPUSimulators.common import hip_check
+
+
+class HIPEvent(BaseEvent):
+    """
+    A GPU Event handler.
+    """
+
+    def __init__(self):
+        """
+        Creates a GPU Event.
+        """
+        super().__init__()
+        self.event = hip_check(hip.hipEventCreate())
+
+    def __del__(self):
+        hip_check(hip.hipEventDestroy(self.event))
+
+    def record(self, stream: ihipStream_t | object):
+        """
+        Insert a recording point into the ``stream``.
+
+        Args:
+            stream: The stream to insert the recording point into.
+        """
+        hip_check(hip.hipEventRecord(self.event, stream))
+
+    def synchronize(self):
+        """
+        Wait for the event to complete.
+        """
+        hip_check(hip.hipEventSynchronize(self.event))
+
+    def time_since(self, start: ihipEvent_t | object):
+        """
+        Return the elapsed time from the ``start`` event and this class.
+
+        Args:
+            start: The Event to measure time from. Can also use the HIPEvent class instead of obj.event.
+
+        Returns:
+            Time since the ``start`` event and the end time of this class.
+        """
+        if isinstance(start, HIPEvent):
+            start = start.event
+
+        return hip_check(hip.hipEventElapsedTime(start, self.event))
--- a/HIPTestSchemes.ipynb
+++ b/HIPTestSchemes.ipynb