feat(kernel): add basic HIPContext

2025-10-31 20:17:41 +01:00 · 2025-06-24 16:04:48 +02:00 · 2025-06-24 16:04:48 +02:00 · 8f24cd45ea
commit 8f24cd45ea
parent 0fa04dbcec
13 changed files with 334 additions and 58 deletions
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@ -29,7 +29,8 @@ from tqdm.auto import tqdm

 import pycuda.driver as cuda

-from GPUSimulators import Common, Simulator, CudaContext
+from GPUSimulators import Common, Simulator
+from GPUSimulators.gpu import CudaContext


 class Autotuner:
--- a/GPUSimulators/IPythonMagic.py
+++ b/GPUSimulators/IPythonMagic.py
@ -26,7 +26,8 @@ from IPython.core import magic_arguments
 from IPython.core.magic import line_magic, Magics, magics_class
 import pycuda.driver as cuda

-from GPUSimulators import Common, CudaContext
+from GPUSimulators import Common
+from GPUSimulators.gpu import CudaContext


@magics_class
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@ -21,8 +21,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

 #Import packages we need
-from GPUSimulators import CudaContext, Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+from GPUSimulators import Simulator, Common
+from GPUSimulators.gpu import CudaContext
+from GPUSimulators.Simulator import BoundaryCondition
 import numpy as np

 from pycuda import gpuarray
@ -33,12 +34,12 @@ class LxF (Simulator.BaseSimulator):
    Class that solves the SW equations using the Lax Friedrichs scheme
    """

-    def __init__(self, 
-                 context: CudaContext, 
-                 h0: float, hu0: float, hv0: float, 
-                 nx: int, ny: int, 
-                 dx: int, dy: int, 
-                 g: float, 
+    def __init__(self,
+                 context: CudaContext,
+                 h0: float, hu0: float, hv0: float,
+                 nx: int, ny: int,
+                 dx: int, dy: int,
+                 g: float,
                 cfl_scale: float=0.9,
                 boundary_conditions=BoundaryCondition(),
                 block_width: int=16, block_height: int=16,
--- a/GPUSimulators/SHMEMSimulator.py
+++ b/GPUSimulators/SHMEMSimulator.py
@ -20,13 +20,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

 import logging
-from GPUSimulators import Simulator, CudaContext
+from GPUSimulators import Simulator
 import numpy as np

-import pycuda.driver as cuda
-
-import time
-

 class SHMEMSimulator(Simulator.BaseSimulator):
    """
--- a/GPUSimulators/SHMEMSimulatorGroup.py
+++ b/GPUSimulators/SHMEMSimulatorGroup.py
@ -20,13 +20,12 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

 import logging
-from GPUSimulators import Simulator, CudaContext
+from GPUSimulators import Simulator
+from GPUSimulators.gpu import CudaContext
 import numpy as np

 import pycuda.driver as cuda

-import time
-

 class SHMEMGrid(object):
    """
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@ -25,11 +25,10 @@ import numpy as np
 import logging
 from enum import IntEnum

-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
 import pycuda.driver as cuda

-from GPUSimulators import Common, CudaContext
+from GPUSimulators import Common
+from GPUSimulators.gpu import CudaContext


 class BoundaryCondition(object):    
@ -100,10 +99,10 @@ class BoundaryCondition(object):

 class BaseSimulator(object):
   
-    def __init__(self, 
-                 context: CudaContext, 
-                 nx: int, ny: int, 
-                 dx: int, dy: int, 
+    def __init__(self,
+                 context: CudaContext,
+                 nx: int, ny: int,
+                 dx: int, dy: int,
                 boundary_conditions: BoundaryCondition,
                 cfl_scale: float,
                 num_substeps: int,
--- a/GPUSimulators/gpu/Context.py
+++ b/GPUSimulators/gpu/Context.py
@ -0,0 +1,115 @@
+import logging
+import os
+import io
+import re
+import logging
+from hashlib import md5
+
+
+class Context(object):
+    """
+    Class that manages either a HIP or CUDA context.
+    """
+
+    def __init__(self, language: str, device=0, context_flags=None, use_cache=True, autotuning=True):
+        """
+        Create a new context.
+        """
+        self.use_cache = use_cache
+        self.logger = logging.getLogger(__name__)
+        self.modules = {}
+
+        self.module_path = os.path.dirname(os.path.realpath(__file__))
+
+        self.autotuner = None
+
+        # Creates cache directory if specified
+        self.cache_path = os.path.join(self.module_path, f"{language}_cache")
+        if self.use_cache:
+            if not os.path.isdir(self.cache_path):
+                os.mkdir(self.cache_path)
+            self.logger.info(f"Using cache dir {self.cache_path}")
+
+    def __del__(self):
+        """
+        Cleans up the context.
+        """
+        pass
+
+    def __str__(self):
+        """
+        Gives the context id.
+        """
+        pass
+
+    def hash_kernel(self, kernel_filename: str, include_dirs: list[str]) -> str:
+        """
+        Generate a kernel ID for the caches.
+
+        Args:
+            kernel_filename: Path to the kernel file.
+            include_dirs: Directories to search for ``#include`` in the kernel file.
+
+        Returns:
+            MD5 has for the kernel in the cache.
+
+        Raises:
+            RuntimeError: When the number of ``#include``s surpassed the maximum (101) permitted ``#include``s.
+        """
+
+        num_includes = 0
+        max_includes = 100
+        kernel_hasher = md5()
+        logger = logging.getLogger(__name__)
+
+        # Loop over files and includes, and check if something has changed
+        files = [kernel_filename]
+        while len(files):
+            if num_includes > max_includes:
+                raise RuntimeError(f"Maximum number of includes reached.\n"
+                                   + f"Potential circular include in {kernel_filename}?")
+
+            filename = files.pop()
+
+            modified = os.path.getmtime(filename)
+
+            # Open the file
+            with io.open(filename, "r") as file:
+                # Search for ``#include <reference>`` and also hash the file
+                file_str = file.read()
+                kernel_hasher.update(file_str.encode('utf-8'))
+                kernel_hasher.update(str(modified).encode('utf-8'))
+
+                # Find all the includes
+                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
+
+            # Iterate through everything that looks like is an ``include``
+            for include_file in includes:
+                # Search through ``include`` directories for the file
+                file_path = os.path.dirname(filename)
+                for include_path in [file_path] + include_dirs:
+                    # If found, add it to the list of files to check
+                    temp_path = os.path.join(include_path, include_file)
+                    if os.path.isfile(temp_path):
+                        files = files + [temp_path]
+                        # To avoid circular includes
+                        num_includes = num_includes + 1
+                        break
+
+        return kernel_hasher.hexdigest()
+
+    def get_module(self, kernel_filename: str,
+                   include_dirs: dict = None,
+                   defines: list[str] = None,
+                   compile_args: dict = None,
+                   jit_compile_args: dict = None):
+        """
+        Reads a text file and creates a kernel from that.
+        """
+        raise NotImplementedError("Needs to be implemented in subclass")
+
+    def synchronize(self):
+        """
+        Synchronizes all the streams, etc.
+        """
+        raise NotImplementedError("Needs to be implemented in subclass")
--- a/GPUSimulators/gpu/CudaContext.py
+++ b/GPUSimulators/gpu/CudaContext.py
@ -34,6 +34,7 @@ import pycuda.gpuarray
 import pycuda.driver as cuda

 from GPUSimulators import Autotuner, Common
+from GPUSimulators.gpu.Context import Context


 class CudaContext(object):
@ -85,13 +86,13 @@ class CudaContext(object):
        
        #Create cache dir for cubin files
        self.cache_path = os.path.join(self.module_path, "cuda_cache") 
-        if (self.use_cache):
+        if self.use_cache:
            if not os.path.isdir(self.cache_path):
                os.mkdir(self.cache_path)
            self.logger.info("Using CUDA cache dir %s", self.cache_path)
            
        self.autotuner = None
-        if (autotuning):
+        if autotuning:
            self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
            self.autotuner = Autotuner.Autotuner()
    
@ -100,9 +101,9 @@ class CudaContext(object):
            
        # Loop over all contexts in stack, and remove "this"
        other_contexts = []
-        while (cuda.Context.get_current() != None):
+        while cuda.Context.get_current() is not None:
            context = cuda.Context.get_current()
-            if (context.handle != self.cuda_context.handle):
+            if context.handle != self.cuda_context.handle:
                self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
                other_contexts = [context] + other_contexts
                cuda.Context.pop()
@ -145,7 +146,7 @@ class CudaContext(object):
        files = [kernel_filename]
        while len(files):
        
-            if (num_includes > max_includes):
+            if num_includes > max_includes:
                raise RuntimeError("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
        
            filename = files.pop()
@ -165,33 +166,33 @@ class CudaContext(object):
                #Find all includes
                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
                
-            # Loop over everything that looks like an include
+            # Loop over everything that looks like an 'include'
            for include_file in includes:
                
-                #Search through include directories for the file
+                # Search through 'include' directories for the file
                file_path = os.path.dirname(filename)
                for include_path in [file_path] + include_dirs:
                
-                    # If we find it, add it to list of files to check
+                    # If we find it, add it to a list of files to check
                    temp_path = os.path.join(include_path, include_file)
-                    if (os.path.isfile(temp_path)):
+                    if os.path.isfile(temp_path):
                        files = files + [temp_path]
                        num_includes = num_includes + 1 #For circular includes...
                        break
            
        return kernel_hasher.hexdigest()

-    def get_module(self, kernel_filename: str, 
-                    include_dirs: list[str]=[], \
-                    defines: dict={}, \
-                    compile_args: dict={'no_extern_c': True}, jit_compile_args: dict={}) -> cuda.Module:
+    def get_module(self, kernel_filename: str,
+                   include_dirs: dict=None,
+                   defines:list[str]=None,
+                   compile_args:dict=None, jit_compile_args:dict=None) -> cuda.Module:
        """
        Reads a text file and creates an OpenCL kernel from that.

        Args:
            kernel_filename: The file to use for the kernel.
            include_dirs: List of directories for the ``#include``s referenced.
-            defines: Adds ``#define`` tags to the kernel, such as: ``#define key value``.
+            defines: Adds ``#define`` tags to the kernel, such as ``#define key value``.
            compile_args: Adds other compiler options (parameters) for ``pycuda.compiler.compile()``.
            jit_compile_args: Adds other just-in-time compilation options (parameters)
                for ``pycuda.driver.module_from_buffer()``.
@ -200,6 +201,15 @@ class CudaContext(object):
            The kernel module (pycuda.driver.Module).
        """

+        if defines is None:
+            defines = {}
+        if include_dirs is None:
+            include_dirs = []
+        if compile_args is None:
+            compile_args = {'no_extern_c': True}
+        if jit_compile_args is None:
+            jit_compile_args = {}
+
        def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
            """
            Helper function to print compilation output
@ -217,13 +227,13 @@ class CudaContext(object):
            
        # Create a hash of the kernel options
        options_hasher = hashlib.md5()
-        options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
+        options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'))
        options_hash = options_hasher.hexdigest()
        
        # Create hash of kernel souce
-        source_hash = self.hash_kernel( \
-                    kernel_path, \
-                    include_dirs=[self.module_path] + include_dirs)
+        source_hash = self.hash_kernel(
+            kernel_path,
+            include_dirs=[self.module_path] + include_dirs)
                    
        # Create final hash
        root, ext = os.path.splitext(kernel_filename)
@ -234,12 +244,12 @@ class CudaContext(object):
        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
        
        # If we have the kernel in our hashmap, return it
-        if (kernel_hash in self.modules.keys()):
+        if kernel_hash in self.modules.keys():
            self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
            return self.modules[kernel_hash]
        
        # If we have it on disk, return it
-        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
+        elif self.use_cache and os.path.isfile(cached_kernel_filename):
            self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
                
            with io.open(cached_kernel_filename, "rb") as file:
@ -258,7 +268,7 @@ class CudaContext(object):
            for key, value in defines.items():
                kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
            kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
-            if (self.use_cache):
+            if self.use_cache:
                cached_kernel_dir = os.path.dirname(cached_kernel_filename)
                if not os.path.isdir(cached_kernel_dir):
                    os.mkdir(cached_kernel_dir)
@ -272,7 +282,7 @@ class CudaContext(object):
                    warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
                    cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
-                if (self.use_cache):
+                if self.use_cache:
                    with io.open(cached_kernel_filename, "wb") as file:
                        file.write(cubin)
                
--- a/GPUSimulators/gpu/HIPContext.py
+++ b/GPUSimulators/gpu/HIPContext.py
@ -0,0 +1,160 @@
+import hashlib
+import io
+import os.path
+
+import hip as hip_main
+from hip import hip, hiprtc
+
+from GPUSimulators import Common
+from GPUSimulators.gpu.Context import Context
+
+
+class HIPContext(Context):
+    """
+    Class that manages the HIP context.
+    """
+
+    def __init__(self, device=0, context_flags=None, use_cache=True, autotuning=False):
+        """
+        Creates a new HIP context.
+        """
+        super().__init__("hip", device, context_flags, use_cache, autotuning)
+
+        # Log information about HIP version
+        self.logger.info(f"HIP Python version {hip_main.HIP_VERSION_NAME}")
+        self.logger.info(f"ROCm version {hip_main.ROCM_VERSION_NAME}")
+
+        # Device information
+        props = hip.hipDeviceProp_t()
+        self.__hip_check(hip.hipGetDeviceProperties(props, device))
+        device_count = self.__hip_check(hip.hipGetDeviceCount())
+        arch = props.gcnArchName
+        self.logger.info(
+            f"Using device {device}/{device_count} '{props.name.decode('ascii')} ({arch.decode('ascii')})'"
+            + f" ({props.pciBusID})"
+        )
+        self.logger.debug(f" => total available memory: {int(props.totalGlobalMem / pow(1024, 2))} MiB")
+
+        if autotuning:
+            self.logger.info(
+                "Autotuning enabled. It may take several minutes to run the code the first time: have patience")
+            raise (NotImplementedError("Autotuner is not yet implemented for HIP."))
+            # TODO Implement Autotuner for HIP
+            # self.autotuner = Autotuner.Autotuner()
+
+    def __hip_check(self, call_request):
+        """
+        Function that checks if the HIP function executed successfully.
+        """
+
+        err = call_request[0]
+        result = call_request[1:]
+        if len(result) == 1:
+            result = result[0]
+        if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
+            self.logger.error(f"HIP Error: {str(err)}")
+            raise RuntimeError(str(err))
+        return result
+
+    def get_module(self, kernel_filename: str,
+                   include_dirs: dict=None,
+                   defines:list[str]=None,
+                   compile_args:dict=None,
+                   jit_compile_args:dict=None):
+        """
+        Reads a ``.hip`` file and creates a HIP kernel from that.
+
+        Args:
+            kernel_filename: The file to use for the kernel.
+            include_dirs: List of directories for the ``#include``s referenced.
+            defines: Adds ``#define`` tags to the kernel, such as: ``#define key value``.
+            compile_args: Adds other compiler options (parameters) for ``pycuda.compiler.compile()``.
+            jit_compile_args: Adds other just-in-time compilation options (parameters)
+                for ``pycuda.driver.module_from_buffer()``.
+
+        Returns:
+            The kernel module (pycuda.driver.Module).
+        """
+        if defines is None:
+            defines = {}
+        if include_dirs is None:
+            include_dirs = []
+        if compile_args is None:
+            compile_args = {'no_extern_c': True}
+        if jit_compile_args is None:
+            jit_compile_args = {}
+
+        def compile_message_handler(compile_success_bool, info_str, error_str):
+            self.logger.debug(f"Compilation success: {str(compile_success_bool)}")
+            if info_str:
+                self.logger.debug(f"Compilation info: {info_str}")
+            if error_str:
+                self.logger.debug(f"Compilation error: {error_str}")
+
+        kernel_filename = os.path.normpath(kernel_filename)
+        kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
+
+        # Create a hash of the kernel options
+        options_hasher = hashlib.md5()
+        options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'))
+        options_hash = options_hasher.hexdigest()
+
+        # Create hash of the kernel source
+        source_hash = self.hash_kernel(kernel_path, include_dirs=[self.module_path] + include_dirs)
+
+        # Create the final hash
+        root, ext = os.path.splitext(kernel_filename)
+        kernel_hash = root + "_" + source_hash + "_" + options_hash + ext
+        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
+
+        # Checking if the kernel is already in the hashmap
+        if kernel_hash in self.modules.keys():
+            self.logger.debug(f"Found kernel {kernel_filename} cached in hashmap ({kernel_hash})")
+            return self.modules[kernel_hash]
+        elif self.use_cache and os.path.isfile(cached_kernel_filename):
+            self.logger.debug(f"Found kernerl {kernel_filename} cached on disk ({kernel_hash})")
+
+            with io.open(cached_kernel_filename, "rb") as file:
+                file_str = file.read()
+                # TODO add ``module`` to HIP
+                module = None
+
+            self.modules[kernel_hash] = module
+            return module
+        else:
+            self.logger.debug(f"Compiling {kernel_filename} ({kernel_hash})")
+
+            # Create kernel string
+            kernel_string = ""
+            for key, value in defines.items():
+                kernel_string += f"#define {str(key)} {str(value)}\n"
+            kernel_string += f"#include \"{os.path.join(self.module_path, kernel_filename)}\""
+
+            if self.use_cache:
+                cached_kernel_dir = os.path.dirname(cached_kernel_filename)
+                if not os.path.isdir(cached_kernel_dir):
+                    os.mkdir(cached_kernel_dir)
+                with io.open(cached_kernel_filename + ".txt", "w") as file:
+                    file.write(kernel_string)
+
+            with Common.Timer("compiler") as timer:
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
+                    # TODO compile the binary file
+                    bin = None
+
+                # TODO get binary from buffer
+                module = None
+                if self.use_cache:
+                    with io.open(cached_kernel_filename, "wb") as file:
+                        file.write(bin)
+
+            self.modules[kernel_hash] = module
+            return module
+
+    def synchronize(self):
+        self.__hip_check(hip.hipDeviceSynchronize())
+
+
+test = HIPContext()
--- a/GPUSimulators/gpu/init.py
+++ b/GPUSimulators/gpu/init.py
--- a/mpiTesting.py
+++ b/mpiTesting.py
@ -34,10 +34,10 @@ from mpi4py import MPI
 import pycuda.driver as cuda

 # Simulator engine etc
-from GPUSimulators import MPISimulator, Common, CudaContext
+from GPUSimulators import MPISimulator, Common
+from GPUSimulators.gpu import CudaContext
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
-from GPUSimulators.Simulator import BoundaryCondition as BC

 import argparse
 parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
--- a/shmemTesting.py
+++ b/shmemTesting.py
@ -22,17 +22,12 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import numpy as np
 import gc
-import time
-import json
 import logging

 #Simulator engine etc
-from GPUSimulators import SHMEMSimulatorGroup, Common, CudaContext
+from GPUSimulators import SHMEMSimulatorGroup, Common
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
-from GPUSimulators.Simulator import BoundaryCondition as BC
-
-

 ####
 #Initialize logging 
--- a/singleGPUTesting.py
+++ b/singleGPUTesting.py
@ -23,16 +23,15 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import numpy as np
 import gc
 import logging
-import os

 # CUDA
 import pycuda.driver as cuda

 # Simulator engine etc
-from GPUSimulators import Common, CudaContext
+from GPUSimulators import Common
+from GPUSimulators.gpu import CudaContext
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
-from GPUSimulators.Simulator import BoundaryCondition as BC

 import argparse
 parser = argparse.ArgumentParser(description='Single GPU testing.')