feat(kernel): add basic HIPContext

This commit is contained in:
Anthony Berg 2025-06-24 16:04:48 +02:00
parent 0fa04dbcec
commit 8f24cd45ea
13 changed files with 334 additions and 58 deletions

View File

@ -29,7 +29,8 @@ from tqdm.auto import tqdm
import pycuda.driver as cuda
from GPUSimulators import Common, Simulator, CudaContext
from GPUSimulators import Common, Simulator
from GPUSimulators.gpu import CudaContext
class Autotuner:

View File

@ -26,7 +26,8 @@ from IPython.core import magic_arguments
from IPython.core.magic import line_magic, Magics, magics_class
import pycuda.driver as cuda
from GPUSimulators import Common, CudaContext
from GPUSimulators import Common
from GPUSimulators.gpu import CudaContext
@magics_class

View File

@ -21,8 +21,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import CudaContext, Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
from GPUSimulators import Simulator, Common
from GPUSimulators.gpu import CudaContext
from GPUSimulators.Simulator import BoundaryCondition
import numpy as np
from pycuda import gpuarray
@ -33,12 +34,12 @@ class LxF (Simulator.BaseSimulator):
Class that solves the SW equations using the Lax Friedrichs scheme
"""
def __init__(self,
context: CudaContext,
h0: float, hu0: float, hv0: float,
nx: int, ny: int,
dx: int, dy: int,
g: float,
def __init__(self,
context: CudaContext,
h0: float, hu0: float, hv0: float,
nx: int, ny: int,
dx: int, dy: int,
g: float,
cfl_scale: float=0.9,
boundary_conditions=BoundaryCondition(),
block_width: int=16, block_height: int=16,

View File

@ -20,13 +20,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import logging
from GPUSimulators import Simulator, CudaContext
from GPUSimulators import Simulator
import numpy as np
import pycuda.driver as cuda
import time
class SHMEMSimulator(Simulator.BaseSimulator):
"""

View File

@ -20,13 +20,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import logging
from GPUSimulators import Simulator, CudaContext
from GPUSimulators import Simulator
from GPUSimulators.gpu import CudaContext
import numpy as np
import pycuda.driver as cuda
import time
class SHMEMGrid(object):
"""

View File

@ -25,11 +25,10 @@ import numpy as np
import logging
from enum import IntEnum
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from GPUSimulators import Common, CudaContext
from GPUSimulators import Common
from GPUSimulators.gpu import CudaContext
class BoundaryCondition(object):
@ -100,10 +99,10 @@ class BoundaryCondition(object):
class BaseSimulator(object):
def __init__(self,
context: CudaContext,
nx: int, ny: int,
dx: int, dy: int,
def __init__(self,
context: CudaContext,
nx: int, ny: int,
dx: int, dy: int,
boundary_conditions: BoundaryCondition,
cfl_scale: float,
num_substeps: int,

View File

@ -0,0 +1,115 @@
import logging
import os
import io
import re
import logging
from hashlib import md5
class Context(object):
"""
Class that manages either a HIP or CUDA context.
"""
def __init__(self, language: str, device=0, context_flags=None, use_cache=True, autotuning=True):
"""
Create a new context.
"""
self.use_cache = use_cache
self.logger = logging.getLogger(__name__)
self.modules = {}
self.module_path = os.path.dirname(os.path.realpath(__file__))
self.autotuner = None
# Creates cache directory if specified
self.cache_path = os.path.join(self.module_path, f"{language}_cache")
if self.use_cache:
if not os.path.isdir(self.cache_path):
os.mkdir(self.cache_path)
self.logger.info(f"Using cache dir {self.cache_path}")
def __del__(self):
"""
Cleans up the context.
"""
pass
def __str__(self):
"""
Gives the context id.
"""
pass
def hash_kernel(self, kernel_filename: str, include_dirs: list[str]) -> str:
"""
Generate a kernel ID for the caches.
Args:
kernel_filename: Path to the kernel file.
include_dirs: Directories to search for ``#include`` in the kernel file.
Returns:
MD5 has for the kernel in the cache.
Raises:
RuntimeError: When the number of ``#include``s surpassed the maximum (101) permitted ``#include``s.
"""
num_includes = 0
max_includes = 100
kernel_hasher = md5()
logger = logging.getLogger(__name__)
# Loop over files and includes, and check if something has changed
files = [kernel_filename]
while len(files):
if num_includes > max_includes:
raise RuntimeError(f"Maximum number of includes reached.\n"
+ f"Potential circular include in {kernel_filename}?")
filename = files.pop()
modified = os.path.getmtime(filename)
# Open the file
with io.open(filename, "r") as file:
# Search for ``#include <reference>`` and also hash the file
file_str = file.read()
kernel_hasher.update(file_str.encode('utf-8'))
kernel_hasher.update(str(modified).encode('utf-8'))
# Find all the includes
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
# Iterate through everything that looks like is an ``include``
for include_file in includes:
# Search through ``include`` directories for the file
file_path = os.path.dirname(filename)
for include_path in [file_path] + include_dirs:
# If found, add it to the list of files to check
temp_path = os.path.join(include_path, include_file)
if os.path.isfile(temp_path):
files = files + [temp_path]
# To avoid circular includes
num_includes = num_includes + 1
break
return kernel_hasher.hexdigest()
def get_module(self, kernel_filename: str,
include_dirs: dict = None,
defines: list[str] = None,
compile_args: dict = None,
jit_compile_args: dict = None):
"""
Reads a text file and creates a kernel from that.
"""
raise NotImplementedError("Needs to be implemented in subclass")
def synchronize(self):
"""
Synchronizes all the streams, etc.
"""
raise NotImplementedError("Needs to be implemented in subclass")

View File

@ -34,6 +34,7 @@ import pycuda.gpuarray
import pycuda.driver as cuda
from GPUSimulators import Autotuner, Common
from GPUSimulators.gpu.Context import Context
class CudaContext(object):
@ -85,13 +86,13 @@ class CudaContext(object):
#Create cache dir for cubin files
self.cache_path = os.path.join(self.module_path, "cuda_cache")
if (self.use_cache):
if self.use_cache:
if not os.path.isdir(self.cache_path):
os.mkdir(self.cache_path)
self.logger.info("Using CUDA cache dir %s", self.cache_path)
self.autotuner = None
if (autotuning):
if autotuning:
self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
self.autotuner = Autotuner.Autotuner()
@ -100,9 +101,9 @@ class CudaContext(object):
# Loop over all contexts in stack, and remove "this"
other_contexts = []
while (cuda.Context.get_current() != None):
while cuda.Context.get_current() is not None:
context = cuda.Context.get_current()
if (context.handle != self.cuda_context.handle):
if context.handle != self.cuda_context.handle:
self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
other_contexts = [context] + other_contexts
cuda.Context.pop()
@ -145,7 +146,7 @@ class CudaContext(object):
files = [kernel_filename]
while len(files):
if (num_includes > max_includes):
if num_includes > max_includes:
raise RuntimeError("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
filename = files.pop()
@ -165,33 +166,33 @@ class CudaContext(object):
#Find all includes
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
# Loop over everything that looks like an include
# Loop over everything that looks like an 'include'
for include_file in includes:
#Search through include directories for the file
# Search through 'include' directories for the file
file_path = os.path.dirname(filename)
for include_path in [file_path] + include_dirs:
# If we find it, add it to list of files to check
# If we find it, add it to a list of files to check
temp_path = os.path.join(include_path, include_file)
if (os.path.isfile(temp_path)):
if os.path.isfile(temp_path):
files = files + [temp_path]
num_includes = num_includes + 1 #For circular includes...
break
return kernel_hasher.hexdigest()
def get_module(self, kernel_filename: str,
include_dirs: list[str]=[], \
defines: dict={}, \
compile_args: dict={'no_extern_c': True}, jit_compile_args: dict={}) -> cuda.Module:
def get_module(self, kernel_filename: str,
include_dirs: dict=None,
defines:list[str]=None,
compile_args:dict=None, jit_compile_args:dict=None) -> cuda.Module:
"""
Reads a text file and creates an OpenCL kernel from that.
Args:
kernel_filename: The file to use for the kernel.
include_dirs: List of directories for the ``#include``s referenced.
defines: Adds ``#define`` tags to the kernel, such as: ``#define key value``.
defines: Adds ``#define`` tags to the kernel, such as ``#define key value``.
compile_args: Adds other compiler options (parameters) for ``pycuda.compiler.compile()``.
jit_compile_args: Adds other just-in-time compilation options (parameters)
for ``pycuda.driver.module_from_buffer()``.
@ -200,6 +201,15 @@ class CudaContext(object):
The kernel module (pycuda.driver.Module).
"""
if defines is None:
defines = {}
if include_dirs is None:
include_dirs = []
if compile_args is None:
compile_args = {'no_extern_c': True}
if jit_compile_args is None:
jit_compile_args = {}
def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
"""
Helper function to print compilation output
@ -217,13 +227,13 @@ class CudaContext(object):
# Create a hash of the kernel options
options_hasher = hashlib.md5()
options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'))
options_hash = options_hasher.hexdigest()
# Create hash of kernel souce
source_hash = self.hash_kernel( \
kernel_path, \
include_dirs=[self.module_path] + include_dirs)
source_hash = self.hash_kernel(
kernel_path,
include_dirs=[self.module_path] + include_dirs)
# Create final hash
root, ext = os.path.splitext(kernel_filename)
@ -234,12 +244,12 @@ class CudaContext(object):
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
# If we have the kernel in our hashmap, return it
if (kernel_hash in self.modules.keys()):
if kernel_hash in self.modules.keys():
self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
return self.modules[kernel_hash]
# If we have it on disk, return it
elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
elif self.use_cache and os.path.isfile(cached_kernel_filename):
self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
with io.open(cached_kernel_filename, "rb") as file:
@ -258,7 +268,7 @@ class CudaContext(object):
for key, value in defines.items():
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
if (self.use_cache):
if self.use_cache:
cached_kernel_dir = os.path.dirname(cached_kernel_filename)
if not os.path.isdir(cached_kernel_dir):
os.mkdir(cached_kernel_dir)
@ -272,7 +282,7 @@ class CudaContext(object):
warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
if (self.use_cache):
if self.use_cache:
with io.open(cached_kernel_filename, "wb") as file:
file.write(cubin)

View File

@ -0,0 +1,160 @@
import hashlib
import io
import os.path
import hip as hip_main
from hip import hip, hiprtc
from GPUSimulators import Common
from GPUSimulators.gpu.Context import Context
class HIPContext(Context):
"""
Class that manages the HIP context.
"""
def __init__(self, device=0, context_flags=None, use_cache=True, autotuning=False):
"""
Creates a new HIP context.
"""
super().__init__("hip", device, context_flags, use_cache, autotuning)
# Log information about HIP version
self.logger.info(f"HIP Python version {hip_main.HIP_VERSION_NAME}")
self.logger.info(f"ROCm version {hip_main.ROCM_VERSION_NAME}")
# Device information
props = hip.hipDeviceProp_t()
self.__hip_check(hip.hipGetDeviceProperties(props, device))
device_count = self.__hip_check(hip.hipGetDeviceCount())
arch = props.gcnArchName
self.logger.info(
f"Using device {device}/{device_count} '{props.name.decode('ascii')} ({arch.decode('ascii')})'"
+ f" ({props.pciBusID})"
)
self.logger.debug(f" => total available memory: {int(props.totalGlobalMem / pow(1024, 2))} MiB")
if autotuning:
self.logger.info(
"Autotuning enabled. It may take several minutes to run the code the first time: have patience")
raise (NotImplementedError("Autotuner is not yet implemented for HIP."))
# TODO Implement Autotuner for HIP
# self.autotuner = Autotuner.Autotuner()
def __hip_check(self, call_request):
"""
Function that checks if the HIP function executed successfully.
"""
err = call_request[0]
result = call_request[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
self.logger.error(f"HIP Error: {str(err)}")
raise RuntimeError(str(err))
return result
def get_module(self, kernel_filename: str,
include_dirs: dict=None,
defines:list[str]=None,
compile_args:dict=None,
jit_compile_args:dict=None):
"""
Reads a ``.hip`` file and creates a HIP kernel from that.
Args:
kernel_filename: The file to use for the kernel.
include_dirs: List of directories for the ``#include``s referenced.
defines: Adds ``#define`` tags to the kernel, such as: ``#define key value``.
compile_args: Adds other compiler options (parameters) for ``pycuda.compiler.compile()``.
jit_compile_args: Adds other just-in-time compilation options (parameters)
for ``pycuda.driver.module_from_buffer()``.
Returns:
The kernel module (pycuda.driver.Module).
"""
if defines is None:
defines = {}
if include_dirs is None:
include_dirs = []
if compile_args is None:
compile_args = {'no_extern_c': True}
if jit_compile_args is None:
jit_compile_args = {}
def compile_message_handler(compile_success_bool, info_str, error_str):
self.logger.debug(f"Compilation success: {str(compile_success_bool)}")
if info_str:
self.logger.debug(f"Compilation info: {info_str}")
if error_str:
self.logger.debug(f"Compilation error: {error_str}")
kernel_filename = os.path.normpath(kernel_filename)
kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
# Create a hash of the kernel options
options_hasher = hashlib.md5()
options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'))
options_hash = options_hasher.hexdigest()
# Create hash of the kernel source
source_hash = self.hash_kernel(kernel_path, include_dirs=[self.module_path] + include_dirs)
# Create the final hash
root, ext = os.path.splitext(kernel_filename)
kernel_hash = root + "_" + source_hash + "_" + options_hash + ext
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
# Checking if the kernel is already in the hashmap
if kernel_hash in self.modules.keys():
self.logger.debug(f"Found kernel {kernel_filename} cached in hashmap ({kernel_hash})")
return self.modules[kernel_hash]
elif self.use_cache and os.path.isfile(cached_kernel_filename):
self.logger.debug(f"Found kernerl {kernel_filename} cached on disk ({kernel_hash})")
with io.open(cached_kernel_filename, "rb") as file:
file_str = file.read()
# TODO add ``module`` to HIP
module = None
self.modules[kernel_hash] = module
return module
else:
self.logger.debug(f"Compiling {kernel_filename} ({kernel_hash})")
# Create kernel string
kernel_string = ""
for key, value in defines.items():
kernel_string += f"#define {str(key)} {str(value)}\n"
kernel_string += f"#include \"{os.path.join(self.module_path, kernel_filename)}\""
if self.use_cache:
cached_kernel_dir = os.path.dirname(cached_kernel_filename)
if not os.path.isdir(cached_kernel_dir):
os.mkdir(cached_kernel_dir)
with io.open(cached_kernel_filename + ".txt", "w") as file:
file.write(kernel_string)
with Common.Timer("compiler") as timer:
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
# TODO compile the binary file
bin = None
# TODO get binary from buffer
module = None
if self.use_cache:
with io.open(cached_kernel_filename, "wb") as file:
file.write(bin)
self.modules[kernel_hash] = module
return module
def synchronize(self):
self.__hip_check(hip.hipDeviceSynchronize())
test = HIPContext()

View File

View File

@ -34,10 +34,10 @@ from mpi4py import MPI
import pycuda.driver as cuda
# Simulator engine etc
from GPUSimulators import MPISimulator, Common, CudaContext
from GPUSimulators import MPISimulator, Common
from GPUSimulators.gpu import CudaContext
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC
import argparse
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')

View File

@ -22,17 +22,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import numpy as np
import gc
import time
import json
import logging
#Simulator engine etc
from GPUSimulators import SHMEMSimulatorGroup, Common, CudaContext
from GPUSimulators import SHMEMSimulatorGroup, Common
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC
####
#Initialize logging

View File

@ -23,16 +23,15 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import numpy as np
import gc
import logging
import os
# CUDA
import pycuda.driver as cuda
# Simulator engine etc
from GPUSimulators import Common, CudaContext
from GPUSimulators import Common
from GPUSimulators.gpu import CudaContext
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC
import argparse
parser = argparse.ArgumentParser(description='Single GPU testing.')