feat(common): add HIP array2d and arkawaa2d

2025-10-31 20:27:40 +01:00 · 2025-06-25 13:11:52 +02:00 · 2025-06-25 13:11:52 +02:00 · 01b39cc9b4
commit 01b39cc9b4
parent 985e774979
4 changed files with 146 additions and 8 deletions
--- a/GPUSimulators/common/arrays/array2d.py
+++ b/GPUSimulators/common/arrays/array2d.py
@ -8,7 +8,7 @@ class BaseArray2D(object):
    A base class that holds 2D data. To be used depending on the GPGPU language.
    """

-    def __init__(self, nx, ny, x_halo, y_halo, cpu_data=None, ):
+    def __init__(self, nx, ny, x_halo, y_halo, cpu_data=None):
        """
        Uploads initial data to the CUDA device
        """
@ -19,18 +19,20 @@ class BaseArray2D(object):
        self.x_halo = x_halo
        self.y_halo = y_halo

-        self.nx_halo = nx + 2 * x_halo
-        self.ny_halo = ny + 2 * y_halo
+        nx_halo = nx + 2 * x_halo
+        ny_halo = ny + 2 * y_halo
+
+        self.shape = (nx_halo, ny_halo)

        # If we don't have any data, just allocate and return
        if cpu_data is None:
            return

        # Make sure data is in proper format
-        if cpu_data.shape != (self.ny_halo, self.nx_halo) and cpu_data.shape != (self.ny, self.nx):
+        if cpu_data.shape != (ny_halo, nx_halo) and cpu_data.shape != (self.ny, self.nx):
            raise ValueError(
                f"Wrong shape of data {str(cpu_data.shape)} vs {str((self.ny, self.nx))} / "
-                + f"{str((self.ny_halo, self.nx_halo))}")
+                + f"{str((ny_halo, nx_halo))}")

        if cpu_data.itemsize != 4:
            raise ValueError("Wrong size of data type")
--- a/GPUSimulators/common/arrays/cuda/array2d.py
+++ b/GPUSimulators/common/arrays/cuda/array2d.py
@ -20,14 +20,14 @@ class CudaArray2D(BaseArray2D):
        super().__init__(nx, ny, x_halo, y_halo, cpu_data)
        # self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
        # Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
-        self.data = pycuda.gpuarray.zeros((self.ny_halo, self.nx_halo), dtype)
+        self.data = pycuda.gpuarray.zeros(self.shape, dtype)

        # For returning to download
        self.memorypool = PageLockedMemoryPool()

        # Create a copy object from host to device
-        x = (self.nx_halo - cpu_data.shape[1]) // 2
-        y = (self.ny_halo - cpu_data.shape[0]) // 2
+        x = (self.shape[0] - cpu_data.shape[1]) // 2
+        y = (self.shape[1] - cpu_data.shape[0]) // 2
        self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
        # self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)

--- a/GPUSimulators/common/arrays/hip/arkawa2d.py
+++ b/GPUSimulators/common/arrays/hip/arkawa2d.py
@ -0,0 +1,55 @@
+import numpy as np
+from hip import hip, hipblas
+
+from ....common import hip_check
+from ..arkawa2d import BaseArakawaA2D
+from .array2d import HIPArray2D
+
+
+def _sum_array(array: HIPArray2D):
+    """
+    Sum all the elements in HIPArray2D using hipblas.
+    Args:
+        array: A HIPArray2D to compute the sum of.
+    """
+    data_h = array.data_h
+    num_bytes = array.dtype.itemsize
+
+    result_d = hip_check(hip.hipMalloc(num_bytes))
+    result_h = array.dtype.type(0)
+
+    # Sum the ``data_h`` array using hipblas
+    handle = hip_check(hipblas.hipblasCreate())
+    hip_check(hipblas.hipblasSasum(handle, data_h.size, data_h.data, 1, result_d))
+    hip_check(hipblas.hipblasDestroy(handle))
+
+    # Copy over the result from the device
+    hip_check(hip.hipMemcpy(result_h, result_d, num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost))
+
+    hip_check(hip.hipFree(result_d))
+
+    return result_h
+
+
+class ArakawaA2D(BaseArakawaA2D):
+    """
+    A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
+    """
+
+    def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
+        """
+        Uploads initial data to the GPU device
+        """
+        super().__init__(stream, nx, ny, halo_x, halo_y, cpu_variables)
+
+    def check(self):
+        """
+        Checks that data is still sane
+        """
+        for i, gpu_variable in enumerate(self.gpu_variables):
+            var_sum = _sum_array(gpu_variable)
+            self.logger.debug(f"Data {i} with size [{gpu_variable.nx} x {gpu_variable.ny}] "
+                              + f"has average {var_sum / (gpu_variable.nx * gpu_variable.ny)}")
+
+            if np.isnan(var_sum):
+                raise ValueError("Data contains NaN values!")
--- a/GPUSimulators/common/arrays/hip/array2d.py
+++ b/GPUSimulators/common/arrays/hip/array2d.py
@ -0,0 +1,81 @@
+import ctypes
+
+import numpy as np
+from hip import hip, hipblas
+
+from GPUSimulators.common import hip_check
+from GPUSimulators.common.arrays.array2d import BaseArray2D
+
+
+class HIPArray2D(BaseArray2D):
+    """
+    Class that holds 2D HIP data
+    """
+
+    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype: np.dtype = np.float32):
+        """
+        Uploads initial data to the HIP device
+        """
+
+        super().__init__(nx, ny, x_halo, y_halo, cpu_data)
+        # self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+        self.dtype = dtype
+        self.data_h = np.zeros(self.shape, self.dtype)
+        self.num_bytes = self.data_h.size * self.data_h.itemsize
+
+        self.data_d = hip_check(hip.hipMalloc(self.num_bytes)).configure(
+            typestr=np.finfo(dtype).dtype.name, shape=self.shape
+        )
+
+        # Create a copy object from host to device
+        x = (self.shape[0] - cpu_data.shape[1]) // 2
+        y = (self.shape[1] - cpu_data.shape[0]) // 2
+        self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
+        # self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+
+    def __del__(self, *args):
+        # self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        hip_check(hip.hipFree(self.data_d))
+
+    def download(self, stream, cpu_data=None, asynch=False, extent=None):
+        """
+        Enables downloading data from GPU to Python
+        """
+
+        if extent is None:
+            x = self.x_halo
+            y = self.y_halo
+            nx = self.nx
+            ny = self.ny
+        else:
+            x, y, nx, ny = extent
+
+        if cpu_data is None:
+            # self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
+            # Allocate host memory
+            cpu_data = np.empty((ny, nx), dtype=self.dtype)
+
+        self.check(x, y, nx, ny, cpu_data)
+
+        if not asynch:
+            hip_check(hip.hipStreamSynchronize(stream))
+
+        hip_check(
+            hip.hipMemcpyAsync(self.data_d, cpu_data, self.num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost, stream))
+
+        return cpu_data
+
+    def upload(self, stream, cpu_data, extent=None):
+        if extent is None:
+            x = self.x_halo
+            y = self.y_halo
+            nx = self.nx
+            ny = self.ny
+        else:
+            x, y, nx, ny = extent
+
+        self.check(x, y, nx, ny, cpu_data)
+
+        # Create a copy object from device to host
+        hip_check(hip.hipMemcpyAsync(self.data_d, self.data_h, self.num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice,
+                                     stream))