fix(common): handling copying array on HIP

2025-10-31 20:27:40 +01:00 · 2025-07-01 19:43:34 +02:00 · 2025-07-01 19:43:34 +02:00 · 5c60978614
commit 5c60978614
parent 30aaccba91
8 changed files with 75 additions and 22 deletions
--- a/GPUSimulators/common/arrays/init.py
+++ b/GPUSimulators/common/arrays/init.py
@ -3,6 +3,9 @@ from os import environ
 __env_name = 'GPU_LANG'

 if __env_name in environ and environ.get(__env_name).lower() == "cuda":
-    from .cuda import *
+    from .cuda.arkawa2d import CudaArakawaA2D as ArakawaA2D
+    from .cuda.array2d import CudaArray2D as Array2D
+    from .cuda.array3d import CudaArray3D as Array3D
 else:
-    from .hip import *
+    from .hip.arkawa2d import HIPArakawaA2D as ArakawaA2D
+    from .hip.array2d import HIPArray2D as Array2D
--- a/GPUSimulators/common/arrays/arkawa2d.py
+++ b/GPUSimulators/common/arrays/arkawa2d.py
@ -37,5 +37,4 @@ class BaseArakawaA2D(object):
                raise IndexError(f"Variable {i} is out of range")
            cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]

-        # stream.synchronize()
        return cpu_variables
--- a/GPUSimulators/common/arrays/array2d.py
+++ b/GPUSimulators/common/arrays/array2d.py
@ -40,6 +40,12 @@ class BaseArray2D(object):
        if np.isfortran(cpu_data):
            raise TypeError("Wrong datatype (Fortran, expected C)")

+    def get_strides(self) -> tuple[int, ...]:
+        """
+        Gets the number of bytes it takes to move to the next element.
+        """
+        raise NotImplementedError("This function needs to be implemented in a subclass.")
+
    def check(self, x, y, nx, ny, cpu_data):
        if nx != cpu_data.shape[1]:
            raise ValueError
--- a/GPUSimulators/common/arrays/cuda/init.py
+++ b/GPUSimulators/common/arrays/cuda/init.py
@ -1,3 +0,0 @@
-from .arkawa2d import CudaArakawaA2D as ArakawaA2D
-from .array2d import CudaArray2D as Array2D
-from .array3d import CudaArray3D as Array3D
--- a/GPUSimulators/common/arrays/cuda/array2d.py
+++ b/GPUSimulators/common/arrays/cuda/array2d.py
@ -108,3 +108,6 @@ class CudaArray2D(BaseArray2D):
        copy.height = int(ny)

        copy(stream)
+
+    def get_strides(self) -> tuple[int, ...]:
+        return self.data.strides[0]
--- a/GPUSimulators/common/arrays/hip/init.py
+++ b/GPUSimulators/common/arrays/hip/init.py
@ -1,3 +0,0 @@
-from .arkawa2d import HIPArakawaA2D as ArakawaA2D
-from .array2d import HIPArray2D as Array2D
-# from .array3d import HIPArray3D as Array3D
--- a/GPUSimulators/common/arrays/hip/arkawa2d.py
+++ b/GPUSimulators/common/arrays/hip/arkawa2d.py
@ -1,7 +1,7 @@
 import numpy as np
 from hip import hip, hipblas

-from ....common import hip_check
+from ...hip_check import hip_check
 from ..arkawa2d import BaseArakawaA2D
 from .array2d import HIPArray2D

@ -13,10 +13,10 @@ def _sum_array(array: HIPArray2D):
        array: A HIPArray2D to compute the sum of.
    """
    data_h = array.data_h
-    num_bytes = array.dtype.itemsize
+    num_bytes = array.num_bytes

    result_d = hip_check(hip.hipMalloc(num_bytes))
-    result_h = array.dtype.type(0)
+    result_h = np.zeros(1, dtype=array.dtype)

    # Sum the ``data_h`` array using hipblas
    handle = hip_check(hipblas.hipblasCreate())
--- a/GPUSimulators/common/arrays/hip/array2d.py
+++ b/GPUSimulators/common/arrays/hip/array2d.py
@ -55,16 +55,15 @@ class HIPArray2D(BaseArray2D):
        if cpu_data is None:
            # self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
            # Allocate host memory
-            cpu_data = np.empty((ny, nx), dtype=self.dtype)
+            cpu_data = np.zeros((ny, nx), dtype=self.dtype)

-        self.check(x, y, nx, ny, cpu_data)
+        copy_args = hip.hip_Memcpy2D(**self.__get_copy_info(x, y, nx, ny, cpu_data, True))
+
+        hip_check(hip.hipMemcpyParam2DAsync(copy_args, stream))

        if not asynch:
            hip_check(hip.hipStreamSynchronize(stream))

-        hip_check(
-            hip.hipMemcpyAsync(self.data, cpu_data, self.num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost, stream))
-
        return cpu_data

    def upload(self, stream, cpu_data, extent=None):
@ -76,9 +75,58 @@ class HIPArray2D(BaseArray2D):
        else:
            x, y, nx, ny = extent

-        self.check(x, y, nx, ny, cpu_data)
+        copy_param = hip.hip_Memcpy2D(**self.__get_copy_info(x, y, nx, ny, cpu_data))

-        # TODO implement non-async to test if it actually works - avoid errors
-        # Create a copy object from device to host
-        hip_check(hip.hipMemcpyAsync(self.data, self.data_h, self.num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice,
-                                     stream))
+        hip_check(hip.hipMemcpyParam2DAsync(copy_param, stream))
+
+    def get_strides(self) -> tuple[int, ...]:
+        strides = []
+        for i in range(len(self.data_h.shape)):
+            strides.append(self.data_h.shape[i] * np.float32().itemsize)
+
+        return tuple(strides)
+
+    def __get_copy_info(self, x, y, nx, ny, host, to_host=False):
+        self.check(x, y, nx, ny, host)
+
+        # Arguments for the host data
+        src_args = [
+            'Host',
+            0,
+            0,
+            hip.hipMemoryType.hipMemoryTypeHost,
+            host,
+            host.strides[0]
+
+        ]
+        # Arguments for the device
+        dst_args = [
+            'Device',
+            int(x) * np.float32().itemsize,
+            int(y),
+            hip.hipMemoryType.hipMemoryTypeDevice,
+            self.data,
+            self.get_strides()[0],
+        ]
+
+        if to_host:
+            src_args, dst_args = dst_args, src_args
+
+        args = {
+            'srcXInBytes': src_args[1],
+            'srcY': src_args[2],
+            'srcMemoryType': src_args[3],
+            f'src{src_args[0]}': src_args[4],
+            'srcPitch': src_args[5],
+
+            'dstXInBytes': dst_args[1],
+            'dstY': dst_args[2],
+            'dstMemoryType': dst_args[3],
+            f'dst{dst_args[0]}': dst_args[4],
+            'dstPitch': dst_args[5],
+
+            'WidthInBytes': int(nx) * np.float32().itemsize,
+            'Height': int(ny)
+        }
+
+        return args