fix(common): handling copying array on HIP

This commit is contained in:
Anthony Berg 2025-07-01 19:43:34 +02:00
parent 30aaccba91
commit 5c60978614
8 changed files with 75 additions and 22 deletions

View File

@ -3,6 +3,9 @@ from os import environ
__env_name = 'GPU_LANG'
if __env_name in environ and environ.get(__env_name).lower() == "cuda":
from .cuda import *
from .cuda.arkawa2d import CudaArakawaA2D as ArakawaA2D
from .cuda.array2d import CudaArray2D as Array2D
from .cuda.array3d import CudaArray3D as Array3D
else:
from .hip import *
from .hip.arkawa2d import HIPArakawaA2D as ArakawaA2D
from .hip.array2d import HIPArray2D as Array2D

View File

@ -37,5 +37,4 @@ class BaseArakawaA2D(object):
raise IndexError(f"Variable {i} is out of range")
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
# stream.synchronize()
return cpu_variables

View File

@ -40,6 +40,12 @@ class BaseArray2D(object):
if np.isfortran(cpu_data):
raise TypeError("Wrong datatype (Fortran, expected C)")
def get_strides(self) -> tuple[int, ...]:
"""
Gets the number of bytes it takes to move to the next element.
"""
raise NotImplementedError("This function needs to be implemented in a subclass.")
def check(self, x, y, nx, ny, cpu_data):
if nx != cpu_data.shape[1]:
raise ValueError

View File

@ -1,3 +0,0 @@
from .arkawa2d import CudaArakawaA2D as ArakawaA2D
from .array2d import CudaArray2D as Array2D
from .array3d import CudaArray3D as Array3D

View File

@ -108,3 +108,6 @@ class CudaArray2D(BaseArray2D):
copy.height = int(ny)
copy(stream)
def get_strides(self) -> tuple[int, ...]:
return self.data.strides[0]

View File

@ -1,3 +0,0 @@
from .arkawa2d import HIPArakawaA2D as ArakawaA2D
from .array2d import HIPArray2D as Array2D
# from .array3d import HIPArray3D as Array3D

View File

@ -1,7 +1,7 @@
import numpy as np
from hip import hip, hipblas
from ....common import hip_check
from ...hip_check import hip_check
from ..arkawa2d import BaseArakawaA2D
from .array2d import HIPArray2D
@ -13,10 +13,10 @@ def _sum_array(array: HIPArray2D):
array: A HIPArray2D to compute the sum of.
"""
data_h = array.data_h
num_bytes = array.dtype.itemsize
num_bytes = array.num_bytes
result_d = hip_check(hip.hipMalloc(num_bytes))
result_h = array.dtype.type(0)
result_h = np.zeros(1, dtype=array.dtype)
# Sum the ``data_h`` array using hipblas
handle = hip_check(hipblas.hipblasCreate())

View File

@ -55,16 +55,15 @@ class HIPArray2D(BaseArray2D):
if cpu_data is None:
# self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
# Allocate host memory
cpu_data = np.empty((ny, nx), dtype=self.dtype)
cpu_data = np.zeros((ny, nx), dtype=self.dtype)
self.check(x, y, nx, ny, cpu_data)
copy_args = hip.hip_Memcpy2D(**self.__get_copy_info(x, y, nx, ny, cpu_data, True))
hip_check(hip.hipMemcpyParam2DAsync(copy_args, stream))
if not asynch:
hip_check(hip.hipStreamSynchronize(stream))
hip_check(
hip.hipMemcpyAsync(self.data, cpu_data, self.num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost, stream))
return cpu_data
def upload(self, stream, cpu_data, extent=None):
@ -76,9 +75,58 @@ class HIPArray2D(BaseArray2D):
else:
x, y, nx, ny = extent
self.check(x, y, nx, ny, cpu_data)
copy_param = hip.hip_Memcpy2D(**self.__get_copy_info(x, y, nx, ny, cpu_data))
# TODO implement non-async to test if it actually works - avoid errors
# Create a copy object from device to host
hip_check(hip.hipMemcpyAsync(self.data, self.data_h, self.num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice,
stream))
hip_check(hip.hipMemcpyParam2DAsync(copy_param, stream))
def get_strides(self) -> tuple[int, ...]:
strides = []
for i in range(len(self.data_h.shape)):
strides.append(self.data_h.shape[i] * np.float32().itemsize)
return tuple(strides)
def __get_copy_info(self, x, y, nx, ny, host, to_host=False):
self.check(x, y, nx, ny, host)
# Arguments for the host data
src_args = [
'Host',
0,
0,
hip.hipMemoryType.hipMemoryTypeHost,
host,
host.strides[0]
]
# Arguments for the device
dst_args = [
'Device',
int(x) * np.float32().itemsize,
int(y),
hip.hipMemoryType.hipMemoryTypeDevice,
self.data,
self.get_strides()[0],
]
if to_host:
src_args, dst_args = dst_args, src_args
args = {
'srcXInBytes': src_args[1],
'srcY': src_args[2],
'srcMemoryType': src_args[3],
f'src{src_args[0]}': src_args[4],
'srcPitch': src_args[5],
'dstXInBytes': dst_args[1],
'dstY': dst_args[2],
'dstMemoryType': dst_args[3],
f'dst{dst_args[0]}': dst_args[4],
'dstPitch': dst_args[5],
'WidthInBytes': int(nx) * np.float32().itemsize,
'Height': int(ny)
}
return args