re-write WAF.py with hip-python

This commit is contained in:
Hicham Agueny 2024-02-20 15:51:57 +01:00
parent 3b2c571dbb
commit 41367e2ba6

View File

@ -24,9 +24,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
from GPUSimulators import Simulator, Common from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
import numpy as np import numpy as np
import ctypes
from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc
@ -47,6 +48,21 @@ class WAF (Simulator.BaseSimulator):
dt: Size of each timestep (90 s) dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
h0, hu0, hv0, h0, hu0, hv0,
@ -68,19 +84,45 @@ class WAF (Simulator.BaseSimulator):
self.g = np.float32(g) self.g = np.float32(g)
#Get kernels #Get kernels
module = context.get_module("cuda/SWE2D_WAF.cu", # module = context.get_module("cuda/SWE2D_WAF.cu",
defines={ # defines={
'BLOCK_WIDTH': self.block_size[0], # 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1] # 'BLOCK_HEIGHT': self.block_size[1]
}, # },
compile_args={ # compile_args={
'no_extern_c': True, # 'no_extern_c': True,
'options': ["--use_fast_math"], # 'options': ["--use_fast_math"],
}, # },
jit_compile_args={}) # jit_compile_args={})
self.kernel = module.get_function("WAFKernel") # self.kernel = module.get_function("WAFKernel")
self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") # self.kernel.prepare("iiffffiiPiPiPiPiPiPiP")
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_WAF.cu.hip'))
with open(kernel_file_path, 'r') as file:
kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"WAFKernel", 0, [], []))
props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0))
arch = props.gcnArchName
print(f"Compiling kernel .WAFKernel. for {arch}")
cflags = [b"--offload-arch="+arch]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"WAFKernel"))
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
nx, ny, nx, ny,
@ -90,7 +132,12 @@ class WAF (Simulator.BaseSimulator):
nx, ny, nx, ny,
2, 2, 2, 2,
[None, None, None]) [None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
@ -100,28 +147,95 @@ class WAF (Simulator.BaseSimulator):
self.substepDimsplit(dt*0.5, step_number) self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep): def substepDimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, # self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny, # self.nx, self.ny,
self.dx, self.dy, dt, # self.dx, self.dy, dt,
self.g, # self.g,
substep, # substep,
self.boundary_conditions, # self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0], # self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0], # self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0], # self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0], # self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0], # self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0], # self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata) # self.cfl_data.gpudata)
#launch kernel
hip_check(
hip.hipModuleLaunchKernel(
kernel,
*self.grid_size,
*self.block_size,
sharedMemBytes=0,
stream=self.stream,
kernelParams=None,
extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt),
ctypes.c_float(self.g),
ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]),
self.cfl_data
)
)
)
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module))
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .WAFKernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0
def check(self): def check(self):
self.u0.check() self.u0.check()
self.u1.check() self.u1.check()
# computing min with hipblas: the output is an index
def min_hipblas(self, num_elements, cfl_data, stream):
num_bytes = num_elements * np.dtype(np.float32).itemsize
num_bytes_i = np.dtype(np.int32).itemsize
indx_d = hip_check(hip.hipMalloc(num_bytes_i))
indx_h = np.zeros(1, dtype=np.int32)
x_temp = np.zeros(num_elements, dtype=np.float32)
#print("--size.data:", cfl_data.size)
handle = hip_check(hipblas.hipblasCreate())
#hip_check(hipblas.hipblasGetStream(handle, stream))
#"incx" [int] specifies the increment for the elements of x. incx must be > 0.
hip_check(hipblas.hipblasIsamin(handle, num_elements, cfl_data, 1, indx_d))
# destruction of handle
hip_check(hipblas.hipblasDestroy(handle))
# copy result (stored in indx_d) back to the host (store in indx_h)
hip_check(hip.hipMemcpyAsync(indx_h,indx_d,num_bytes_i,hip.hipMemcpyKind.hipMemcpyDeviceToHost,stream))
hip_check(hip.hipMemcpyAsync(x_temp,cfl_data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost,stream))
#hip_check(hip.hipMemsetAsync(cfl_data,0,num_bytes,self.stream))
hip_check(hip.hipStreamSynchronize(stream))
min_value = x_temp.flatten()[indx_h[0]-1]
# clean up
hip_check(hip.hipStreamDestroy(stream))
hip_check(hip.hipFree(cfl_data))
return min_value
def computeDt(self): def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5 max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream)
return max_dt*0.5