From 41367e2ba6f7a22a6dfbc5a458762359aaaa30aa Mon Sep 17 00:00:00 2001 From: Hicham Agueny Date: Tue, 20 Feb 2024 15:51:57 +0100 Subject: [PATCH] re-write WAF.py with hip-python --- GPUSimulators/WAF.py | 176 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 145 insertions(+), 31 deletions(-) diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index 22860fc..7e2763c 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -24,9 +24,10 @@ along with this program. If not, see . from GPUSimulators import Simulator, Common from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition import numpy as np +import ctypes -from pycuda import gpuarray - +#from pycuda import gpuarray +from hip import hip,hiprtc @@ -47,6 +48,21 @@ class WAF (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ + + def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result + def __init__(self, context, h0, hu0, hv0, @@ -68,19 +84,45 @@ class WAF (Simulator.BaseSimulator): self.g = np.float32(g) #Get kernels - module = context.get_module("cuda/SWE2D_WAF.cu", - defines={ - 'BLOCK_WIDTH': self.block_size[0], - 'BLOCK_HEIGHT': self.block_size[1] - }, - compile_args={ - 'no_extern_c': True, - 'options': ["--use_fast_math"], - }, - jit_compile_args={}) - self.kernel = module.get_function("WAFKernel") - self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") +# module = context.get_module("cuda/SWE2D_WAF.cu", +# defines={ +# 'BLOCK_WIDTH': self.block_size[0], +# 'BLOCK_HEIGHT': self.block_size[1] +# }, +# compile_args={ +# 'no_extern_c': True, +# 'options': ["--use_fast_math"], +# }, +# jit_compile_args={}) +# self.kernel = module.get_function("WAFKernel") +# self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") + kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_WAF.cu.hip')) + with open(kernel_file_path, 'r') as file: + kernel_source = file.read() + + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"WAFKernel", 0, [], [])) + + props = hip.hipDeviceProp_t() + hip_check(hip.hipGetDeviceProperties(props,0)) + arch = props.gcnArchName + + print(f"Compiling kernel .WAFKernel. for {arch}") + + cflags = [b"--offload-arch="+arch] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: + log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) + log = bytearray(log_size) + hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) + raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) + code = bytearray(code_size) + hip_check(hiprtc.hiprtcGetCode(prog, code)) + module = hip_check(hip.hipModuleLoadData(code)) + + kernel = hip_check(hip.hipModuleGetFunction(module, b"WAFKernel")) + #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, nx, ny, @@ -90,7 +132,12 @@ class WAF (Simulator.BaseSimulator): nx, ny, 2, 2, [None, None, None]) - self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) + #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) + data_h = np.empty(self.grid_size, dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=self.grid_size) + dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) @@ -100,28 +147,95 @@ class WAF (Simulator.BaseSimulator): self.substepDimsplit(dt*0.5, step_number) def substepDimsplit(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, - self.nx, self.ny, - self.dx, self.dy, dt, - self.g, - substep, - self.boundary_conditions, - self.u0[0].data.gpudata, self.u0[0].data.strides[0], - self.u0[1].data.gpudata, self.u0[1].data.strides[0], - self.u0[2].data.gpudata, self.u0[2].data.strides[0], - self.u1[0].data.gpudata, self.u1[0].data.strides[0], - self.u1[1].data.gpudata, self.u1[1].data.strides[0], - self.u1[2].data.gpudata, self.u1[2].data.strides[0], - self.cfl_data.gpudata) +# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, +# self.nx, self.ny, +# self.dx, self.dy, dt, +# self.g, +# substep, +# self.boundary_conditions, +# self.u0[0].data.gpudata, self.u0[0].data.strides[0], +# self.u0[1].data.gpudata, self.u0[1].data.strides[0], +# self.u0[2].data.gpudata, self.u0[2].data.strides[0], +# self.u1[0].data.gpudata, self.u1[0].data.strides[0], +# self.u1[1].data.gpudata, self.u1[1].data.strides[0], +# self.u1[2].data.gpudata, self.u1[2].data.strides[0], +# self.cfl_data.gpudata) + + #launch kernel + hip_check( + hip.hipModuleLaunchKernel( + kernel, + *self.grid_size, + *self.block_size, + sharedMemBytes=0, + stream=self.stream, + kernelParams=None, + extra=( # pass kernel's arguments + ctypes.c_int(self.nx), ctypes.c_int(self.ny), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.g), + ctypes.c_int(substep), + ctypes.c_int(self.boundary_conditions), + ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), + ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), + ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), + ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), + ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), + ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), + self.cfl_data + ) + ) + ) + + hip_check(hip.hipDeviceSynchronize()) + self.u0, self.u1 = self.u1, self.u0 + hip_check(hip.hipModuleUnload(module)) + + hip_check(hip.hipFree(cfl_data)) + + print("--Launching Kernel .WAFKernel. is ok") + def getOutput(self): return self.u0 def check(self): self.u0.check() self.u1.check() - + + # computing min with hipblas: the output is an index + def min_hipblas(self, num_elements, cfl_data, stream): + num_bytes = num_elements * np.dtype(np.float32).itemsize + num_bytes_i = np.dtype(np.int32).itemsize + indx_d = hip_check(hip.hipMalloc(num_bytes_i)) + indx_h = np.zeros(1, dtype=np.int32) + x_temp = np.zeros(num_elements, dtype=np.float32) + + #print("--size.data:", cfl_data.size) + handle = hip_check(hipblas.hipblasCreate()) + + #hip_check(hipblas.hipblasGetStream(handle, stream)) + #"incx" [int] specifies the increment for the elements of x. incx must be > 0. + hip_check(hipblas.hipblasIsamin(handle, num_elements, cfl_data, 1, indx_d)) + + # destruction of handle + hip_check(hipblas.hipblasDestroy(handle)) + + # copy result (stored in indx_d) back to the host (store in indx_h) + hip_check(hip.hipMemcpyAsync(indx_h,indx_d,num_bytes_i,hip.hipMemcpyKind.hipMemcpyDeviceToHost,stream)) + hip_check(hip.hipMemcpyAsync(x_temp,cfl_data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost,stream)) + #hip_check(hip.hipMemsetAsync(cfl_data,0,num_bytes,self.stream)) + hip_check(hip.hipStreamSynchronize(stream)) + + min_value = x_temp.flatten()[indx_h[0]-1] + + # clean up + hip_check(hip.hipStreamDestroy(stream)) + hip_check(hip.hipFree(cfl_data)) + return min_value + def computeDt(self): - max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); - return max_dt*0.5 \ No newline at end of file + #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); + max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) + return max_dt*0.5