From 2a7a8c62583f857e4bd575a289d22906222c0767 Mon Sep 17 00:00:00 2001 From: Hicham Agueny Date: Sun, 9 Jun 2024 22:48:06 +0200 Subject: [PATCH] hip-python implementation --- GPUSimulators/Autotuner.py | 5 +- GPUSimulators/Common.py | 371 ++++++----- GPUSimulators/CudaContext.py | 82 ++- GPUSimulators/CudaContext_cu.py | 272 -------- GPUSimulators/EE2D_KP07_dimsplit.py | 614 ++++++++++-------- GPUSimulators/FORCE.py | 215 +++--- GPUSimulators/HLL.py | 224 ++++--- GPUSimulators/HLL2.py | 242 ++++--- GPUSimulators/IPythonMagic.py | 26 +- GPUSimulators/KP07.py | 247 ++++--- GPUSimulators/KP07_dimsplit.py | 242 ++++--- GPUSimulators/LxF.py | 223 ++++--- GPUSimulators/MPISimulator.py | 95 +-- GPUSimulators/SHMEMSimulator.py | 27 +- GPUSimulators/SHMEMSimulatorGroup.py | 27 +- GPUSimulators/Simulator.py | 33 +- GPUSimulators/WAF.py | 227 ++++--- .../__pycache__/MPISimulator.cpython-39.pyc | Bin 11058 -> 0 bytes .../__pycache__/Simulator.cpython-39.pyc | Bin 8433 -> 0 bytes .../__pycache__/__init__.cpython-39.pyc | Bin 182 -> 0 bytes GPUSimulators/cuda/EE2D_KP07_dimsplit.cu.hip | 3 +- GPUSimulators/cuda/common.h | 13 +- .../InitialConditions.cpython-39.pyc | Bin 0 -> 7097 bytes 23 files changed, 1769 insertions(+), 1419 deletions(-) delete mode 100644 GPUSimulators/CudaContext_cu.py delete mode 100644 GPUSimulators/__pycache__/MPISimulator.cpython-39.pyc delete mode 100644 GPUSimulators/__pycache__/Simulator.cpython-39.pyc delete mode 100644 GPUSimulators/__pycache__/__init__.cpython-39.pyc create mode 100644 GPUSimulators/helpers/__pycache__/InitialConditions.cpython-39.pyc diff --git a/GPUSimulators/Autotuner.py b/GPUSimulators/Autotuner.py index 84aedc2..89a071d 100644 --- a/GPUSimulators/Autotuner.py +++ b/GPUSimulators/Autotuner.py @@ -31,8 +31,7 @@ from hip import hip,hiprtc from GPUSimulators import Common, Simulator, CudaContext -class Autotuner: - def hip_check(call_result): +def hip_check(call_result): err = call_result[0] result = call_result[1:] if len(result) == 1: @@ -46,6 +45,8 @@ class Autotuner: raise RuntimeError(str(err)) return result + +class Autotuner: def __init__(self, nx=2048, ny=2048, block_widths=range(8, 32, 1), diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index 6681450..f23d9a7 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -56,7 +56,7 @@ def hip_check(call_result): ): raise RuntimeError(str(err)) return result - + def safeCall(cmd): logger = logging.getLogger(__name__) try: @@ -158,7 +158,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names extent = sim.getExtent() ncvars['x'][:] = np.linspace(extent[0], extent[1], simulator_args['nx']) ncvars['y'][:] = np.linspace(extent[2], extent[3], simulator_args['ny']) - + #Choose which variables to download (prune None from list, but keep the index) download_vars = [] for i, var_name in enumerate(save_var_names): @@ -203,7 +203,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names #Download save_vars = sim.download(download_vars) - + #Save to file for i, var_name in enumerate(save_var_names): ncvars[var_name][k, :] = save_vars[i] @@ -216,12 +216,9 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names logger.debug(print_string) logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps())) - + return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi - - - - + #return outdata.filename class Timer(object): @@ -246,9 +243,6 @@ class Timer(object): def elapsed(self): return time.time() - self.start - - - class PopenFileBuffer(object): """ @@ -366,10 +360,6 @@ class IPEngine(object): gc.collect() - - - - class DataDumper(object): """ Simple class for holding a netCDF4 object @@ -443,8 +433,6 @@ class DataDumper(object): - - class ProgressPrinter(object): """ Small helper class for @@ -499,11 +487,6 @@ class ProgressPrinter(object): return progressbar - - - - - """ Class that holds 2D data """ @@ -520,24 +503,28 @@ class CudaArray2D: nx_halo = nx + 2*x_halo ny_halo = ny + 2*y_halo - + #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny) #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here #Initialize an array on GPU with zeros #self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype) - self.data_h = np.zeros((ny_halo, nx_halo), dtype="float32") - num_bytes = self.data_h.size * self.data_h.itemsize - + #data.strides[0] == nx_halo*np.float32().itemsize + #data.strides[1] == np.float32().itemsize + num_bytes = ny_halo*nx_halo * np.float32().itemsize + + #data_h = np.zeros((ny_halo, nx_halo), dtype) # init device array and upload host data self.data = hip_check(hip.hipMalloc(num_bytes)).configure( typestr="float32",shape=(ny_halo, nx_halo)) + #num_bytes = ny*nx * np.float32().itemsize + #cpu_data = hip_check(hip.hipHostMalloc(num_bytes,hip.hipHostMallocPortable)) # copy data from host to device - hip_check(hip.hipMemcpy(self.data,self.data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice)) + #hip_check(hip.hipMemcpy(self.data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice)) - #For returning to download (No counterpart in hip-python) + #https://rocm.docs.amd.com/projects/hip-python/en/latest/python_api/hip.html#hip.hip.hipMemPoolCreate #self.memorypool = PageLockedMemoryPool() - + #If we don't have any data, just allocate and return if cpu_data is None: return @@ -547,16 +534,21 @@ class CudaArray2D: assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" + #Create copy object from host to device x = (nx_halo - cpu_data.shape[1]) // 2 y = (ny_halo - cpu_data.shape[0]) // 2 self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]]) + #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) - + def __del__(self, *args): #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) - self.data.gpudata.free() + #self.data.gpudata.free() + #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data), self.nx, self.ny) + hip_check(hip.hipFree(self.data)) + #hip_check(hip.hipFreeAsync(self.data, self.stream)) self.data = None """ @@ -570,71 +562,84 @@ class CudaArray2D: ny = self.ny else: x, y, nx, ny = extent - + if (cpu_data is None): #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #Allocate host memory #The following fails, don't know why (crashes python) + #allocate a pinned (page-locked) memory array #cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #see here type of memory: https://rocm.docs.amd.com/projects/hip-python/en/latest/python_api/hip.html#hip.hip.hipMemoryType - cpu_data = np.empty((ny, nx), dtype=np.float32) - num_bytes = cpu_data.size * cpu_data.itemsize - #hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can #be accessed directly by the GPU device + cpu_data = np.zeros((ny, nx), dtype=np.float32) + #num_bytes = cpu_data.size * cpu_data.itemsize + #hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can #be accessed directly by the GPU device #hipHostMallocDefault:Memory is mapped and portable (default allocation) #hipHostMallocPortable: memory is explicitely portable across different devices - cpu_data = hip_check(hip.hipHostMalloc(num_bytes,hip.hipHostMallocPortable)) + #cpu_data = hip_check(hip.hipHostMalloc(num_bytes,hip.hipHostMallocPortable)) #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) - + assert nx == cpu_data.shape[1] assert ny == cpu_data.shape[0] assert x+nx <= self.nx + 2*self.x_halo assert y+ny <= self.ny + 2*self.y_halo - + + #Cuda + """ #Create copy object from device to host - #copy = cuda.Memcpy2D() - #copy.set_src_device(self.data.gpudata) - #copy.set_dst_host(cpu_data) + copy = cuda.Memcpy2D() + copy.set_src_device(self.data.gpudata) + copy.set_dst_host(cpu_data) #Set offsets and pitch of source - #copy.src_x_in_bytes = int(x)*self.data.strides[1] - #copy.src_y = int(y) - #copy.src_pitch = self.data.strides[0] + copy.src_x_in_bytes = int(x)*self.data.strides[1] + copy.src_y = int(y) + copy.src_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy - #copy.width_in_bytes = int(nx)*cpu_data.itemsize - #copy.height = int(ny) - - #The equivalent of cuda.Memcpy2D in hip-python would be: but it fails with an error pointing to cpu_data - #and a message: "RuntimeError: hipError_t.hipErrorInvalidValue" - #shape = (nx,ny) - #num_bytes = cpu_data.size * cpu_data.itemsize - #dst_pitch_bytes = cpu_data.strides[0] - #src_pitch_bytes = num_bytes // shape[0] - #src_pitch_bytes = data.strides[0] - #width_bytes = int(nx)*cpu_data.itemsize - #height_Nrows = int(ny) - #hipMemcpy2D(dst, unsigned long dpitch, src, unsigned long spitch, unsigned long width, unsigned long height, kind) - #copy = hip_check(hip.hipMemcpy2D(cpu_data, #pointer to destination - # dst_pitch_bytes, #pitch of destination array - # data, #pointer to source - # src_pitch_bytes, #pitch of source array - # width_bytes, #number of bytes in each row - # height_Nrows, #number of rows to copy - # hip.hipMemcpyKind.hipMemcpyDeviceToHost)) #kind + copy.width_in_bytes = int(nx)*cpu_data.itemsize + copy.height = int(ny) + """ - #this is an alternative: #copy from device to host - cpu_data = np.empty((ny, nx), dtype=np.float32) - num_bytes = cpu_data.size * cpu_data.itemsize - #hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind) - copy = hip_check(hip.hipMemcpy(cpu_data,self.data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost)) - copy(stream) - if asynch==False: - stream.synchronize() + #host_array_pinned = hip_check(hip.hipHostMalloc(cpu_data.size * cpu_data.itemsize, hip.hipHostMallocDefault)) + #device_pointer = hip_check(hip.hipHostGetDevicePointer(host_array_pinned,hip.hipHostMallocDefault)) + + + copy_download = { + 'srcXInBytes': int(x)*np.float32().itemsize, + 'srcY': int(y), + 'srcMemoryType': hip.hipMemoryType.hipMemoryTypeDevice,#hipMemoryTypeManaged + 'srcDevice': self.data, + 'srcPitch': self.data.shape[0]*np.float32().itemsize, + + 'dstXInBytes': 0, + 'dstY': 0, + 'dstMemoryType': hip.hipMemoryType.hipMemoryTypeHost, + 'dstHost': cpu_data, #device_pointer, + 'dstPitch': cpu_data.strides[0], + + 'WidthInBytes': int(nx)*cpu_data.itemsize, + 'Height': int(ny) + } + + # Perform the copy back to host + Copy = hip.hip_Memcpy2D(**copy_download) + + #err = hip.hipMemcpyParam2D(Copy) + err = hip.hipMemcpyParam2DAsync(Copy, stream) + if err is None: + print("--download - DtoH: Failed to copy 2D data to Host") + print("--I stop:", err) + exit() + + #copy(stream) + if asynch==False: + #stream.synchronize() + hip_check(hip.hipStreamSynchronize(stream)) return cpu_data @@ -646,37 +651,67 @@ class CudaArray2D: ny = self.ny else: x, y, nx, ny = extent - + assert(nx == cpu_data.shape[1]) assert(ny == cpu_data.shape[0]) assert(x+nx <= self.nx + 2*self.x_halo) assert(y+ny <= self.ny + 2*self.y_halo) - + + #Cuda + """ #Create copy object from device to host #Well this copy from src:host to dst:device AND NOT from device to host - #copy = cuda.Memcpy2D() - #copy.set_dst_device(self.data.gpudata) - #copy.set_src_host(cpu_data) + copy = cuda.Memcpy2D() + copy.set_dst_device(self.data.gpudata) + copy.set_src_host(cpu_data) #Set offsets and pitch of source - #copy.dst_x_in_bytes = int(x)*self.data.strides[1] - #copy.dst_y = int(y) - #copy.dst_pitch = self.data.strides[0] + copy.dst_x_in_bytes = int(x)*self.data.strides[1] + copy.dst_y = int(y) + copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy - #copy.width_in_bytes = int(nx)*cpu_data.itemsize - #copy.height = int(ny) + copy.width_in_bytes = int(nx)*cpu_data.itemsize + copy.height = int(ny) + """ + + + #Copy from host to device + + #host_array_pinned = hip_check(hip.hipHostMalloc(cpu_data.size * cpu_data.itemsize, hip.hipHostMallocDefault)) + #device_pointer = hip_check(hip.hipHostGetDevicePointer(host_array_pinned,hip.hipHostMallocDefault)) + + copy_upload = { + 'srcXInBytes': 0, + 'srcY': 0, + 'srcMemoryType': hip.hipMemoryType.hipMemoryTypeHost, + 'srcHost': cpu_data, #device_pointer + 'srcPitch': cpu_data.strides[0], # assuming float32 (4 bytes) + + 'dstXInBytes': int(x)*np.float32().itemsize, + 'dstY': int(y), + 'dstMemoryType': hip.hipMemoryType.hipMemoryTypeDevice, #hipMemoryTypeManaged + 'dstDevice': self.data, + 'dstPitch': self.data.shape[0]*np.float32().itemsize, + + 'WidthInBytes': int(nx)*cpu_data.itemsize, + 'Height': int(ny) + } + + + # Perform the copy HtoD + Copy = hip.hip_Memcpy2D(**copy_upload) - #copy from host de device - num_bytes = cpu_data.size * cpu_data.itemsize - self.data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=cpu_data.shape) - #hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind) - copy = hip_check(hip.hipMemcpy(self.data,cpu_data,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice)) - - copy(stream) + #err = hip.hipMemcpyParam2D(Copy) + err = hip.hipMemcpyParam2DAsync(Copy, stream) + + if err is None: + print("--Upload - HtoD: Failed to copy 2D data to Device") + print("--I stop:", err) + exit() + #copy(stream) @@ -704,15 +739,12 @@ class CudaArray3D: #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here #self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype) - self.data_h = np.zeros((nz_halo, ny_halo, nx_halo), dtype="float32") - num_bytes = self.data_h.size * self.data_h.itemsize - + """ + num_bytes = nz_halo*ny_halo*nx_halo * np.float32().itemsize # init device array and upload host data self.data = hip_check(hip.hipMalloc(num_bytes)).configure( typestr="float32",shape=(nz_halo, ny_halo, nx_halo)) - - # copy data from host to device - hip_check(hip.hipMemcpy(self.data,self.data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice)) + """ #For returning to download #self.memorypool = PageLockedMemoryPool() @@ -725,48 +757,85 @@ class CudaArray3D: assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo))) assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" - + + #Cuda + """ #Create copy object from host to device - #copy = cuda.Memcpy3D() - #copy.set_src_host(cpu_data) - #copy.set_dst_device(self.data.gpudata) + copy = cuda.Memcpy3D() + copy.set_src_host(cpu_data) + copy.set_dst_device(self.data.gpudata) #Set offsets of destination - #x_offset = (nx_halo - cpu_data.shape[2]) // 2 - #y_offset = (ny_halo - cpu_data.shape[1]) // 2 - #z_offset = (nz_halo - cpu_data.shape[0]) // 2 - #copy.dst_x_in_bytes = x_offset*self.data.strides[1] - #copy.dst_y = y_offset - #copy.dst_z = z_offset + x_offset = (nx_halo - cpu_data.shape[2]) // 2 + y_offset = (ny_halo - cpu_data.shape[1]) // 2 + z_offset = (nz_halo - cpu_data.shape[0]) // 2 + copy.dst_x_in_bytes = x_offset*self.data.strides[1] + copy.dst_y = y_offset + copy.dst_z = z_offset #Set pitch of destination - #copy.dst_pitch = self.data.strides[0] + copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy - #width = max(self.nx, cpu_data.shape[2]) - #height = max(self.ny, cpu_data.shape[1]) - #depth = max(self.nz, cpu-data.shape[0]) - #copy.width_in_bytes = width*cpu_data.itemsize - #copy.height = height - #copy.depth = depth + width = max(self.nx, cpu_data.shape[2]) + height = max(self.ny, cpu_data.shape[1]) + depth = max(self.nz, cpu-data.shape[0]) + copy.width_in_bytes = width*cpu_data.itemsize + copy.height = height + copy.depth = depth - #copy from host to device - num_bytes = cpu_data.size * cpu_data.itemsize - self.data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=cpu_data.shape) - #hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind) - copy = hip_check(hip.hipMemcpy(self.data,cpu_data,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice)) - #Perform the copy copy(stream) - #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) - + """ + + #copy from host to device + #src + host_array_pinned = hip_check(hip.hipHostMalloc(cpu_data.size * cpu_data.itemsize, hip.hipHostMallocDefault)) + src_ptr = hip_check(hip.hipHostGetDevicePointer(host_array_pinned,hip.hipHostMallocDefault)) + #src_ptr = hip.hipPitchedPtr() + + #dst + # Allocate 3D pitched memory on the device + self.data = hip.hipPitchedPtr() + c_extent = hip.hipExtent(nx_halo*np.float32().itemsize, ny_halo, nz_halo) + #hip.hipMalloc3D(pitchedDevPtr-OUT, extent-IN) + err, = hip.hipMalloc3D(self.data, c_extent) + dst_pitch = nx_halo * np.float32().itemsize + + #include offset: do we need make_hipPitchedPtr + x_offset = (nx_halo - cpu_data.shape[2]) // 2 + y_offset = (ny_halo - cpu_data.shape[1]) // 2 + z_offset = (nz_halo - cpu_data.shape[0]) // 2 + + if err != hip.hipError_t.hipSuccess: + raise RuntimeError(f"Error from hipMalloc3D: {hip.hipGetErrorString(err)}") + + copy_upload = { + 'srcPos': hip.hipPos(0, 0, 0), + 'srcPtr': src_ptr, + 'dstPos': hip.hipPos(0, 0, 0), + 'dstPtr': self.data, + 'extent': c_extent, + 'kind': hip.hipMemcpyKind.hipMemcpyHostToDevice + } + + # Perform the copy + copy = hip.hipMemcpy3DParms(**copy_upload) + err = hip.hipMemcpy3DAsync(copy, stream) + #copy = hip_check(hip.hipMemcpyAsync(self.data,cpu_data,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,stream)) + + + #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data), self.nx, self.ny) + def __del__(self, *args): #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) - self.data.gpudata.free() + #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data), self.nx, self.ny) + #self.data.gpudata.free() + hip_check(hip.hipFree(self.data)) + #hip_check(hip.hipFreeAsync(self.data, self.stream)) self.data = None """ @@ -778,33 +847,37 @@ class CudaArray3D: #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32) cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32) #cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32) - + + #Cuda + """ #Create copy object from device to host - #copy = cuda.Memcpy2D() - #copy.set_src_device(self.data.gpudata) - #copy.set_dst_host(cpu_data) + copy = cuda.Memcpy2D() + copy.set_src_device(self.data.gpudata) + copy.set_dst_host(cpu_data) #Set offsets and pitch of source - #copy.src_x_in_bytes = self.x_halo*self.data.strides[1] - #copy.src_y = self.y_halo - #copy.src_z = self.z_halo - #copy.src_pitch = self.data.strides[0] + copy.src_x_in_bytes = self.x_halo*self.data.strides[1] + copy.src_y = self.y_halo + copy.src_z = self.z_halo + copy.src_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy - #copy.width_in_bytes = self.nx*cpu_data.itemsize - #copy.height = self.ny - #copy.depth = self.nz + copy.width_in_bytes = self.nx*cpu_data.itemsize + copy.height = self.ny + copy.depth = self.nz + copy(stream) + """ #copy from device to host num_bytes = cpu_data.size * cpu_data.itemsize #hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind) - copy = hip_check(hip.hipMemcpy(cpu_data,self.data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost)) + copy = hip_check(hip.hipMemcpyAsync(cpu_data,self.data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost,stream)) - copy(stream) if asynch==False: - stream.synchronize() - + #stream.synchronize() + hip_check(hip.hipStreamSynchronize(stream)) + return cpu_data @@ -818,9 +891,11 @@ class ArakawaA2D: """ self.logger = logging.getLogger(__name__) self.gpu_variables = [] + for cpu_variable in cpu_variables: self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)] - + + def __getitem__(self, key): assert type(key) == int, "Indexing is int based" if (key > len(self.gpu_variables) or key < 0): @@ -833,21 +908,23 @@ class ArakawaA2D: """ if variables is None: variables=range(len(self.gpu_variables)) - + cpu_variables = [] for i in variables: assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] + #print("--FIN: sum:", np.array(cpu_variables).sum()) + #stream.synchronize() + hip_check(hip.hipStreamSynchronize(stream)) return cpu_variables #hipblas def sum_hipblas(self, num_elements, data): num_bytes_r = np.dtype(np.float32).itemsize result_d = hip_check(hip.hipMalloc(num_bytes_r)) - result_h = np.zeros(1, dtype=np.float32) - print("--bytes:", num_bytes_r) + result_h0 = np.zeros(1, dtype=np.float32) # call hipblasSaxpy + initialization handle = hip_check(hipblas.hipblasCreate()) @@ -859,10 +936,12 @@ class ArakawaA2D: hip_check(hipblas.hipblasDestroy(handle)) # copy result (stored in result_d) back to host (store in result_h) - hip_check(hip.hipMemcpy(result_h,result_d,num_bytes_r,hip.hipMemcpyKind.hipMemcpyDeviceToHost)) + hip_check(hip.hipMemcpy(result_h0,result_d,num_bytes_r,hip.hipMemcpyKind.hipMemcpyDeviceToHost)) + + result_h = result_h0[0] # clean up - hip_check(hip.hipFree(data)) + #hip_check(hip.hipFree(data)) return result_h def check(self): @@ -872,8 +951,8 @@ class ArakawaA2D: for i, gpu_variable in enumerate(self.gpu_variables): #compute sum with hipblas #var_sum = pycuda.gpuarray.sum(gpu_variable.data).get() - var_sum = self.sum_hipblas(gpu_variable.ny,gpu_variable.data) + var_sum = self.sum_hipblas(gpu_variable.data.size,gpu_variable.data) + #print(f"GPU: Sum for column {i}: {var_sum}") self.logger.debug("Data %d with size [%d x %d] has average %f", i, gpu_variable.nx, gpu_variable.ny, var_sum / (gpu_variable.nx * gpu_variable.ny)) assert np.isnan(var_sum) == False, "Data contains NaN values!" - diff --git a/GPUSimulators/CudaContext.py b/GPUSimulators/CudaContext.py index 4f2e0aa..45b3b36 100644 --- a/GPUSimulators/CudaContext.py +++ b/GPUSimulators/CudaContext.py @@ -85,8 +85,9 @@ class CudaContext(object): if device is None: device = 0 - - hip_check(hip.hipSetDevice(device)) + + num_gpus = hip_check(hip.hipGetDeviceCount()) + hip.hipSetDevice(device) props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,device)) arch = props.gcnArchName @@ -97,9 +98,12 @@ class CudaContext(object): # Allocate memory to store the PCI BusID pciBusId = ctypes.create_string_buffer(64) # PCI Bus Id - hip_check(hip.hipDeviceGetPCIBusId(pciBusId, 64, device)) + #hip_check(hip.hipDeviceGetPCIBusId(pciBusId, 64, device)) + pciBusId = hip_check(hip.hipDeviceGetPCIBusId(64, device)) - self.logger.info("Using device %d/%d with --arch: '%s', --BusID: %s ", device, hip_check(hip.hipGetDeviceCount()),arch,pciBusId.value.decode('utf-8')[5:7]) + + #self.logger.info("Using device %d/%d with --arch: '%s', --BusID: %s ", device, num_gpus,arch,pciBusId.value.decode('utf-8')[5:7]) + self.logger.info("Using device %d/%d with --arch: '%s', --BusID: %s ", device, num_gpus,arch,pciBusId[5:7]) #self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) self.logger.debug(" => compute capability: %s", hip_check(hip.hipDeviceComputeCapability(device))) @@ -116,7 +120,8 @@ class CudaContext(object): self.logger.debug(" => Total memory: %d MB available", int(total/(1024*1024))) ##self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) - + self.logger.info("Created context handle <%s>", str(self.cuda_context)) + #Create cache dir for cubin files self.cache_path = os.path.join(self.module_path, "cuda_cache") if (self.use_cache): @@ -125,42 +130,51 @@ class CudaContext(object): self.logger.info("Using CUDA cache dir %s", self.cache_path) self.autotuner = None + """ if (autotuning): self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience") self.autotuner = Autotuner.Autotuner() - + """ def __del__(self, *args): - self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) - + #self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) + #self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context)) + """ # Loop over all contexts in stack, and remove "this" other_contexts = [] #while (cuda.Context.get_current() != None): while (hip.hipCtxGetCurrent() != None): #context = cuda.Context.get_current() context = hip_check(hip.hipCtxGetCurrent()) - if (context.handle != self.cuda_context.handle): - self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle)) + #if (context.handle != self.cuda_context.handle): + if (context != self.cuda_context): + #self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle)) + #self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context), str(context)) other_contexts = [context] + other_contexts #cuda.Context.pop() hip.hipCtxPopCurrent() else: - self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle)) + #self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle)) + self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context), str(context)) #cuda.Context.pop() hip.hipCtxPopCurrent() # Add all the contexts we popped that were not our own for context in other_contexts: - self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle)) + #self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle)) + self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context), str(context)) #cuda.Context.push(context) hip_check(hip.hipCtxPushCurrent(context)) - self.logger.debug("<%s> Detaching", str(self.cuda_context.handle)) - self.cuda_context.detach() - + #self.logger.debug("<%s> Detaching", str(self.cuda_context.handle)) + self.logger.debug("<%s> Detaching", str(self.cuda_context)) + #self.cuda_context.detach() + hip_check(hip.hipCtxDestroy(self.cuda_context)) + """ def __str__(self): - return "CudaContext id " + str(self.cuda_context.handle) + #return "CudaContext id " + str(self.cuda_context.handle) + return "CudaContext id " + str(self.cuda_context) def hash_kernel(kernel_filename, include_dirs): @@ -227,7 +241,7 @@ class CudaContext(object): self.logger.debug("Info: %s", info_str) if error_str: self.logger.debug("Error: %s", error_str) - + kernel_filename = os.path.normpath(kernel_filename) kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename)) #self.logger.debug("Getting %s", kernel_filename) @@ -236,12 +250,12 @@ class CudaContext(object): options_hasher = hashlib.md5() options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8')); options_hash = options_hasher.hexdigest() - + # Create hash of kernel souce source_hash = CudaContext.hash_kernel( \ kernel_path, \ include_dirs=[self.module_path] + include_dirs) - + # Create final hash root, ext = os.path.splitext(kernel_filename) kernel_hash = root \ @@ -282,34 +296,16 @@ class CudaContext(object): os.mkdir(cached_kernel_dir) with io.open(cached_kernel_filename + ".txt", "w") as file: file.write(kernel_string) - + """cuda with Common.Timer("compiler") as timer: + import warnings with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning) cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) - #module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) - - #cubin = hip_check(hiprtc.hiprtcCreateProgram(kernel_string.encode(), b"Kernel-Name", 0, [], [])) - props = hip.hipDeviceProp_t() - hip_check(hip.hipGetDeviceProperties(props,0)) - arch = props.gcnArchName - - print(f"Compiling kernel for {arch}") - - cflags = [b"--offload-arch="+arch] - err, = hiprtc.hiprtcCompileProgram(cubin, len(cflags), cflags) - if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: - log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(cubin)) - log = bytearray(log_size) - hip_check(hiprtc.hiprtcGetProgramLog(cubin, log)) - raise RuntimeError(log.decode()) - code_size = hip_check(hiprtc.hiprtcGetCodeSize(cubin)) - code = bytearray(code_size) - hip_check(hiprtc.hiprtcGetCode(cubin, code)) - module = hip_check(hip.hipModuleLoadData(code)) + module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: @@ -317,7 +313,8 @@ class CudaContext(object): self.modules[kernel_hash] = module return module - + """ + """ Clears the kernel cache (useful for debugging & development) """ @@ -330,4 +327,5 @@ class CudaContext(object): Synchronizes all streams etc """ def synchronize(self): - self.cuda_context.synchronize() + #self.cuda_context.synchronize() + hip_check(hip.hipCtxSynchronize()) diff --git a/GPUSimulators/CudaContext_cu.py b/GPUSimulators/CudaContext_cu.py deleted file mode 100644 index 6c90636..0000000 --- a/GPUSimulators/CudaContext_cu.py +++ /dev/null @@ -1,272 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -This python module implements Cuda context handling - -Copyright (C) 2018 SINTEF ICT - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -""" - - - -import os - -import numpy as np -import time -import re -import io -import hashlib -import logging -import gc - -import pycuda.compiler as cuda_compiler -import pycuda.gpuarray -import pycuda.driver as cuda - -from GPUSimulators import Autotuner, Common - - - -""" -Class which keeps track of the CUDA context and some helper functions -""" -class CudaContext(object): - - def __init__(self, device=None, context_flags=None, use_cache=True, autotuning=True): - """ - Create a new CUDA context - Set device to an id or pci_bus_id to select a specific GPU - Set context_flags to cuda.ctx_flags.SCHED_BLOCKING_SYNC for a blocking context - """ - self.use_cache = use_cache - self.logger = logging.getLogger(__name__) - self.modules = {} - - self.module_path = os.path.dirname(os.path.realpath(__file__)) - - #Initialize cuda (must be first call to PyCUDA) - cuda.init(flags=0) - - self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) - - #Print some info about CUDA - self.logger.info("CUDA version %s", str(cuda.get_version())) - self.logger.info("Driver version %s", str(cuda.get_driver_version())) - - if device is None: - device = 0 - - self.cuda_device = cuda.Device(device) - self.logger.info("Using device %d/%d '%s' (%s) GPU", device, cuda.Device.count(), self.cuda_device.name(), self.cuda_device.pci_bus_id()) - self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) - - # Create the CUDA context - if context_flags is None: - context_flags=cuda.ctx_flags.SCHED_AUTO - - self.cuda_context = self.cuda_device.make_context(flags=context_flags) - - free, total = cuda.mem_get_info() - self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024))) - - self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) - - #Create cache dir for cubin files - self.cache_path = os.path.join(self.module_path, "cuda_cache") - if (self.use_cache): - if not os.path.isdir(self.cache_path): - os.mkdir(self.cache_path) - self.logger.info("Using CUDA cache dir %s", self.cache_path) - - self.autotuner = None - if (autotuning): - self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience") - self.autotuner = Autotuner.Autotuner() - - - def __del__(self, *args): - self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) - - # Loop over all contexts in stack, and remove "this" - other_contexts = [] - while (cuda.Context.get_current() != None): - context = cuda.Context.get_current() - if (context.handle != self.cuda_context.handle): - self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle)) - other_contexts = [context] + other_contexts - cuda.Context.pop() - else: - self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle)) - cuda.Context.pop() - - # Add all the contexts we popped that were not our own - for context in other_contexts: - self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle)) - cuda.Context.push(context) - - self.logger.debug("<%s> Detaching", str(self.cuda_context.handle)) - self.cuda_context.detach() - - - def __str__(self): - return "CudaContext id " + str(self.cuda_context.handle) - - - def hash_kernel(kernel_filename, include_dirs): - # Generate a kernel ID for our caches - num_includes = 0 - max_includes = 100 - kernel_hasher = hashlib.md5() - logger = logging.getLogger(__name__) - - # Loop over file and includes, and check if something has changed - files = [kernel_filename] - while len(files): - - if (num_includes > max_includes): - raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename)) - - filename = files.pop() - - #logger.debug("Hashing %s", filename) - - modified = os.path.getmtime(filename) - - # Open the file - with io.open(filename, "r") as file: - - # Search for #inclue and also hash the file - file_str = file.read() - kernel_hasher.update(file_str.encode('utf-8')) - kernel_hasher.update(str(modified).encode('utf-8')) - - #Find all includes - includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M) - - # Loop over everything that looks like an include - for include_file in includes: - - #Search through include directories for the file - file_path = os.path.dirname(filename) - for include_path in [file_path] + include_dirs: - - # If we find it, add it to list of files to check - temp_path = os.path.join(include_path, include_file) - if (os.path.isfile(temp_path)): - files = files + [temp_path] - num_includes = num_includes + 1 #For circular includes... - break - - return kernel_hasher.hexdigest() - - - """ - Reads a text file and creates an OpenCL kernel from that - """ - def get_module(self, kernel_filename, - include_dirs=[], \ - defines={}, \ - compile_args={'no_extern_c', True}, jit_compile_args={}): - """ - Helper function to print compilation output - """ - def cuda_compile_message_handler(compile_success_bool, info_str, error_str): - self.logger.debug("Compilation returned %s", str(compile_success_bool)) - if info_str: - self.logger.debug("Info: %s", info_str) - if error_str: - self.logger.debug("Error: %s", error_str) - - kernel_filename = os.path.normpath(kernel_filename) - kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename)) - #self.logger.debug("Getting %s", kernel_filename) - - # Create a hash of the kernel options - options_hasher = hashlib.md5() - options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8')); - options_hash = options_hasher.hexdigest() - - # Create hash of kernel souce - source_hash = CudaContext.hash_kernel( \ - kernel_path, \ - include_dirs=[self.module_path] + include_dirs) - - # Create final hash - root, ext = os.path.splitext(kernel_filename) - kernel_hash = root \ - + "_" + source_hash \ - + "_" + options_hash \ - + ext - cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) - - # If we have the kernel in our hashmap, return it - if (kernel_hash in self.modules.keys()): - self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash) - return self.modules[kernel_hash] - - # If we have it on disk, return it - elif (self.use_cache and os.path.isfile(cached_kernel_filename)): - self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash) - - with io.open(cached_kernel_filename, "rb") as file: - file_str = file.read() - module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) - - self.modules[kernel_hash] = module - return module - - # Otherwise, compile it from source - else: - self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash) - - #Create kernel string - kernel_string = "" - for key, value in defines.items(): - kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) - kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) - if (self.use_cache): - cached_kernel_dir = os.path.dirname(cached_kernel_filename) - if not os.path.isdir(cached_kernel_dir): - os.mkdir(cached_kernel_dir) - with io.open(cached_kernel_filename + ".txt", "w") as file: - file.write(kernel_string) - - - with Common.Timer("compiler") as timer: - import warnings - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning) - cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) - module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) - if (self.use_cache): - with io.open(cached_kernel_filename, "wb") as file: - file.write(cubin) - - self.modules[kernel_hash] = module - return module - - """ - Clears the kernel cache (useful for debugging & development) - """ - def clear_kernel_cache(self): - self.logger.debug("Clearing cache") - self.modules = {} - gc.collect() - - """ - Synchronizes all streams etc - """ - def synchronize(self): - self.cuda_context.synchronize() \ No newline at end of file diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py index 935eb90..4d46c3e 100644 --- a/GPUSimulators/EE2D_KP07_dimsplit.py +++ b/GPUSimulators/EE2D_KP07_dimsplit.py @@ -19,6 +19,9 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ +import os +import sys + #Import packages we need from GPUSimulators import Simulator, Common from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition @@ -27,13 +30,21 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc - - - - - - - +from hip import hipblas + +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations using the Forward-Backward linear scheme @@ -56,20 +67,6 @@ class EE2D_KP07_dimsplit (BaseSimulator): p: pressure """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result - def __init__(self, context, rho, rho_u, rho_v, E, @@ -94,133 +91,210 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.gamma = np.float32(gamma) self.theta = np.float32(theta) - #Get kernels - #module = context.get_module("cuda/EE2D_KP07_dimsplit.cu", - # defines={ - # 'BLOCK_WIDTH': self.block_size[0], - # 'BLOCK_HEIGHT': self.block_size[1] - # }, - # compile_args={ - # 'no_extern_c': True, - # 'options': ["--use_fast_math"], - # }, - # jit_compile_args={}) - #self.kernel = module.get_function("KP07DimsplitKernel") - #self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii") - # - kernel_file_path = os.path.abspath(os.path.join('cuda', 'EE2D_KP07_dimsplit.cu.hip')) + + #Get cuda kernels + """ Cuda + module = context.get_module("cuda/EE2D_KP07_dimsplit.cu.hip", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + #compile and load to the device + self.kernel = module.get_function("KP07DimsplitKernel") + self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii") + """ + + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + + #source code + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'EE2D_KP07_dimsplit.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 0, [], [])) + #EulerCommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'EulerCommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + #limiters.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h')) + with open(header_file_path, 'r') as file: + header_limiters = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"EulerCommon.h",b"limiters.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() + + #extract the arch of the device props = hip.hipDeviceProp_t() - hip_check(hip.hipGetDeviceProperties(props,0)) + hip_check(hip.hipGetDeviceProperties(props,0)) #only one device 0 arch = props.gcnArchName print(f"Compiling kernel for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"KP07DimsplitKernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named "KP07DimsplitKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"KP07DimsplitKernel")) + print() + print("--Get the device kernel *KP07DimsplitKernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, nx, ny, 2, 2, [rho, rho_u, rho_v, E]) + self.u1 = Common.ArakawaA2D(self.stream, nx, ny, 2, 2, [None, None, None, None]) + #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) # init device array cfl_data - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) - dt_x = np.min(self.dx / (np.abs(rho_u/rho) + np.sqrt(gamma*rho))) dt_y = np.min(self.dy / (np.abs(rho_v/rho) + np.sqrt(gamma*rho))) self.dt = min(dt_x, dt_y) - self.cfl_data.fill(self.dt, stream=self.stream) - - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + + def substep(self, dt, step_number, external=True, internal=True): self.substepDimsplit(0.5*dt, step_number, external, internal) def substepDimsplit(self, dt, substep, external, internal): + + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + u03_strides0 = self.u0[3].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize + u13_strides0 = self.u1[3].data.shape[0]*np.float32().itemsize + if external and internal: + #print("COMPLETE DOMAIN (dt=" + str(dt) + ")") -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.gamma, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u0[3].data.gpudata, self.u0[3].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.u1[3].data.gpudata, self.u1[3].data.strides[0], -# self.cfl_data.gpudata, -# 0, 0, -# self.nx, self.ny) - - #launch kernel + """ Cuda + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) + """ + + #hip.hipModuleLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, stream, kernelParams, extra) + + #The argument grid/block requires 3 components x,y and z. in 2D z=1. hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.gamma), ctypes.c_float(self.theta), ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u0[3].data, ctypes.c_int(u03_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.u1[3].data, ctypes.c_int(u13_strides0), self.cfl_data, - 0, 0, - ctypes.c_int(self.nx), ctypes.c_int(self.ny) - ) + ctypes.c_int(0), ctypes.c_int(0), + ctypes.c_int(self.nx), ctypes.c_int(self.ny), ) ) + ) - hip_check(hip.hipDeviceSynchronize()) - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--External & Internal: Launching Kernel is ok") + #print("--External & Internal: Launching Kernel is ok") return @@ -229,243 +303,250 @@ class EE2D_KP07_dimsplit (BaseSimulator): # XXX: Corners are treated twice! # ################################### - ns_grid_size = (self.grid_size[0], 1) - + ns_grid_size = (self.grid_size[0], 1, 1) # NORTH # (x0, y0) x (x1, y1) # (0, ny-y_halo) x (nx, ny) -# self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.gamma, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u0[3].data.gpudata, self.u0[3].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.u1[3].data.gpudata, self.u1[3].data.strides[0], -# self.cfl_data.gpudata, -# 0, self.ny - int(self.u0[0].y_halo), -# self.nx, self.ny) - + """ Cuda + self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, self.ny - int(self.u0[0].y_halo), + self.nx, self.ny) + """ + hip_check( hip.hipModuleLaunchKernel( - kernel, - *ns_grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *ns_grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.gamma), ctypes.c_float(self.theta), ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u0[3].data, ctypes.c_int(u03_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.u1[3].data, ctypes.c_int(u13_strides0), self.cfl_data, - 0, ctypes.c_int(self.ny) - ctypes.c_int(self.u0[0].y_halo), - ctypes.c_int(self.nx), ctypes.c_int(self.ny) - ) + ctypes.c_int(0), ctypes.c_int(self.ny - self.u0[0].y_halo), + ctypes.c_int(self.nx), ctypes.c_int(self.ny), ) ) + ) + hip_check(hip.hipStreamSynchronize(self.stream)) + #print() + #print("--I m at the NORTH:") + #print() # SOUTH # (x0, y0) x (x1, y1) # (0, 0) x (nx, y_halo) -# self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.gamma, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u0[3].data.gpudata, self.u0[3].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.u1[3].data.gpudata, self.u1[3].data.strides[0], -# self.cfl_data.gpudata, -# 0, 0, -# self.nx, int(self.u0[0].y_halo)) - + """ Cuda + self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, 0, + self.nx, int(self.u0[0].y_halo)) + """ + hip_check( hip.hipModuleLaunchKernel( - kernel, - *ns_grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *ns_grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.gamma), ctypes.c_float(self.theta), ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u0[3].data, ctypes.c_int(u03_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.u1[3].data, ctypes.c_int(u13_strides0), self.cfl_data, - 0, 0, - ctypes.c_int(self.nx), ctypes.c_int(self.u0[0].y_halo) - ) + ctypes.c_int(0), ctypes.c_int(0), + ctypes.c_int(self.nx), ctypes.c_int(self.u0[0].y_halo), ) ) - - - we_grid_size = (1, self.grid_size[1]) + ) + hip_check(hip.hipStreamSynchronize(self.stream)) + + we_grid_size = (1, self.grid_size[1], 1) # WEST # (x0, y0) x (x1, y1) # (0, 0) x (x_halo, ny) -# self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.gamma, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u0[3].data.gpudata, self.u0[3].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.u1[3].data.gpudata, self.u1[3].data.strides[0], -# self.cfl_data.gpudata, -# 0, 0, -# int(self.u0[0].x_halo), self.ny) + """ Cuda + self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, 0, + int(self.u0[0].x_halo), self.ny) + """ hip_check( hip.hipModuleLaunchKernel( - kernel, - *we_grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *we_grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.gamma), ctypes.c_float(self.theta), ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u0[3].data, ctypes.c_int(u03_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.u1[3].data, ctypes.c_int(u13_strides0), self.cfl_data, - 0, 0, - ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.ny) - ) + ctypes.c_int(0), ctypes.c_int(0), + ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.ny), ) ) + ) + hip_check(hip.hipStreamSynchronize(self.stream)) # EAST # (x0, y0) x (x1, y1) # (nx-x_halo, 0) x (nx, ny) -# self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.gamma, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u0[3].data.gpudata, self.u0[3].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.u1[3].data.gpudata, self.u1[3].data.strides[0], -# self.cfl_data.gpudata, -# self.nx - int(self.u0[0].x_halo), 0, -# self.nx, self.ny) + """ Cuda + self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + self.nx - int(self.u0[0].x_halo), 0, + self.nx, self.ny) + """ hip_check( hip.hipModuleLaunchKernel( - kernel, - *we_grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *we_grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.gamma), ctypes.c_float(self.theta), ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u0[3].data, ctypes.c_int(u03_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.u1[3].data, ctypes.c_int(u13_strides0), self.cfl_data, - ctypes.c_int(self.nx) - ctypes.c_int(self.u0[0].x_halo), 0, - ctypes.c_int(self.nx), ctypes.c_int(self.ny) - ) + ctypes.c_int(self.nx - self.u0[0].x_halo), ctypes.c_int(0), + ctypes.c_int(self.nx), ctypes.c_int(self.ny), ) ) + ) - hip_check(hip.hipDeviceSynchronize()) - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--External and not Internal: Launching Kernel is ok") - +# print("--External and not Internal: Launching Kernel is ok") return if internal and not external: - + # INTERNAL DOMAIN # (x0, y0) x (x1, y1) # (x_halo, y_halo) x (nx - x_halo, ny - y_halo) + """ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream, self.nx, self.ny, self.dx, self.dy, dt, @@ -485,45 +566,40 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.cfl_data.gpudata, int(self.u0[0].x_halo), int(self.u0[0].y_halo), self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo)) - + """ hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.internal_stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.gamma), ctypes.c_float(self.theta), ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u0[3].data, ctypes.c_int(u03_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.u1[3].data, ctypes.c_int(u13_strides0), self.cfl_data, ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.u0[0].y_halo), - ctypes.c_int(self.nx) - ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.ny) - ctypes.c_int(self.u0[0].y_halo) - ) + ctypes.c_int(self.nx - self.u0[0].x_halo), ctypes.c_int(self.ny - self.u0[0].y_halo), ) ) + ) - hip_check(hip.hipDeviceSynchronize()) - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Internal and not External: Launching Kernel is ok") + # print("--Internal and not External: Launching Kernel is ok") return def swapBuffers(self): diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py index 092711a..fbdf73e 100644 --- a/GPUSimulators/FORCE.py +++ b/GPUSimulators/FORCE.py @@ -25,16 +25,24 @@ from GPUSimulators import Simulator, Common from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition import numpy as np import ctypes + #from pycuda import gpuarray from hip import hip,hiprtc +from hip import hipblas - - - - - - - +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations @@ -53,19 +61,6 @@ class FORCE (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def __init__(self, context, @@ -87,25 +82,55 @@ class FORCE (Simulator.BaseSimulator): block_width, block_height) self.g = np.float32(g) - #Get kernels -# module = context.get_module("cuda/SWE2D_FORCE.cu.hip", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("FORCEKernel") -# self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_FORCE.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("FORCEKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_FORCE.cu')) + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_FORCE.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"FORCEKernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"FORCEKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -113,20 +138,38 @@ class FORCE (Simulator.BaseSimulator): print(f"Compiling kernel .FORCEKernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"FORCEKernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + #Get the device kernel named named "FORCEKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"FORCEKernel")) + + print() + print("--Get the device kernel *FORCEKernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -138,65 +181,79 @@ class FORCE (Simulator.BaseSimulator): 1, 1, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + def substep(self, dt, step_number): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) -# self.u0, self.u1 = self.u1, self.u0 - + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize + #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) - - hip_check(hip.hipDeviceSynchronize()) + ) + self.u0, self.u1 = self.u1, self.u0 - - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .FORCEKernel. is ok") + #print("--Launching Kernel .FORCEKernel. is ok") def getOutput(self): return self.u0 diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py index 792d3c6..3ef9737 100644 --- a/GPUSimulators/HLL.py +++ b/GPUSimulators/HLL.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- """ -This python module implements the HLL flux +This python module implements the FORCE flux +for the shallow water equations Copyright (C) 2016 SINTEF ICT @@ -27,10 +28,21 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc +from hip import hipblas - - - +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver @@ -49,22 +61,9 @@ class HLL (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def __init__(self, - context, + context, h0, hu0, hv0, nx, ny, dx, dy, @@ -80,28 +79,58 @@ class HLL (Simulator.BaseSimulator): boundary_conditions, cfl_scale, 1, - block_width, block_height); + block_width, block_height) self.g = np.float32(g) - #Get kernels -# module = context.get_module("cuda/SWE2D_HLL.cu", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("HLLKernel") -# self.kernel.prepare("iiffffiPiPiPiPiPiPiP") - - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_HLL.cu.hip')) + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_HLL.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("HLLKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ + + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_HLL.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLLKernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLLKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -109,19 +138,38 @@ class HLL (Simulator.BaseSimulator): print(f"Compiling kernel .HLLKernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"HLLKernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named named "FORCEKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"HLLKernel")) + + print() + print("--Get the device kernel *HLLKernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -133,71 +181,87 @@ class HLL (Simulator.BaseSimulator): 1, 1, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + def substep(self, dt, step_number): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize + #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) + ) - hip_check(hip.hipDeviceSynchronize()) - self.u0, self.u1 = self.u1, self.u0 - - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .HLLKernel. is ok") + + #print("--Launching Kernel .HLLKernel. is ok") def getOutput(self): return self.u0 - + def check(self): self.u0.check() self.u1.check() - + # computing min with hipblas: the output is an index def min_hipblas(self, num_elements, cfl_data, stream): num_bytes = num_elements * np.dtype(np.float32).itemsize @@ -232,4 +296,4 @@ class HLL (Simulator.BaseSimulator): def computeDt(self): #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) - return max_dt*0.5 + return max_dt*0.5 diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py index b5c0dc0..9f83417 100644 --- a/GPUSimulators/HLL2.py +++ b/GPUSimulators/HLL2.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- """ -This python module implements the 2nd order HLL flux +This python module implements the FORCE flux +for the shallow water equations Copyright (C) 2016 SINTEF ICT @@ -27,15 +28,24 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc - - - - - +from hip import hipblas +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ -Class that solves the SW equations using the Forward-Backward linear scheme +Class that solves the SW equations """ class HLL2 (Simulator.BaseSimulator): @@ -51,27 +61,14 @@ class HLL2 (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result - + def __init__(self, context, h0, hu0, hv0, nx, ny, dx, dy, g, - theta=1.8, + theta=1.8, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): @@ -83,29 +80,63 @@ class HLL2 (Simulator.BaseSimulator): boundary_conditions, cfl_scale, 2, - block_width, block_height); + block_width, block_height) self.g = np.float32(g) self.theta = np.float32(theta) - - #Get kernels -# module = context.get_module("cuda/SWE2D_HLL2.cu", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("HLL2Kernel") -# self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") - - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_HLL2.cu.hip')) + + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_HLL2.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("HLL2Kernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ + + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_HLL2.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLL2Kernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #limiters.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h')) + with open(header_file_path, 'r') as file: + header_limiters = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLL2Kernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"SWECommon.h",b"limiters.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -113,19 +144,38 @@ class HLL2 (Simulator.BaseSimulator): print(f"Compiling kernel .HLL2Kernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"HLL2Kernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named named "FORCEKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"HLL2Kernel")) + + print() + print("--Get the device kernel *HLL2Kernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -137,70 +187,87 @@ class HLL2 (Simulator.BaseSimulator): 2, 2, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + def substep(self, dt, step_number): self.substepDimsplit(dt*0.5, step_number) - - def substepDimsplit(self, dt, substep): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) + + def substepDimsplit(self, dt, substep): + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.theta), - ctypes.c_int(substep), + ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) + ) - hip_check(hip.hipDeviceSynchronize()) self.u0, self.u1 = self.u1, self.u0 - - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .HLL2Kernel. is ok") + + #print("--Launching Kernel .HLL2Kernel. is ok") def getOutput(self): return self.u0 @@ -208,7 +275,7 @@ class HLL2 (Simulator.BaseSimulator): def check(self): self.u0.check() self.u1.check() - + # computing min with hipblas: the output is an index def min_hipblas(self, num_elements, cfl_data, stream): num_bytes = num_elements * np.dtype(np.float32).itemsize @@ -244,4 +311,3 @@ class HLL2 (Simulator.BaseSimulator): #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) return max_dt*0.5 - diff --git a/GPUSimulators/IPythonMagic.py b/GPUSimulators/IPythonMagic.py index 92baeb8..ae10092 100644 --- a/GPUSimulators/IPythonMagic.py +++ b/GPUSimulators/IPythonMagic.py @@ -29,6 +29,19 @@ from hip import hip, hiprtc from GPUSimulators import Common, CudaContext +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result @magics_class class MagicCudaContext(Magics): @@ -42,19 +55,6 @@ class MagicCudaContext(Magics): '--no_cache', '-nc', action="store_true", help='Disable caching of kernels') @magic_arguments.argument( '--no_autotuning', '-na', action="store_true", help='Disable autotuning of kernels') - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def cuda_context_handler(self, line): args = magic_arguments.parse_argstring(self.cuda_context_handler, line) diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py index 93ce5e9..2ce9d4e 100644 --- a/GPUSimulators/KP07.py +++ b/GPUSimulators/KP07.py @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- """ -This python module implements the Kurganov-Petrova numerical scheme -for the shallow water equations, described in -A. Kurganov & Guergana Petrova -A Second-Order Well-Balanced Positivity Preserving Central-Upwind -Scheme for the Saint-Venant System Communications in Mathematical -Sciences, 5 (2007), 133-160. +This python module implements the FORCE flux +for the shallow water equations Copyright (C) 2016 SINTEF ICT @@ -32,8 +28,21 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc +from hip import hipblas - +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations using the Forward-Backward linear scheme @@ -52,19 +61,6 @@ class KP07 (Simulator.BaseSimulator): dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def __init__(self, context, @@ -72,7 +68,7 @@ class KP07 (Simulator.BaseSimulator): nx, ny, dx, dy, g, - theta=1.3, + theta=1.3, cfl_scale=0.9, order=2, boundary_conditions=BoundaryCondition(), @@ -84,31 +80,65 @@ class KP07 (Simulator.BaseSimulator): dx, dy, boundary_conditions, cfl_scale, - order, - block_width, block_height); - self.g = np.float32(g) - self.theta = np.float32(theta) + order, + block_width, block_height) + self.g = np.float32(g) + self.theta = np.float32(theta) self.order = np.int32(order) - #Get kernels -# module = context.get_module("cuda/SWE2D_KP07.cu", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("KP07Kernel") -# self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") - - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_KP07.cu.hip')) + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_KP07.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("KP07Kernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ + + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_KP07.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07Kernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #limiters.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h')) + with open(header_file_path, 'r') as file: + header_limiters = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07Kernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"SWECommon.h",b"limiters.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -116,19 +146,38 @@ class KP07 (Simulator.BaseSimulator): print(f"Compiling kernel .KP07Kernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"KP07Kernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named named "FORCEKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"KP07Kernel")) + + print() + print("--Get the device kernel *KP07Kernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -140,73 +189,87 @@ class KP07 (Simulator.BaseSimulator): 2, 2, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - - - def substep(self, dt, step_number): - self.substepRK(dt, step_number) + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + + def substep(self, dt, step_number): + self.substepRK(dt, step_number) - def substepRK(self, dt, substep): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.theta, -# Simulator.stepOrderToCodedInt(step=substep, order=self.order), -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.theta, + Simulator.stepOrderToCodedInt(step=substep, order=self.order), + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.theta), - Simulator.stepOrderToCodedInt(step=substep, order=self.order), + Simulator.stepOrderToCodedInt(step=substep, order=self.order), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) - - hip_check(hip.hipDeviceSynchronize()) + ) self.u0, self.u1 = self.u1, self.u0 - - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .KP07Kernel. is ok") + + #print("--Launching Kernel .KP07Kernel. is ok") def getOutput(self): return self.u0 @@ -214,7 +277,7 @@ class KP07 (Simulator.BaseSimulator): def check(self): self.u0.check() self.u1.check() - + # computing min with hipblas: the output is an index def min_hipblas(self, num_elements, cfl_data, stream): num_bytes = num_elements * np.dtype(np.float32).itemsize @@ -247,6 +310,6 @@ class KP07 (Simulator.BaseSimulator): return min_value def computeDt(self): - max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); + max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) return max_dt*0.5**(self.order-1) diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py index 0a5cfc7..79dc0aa 100644 --- a/GPUSimulators/KP07_dimsplit.py +++ b/GPUSimulators/KP07_dimsplit.py @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- """ -This python module implements the Kurganov-Petrova numerical scheme -for the shallow water equations, described in -A. Kurganov & Guergana Petrova -A Second-Order Well-Balanced Positivity Preserving Central-Upwind -Scheme for the Saint-Venant System Communications in Mathematical -Sciences, 5 (2007), 133-160. +This python module implements the FORCE flux +for the shallow water equations Copyright (C) 2016 SINTEF ICT @@ -32,14 +28,26 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc +from hip import hipblas - - +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations using the dimentionally split KP07 scheme """ -class KP07_dimsplit(Simulator.BaseSimulator): +class KP07_dimsplit (Simulator.BaseSimulator): """ Initialization routine @@ -54,27 +62,13 @@ class KP07_dimsplit(Simulator.BaseSimulator): g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result - def __init__(self, context, h0, hu0, hv0, nx, ny, dx, dy, g, - theta=1.3, + theta=1.3, cfl_scale=0.9, boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): @@ -85,32 +79,66 @@ class KP07_dimsplit(Simulator.BaseSimulator): dx, dy, boundary_conditions, cfl_scale, - 2, + 2, block_width, block_height) self.gc_x = 2 self.gc_y = 2 - self.g = np.float32(g) + self.g = np.float32(g) self.theta = np.float32(theta) - #Get kernels -# module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("KP07DimsplitKernel") -# self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") - - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_KP07_dimsplit.cu.hip')) + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("KP07DimsplitKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ + + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_KP07_dimsplit.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #limiters.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h')) + with open(header_file_path, 'r') as file: + header_limiters = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"SWECommon.h",b"limiters.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -118,19 +146,38 @@ class KP07_dimsplit(Simulator.BaseSimulator): print(f"Compiling kernel .KP07DimsplitKernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"KP07DimsplitKernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named named "FORCEKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"KP07DimsplitKernel")) + + print() + print("--Get the device kernel *KP07DimsplitKernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -139,77 +186,94 @@ class KP07_dimsplit(Simulator.BaseSimulator): [h0, hu0, hv0]) self.u1 = Common.ArakawaA2D(self.stream, nx, ny, - self.gc_x, self.gc_y, + self.gc_x, self.gc_y, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + def substep(self, dt, step_number): self.substepDimsplit(dt*0.5, step_number) - + def substepDimsplit(self, dt, substep): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.theta, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_float(self.theta), - ctypes.c_int(substep) + ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) - - hip_check(hip.hipDeviceSynchronize()) + ) self.u0, self.u1 = self.u1, self.u0 - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .KP07DimsplitKernel. is ok") + + #print("--Launching Kernel .KP07DimsplitKernel. is ok") def getOutput(self): return self.u0 - + def check(self): self.u0.check() self.u1.check() diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py index 98e54c6..6edc6b6 100644 --- a/GPUSimulators/LxF.py +++ b/GPUSimulators/LxF.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -This python module implements the classical Lax-Friedrichs numerical -scheme for the shallow water equations +This python module implements the FORCE flux +for the shallow water equations Copyright (C) 2016 SINTEF ICT @@ -28,10 +28,21 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc +from hip import hipblas - - - +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations using the Lax Friedrichs scheme @@ -51,20 +62,6 @@ class LxF (Simulator.BaseSimulator): g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result - def __init__(self, context, h0, hu0, hv0, @@ -72,7 +69,7 @@ class LxF (Simulator.BaseSimulator): dx, dy, g, cfl_scale=0.9, - boundary_conditions=BoundaryCondition(), + boundary_conditions=BoundaryCondition(), block_width=16, block_height=16): # Call super constructor @@ -82,28 +79,58 @@ class LxF (Simulator.BaseSimulator): boundary_conditions, cfl_scale, 1, - block_width, block_height); + block_width, block_height) self.g = np.float32(g) - # Get kernels -# module = context.get_module("cuda/SWE2D_LxF.cu", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("LxFKernel") -# self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_LxF.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("LxFKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_LxF.cu.hip')) + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_LxF.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"LxFKernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"LxFKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -111,19 +138,38 @@ class LxF (Simulator.BaseSimulator): print(f"Compiling kernel .LxFKernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"LxFKernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named named "LxFKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"LxFKernel")) + + print() + print("--Get the device kernel *LxFKernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -135,72 +181,87 @@ class LxF (Simulator.BaseSimulator): 1, 1, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + def substep(self, dt, step_number): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) - - hip_check(hip.hipDeviceSynchronize()) + ) self.u0, self.u1 = self.u1, self.u0 - - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .LxFKernel. is ok") + + #print("--Launching Kernel .LxFKernel. is ok") def getOutput(self): return self.u0 - + def check(self): self.u0.check() self.u1.check() - + # computing min with hipblas: the output is an index def min_hipblas(self, num_elements, cfl_data, stream): num_bytes = num_elements * np.dtype(np.float32).itemsize @@ -231,7 +292,7 @@ class LxF (Simulator.BaseSimulator): hip_check(hip.hipStreamDestroy(stream)) hip_check(hip.hipFree(cfl_data)) return min_value - + def computeDt(self): #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index f13de52..c792fdd 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -30,6 +30,19 @@ import time #import nvtx from hip import hip, hiprtc +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result class MPIGrid(object): """ @@ -206,19 +219,6 @@ class MPISimulator(Simulator.BaseSimulator): """ Class which handles communication between simulators on different MPI nodes """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def __init__(self, sim, grid): self.profiling_data_mpi = { 'start': {}, 'end': {} } @@ -306,58 +306,73 @@ class MPISimulator(Simulator.BaseSimulator): #Note that east and west also transfer ghost cells #whilst north/south only transfer internal cells #Reuses the width/height defined in the read-extets above - ##self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) + """ + self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) + self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) + self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) + self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) + """ - ##self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) - ##self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) - ##self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) - - self.in_e = np.empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) + #HIP + self.in_e = np.zeros((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) num_bytes_e = self.in_e.size * self.in_e.itemsize #hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device #hipHostMallocDefault:Memory is mapped and portable (default allocation) #hipHostMallocPortable: memory is explicitely portable across different devices - self.in_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable)) + #self.in_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.in_e, hip.hipHostMallocPortable)) - self.in_w = np.empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) + #print("--hip.hipGetDeviceFlags():", hip.hipGetDeviceFlags()) + + self.in_w = np.zeros((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) num_bytes_w = self.in_w.size * self.in_w.itemsize - self.in_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable)) + #self.in_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.in_w, hip.hipHostMallocPortable)) - self.in_n = np.empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) + self.in_n = np.zeros((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) num_bytes_n = self.in_n.size * self.in_n.itemsize - self.in_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable)) + #self.in_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.in_n, hip.hipHostMallocPortable)) - self.in_s = np.empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) + self.in_s = np.zeros((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) num_bytes_s = self.in_s.size * self.in_s.itemsize - self.in_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable)) + #self.in_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.in_s, hip.hipHostMallocPortable)) #Allocate data for sending - #self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e) - #self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w) - #self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n) - #self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s) - - self.out_e = np.empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) + """ + self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e) + self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w) + self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n) + self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s) + """ + + self.out_e = np.zeros((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) num_bytes_e = self.out_e.size * self.out_e.itemsize - self.out_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable)) + #self.out_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.out_e, hip.hipHostMallocPortable)) - self.out_w = np.empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) + self.out_w = np.zeros((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) num_bytes_w = self.out_w.size * self.out_w.itemsize - self.out_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable)) + #self.out_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.out_w, hip.hipHostMallocPortable)) - self.out_n = np.empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) + self.out_n = np.zeros((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) num_bytes_n = self.out_n.size * self.out_n.itemsize - self.out_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable)) + #self.out_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.out_n, hip.hipHostMallocPortable)) - self.out_s = np.empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) + self.out_s = np.zeros((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) num_bytes_s = self.out_s.size * self.out_s.itemsize - self.out_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable)) + #self.out_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable)) + #hip_check(hip.hipHostGetDevicePointer(self.out_s, hip.hipHostMallocPortable)) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) self.full_exchange() - sim.context.synchronize() + #hip_check(hip.hipDeviceSynchronize()) + #sim.context.synchronize() def substep(self, dt, step_number): diff --git a/GPUSimulators/SHMEMSimulator.py b/GPUSimulators/SHMEMSimulator.py index 8417d7e..73f28f0 100644 --- a/GPUSimulators/SHMEMSimulator.py +++ b/GPUSimulators/SHMEMSimulator.py @@ -29,24 +29,25 @@ from hip import hip, hiprtc import time +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result + class SHMEMSimulator(Simulator.BaseSimulator): """ Class which handles communication and synchronization between simulators in different contexts (presumably on different GPUs) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def __init__(self, sims, grid): self.logger = logging.getLogger(__name__) diff --git a/GPUSimulators/SHMEMSimulatorGroup.py b/GPUSimulators/SHMEMSimulatorGroup.py index c9dc30f..b7cb68a 100644 --- a/GPUSimulators/SHMEMSimulatorGroup.py +++ b/GPUSimulators/SHMEMSimulatorGroup.py @@ -29,24 +29,25 @@ from hip import hip, hiprtc import time +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result + class SHMEMGrid(object): """ Class which represents an SHMEM grid of GPUs. Facilitates easy communication between neighboring subdomains in the grid. Contains one CUDA context per subdomain. """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result def __init__(self, ngpus=None, ndims=2): self.logger = logging.getLogger(__name__) diff --git a/GPUSimulators/Simulator.py b/GPUSimulators/Simulator.py index b804d79..a6a41c2 100644 --- a/GPUSimulators/Simulator.py +++ b/GPUSimulators/Simulator.py @@ -22,6 +22,7 @@ along with this program. If not, see . #Import packages we need import numpy as np +import math import logging from enum import IntEnum @@ -34,6 +35,20 @@ from hip import hip, hiprtc from GPUSimulators import Common +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result + class BoundaryCondition(object): """ Class for holding boundary conditions for global boundaries @@ -102,15 +117,6 @@ class BoundaryCondition(object): class BaseSimulator(object): - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - return result - def __init__(self, context, nx, ny, @@ -155,14 +161,19 @@ class BaseSimulator(object): block_width = int(peak_configuration["block_width"]) block_height = int(peak_configuration["block_height"]) self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height) - + #Compute kernel launch parameters + """ self.block_size = (block_width, block_height, 1) self.grid_size = ( int(np.ceil(self.nx / float(self.block_size[0]))), int(np.ceil(self.ny / float(self.block_size[1]))) ) - + """ + self.block_size = hip.dim3(block_width, block_height) + #self.grid_size = hip.dim3(math.ceil(self.nx/block_width),math.ceil(self.ny/block_height)) + self.grid_size = hip.dim3(math.ceil((self.nx+block_width-1)/block_width),math.ceil((self.ny+block_height-1)/block_height)) + #Create a CUDA stream #self.stream = cuda.Stream() #self.internal_stream = cuda.Stream() diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index 7e2763c..b7fd664 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -This python module implements the Weighted average flux (WAF) described in -E. Toro, Shock-Capturing methods for free-surface shallow flows, 2001 +This python module implements the FORCE flux +for the shallow water equations Copyright (C) 2016 SINTEF ICT @@ -28,8 +28,21 @@ import ctypes #from pycuda import gpuarray from hip import hip,hiprtc +from hip import hipblas - +def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result """ Class that solves the SW equations using the Forward-Backward linear scheme @@ -49,22 +62,8 @@ class WAF (Simulator.BaseSimulator): g: Gravitational accelleration (9.81 m/s^2) """ - def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - elif ( - isinstance(err, hiprtc.hiprtcResult) - and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS - ): - raise RuntimeError(str(err)) - return result - def __init__(self, - context, + context, h0, hu0, hv0, nx, ny, dx, dy, @@ -80,28 +79,58 @@ class WAF (Simulator.BaseSimulator): boundary_conditions, cfl_scale, 2, - block_width, block_height); + block_width, block_height) self.g = np.float32(g) - #Get kernels -# module = context.get_module("cuda/SWE2D_WAF.cu", -# defines={ -# 'BLOCK_WIDTH': self.block_size[0], -# 'BLOCK_HEIGHT': self.block_size[1] -# }, -# compile_args={ -# 'no_extern_c': True, -# 'options': ["--use_fast_math"], -# }, -# jit_compile_args={}) -# self.kernel = module.get_function("WAFKernel") -# self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") - - kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_WAF.cu.hip')) + #Get cuda kernels + """ + module = context.get_module("cuda/SWE2D_WAF.cu", + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, + jit_compile_args={}) + self.kernel = module.get_function("WAFKernel") + self.kernel.prepare("iiffffiPiPiPiPiPiPiP") + """ + + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Specify the relative path to the "cuda" directory + cuda_dir = os.path.join(current_dir, 'cuda') + + #kernel source + kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_WAF.cu.hip')) with open(kernel_file_path, 'r') as file: kernel_source = file.read() - prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"WAFKernel", 0, [], [])) + #headers + #common.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h')) + with open(header_file_path, 'r') as file: + header_common = file.read() + + #SWECommon.h + header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h')) + with open(header_file_path, 'r') as file: + header_EulerCommon = file.read() + + #hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames) + + prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"WAFKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"])) + + # Check if the program is created successfully + if prog is not None: + print("--This is ") + print("--HIPRTC program created successfully") + print() + else: + print("--Failed to create HIPRTC program") + print("--I stop:", err) + exit() props = hip.hipDeviceProp_t() hip_check(hip.hipGetDeviceProperties(props,0)) @@ -109,19 +138,38 @@ class WAF (Simulator.BaseSimulator): print(f"Compiling kernel .WAFKernel. for {arch}") - cflags = [b"--offload-arch="+arch] + cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()] + err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) + # Check if the program is compiled successfully + if err is not None: + print("--Compilation:", err) + print("--The program is compiled successfully") + else: + print("--Compilation:", err) + print("--Failed to compile the program") + print("--I stop:", err) + if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log = bytearray(log_size) hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) raise RuntimeError(log.decode()) + code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code = bytearray(code_size) hip_check(hiprtc.hiprtcGetCode(prog, code)) - module = hip_check(hip.hipModuleLoadData(code)) - kernel = hip_check(hip.hipModuleGetFunction(module, b"WAFKernel")) + #Load the code as a module + self.module = hip_check(hip.hipModuleLoadData(code)) + + #Get the device kernel named named "LxFKernel" + self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"WAFKernel")) + + print() + print("--Get the device kernel *WAFKernel* is created successfully--") + print("--kernel", self.kernel) + print() #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, @@ -133,69 +181,84 @@ class WAF (Simulator.BaseSimulator): 2, 2, [None, None, None]) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) - data_h = np.empty(self.grid_size, dtype=np.float32) - num_bytes = data_h.size * data_h.itemsize - self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( - typestr="float32",shape=self.grid_size) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt = min(dt_x, dt_y) - self.cfl_data.fill(dt, stream=self.stream) - + #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute + #self.cfl_data.fill(self.dt, stream=self.stream) + grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size + + data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32) + num_bytes = data_h.size * data_h.itemsize + data_h.fill(self.dt) + + self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure( + typestr="float32",shape=(grid_dim_x, grid_dim_y)) + + hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream)) + #sets the memory region pointed to by x_d to zero asynchronously + #initiates the memset operation asynchronously + #hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream)) + def substep(self, dt, step_number): self.substepDimsplit(dt*0.5, step_number) - - def substepDimsplit(self, dt, substep): -# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, -# self.nx, self.ny, -# self.dx, self.dy, dt, -# self.g, -# substep, -# self.boundary_conditions, -# self.u0[0].data.gpudata, self.u0[0].data.strides[0], -# self.u0[1].data.gpudata, self.u0[1].data.strides[0], -# self.u0[2].data.gpudata, self.u0[2].data.strides[0], -# self.u1[0].data.gpudata, self.u1[0].data.strides[0], -# self.u1[1].data.gpudata, self.u1[1].data.strides[0], -# self.u1[2].data.gpudata, self.u1[2].data.strides[0], -# self.cfl_data.gpudata) + + def substepDimsplit(self, dt, substep): + #Cuda + """ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.cfl_data.gpudata) + self.u0, self.u1 = self.u1, self.u0 + """ + u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize + u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize + u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize + + u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize + u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize + u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize #launch kernel hip_check( hip.hipModuleLaunchKernel( - kernel, - *self.grid_size, - *self.block_size, - sharedMemBytes=0, + self.kernel, + *self.grid_size, #grid + *self.block_size, #block + sharedMemBytes=0, #65536, stream=self.stream, kernelParams=None, extra=( # pass kernel's arguments ctypes.c_int(self.nx), ctypes.c_int(self.ny), - ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), + ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt), ctypes.c_float(self.g), - ctypes.c_int(substep), + ctypes.c_int(substep), ctypes.c_int(self.boundary_conditions), - ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), - ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), - ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), - ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), - ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), - ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), - self.cfl_data - ) + self.u0[0].data, ctypes.c_int(u00_strides0), + self.u0[1].data, ctypes.c_int(u01_strides0), + self.u0[2].data, ctypes.c_int(u02_strides0), + self.u1[0].data, ctypes.c_int(u10_strides0), + self.u1[1].data, ctypes.c_int(u11_strides0), + self.u1[2].data, ctypes.c_int(u12_strides0), + self.cfl_data, ) ) - - hip_check(hip.hipDeviceSynchronize()) + ) self.u0, self.u1 = self.u1, self.u0 - - hip_check(hip.hipModuleUnload(module)) - - hip_check(hip.hipFree(cfl_data)) - - print("--Launching Kernel .WAFKernel. is ok") + + #print("--Launching Kernel .WAFKernel. is ok") def getOutput(self): return self.u0 @@ -203,7 +266,7 @@ class WAF (Simulator.BaseSimulator): def check(self): self.u0.check() self.u1.check() - + # computing min with hipblas: the output is an index def min_hipblas(self, num_elements, cfl_data, stream): num_bytes = num_elements * np.dtype(np.float32).itemsize diff --git a/GPUSimulators/__pycache__/MPISimulator.cpython-39.pyc b/GPUSimulators/__pycache__/MPISimulator.cpython-39.pyc deleted file mode 100644 index da4daacc64c69c69b772807cf18375b7c8cf127b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11058 zcmb_iOK=-UdY%`6!4RYS*uv09;PGHib$-8;*6=r=nJH)=_{+;GFt zvX)xyE$LTlk$rZlXuop)!W;JL^2*wcoA!0T>PK$FN@ta=s&tz+7N|(iv%^*;+H|FN z$=+&p?6Mo!(i46dNq@Z)VcE!b1F?v80Eb`MvQW_pgeUE&=Gl=ao1xvRaN*X$jf zMA^RBUm%_BoZXVvSvMkprQK@Npdz5$vVoAjaSI1Gytgl5`vC)8YqhbB8fL@BHvLA! zUia)y=v6w6a~3dQzrVcp_Ivl%?5isu*zaGxd-v+f+6QlTqmMeD9iq6>~AiG?PqV}c5#m&vlg=)}QXvyj#PMS_RT)eVi zeFa`oR*GtzU+q&U3OY!(OfADd9fh~>GJkmr_(~~hLM`dyw9tj|SS=Z1PGm(6xr{JH z{;^WZ3QLUPn-c{wj;|>u#4&vHVp1H(*Ai3W1ioY9j5sM~FtQ*{iCNT)iwyW{KF$G@ zThbTXp=IN@MBHO<*8Fk}IBQEUWEKM(5SLZakx&=3gcmN@H{G(|053&eXnStB1=Kd1 zoxm?M;ev9aP0tH>-oOJduD7HQobqU{{R+&jlxgGrALUKFOL)T?l1S;Pk+!RL6-Vy^ z9rv|x{vAbVkJOLUu39&Qem@f~pvIt@DAUVQZCBgXFe2Bbab{Pk>%F|l^ep6Zc+L7) zSLqdYm0f*Tg@96kitLKaV7=`(?*(C}-EPSU)Dm=>>tG>lS@;k(#DCcLyPYs1zKAwk zHaLa2x7i|D^p#38oZ)sRJG3@{naFMh9`Aem;*#`$FfqXACHohbb{6crZg3wtu>*Py zfykB=et?eDS{#pajaIb^1$`PT?rQ( z*gZ(&!Xh}L?zYdG)m)pU3YfuPqY&EZ+13Kk|OoYIK_3tvmOen%`c;cfbhY zB4}z*T4-<4j%bYI{1D59lSmY`piQb%c(a)KObfGZ@fz6%)EP~&_L;8x$VjLsyX@-%p4})@^+RspO2)H-EDnCVANV6#F z6rK@~sr_h>X`_dM%-{lxryw?F6X28MKL9*oBiX?-lsgK{_q{N>h7JQT=SRV$_M^c( z0<)mW6tjKW9KtM7BxCkSxJ(e%3WQee)-Yhe-+*EI9gc?V2m}%t>O_V>pgMv;wCtU} zvU`6{> z1u(Opf+VSQ8q7|-c3+b(p+>2qZWZK<1xZe!yaVvN8U%xa&HM^|v>64uP?MkYmbEz^ zKdU^_b~4=zZNIC5zI62de6LJ058H#ePNz$A3iWktaN~`x-ZcoS1fI)EKSz)ijqPRD z-()e+@2*$wz&L&2HMU?Cxw2e?MG)0wt5dDnE{w>ETZV1!Z?j>LSQ^Pl+fL}3j+n&; zi-lq~R)g4Rc$G+!e2a~Buv95e=~@Sdf2>Qd`BaNF(Thu0oC92qK*uR$UU10#wJ?|W?tef&@tQOmayvp z_{e=IdvHwqWIqBWY3G4VARS)?d>>AdweqQg6EcMI7utpa7}Oyq0r@YSj}5W`pQ<1) ztjPx99xue!WkNn|zxI`X#P`^(bNmZ z3)=SdVZZ_nbf`rYo>jt-6EcsA*m9gCJ^-V*;5Zu{w~?C2ImZ#Lvg628=$BX~Y>GTf z#pft_o{|?RAqJFhQbOD^M2T#&Fptt{QQweCrfL{g!OEGsnMt1|E5~+t8OXQkfW*e9T(QUPQ#6&HZF zfo;8gkLhmMxvL?J(SVdb)MeWXry!aU!1t+CJPH4XofZ~hJl}|vA*7LwOdAZRaXdaA zp?(RC4p!iKG9>e(G#klf2S@FIJ;U1-F}e5Ca9REO8XJBGJ_cf3nuCh#7^J4yC1f5%QM zyB5hvlz1M@ew<8sX9@sqZFXo3YIw{n z)nm*o@cn$3jj!khSe4(~RqEe|sirtTh+YKk9H$zBqW(k3V{>2KS=8l|y1IqDmFCBg zAB&2;m%0{c$Lx-Y@ot_>xGHdmxqrlM=AmuplD6(x*Xrie{x7F>`JuW6KsT62XhNgD zWXU^=Xh~jy@>JT&6vLq@!DV8p`Eh=oH&{N%uhZ`;SE z10%j|A8#HQ@ooFKd|<@4?c=97DWi8qO!j^cxvBbF^{dgf-V$O;a12zOa_{=Cf-j^% z?*`>K%5;3CsKFM3eFKXT7Gk2#mmxDtCnBfW_MI96kB;}S3;~Uh&O2y{=MQOtHrosu zEmy>Ihqb~;rnct}YePs+r18+$N;-s**NzK;6CujwP$6X7w`d>&L{23@Cf6-CD3}!+ zo3O(ZlbEEvyohA`MQEEO0?(EZhq5S`;;&3BZVI$-kptoi2M!2-H`6aWCa4eN z+@?fK4jt0WWqbq zyO(@6lTVO*LV1jEHc1H?j!8(3iRUVc@i^`m8Uw12$HJ|k4EHg>(G&|woa>Ki7#SGy zeI%t5>nRmDppujfN~Z#E(>bg`tNJCaS}z&Zvh$GBE%^b>RYC$kChs_WZv^{r&&DH5 z@n7+Vze0jQBuDZnnm7Dda49LuN1hp_8r)1no&4I!nOauO!cXJJFt`;x{>#i3w0XFq zrkbQLaz07nlIqjyG}bB@^Jqt2pHunCzM}G@r(f+^tjlr5ftfdmiH;a%gk=gkEa45m zL;`WeQAMF;l+$Dc!6AAbcH3oYAy*J?A>BBL%oBqI*+^ZsU)M8OywO}aDEdR@>SfX% zT0Kj~R63VA$ir|W2t+bIy@M}9@vA~JE9br5%&*+6%r?W zLL~a=gy>WdiiV4$@jj%BhNo)645P7AZdxM{Xck4?o*6d&DYB6Wm7^CWp6oB0Xe#+j zRP~o8{K!9}WZ%Nn@cz)kvm*-+aq-bhkM4JP**JqZQu+R&3x9-#OG67!jV#RLMwTQd zARWUfUPp%Sm6RUrB-{;wly}vt%CP{Yj%Zi}q8A!sAJm?$=js&w(4u_Z68aOJ33;qn z5Jqnt@dEfGU>KyS=I1<|Fx)6{%$C$6F8#Dtd;2-|bZrsQtxsj=oLz0S)^R`Fujo6o zB+_M>k|rfg$9Zz&aR+@pl0T+OmW6b1asFzffm`ZM*-PAY{oUmiR-t=Rx9^mjkmzp1 z0ZHeRrIFo#sE;w{hlE=?8bzH&jAaJ0XdhJ#pc%n4dEw+6GoDGf5>+A2L@A<`h-gqQ zjW(f7_(d?4Js2JFM+hI(X5d>uXv&ylMZD$Zpx!DQIWC{$f zF89JSBtPJ~rq`CqjV_xggTY)%FTm-xY|v25C9KD)f?|rSFXYcD$LJ@PhsdFLN1WO8 z1x%!DjjlSAxFz4q#M;C2C2i|Gz87L`>%u;Cv(mr8FnWY^(veDL5=fuutBn1q2SX=?5wf5d^#xEKp_&4_hskD;x6)B%Euy2q)#-KR!#DVF-~@LuhObF8AbX__=a*vK zA9JMpW356?HI1@_LAqdr9H23t?bqL7Hn`4*&NiOT*2QdCw>L4+F68bXqFvIxW%ui_ zDu@a%2k{6hIIJadzw5{9ll9~H=J6s*k92Ap8VHvf`6r0+Q`j5im-IV<@AODN3;o8@ ze$R1k0Ui$QmRsnI z3;Xea7Cz%IG`Vu*xtv0;e$n*BoGAcy{$bjII& zhZ&zeV#enNGk$~_|G)Sk*{dPfM`-m7R+t;`!NifYYLE0gHbSfYT^sT@BXd8?_~`Qi zjX9hTo*(H)FdWSX#>PLP_y2|ulKp4o&xTHBRNC$Ffqn2~u<8pb-DWoC8Df$v8TmKg zVZP@No$q^t`5J>g{a^SXv+?hPc@FRa+trh)UH$ve5No)NGRyEU>7u2S#Xoo8ZN=F{ zQ6>iUyXYogq69h#b|5Zk;Pc0N}+u0!mtx)F|Wc|7857sr#3%MIGdO-gQ2a*7giZP>X;a8E|Jq&>;c zDPb3CJoTK*9vt}8duAu$A$4H~>22ix1ur{DxGT{M74sR~mC$3F6O!7ClgYAVqsQYT z{}M_fEg3jp_(vxh#&H%AqgPSTQDEXKjkmWbXt5w9E}DFvsTQM!QWO%hgX7W#!bN>yAR3TRY diff --git a/GPUSimulators/__pycache__/Simulator.cpython-39.pyc b/GPUSimulators/__pycache__/Simulator.cpython-39.pyc deleted file mode 100644 index dc16706a34d26a4641268277e3f0d257f351d864..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8433 zcmbVR+ix3JdY>DI_|HP)7M^Ga1=ngC*SX>AHAB74nu)jRTkTver>A3f zJ6|hg^NKbI- zHCho1x^01)>pM}Sg$1k`O0^rMuY3^%3QW`L_HgJxTX1VRFrN8JtIYq0xXLj;jvE{H%qQA=>P z(Q|0+3Z1@|oz7N5?5YsC%7lk;$zBN=0WtR3@PKT$t1OhxoEsCsl7$_@z5*zF%vTi2 z*3BPB_y8}?yg&in3Ohifh24PIw&(k7L$GcnnqB{@0}8OGtLyh3K3Qi=)o1MK(&NWV z)%9n8g_&R}M#U?U?n7J!>%bH#-5~A(X6M13$IJIHf9b2$`>X5E05)4$U9a9*TVpE^ zAG0O)XzB6#>hhEOOOM&3CyyUJT)Q(5j)5$|-}w{dBuQw8QZO!J*YhKw`V7pA0FuvH z%Y7xl#D?%*0U(z(AQnRszI{{2^}}FOA%^2gYEwZ0M5|2}gz;4t0kWH|IPTn7SlHg) zp5F|*^P${a0H=w|(ZZMW&KoG5R;}nsel>{iK!#DZm&0~D3>q{v{>!0p3s3Yj6hf=% z&{k+^&4gwd+=OPC!h)vdL~hF-f4KE)tEnq()NA=v3u}c;|B8Hqm!O9ZpM)MxC8xep zq9uFFVZc2Smi`@w;kQhhr?jdS`rHHZ8`(^dESs8+Z~EZ|ggP7cM5NXq!$%+D!;av? zjhe-Q2e)cA&jDf^kmdVlk3lxS-s_0|Kd9{qL?U1+om@l>sH8}Z1~joLBy0$au|R3y z39yg0f!tPCt9Ng(2T#`4*@LC^<$G-X-qJdIxB^s(H}fsFT4l>mZZC}%Rdh)RRrEL6 z3-(NVW1^^53Q3_}58Sq>*Au5+Pc0kjrFtFa$j?Ud^*Rq5=q}#&$fo%sPV!aJZM#8` z6du94!K5@2=doz|q7g$}ztdzH2dLurKVRrHqXoYUTO>Ny7a)&YZl~j`3Z~b8`1y|( z@%!TQg%&nnSn&cc7C#Gp*v-3-p43}jX94duuib@R2xYV|s?K+MiAAKoio+suCk5TI zES>)344M@y$pxW|TS+d0nzs^5xKW%~+aih^M2F#z>hvt*iF_0To$D90fmWxQp=vs6 zrm7jJS*m6Zv@I>RwjkIqjOZJ*^g}J^aIpEL4J0{OMI||jT5`G?#;Zf0CiopG zLs>BrD-wP)$tn3fV5&StOCUVNta|-(w4(DUw4$S{KVwQSeQy}0KN@D~e+;XC@wko* z7dJ>Qhd-+4@hC>jp~Ce&E!II;;}+Ic;anRG}dt&FUT&!`JP~O;8^33g#ki`AQqcqOd5$tZ>Fgr+Poo6e^WqF zu}>mlVg#KehfO~Fv|=b~CdNkNC`BDNS`NA3@HSO!6_rqEQ01BQ&+cD3PTqqh$0%%J zwW9DhsCb}ftnGouT>ooD<$a~gH?^$S8tcuCdhM~^{HV7u)?2&@>i}9iRmpIUe4h%X zs~=EJDeQ+-Q(8-8Q>qIx5PYeHG9-duOsI6-=e+bG}V1rV_4{uXjHGA^s|0G{i6eHHaH z39IV`Vl<2pEEYTQ4F(@h9*PZ5S#+>{qv_Wp z+>dc}U4}Elt&8r=z03v9c<_MYOGfj|}X{#TgfScUG(@XMAKXltnHC@GHUmXyW`loT^8gsja1l6ptDFHk7*xIsfCCY*7S<6@(`nK&sG z2*Jd{Rnt#$>ik-96LM0|`n%k%*=va3-F9LlrlI#V#RW2O!MwCXj{$#NDWwt9bwcb@ zDn6s)M^sRln4BI48+Fhiy~suKFKPG-Dn6#-6BM->1vn-(y-kP(MJ{9BNjWBRU?HA} zE&$s3^Eq2DWq*!t>$X|Y%c@;AOU5Pr(jRRb(Wzfcdw^qQq+D3@GNNSQJK8 z--5B+qR`*kH(?MFd6p^UWMBW{2H)MG;&~+}-@yvGK!tL}2ef#X`VRc8!i2o=Z}1GG z$umdcQ;}q~K|Xc(Bfn0Mg74$ZSyA}63)uFIeQnnoz)_vkc5?_a?E!+vt}Yk3Np+a> zY}O$6Li?32KN{F_1M94Pgq&xQBYR<}c8(mvmOV>u!Hn~Jg}A7&&!uOa*IvT`?&b%$ z4ZqOkKVi(-o8b99a(38jAMohd|K&$$PY&|R`I&oD>FR-X9y4|egMwUCE2lYJ;-Elo ztxr3Y_cdL6s=ZvawD{Do^xlWMwp+y5{Ge!QTgcT7&q?>&H=6p7ag(dh{1U64A=uk3 z{$26qV)|ax7O|7c>pA^jvyp9BIUE>A=~a{N=yefr`fnIt#`pP6}wyYO*(ByMt8X?Ksue%LLvJQ6y&D3*;iz@<^MJ^;EKt zt0NZHreK5A*03LB9W5_thDi>&(8h~oYAK3@B-K{;&BSu$CR{^mLP)OVJruQkCxxw+ zk6~?G8P{@(HML^5gKVRy^SI_Tx{^{UR4=LdWSHngT^!Lig%xEJ#BZ^CG><~F9iyNZ z3`<9RtJ}tuIjfh=i!dN(b<1?j8T6KQ1IDDRvt#CCC^3g}3PjzJ`M8Njtc=HN{kHbJ zybA@~LmmOqqoHdr5z|4O%|R->P)xKep0+F{ux9_>NYIYRYi8S#KEiE{WE9!yk+X&O zRr12)(Q@U#s+C-lgR_M>C+bxoSLqwq)h=qqvi6}(_GHCniat5oa?<42_onEb&%85Sg#*nzAz{BkTPLazc zEXXdAwc;yRdhQ0~|0rb7FzerFe>zM%#;oCPhc1nDIiu+ct|?StjQ|x{AFi!w@+rD- zG69fGM7^NVl3@U+DNi!yHO!&}>JWenM~HH;{1^-bhl0UIgA1U?`X6J* z9q}ERI{!IG5&KWXY!w6lId;?$H##&H$Icq*!6hulhZ&Xc0D3yPmDHWiYMW^qCrwvc zY4D)p1{Fb2@Ng1LN>c9Z#8Ca;jX`$qFwJ3v(mBVZkqE6Q{sUA}uEWn8F_iH&e3$Z9 zI(ahqzZ}T|`TO^W0og&)ErOp=Hya8CkP0D#C`s+pK!6ZWui1A{%Y;}`{iyi<7=8bR zCn7!3V1N3T594xTJa~-cBP}aqmpDsxY{6H*M#>_UXEV)1sC&PH)nSLDs@5K3s0v8jDN6J=G;BkP3Y$$zlGy4xlrt1G-=`y>Oe(@lCbJX1Z{&H0a zxNo>cd=on2f3aHm-YPPE=K#qeqc=g@)ziz!Q7%2fv>{dB z8CCfl`$O2SDJh7wM4J(T8a${8b2K|DdJ<8c~Y zWBAO8Soxq6va4kK{@Vb3hyi?)9RYg!u=uLOjsf{r!pLvvoVQZtrx-;*I6CYIRYpNe z1zrXJGe`4c2RX1cI)}WPBnKkNh)Xi9l8m1GnhH`)`Ex3W{fZ-!u$2VGn1m3aRSr+i zq1MI|QGj1C3O^}W&eekBn1$h=g_@1WQgaWPCt}=)c?2kBFuzn^ByLfb*iiXPsI=0W zs?wT2mFpdbr1U8m;UazgMaUO2cNjHL060wb$`5d$_x<8q z={0ZzohM#;K5oy2VXZbv1Gq@01EZOnoi2U9?bYmbX)>8@fp7Boten{CTi&9RCJ%jG zQbBQIhZ|e?HkH~(;-La|48;+LlfupPv)Px#sfezL))_>SddVoIrCq*T{(J4}{{j@a BH8B7H diff --git a/GPUSimulators/__pycache__/__init__.cpython-39.pyc b/GPUSimulators/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index a966589566cf0036cb97a40facf080e9dc3106f1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 182 zcmYe~<>g`kg7xe&sV+eJF^Gc<7=auIATDMB5-AM944RC7D;bJF!U*D5w0=Qav3^cz zaY<2XfuVjuQGQlpK|v0fk(yi*Z(?R@00fq1`WczY8TxLSd6^}tVfi_wxvA~}q461+ s1^PfbI5W32C$S{Is8~Nf9;75bUaz3?7Kcr4eoARhsvXGE&p^xo01!1T>Hq)$ diff --git a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu.hip b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu.hip index 67b701b..590ae86 100644 --- a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu.hip +++ b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu.hip @@ -25,7 +25,6 @@ along with this program. If not, see . #include "EulerCommon.h" #include "limiters.h" - __device__ void computeFluxF(float Q[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float Qx[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], @@ -248,4 +247,4 @@ __global__ void KP07DimsplitKernel( } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/GPUSimulators/cuda/common.h b/GPUSimulators/cuda/common.h index 5463294..9be378e 100644 --- a/GPUSimulators/cuda/common.h +++ b/GPUSimulators/cuda/common.h @@ -24,6 +24,8 @@ along with this program. If not, see . #pragma once +#include +#include /** * Float3 operators @@ -86,9 +88,6 @@ __device__ float desingularize(float x_, float eps_) { - - - /** * Returns the step stored in the leftmost 16 bits * of the 32 bit step-order integer @@ -497,14 +496,18 @@ __device__ void memset(float Q[vars][shmem_height][shmem_width], float value) { template -__device__ void reduce_max(float* data, unsigned int n) { +//__device__ void reduce_max(float* data, unsigned int n) { +__device__ float reduce_max(float* data, unsigned int n) { __shared__ float sdata[threads]; unsigned int tid = threadIdx.x; //Reduce to "threads" elements sdata[tid] = FLT_MIN; for (unsigned int i=tid; ii5@b)KFdU@!ndkc3F;HxpT93n?urt=1daS&P!r`r+=HQb;NrA&$8qdH@Uw z%wT&4B*FAlrOd5bU$)|FTuzmEh|9X9V&9UBeab0^ROL1&r*hcrEtNycwdF&;*8@@z zSL^J6^QQatn>YRUzV}|UF)`t2xE6l#gVs;Ju4(^CmGP^H${pO%2OwOlYn*Yt%X)fU zXH+)2M%`3Tt8VcsH@Wo@tLOL`p5u9x@_d#Tc?l((&+!RfMybH(d4*4*xH+7ShyZ1wAZNz>UTys4i?ZVylM1;C~ z^Y{ZJZP(mTI9D4nby&DzpI)k=mqWMPK!=XsTlnIDUyO^pexN$t3HumE2ih>OJ-^#^ zw>)#NSFqNVa z#6$Gi`QE*?<%f{J`}WFvD;rPIZTJ4l#_GNGb@%?GHTSOj`0m=q%JP%^^D~4&o_?_P~oh`K-!gIM7H~emdp+3dpMd(tOV?*qESj491@1jGrQTo)K|7VSy zMmG%FYG9D0-Ir|?^k}8!24Q^7jnK2VI&s|p_Tu8+-rhnx7%YUMy@-{jRgM;KFF2pU zNVFPDr`}czDu&Bpz>QzadxytGu? z8`6Bnq;jSx2vj+8`7z2b6QeDdjUMi zfLBz!36kUboHi~yuO(%Y9q}`H=m38*xP>>4zox#F=}$_T&IB4WPNpT}(wnj*%W^_a zN=FvkIoQ0pDi~(qGa{ej#`T+}nx2|%;q!E&?ZwMsD7YVBd)F$dwdeD=ljb@e*?Ou6 z`?!a~!fUDJ!#4NpdYkTH%KVgVr#goOAEx^L^;93?-oJt8jpv%E;+@aq#eOSV><(b_ zUjN1-)_uFt?{`(fYsQ;zy!p-RxW4t~V#jZG7VrCkAA5fmb_YH0!Q&^J9lyW${-d>b zzq_)wi2AzU8^8`j5iNGSZXY(hxPlk_M)zbVL<{|4TEMXG?Z?2g3+O}y`SM=aX`G@xu7P^p$nt*F!Jd#Ta&f;2C@tY$|3 z0eF58r)JM<1U0K(>0le6eKW$6kfGJ>{moXRi6yAp!%;~TNIf|^wdNT)gKFyVa4(1& zSZiPnU!^M2`$0|`Sd4$OV>Xwjdm;6W0GDQT_kt19{+<~ zP4=tSLrW}aNo~i9=GA=X6#oj=DJOH1fqz}`fd!DigLPzTHuV@f4yimT4U}`LT!9T) zWXDy)h>Fai3|miZIT^EUo$J!V)8NJ@1~-*u=V5Wgh8=K*Ysn1mYMgtPKhl^c3)0~xc2EKGHZ&FF8T83K4{7!N zGt%4v!T^J2V@J-?TmYvq^7GoEk<3BY?1`?TEPX{?^UyW_vMxD?c{mGw^C$WyWce%V zI|qH|UZL+i^qo7=2g|~GT#yF!KDj98e7xv1X|k(b&Ewh+X8hT!KI1OI!eR!pr{H*65d_0wDfp0(|zdBwWxIpMW7gZq1Uq6f^SZVj^9Agk?WwOv(H+W1HtkeBVJjPQ3CxVxJ&Q7?3W zo62lc)~Oc;CpEn@s26s}s=K00{U!T4ZMr6P{}CFZ?}BJK7%lMenA!?K2{A9ROAN-0 zR+HK0HDD^9#x)@8*pu*87vF)NCauu%brFRUZiTHsgDrEGm|RbQUr8?3wi*1R)`ES) z-po(*B##ny#7jK;6}-YLcn)}{^%V(4X)i*jacF%%mz0u;q?}A9m1GL?7Q87uElTu# z6ZT8IGLV6t@MciT$!XZ{3!AROdhKME7hrV-ScFROBHUgHjI{EVE>N@VA9c84Wr zAz{n2yeRWW2GFL;i#ucu#68DL8Fy5hkTzs6YSmVjMbwJCKsbdMgY*}ODnuv(>P<{`YhH$+P2Pzbgl!}odSMG?b z*a*TU@`oVz#n-5K1>{rQxc+TPS-1EG%Hj_|vRD7Qe5ZZ+!@v3a$-gu2jFeru^W03W zAQT8>tO${GCpEoB1h2N|Me*}uYKTslS|FQ)>;bd6H)g~qMv=01YPB1^UL&2^8pWKO z$c?g~GpbF9KSD=RGYq^)T%)3O;E6C&VMJ;pa{>l6J@Fd#c|o=BA^Y$IAyH~>0J2hN z&1(^;AfQg|$DZ(!3N>pN$GYxx7ADw4betKhV<|>OqAPRpjQ8F zwfG<7{fPRaSp*2CK|z-K85h)Z)=+$QnUxWE!SfqscE!+HF8lq?E>!3}rql29oGqS0 z*QcaV(|*15Qj|Xj7NG})G*Jua@v-(XYw46he`F^)U`U>uiOsE~z;g+q$hiIIfhVUi zfp|s8jbjY)N}N!}977&(;R7dN8NiO%f=@;%&ZA~jwL^BO<7XhgFLP%{PbQVMPI6N^ zuvUB(wmS)HSGEh80%R(RTcN1_M5Y4U&+NNM*@Z0f3D|dusIu<~*ta>d@2MCVAZ_Z; zkF+1L!yM3Tdi0|h!---0+odm3URKtB1{ni#27JYlCD18l{WHL=0<8ZGFrt{u%2`-{ z<)EufcP$*WJJ|Th=HNVg9;L@fe6L===H6Ji{+Eh(83FC<=$9a;R+VA346bwIBgDxuA~VeS z_>5RdF3D4Vr&fWmAL~X4irF{S{4vx@s96#*(vgk`t$>`4v|vqX6_N?G=CBHuoS4&g zOlj~O-Z5S&lUFj>6(bosg|QEAtAULpXrur#8}}%ij`u>h)gAa;oq@djS_ZlFk^;yk zLCEn%kU#*jjyw7}NDNns1Ou*;wql}62zsx^`nK^5>BJF?WFF!iQ`9wkR)XWESbvry zCnjkMjFBW#2$Dq7ktHf0@O)wgE^0RP1#oXxFpV0vxsbewn#oHD#C3#jm|i~d3AyrP z%{0|{OnLqgi3!K|Yvolz_M1UFY?#t}1Z>zj{Tp1k)N~aj%?BRNk&s1xgK9Y-E9!zE zpk9DO;Q~ITS+|;lEr2IW%^d`NwR~#y8v7uAkO4KYF5qFWJ{2|n{%|4tPN+_H{(?TW z3R1Hb^rkEj#6w`!uj|9U_yJW|E5lIAZO8JQNkYM17{;A7^?`p!bL4_(NN-fe37Df& z4#|(Yb)-Jv9ef9GzUy^&{otY3?RCO#e6a9VH*7Y#(d`Aa)yWiI;*5VMbDFpD9W{$> z`l6=&*HSI7#w)&u8l~;=&DWJnQUlDq5ruZO@|c&|B3YBRpzu(9l_yErW)>@+_>q(~ zp6i)!qNB{QDY1_`|F7U@iJw_D?2DHO^jXC$BLSrQ;L2B-fsqH$#=?YlOI`oFq|9L? zVBX2L6YV6mr$$rqn_p#d!xHf+pgnfome=dYL*={GY>D?sVw1=RMCklL&|GEnM@AX6AcMb@YM555pb#VC9MREBd7`_rw?(bTkQN zj<0hlkONlX`)@Jj2;VJ)@5=R_0%i)|$%&4;VB%4zewugMEt6a^HZrgVrkSq@)y-LajYR z8csnx-1%`}o^gPK87J6+#tEgdo5}O?X?bu?#zj_!L|}r2se*09qGulth8NN(FFiFrE+;M*+j!?+FQ}RY))oowiB^Y!!aY zsgQsYf;{kjp3fj8I4jQr#j6KzD3|s?ozTZ{-fN9vmyQr$+du*FKO*6{vE_GtoL*B> zpv1>5^tVeYs6QS2tNgzGSJi)b@E->sEoDYb4ncH@DA(bl^ckg<2a_woZlmk-<5POk zBcanapV|2}YNGR~F-*Kc)z?8%qp=lbX8*$C{}c_ibEhmm&5!<*@TCFzDJy>!u8x`Z z39^nw=*8h?Oowpr4m`tyQvje)KmouM0N{A)vzfmRyUZeTQ(6DW_N&GPM!#9ZK`cIv z*xUh~W?$FKdWBulzceB*h=}Hat