From e2b1281f5bbc47572d35b934a040193eca6e1eb2 Mon Sep 17 00:00:00 2001 From: Hicham Agueny Date: Mon, 26 Feb 2024 12:55:34 +0100 Subject: [PATCH] update streams with hip-python --- GPUSimulators/MPISimulator.py | 14 +++++++----- GPUSimulators/SHMEMSimulatorGroup.py | 33 ++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index 29706d0..f13de52 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -219,7 +219,7 @@ class MPISimulator(Simulator.BaseSimulator): ): raise RuntimeError(str(err)) return result - + def __init__(self, sim, grid): self.profiling_data_mpi = { 'start': {}, 'end': {} } self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0 @@ -382,8 +382,10 @@ class MPISimulator(Simulator.BaseSimulator): self.full_exchange() #nvtx.mark("sync start", color="blue") - self.sim.stream.synchronize() - self.sim.internal_stream.synchronize() + #self.sim.stream.synchronize() + #self.sim.internal_stream.synchronize() + hip_check(hip.hipStreamSynchronize(self.sim.stream)) + hip_check(hip.hipStreamSynchronize(self.sim.internal_stream)) #nvtx.mark("sync end", color="blue") self.profiling_data_mpi["n_time_steps"] += 1 @@ -433,7 +435,8 @@ class MPISimulator(Simulator.BaseSimulator): if self.south is not None: for k in range(self.nvars): self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) - self.sim.stream.synchronize() + #self.sim.stream.synchronize() + hip_check(hip.hipStreamSynchronize(self.sim.stream)) self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() @@ -488,7 +491,8 @@ class MPISimulator(Simulator.BaseSimulator): if self.west is not None: for k in range(self.nvars): self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) - self.sim.stream.synchronize() + #self.sim.stream.synchronize() + hip_check(hip.hipStreamSynchronize(self.sim.stream)) self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() diff --git a/GPUSimulators/SHMEMSimulatorGroup.py b/GPUSimulators/SHMEMSimulatorGroup.py index fc11d50..c9dc30f 100644 --- a/GPUSimulators/SHMEMSimulatorGroup.py +++ b/GPUSimulators/SHMEMSimulatorGroup.py @@ -24,7 +24,8 @@ import logging from GPUSimulators import Simulator, CudaContext import numpy as np -import pycuda.driver as cuda +#import pycuda.driver as cuda +from hip import hip, hiprtc import time @@ -33,13 +34,28 @@ class SHMEMGrid(object): Class which represents an SHMEM grid of GPUs. Facilitates easy communication between neighboring subdomains in the grid. Contains one CUDA context per subdomain. """ + def hip_check(call_result): + err = call_result[0] + result = call_result[1:] + if len(result) == 1: + result = result[0] + if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: + raise RuntimeError(str(err)) + elif ( + isinstance(err, hiprtc.hiprtcResult) + and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS + ): + raise RuntimeError(str(err)) + return result + def __init__(self, ngpus=None, ndims=2): self.logger = logging.getLogger(__name__) - cuda.init(flags=0) - self.logger.info("Initializing CUDA") - num_cuda_devices = cuda.Device.count() - + #cuda.init(flags=0) + self.logger.info("Initializing HIP") + #num_cuda_devices = cuda.Device.count() + num_cuda_devices = hip_check(hip.hipGetDeviceCount()) + if ngpus is None: ngpus = num_cuda_devices @@ -357,7 +373,9 @@ class SHMEMSimulatorGroup(object): for k in range(self.nvars[i]): # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south) self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i]) - self.sims[i].stream.synchronize() + #self.sims[i].stream.synchronize() + hip_check(hip.hipStreamSynchronize(self.sims[i].stream)) + def ns_upload(self, i): #Upload to the GPU @@ -378,7 +396,8 @@ class SHMEMSimulatorGroup(object): for k in range(self.nvars[i]): # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west) self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i]) - self.sims[i].stream.synchronize() + #self.sims[i].stream.synchronize() + hip_check(hip.hipStreamSynchronize(self.sims[i].stream)) def ew_upload(self, i): #Upload to the GPU