hip-python implementation

This commit is contained in:
Hicham Agueny 2024-06-09 22:48:06 +02:00
parent d5601ec808
commit 2a7a8c6258
23 changed files with 1769 additions and 1419 deletions

View File

@ -31,8 +31,7 @@ from hip import hip,hiprtc
from GPUSimulators import Common, Simulator, CudaContext from GPUSimulators import Common, Simulator, CudaContext
class Autotuner: def hip_check(call_result):
def hip_check(call_result):
err = call_result[0] err = call_result[0]
result = call_result[1:] result = call_result[1:]
if len(result) == 1: if len(result) == 1:
@ -46,6 +45,8 @@ class Autotuner:
raise RuntimeError(str(err)) raise RuntimeError(str(err))
return result return result
class Autotuner:
def __init__(self, def __init__(self,
nx=2048, ny=2048, nx=2048, ny=2048,
block_widths=range(8, 32, 1), block_widths=range(8, 32, 1),

View File

@ -218,10 +218,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps())) logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
#return outdata.filename
class Timer(object): class Timer(object):
@ -247,9 +244,6 @@ class Timer(object):
return time.time() - self.start return time.time() - self.start
class PopenFileBuffer(object): class PopenFileBuffer(object):
""" """
Simple class for holding a set of tempfiles Simple class for holding a set of tempfiles
@ -366,10 +360,6 @@ class IPEngine(object):
gc.collect() gc.collect()
class DataDumper(object): class DataDumper(object):
""" """
Simple class for holding a netCDF4 object Simple class for holding a netCDF4 object
@ -443,8 +433,6 @@ class DataDumper(object):
class ProgressPrinter(object): class ProgressPrinter(object):
""" """
Small helper class for Small helper class for
@ -499,11 +487,6 @@ class ProgressPrinter(object):
return progressbar return progressbar
""" """
Class that holds 2D data Class that holds 2D data
""" """
@ -525,17 +508,21 @@ class CudaArray2D:
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
#Initialize an array on GPU with zeros #Initialize an array on GPU with zeros
#self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype) #self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)
self.data_h = np.zeros((ny_halo, nx_halo), dtype="float32") #data.strides[0] == nx_halo*np.float32().itemsize
num_bytes = self.data_h.size * self.data_h.itemsize #data.strides[1] == np.float32().itemsize
num_bytes = ny_halo*nx_halo * np.float32().itemsize
#data_h = np.zeros((ny_halo, nx_halo), dtype)
# init device array and upload host data # init device array and upload host data
self.data = hip_check(hip.hipMalloc(num_bytes)).configure( self.data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(ny_halo, nx_halo)) typestr="float32",shape=(ny_halo, nx_halo))
#num_bytes = ny*nx * np.float32().itemsize
#cpu_data = hip_check(hip.hipHostMalloc(num_bytes,hip.hipHostMallocPortable))
# copy data from host to device # copy data from host to device
hip_check(hip.hipMemcpy(self.data,self.data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice)) #hip_check(hip.hipMemcpy(self.data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice))
#For returning to download (No counterpart in hip-python) #https://rocm.docs.amd.com/projects/hip-python/en/latest/python_api/hip.html#hip.hip.hipMemPoolCreate
#self.memorypool = PageLockedMemoryPool() #self.memorypool = PageLockedMemoryPool()
#If we don't have any data, just allocate and return #If we don't have any data, just allocate and return
@ -547,16 +534,21 @@ class CudaArray2D:
assert cpu_data.itemsize == 4, "Wrong size of data type" assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Create copy object from host to device #Create copy object from host to device
x = (nx_halo - cpu_data.shape[1]) // 2 x = (nx_halo - cpu_data.shape[1]) // 2
y = (ny_halo - cpu_data.shape[0]) // 2 y = (ny_halo - cpu_data.shape[0]) // 2
self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]]) self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args): def __del__(self, *args):
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free() #self.data.gpudata.free()
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data), self.nx, self.ny)
hip_check(hip.hipFree(self.data))
#hip_check(hip.hipFreeAsync(self.data, self.stream))
self.data = None self.data = None
""" """
@ -575,14 +567,15 @@ class CudaArray2D:
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
#Allocate host memory #Allocate host memory
#The following fails, don't know why (crashes python) #The following fails, don't know why (crashes python)
#allocate a pinned (page-locked) memory array
#cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
#see here type of memory: https://rocm.docs.amd.com/projects/hip-python/en/latest/python_api/hip.html#hip.hip.hipMemoryType #see here type of memory: https://rocm.docs.amd.com/projects/hip-python/en/latest/python_api/hip.html#hip.hip.hipMemoryType
cpu_data = np.empty((ny, nx), dtype=np.float32) cpu_data = np.zeros((ny, nx), dtype=np.float32)
num_bytes = cpu_data.size * cpu_data.itemsize #num_bytes = cpu_data.size * cpu_data.itemsize
#hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can #be accessed directly by the GPU device #hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can #be accessed directly by the GPU device
#hipHostMallocDefault:Memory is mapped and portable (default allocation) #hipHostMallocDefault:Memory is mapped and portable (default allocation)
#hipHostMallocPortable: memory is explicitely portable across different devices #hipHostMallocPortable: memory is explicitely portable across different devices
cpu_data = hip_check(hip.hipHostMalloc(num_bytes,hip.hipHostMallocPortable)) #cpu_data = hip_check(hip.hipHostMalloc(num_bytes,hip.hipHostMallocPortable))
#Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
#cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
@ -591,50 +584,62 @@ class CudaArray2D:
assert x+nx <= self.nx + 2*self.x_halo assert x+nx <= self.nx + 2*self.x_halo
assert y+ny <= self.ny + 2*self.y_halo assert y+ny <= self.ny + 2*self.y_halo
#Cuda
"""
#Create copy object from device to host #Create copy object from device to host
#copy = cuda.Memcpy2D() copy = cuda.Memcpy2D()
#copy.set_src_device(self.data.gpudata) copy.set_src_device(self.data.gpudata)
#copy.set_dst_host(cpu_data) copy.set_dst_host(cpu_data)
#Set offsets and pitch of source #Set offsets and pitch of source
#copy.src_x_in_bytes = int(x)*self.data.strides[1] copy.src_x_in_bytes = int(x)*self.data.strides[1]
#copy.src_y = int(y) copy.src_y = int(y)
#copy.src_pitch = self.data.strides[0] copy.src_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and #Set width in bytes to copy for each row and
#number of rows to copy #number of rows to copy
#copy.width_in_bytes = int(nx)*cpu_data.itemsize copy.width_in_bytes = int(nx)*cpu_data.itemsize
#copy.height = int(ny) copy.height = int(ny)
"""
#The equivalent of cuda.Memcpy2D in hip-python would be: but it fails with an error pointing to cpu_data
#and a message: "RuntimeError: hipError_t.hipErrorInvalidValue"
#shape = (nx,ny)
#num_bytes = cpu_data.size * cpu_data.itemsize
#dst_pitch_bytes = cpu_data.strides[0]
#src_pitch_bytes = num_bytes // shape[0]
#src_pitch_bytes = data.strides[0]
#width_bytes = int(nx)*cpu_data.itemsize
#height_Nrows = int(ny)
#hipMemcpy2D(dst, unsigned long dpitch, src, unsigned long spitch, unsigned long width, unsigned long height, kind)
#copy = hip_check(hip.hipMemcpy2D(cpu_data, #pointer to destination
# dst_pitch_bytes, #pitch of destination array
# data, #pointer to source
# src_pitch_bytes, #pitch of source array
# width_bytes, #number of bytes in each row
# height_Nrows, #number of rows to copy
# hip.hipMemcpyKind.hipMemcpyDeviceToHost)) #kind
#this is an alternative:
#copy from device to host #copy from device to host
cpu_data = np.empty((ny, nx), dtype=np.float32)
num_bytes = cpu_data.size * cpu_data.itemsize
#hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind)
copy = hip_check(hip.hipMemcpy(cpu_data,self.data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost))
copy(stream) #host_array_pinned = hip_check(hip.hipHostMalloc(cpu_data.size * cpu_data.itemsize, hip.hipHostMallocDefault))
#device_pointer = hip_check(hip.hipHostGetDevicePointer(host_array_pinned,hip.hipHostMallocDefault))
copy_download = {
'srcXInBytes': int(x)*np.float32().itemsize,
'srcY': int(y),
'srcMemoryType': hip.hipMemoryType.hipMemoryTypeDevice,#hipMemoryTypeManaged
'srcDevice': self.data,
'srcPitch': self.data.shape[0]*np.float32().itemsize,
'dstXInBytes': 0,
'dstY': 0,
'dstMemoryType': hip.hipMemoryType.hipMemoryTypeHost,
'dstHost': cpu_data, #device_pointer,
'dstPitch': cpu_data.strides[0],
'WidthInBytes': int(nx)*cpu_data.itemsize,
'Height': int(ny)
}
# Perform the copy back to host
Copy = hip.hip_Memcpy2D(**copy_download)
#err = hip.hipMemcpyParam2D(Copy)
err = hip.hipMemcpyParam2DAsync(Copy, stream)
if err is None:
print("--download - DtoH: Failed to copy 2D data to Host")
print("--I stop:", err)
exit()
#copy(stream)
if asynch==False: if asynch==False:
stream.synchronize() #stream.synchronize()
hip_check(hip.hipStreamSynchronize(stream))
return cpu_data return cpu_data
@ -652,31 +657,61 @@ class CudaArray2D:
assert(x+nx <= self.nx + 2*self.x_halo) assert(x+nx <= self.nx + 2*self.x_halo)
assert(y+ny <= self.ny + 2*self.y_halo) assert(y+ny <= self.ny + 2*self.y_halo)
#Cuda
"""
#Create copy object from device to host #Create copy object from device to host
#Well this copy from src:host to dst:device AND NOT from device to host #Well this copy from src:host to dst:device AND NOT from device to host
#copy = cuda.Memcpy2D() copy = cuda.Memcpy2D()
#copy.set_dst_device(self.data.gpudata) copy.set_dst_device(self.data.gpudata)
#copy.set_src_host(cpu_data) copy.set_src_host(cpu_data)
#Set offsets and pitch of source #Set offsets and pitch of source
#copy.dst_x_in_bytes = int(x)*self.data.strides[1] copy.dst_x_in_bytes = int(x)*self.data.strides[1]
#copy.dst_y = int(y) copy.dst_y = int(y)
#copy.dst_pitch = self.data.strides[0] copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and #Set width in bytes to copy for each row and
#number of rows to copy #number of rows to copy
#copy.width_in_bytes = int(nx)*cpu_data.itemsize copy.width_in_bytes = int(nx)*cpu_data.itemsize
#copy.height = int(ny) copy.height = int(ny)
"""
#copy from host de device
num_bytes = cpu_data.size * cpu_data.itemsize
self.data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=cpu_data.shape)
#hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind)
copy = hip_check(hip.hipMemcpy(self.data,cpu_data,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice))
copy(stream) #Copy from host to device
#host_array_pinned = hip_check(hip.hipHostMalloc(cpu_data.size * cpu_data.itemsize, hip.hipHostMallocDefault))
#device_pointer = hip_check(hip.hipHostGetDevicePointer(host_array_pinned,hip.hipHostMallocDefault))
copy_upload = {
'srcXInBytes': 0,
'srcY': 0,
'srcMemoryType': hip.hipMemoryType.hipMemoryTypeHost,
'srcHost': cpu_data, #device_pointer
'srcPitch': cpu_data.strides[0], # assuming float32 (4 bytes)
'dstXInBytes': int(x)*np.float32().itemsize,
'dstY': int(y),
'dstMemoryType': hip.hipMemoryType.hipMemoryTypeDevice, #hipMemoryTypeManaged
'dstDevice': self.data,
'dstPitch': self.data.shape[0]*np.float32().itemsize,
'WidthInBytes': int(nx)*cpu_data.itemsize,
'Height': int(ny)
}
# Perform the copy HtoD
Copy = hip.hip_Memcpy2D(**copy_upload)
#err = hip.hipMemcpyParam2D(Copy)
err = hip.hipMemcpyParam2DAsync(Copy, stream)
if err is None:
print("--Upload - HtoD: Failed to copy 2D data to Device")
print("--I stop:", err)
exit()
#copy(stream)
@ -704,15 +739,12 @@ class CudaArray3D:
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
#self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype) #self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)
self.data_h = np.zeros((nz_halo, ny_halo, nx_halo), dtype="float32") """
num_bytes = self.data_h.size * self.data_h.itemsize num_bytes = nz_halo*ny_halo*nx_halo * np.float32().itemsize
# init device array and upload host data # init device array and upload host data
self.data = hip_check(hip.hipMalloc(num_bytes)).configure( self.data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(nz_halo, ny_halo, nx_halo)) typestr="float32",shape=(nz_halo, ny_halo, nx_halo))
"""
# copy data from host to device
hip_check(hip.hipMemcpy(self.data,self.data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice))
#For returning to download #For returning to download
#self.memorypool = PageLockedMemoryPool() #self.memorypool = PageLockedMemoryPool()
@ -726,47 +758,84 @@ class CudaArray3D:
assert cpu_data.itemsize == 4, "Wrong size of data type" assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)" assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Cuda
"""
#Create copy object from host to device #Create copy object from host to device
#copy = cuda.Memcpy3D() copy = cuda.Memcpy3D()
#copy.set_src_host(cpu_data) copy.set_src_host(cpu_data)
#copy.set_dst_device(self.data.gpudata) copy.set_dst_device(self.data.gpudata)
#Set offsets of destination #Set offsets of destination
#x_offset = (nx_halo - cpu_data.shape[2]) // 2 x_offset = (nx_halo - cpu_data.shape[2]) // 2
#y_offset = (ny_halo - cpu_data.shape[1]) // 2 y_offset = (ny_halo - cpu_data.shape[1]) // 2
#z_offset = (nz_halo - cpu_data.shape[0]) // 2 z_offset = (nz_halo - cpu_data.shape[0]) // 2
#copy.dst_x_in_bytes = x_offset*self.data.strides[1] copy.dst_x_in_bytes = x_offset*self.data.strides[1]
#copy.dst_y = y_offset copy.dst_y = y_offset
#copy.dst_z = z_offset copy.dst_z = z_offset
#Set pitch of destination #Set pitch of destination
#copy.dst_pitch = self.data.strides[0] copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and #Set width in bytes to copy for each row and
#number of rows to copy #number of rows to copy
#width = max(self.nx, cpu_data.shape[2]) width = max(self.nx, cpu_data.shape[2])
#height = max(self.ny, cpu_data.shape[1]) height = max(self.ny, cpu_data.shape[1])
#depth = max(self.nz, cpu-data.shape[0]) depth = max(self.nz, cpu-data.shape[0])
#copy.width_in_bytes = width*cpu_data.itemsize copy.width_in_bytes = width*cpu_data.itemsize
#copy.height = height copy.height = height
#copy.depth = depth copy.depth = depth
#copy from host to device
num_bytes = cpu_data.size * cpu_data.itemsize
self.data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=cpu_data.shape)
#hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind)
copy = hip_check(hip.hipMemcpy(self.data,cpu_data,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice))
#Perform the copy #Perform the copy
copy(stream) copy(stream)
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
"""
#copy from host to device
#src
host_array_pinned = hip_check(hip.hipHostMalloc(cpu_data.size * cpu_data.itemsize, hip.hipHostMallocDefault))
src_ptr = hip_check(hip.hipHostGetDevicePointer(host_array_pinned,hip.hipHostMallocDefault))
#src_ptr = hip.hipPitchedPtr()
#dst
# Allocate 3D pitched memory on the device
self.data = hip.hipPitchedPtr()
c_extent = hip.hipExtent(nx_halo*np.float32().itemsize, ny_halo, nz_halo)
#hip.hipMalloc3D(pitchedDevPtr-OUT, extent-IN)
err, = hip.hipMalloc3D(self.data, c_extent)
dst_pitch = nx_halo * np.float32().itemsize
#include offset: do we need make_hipPitchedPtr
x_offset = (nx_halo - cpu_data.shape[2]) // 2
y_offset = (ny_halo - cpu_data.shape[1]) // 2
z_offset = (nz_halo - cpu_data.shape[0]) // 2
if err != hip.hipError_t.hipSuccess:
raise RuntimeError(f"Error from hipMalloc3D: {hip.hipGetErrorString(err)}")
copy_upload = {
'srcPos': hip.hipPos(0, 0, 0),
'srcPtr': src_ptr,
'dstPos': hip.hipPos(0, 0, 0),
'dstPtr': self.data,
'extent': c_extent,
'kind': hip.hipMemcpyKind.hipMemcpyHostToDevice
}
# Perform the copy
copy = hip.hipMemcpy3DParms(**copy_upload)
err = hip.hipMemcpy3DAsync(copy, stream)
#copy = hip_check(hip.hipMemcpyAsync(self.data,cpu_data,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,stream))
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data), self.nx, self.ny)
def __del__(self, *args): def __del__(self, *args):
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free() #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data), self.nx, self.ny)
#self.data.gpudata.free()
hip_check(hip.hipFree(self.data))
#hip_check(hip.hipFreeAsync(self.data, self.stream))
self.data = None self.data = None
""" """
@ -779,31 +848,35 @@ class CudaArray3D:
cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32) cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
#cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32) #cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32)
#Cuda
"""
#Create copy object from device to host #Create copy object from device to host
#copy = cuda.Memcpy2D() copy = cuda.Memcpy2D()
#copy.set_src_device(self.data.gpudata) copy.set_src_device(self.data.gpudata)
#copy.set_dst_host(cpu_data) copy.set_dst_host(cpu_data)
#Set offsets and pitch of source #Set offsets and pitch of source
#copy.src_x_in_bytes = self.x_halo*self.data.strides[1] copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
#copy.src_y = self.y_halo copy.src_y = self.y_halo
#copy.src_z = self.z_halo copy.src_z = self.z_halo
#copy.src_pitch = self.data.strides[0] copy.src_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and #Set width in bytes to copy for each row and
#number of rows to copy #number of rows to copy
#copy.width_in_bytes = self.nx*cpu_data.itemsize copy.width_in_bytes = self.nx*cpu_data.itemsize
#copy.height = self.ny copy.height = self.ny
#copy.depth = self.nz copy.depth = self.nz
copy(stream)
"""
#copy from device to host #copy from device to host
num_bytes = cpu_data.size * cpu_data.itemsize num_bytes = cpu_data.size * cpu_data.itemsize
#hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind) #hip.hipMemcpy(dst, src, unsigned long sizeBytes, kind)
copy = hip_check(hip.hipMemcpy(cpu_data,self.data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost)) copy = hip_check(hip.hipMemcpyAsync(cpu_data,self.data,num_bytes,hip.hipMemcpyKind.hipMemcpyDeviceToHost,stream))
copy(stream)
if asynch==False: if asynch==False:
stream.synchronize() #stream.synchronize()
hip_check(hip.hipStreamSynchronize(stream))
return cpu_data return cpu_data
@ -818,9 +891,11 @@ class ArakawaA2D:
""" """
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self.gpu_variables = [] self.gpu_variables = []
for cpu_variable in cpu_variables: for cpu_variable in cpu_variables:
self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)] self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)]
def __getitem__(self, key): def __getitem__(self, key):
assert type(key) == int, "Indexing is int based" assert type(key) == int, "Indexing is int based"
if (key > len(self.gpu_variables) or key < 0): if (key > len(self.gpu_variables) or key < 0):
@ -839,15 +914,17 @@ class ArakawaA2D:
assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
#print("--FIN: sum:", np.array(cpu_variables).sum())
#stream.synchronize() #stream.synchronize()
hip_check(hip.hipStreamSynchronize(stream))
return cpu_variables return cpu_variables
#hipblas #hipblas
def sum_hipblas(self, num_elements, data): def sum_hipblas(self, num_elements, data):
num_bytes_r = np.dtype(np.float32).itemsize num_bytes_r = np.dtype(np.float32).itemsize
result_d = hip_check(hip.hipMalloc(num_bytes_r)) result_d = hip_check(hip.hipMalloc(num_bytes_r))
result_h = np.zeros(1, dtype=np.float32) result_h0 = np.zeros(1, dtype=np.float32)
print("--bytes:", num_bytes_r)
# call hipblasSaxpy + initialization # call hipblasSaxpy + initialization
handle = hip_check(hipblas.hipblasCreate()) handle = hip_check(hipblas.hipblasCreate())
@ -859,10 +936,12 @@ class ArakawaA2D:
hip_check(hipblas.hipblasDestroy(handle)) hip_check(hipblas.hipblasDestroy(handle))
# copy result (stored in result_d) back to host (store in result_h) # copy result (stored in result_d) back to host (store in result_h)
hip_check(hip.hipMemcpy(result_h,result_d,num_bytes_r,hip.hipMemcpyKind.hipMemcpyDeviceToHost)) hip_check(hip.hipMemcpy(result_h0,result_d,num_bytes_r,hip.hipMemcpyKind.hipMemcpyDeviceToHost))
result_h = result_h0[0]
# clean up # clean up
hip_check(hip.hipFree(data)) #hip_check(hip.hipFree(data))
return result_h return result_h
def check(self): def check(self):
@ -872,8 +951,8 @@ class ArakawaA2D:
for i, gpu_variable in enumerate(self.gpu_variables): for i, gpu_variable in enumerate(self.gpu_variables):
#compute sum with hipblas #compute sum with hipblas
#var_sum = pycuda.gpuarray.sum(gpu_variable.data).get() #var_sum = pycuda.gpuarray.sum(gpu_variable.data).get()
var_sum = self.sum_hipblas(gpu_variable.ny,gpu_variable.data) var_sum = self.sum_hipblas(gpu_variable.data.size,gpu_variable.data)
#print(f"GPU: Sum for column {i}: {var_sum}")
self.logger.debug("Data %d with size [%d x %d] has average %f", i, gpu_variable.nx, gpu_variable.ny, var_sum / (gpu_variable.nx * gpu_variable.ny)) self.logger.debug("Data %d with size [%d x %d] has average %f", i, gpu_variable.nx, gpu_variable.ny, var_sum / (gpu_variable.nx * gpu_variable.ny))
assert np.isnan(var_sum) == False, "Data contains NaN values!" assert np.isnan(var_sum) == False, "Data contains NaN values!"

View File

@ -86,7 +86,8 @@ class CudaContext(object):
if device is None: if device is None:
device = 0 device = 0
hip_check(hip.hipSetDevice(device)) num_gpus = hip_check(hip.hipGetDeviceCount())
hip.hipSetDevice(device)
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,device)) hip_check(hip.hipGetDeviceProperties(props,device))
arch = props.gcnArchName arch = props.gcnArchName
@ -97,9 +98,12 @@ class CudaContext(object):
# Allocate memory to store the PCI BusID # Allocate memory to store the PCI BusID
pciBusId = ctypes.create_string_buffer(64) pciBusId = ctypes.create_string_buffer(64)
# PCI Bus Id # PCI Bus Id
hip_check(hip.hipDeviceGetPCIBusId(pciBusId, 64, device)) #hip_check(hip.hipDeviceGetPCIBusId(pciBusId, 64, device))
pciBusId = hip_check(hip.hipDeviceGetPCIBusId(64, device))
self.logger.info("Using device %d/%d with --arch: '%s', --BusID: %s ", device, hip_check(hip.hipGetDeviceCount()),arch,pciBusId.value.decode('utf-8')[5:7])
#self.logger.info("Using device %d/%d with --arch: '%s', --BusID: %s ", device, num_gpus,arch,pciBusId.value.decode('utf-8')[5:7])
self.logger.info("Using device %d/%d with --arch: '%s', --BusID: %s ", device, num_gpus,arch,pciBusId[5:7])
#self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) #self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
self.logger.debug(" => compute capability: %s", hip_check(hip.hipDeviceComputeCapability(device))) self.logger.debug(" => compute capability: %s", hip_check(hip.hipDeviceComputeCapability(device)))
@ -116,6 +120,7 @@ class CudaContext(object):
self.logger.debug(" => Total memory: %d MB available", int(total/(1024*1024))) self.logger.debug(" => Total memory: %d MB available", int(total/(1024*1024)))
##self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) ##self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
self.logger.info("Created context handle <%s>", str(self.cuda_context))
#Create cache dir for cubin files #Create cache dir for cubin files
self.cache_path = os.path.join(self.module_path, "cuda_cache") self.cache_path = os.path.join(self.module_path, "cuda_cache")
@ -125,42 +130,51 @@ class CudaContext(object):
self.logger.info("Using CUDA cache dir %s", self.cache_path) self.logger.info("Using CUDA cache dir %s", self.cache_path)
self.autotuner = None self.autotuner = None
"""
if (autotuning): if (autotuning):
self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience") self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
self.autotuner = Autotuner.Autotuner() self.autotuner = Autotuner.Autotuner()
"""
def __del__(self, *args): def __del__(self, *args):
self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) #self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
#self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context))
"""
# Loop over all contexts in stack, and remove "this" # Loop over all contexts in stack, and remove "this"
other_contexts = [] other_contexts = []
#while (cuda.Context.get_current() != None): #while (cuda.Context.get_current() != None):
while (hip.hipCtxGetCurrent() != None): while (hip.hipCtxGetCurrent() != None):
#context = cuda.Context.get_current() #context = cuda.Context.get_current()
context = hip_check(hip.hipCtxGetCurrent()) context = hip_check(hip.hipCtxGetCurrent())
if (context.handle != self.cuda_context.handle): #if (context.handle != self.cuda_context.handle):
self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle)) if (context != self.cuda_context):
#self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
#self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context), str(context))
other_contexts = [context] + other_contexts other_contexts = [context] + other_contexts
#cuda.Context.pop() #cuda.Context.pop()
hip.hipCtxPopCurrent() hip.hipCtxPopCurrent()
else: else:
self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle)) #self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context), str(context))
#cuda.Context.pop() #cuda.Context.pop()
hip.hipCtxPopCurrent() hip.hipCtxPopCurrent()
# Add all the contexts we popped that were not our own # Add all the contexts we popped that were not our own
for context in other_contexts: for context in other_contexts:
self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle)) #self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context), str(context))
#cuda.Context.push(context) #cuda.Context.push(context)
hip_check(hip.hipCtxPushCurrent(context)) hip_check(hip.hipCtxPushCurrent(context))
self.logger.debug("<%s> Detaching", str(self.cuda_context.handle)) #self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
self.cuda_context.detach() self.logger.debug("<%s> Detaching", str(self.cuda_context))
#self.cuda_context.detach()
hip_check(hip.hipCtxDestroy(self.cuda_context))
"""
def __str__(self): def __str__(self):
return "CudaContext id " + str(self.cuda_context.handle) #return "CudaContext id " + str(self.cuda_context.handle)
return "CudaContext id " + str(self.cuda_context)
def hash_kernel(kernel_filename, include_dirs): def hash_kernel(kernel_filename, include_dirs):
@ -283,33 +297,15 @@ class CudaContext(object):
with io.open(cached_kernel_filename + ".txt", "w") as file: with io.open(cached_kernel_filename + ".txt", "w") as file:
file.write(kernel_string) file.write(kernel_string)
"""cuda
with Common.Timer("compiler") as timer: with Common.Timer("compiler") as timer:
import warnings import warnings
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning) warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
#module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
#cubin = hip_check(hiprtc.hiprtcCreateProgram(kernel_string.encode(), b"Kernel-Name", 0, [], []))
props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0))
arch = props.gcnArchName
print(f"Compiling kernel for {arch}")
cflags = [b"--offload-arch="+arch]
err, = hiprtc.hiprtcCompileProgram(cubin, len(cflags), cflags)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(cubin))
log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(cubin, log))
raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(cubin))
code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(cubin, code))
module = hip_check(hip.hipModuleLoadData(code))
if (self.use_cache): if (self.use_cache):
with io.open(cached_kernel_filename, "wb") as file: with io.open(cached_kernel_filename, "wb") as file:
@ -317,6 +313,7 @@ class CudaContext(object):
self.modules[kernel_hash] = module self.modules[kernel_hash] = module
return module return module
"""
""" """
Clears the kernel cache (useful for debugging & development) Clears the kernel cache (useful for debugging & development)
@ -330,4 +327,5 @@ class CudaContext(object):
Synchronizes all streams etc Synchronizes all streams etc
""" """
def synchronize(self): def synchronize(self):
self.cuda_context.synchronize() #self.cuda_context.synchronize()
hip_check(hip.hipCtxSynchronize())

View File

@ -1,272 +0,0 @@
# -*- coding: utf-8 -*-
"""
This python module implements Cuda context handling
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import os
import numpy as np
import time
import re
import io
import hashlib
import logging
import gc
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from GPUSimulators import Autotuner, Common
"""
Class which keeps track of the CUDA context and some helper functions
"""
class CudaContext(object):
def __init__(self, device=None, context_flags=None, use_cache=True, autotuning=True):
"""
Create a new CUDA context
Set device to an id or pci_bus_id to select a specific GPU
Set context_flags to cuda.ctx_flags.SCHED_BLOCKING_SYNC for a blocking context
"""
self.use_cache = use_cache
self.logger = logging.getLogger(__name__)
self.modules = {}
self.module_path = os.path.dirname(os.path.realpath(__file__))
#Initialize cuda (must be first call to PyCUDA)
cuda.init(flags=0)
self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
#Print some info about CUDA
self.logger.info("CUDA version %s", str(cuda.get_version()))
self.logger.info("Driver version %s", str(cuda.get_driver_version()))
if device is None:
device = 0
self.cuda_device = cuda.Device(device)
self.logger.info("Using device %d/%d '%s' (%s) GPU", device, cuda.Device.count(), self.cuda_device.name(), self.cuda_device.pci_bus_id())
self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
# Create the CUDA context
if context_flags is None:
context_flags=cuda.ctx_flags.SCHED_AUTO
self.cuda_context = self.cuda_device.make_context(flags=context_flags)
free, total = cuda.mem_get_info()
self.logger.debug(" => memory: %d / %d MB available", int(free/(1024*1024)), int(total/(1024*1024)))
self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
#Create cache dir for cubin files
self.cache_path = os.path.join(self.module_path, "cuda_cache")
if (self.use_cache):
if not os.path.isdir(self.cache_path):
os.mkdir(self.cache_path)
self.logger.info("Using CUDA cache dir %s", self.cache_path)
self.autotuner = None
if (autotuning):
self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
self.autotuner = Autotuner.Autotuner()
def __del__(self, *args):
self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
# Loop over all contexts in stack, and remove "this"
other_contexts = []
while (cuda.Context.get_current() != None):
context = cuda.Context.get_current()
if (context.handle != self.cuda_context.handle):
self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
other_contexts = [context] + other_contexts
cuda.Context.pop()
else:
self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
cuda.Context.pop()
# Add all the contexts we popped that were not our own
for context in other_contexts:
self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
cuda.Context.push(context)
self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
self.cuda_context.detach()
def __str__(self):
return "CudaContext id " + str(self.cuda_context.handle)
def hash_kernel(kernel_filename, include_dirs):
# Generate a kernel ID for our caches
num_includes = 0
max_includes = 100
kernel_hasher = hashlib.md5()
logger = logging.getLogger(__name__)
# Loop over file and includes, and check if something has changed
files = [kernel_filename]
while len(files):
if (num_includes > max_includes):
raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
filename = files.pop()
#logger.debug("Hashing %s", filename)
modified = os.path.getmtime(filename)
# Open the file
with io.open(filename, "r") as file:
# Search for #inclue <something> and also hash the file
file_str = file.read()
kernel_hasher.update(file_str.encode('utf-8'))
kernel_hasher.update(str(modified).encode('utf-8'))
#Find all includes
includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
# Loop over everything that looks like an include
for include_file in includes:
#Search through include directories for the file
file_path = os.path.dirname(filename)
for include_path in [file_path] + include_dirs:
# If we find it, add it to list of files to check
temp_path = os.path.join(include_path, include_file)
if (os.path.isfile(temp_path)):
files = files + [temp_path]
num_includes = num_includes + 1 #For circular includes...
break
return kernel_hasher.hexdigest()
"""
Reads a text file and creates an OpenCL kernel from that
"""
def get_module(self, kernel_filename,
include_dirs=[], \
defines={}, \
compile_args={'no_extern_c', True}, jit_compile_args={}):
"""
Helper function to print compilation output
"""
def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
self.logger.debug("Compilation returned %s", str(compile_success_bool))
if info_str:
self.logger.debug("Info: %s", info_str)
if error_str:
self.logger.debug("Error: %s", error_str)
kernel_filename = os.path.normpath(kernel_filename)
kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
#self.logger.debug("Getting %s", kernel_filename)
# Create a hash of the kernel options
options_hasher = hashlib.md5()
options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
options_hash = options_hasher.hexdigest()
# Create hash of kernel souce
source_hash = CudaContext.hash_kernel( \
kernel_path, \
include_dirs=[self.module_path] + include_dirs)
# Create final hash
root, ext = os.path.splitext(kernel_filename)
kernel_hash = root \
+ "_" + source_hash \
+ "_" + options_hash \
+ ext
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
# If we have the kernel in our hashmap, return it
if (kernel_hash in self.modules.keys()):
self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
return self.modules[kernel_hash]
# If we have it on disk, return it
elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
with io.open(cached_kernel_filename, "rb") as file:
file_str = file.read()
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
self.modules[kernel_hash] = module
return module
# Otherwise, compile it from source
else:
self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
#Create kernel string
kernel_string = ""
for key, value in defines.items():
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
if (self.use_cache):
cached_kernel_dir = os.path.dirname(cached_kernel_filename)
if not os.path.isdir(cached_kernel_dir):
os.mkdir(cached_kernel_dir)
with io.open(cached_kernel_filename + ".txt", "w") as file:
file.write(kernel_string)
with Common.Timer("compiler") as timer:
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
if (self.use_cache):
with io.open(cached_kernel_filename, "wb") as file:
file.write(cubin)
self.modules[kernel_hash] = module
return module
"""
Clears the kernel cache (useful for debugging & development)
"""
def clear_kernel_cache(self):
self.logger.debug("Clearing cache")
self.modules = {}
gc.collect()
"""
Synchronizes all streams etc
"""
def synchronize(self):
self.cuda_context.synchronize()

View File

@ -19,6 +19,9 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
""" """
import os
import sys
#Import packages we need #Import packages we need
from GPUSimulators import Simulator, Common from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
@ -27,13 +30,21 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the Forward-Backward linear scheme Class that solves the SW equations using the Forward-Backward linear scheme
@ -56,20 +67,6 @@ class EE2D_KP07_dimsplit (BaseSimulator):
p: pressure p: pressure
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
rho, rho_u, rho_v, E, rho, rho_u, rho_v, E,
@ -94,133 +91,210 @@ class EE2D_KP07_dimsplit (BaseSimulator):
self.gamma = np.float32(gamma) self.gamma = np.float32(gamma)
self.theta = np.float32(theta) self.theta = np.float32(theta)
#Get kernels
#module = context.get_module("cuda/EE2D_KP07_dimsplit.cu", #Get cuda kernels
# defines={ """ Cuda
# 'BLOCK_WIDTH': self.block_size[0], module = context.get_module("cuda/EE2D_KP07_dimsplit.cu.hip",
# 'BLOCK_HEIGHT': self.block_size[1] defines={
# }, 'BLOCK_WIDTH': self.block_size[0],
# compile_args={ 'BLOCK_HEIGHT': self.block_size[1]
# 'no_extern_c': True, },
# 'options': ["--use_fast_math"], compile_args={
# }, 'no_extern_c': True,
# jit_compile_args={}) 'options': ["--use_fast_math"],
#self.kernel = module.get_function("KP07DimsplitKernel") },
#self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii") jit_compile_args={})
# #compile and load to the device
kernel_file_path = os.path.abspath(os.path.join('cuda', 'EE2D_KP07_dimsplit.cu.hip')) self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#source code
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'EE2D_KP07_dimsplit.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
#headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 0, [], [])) #EulerCommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'EulerCommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#limiters.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h'))
with open(header_file_path, 'r') as file:
header_limiters = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"EulerCommon.h",b"limiters.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <EE2D_KP07_dimsplit.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
#extract the arch of the device
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0)) #only one device 0
arch = props.gcnArchName arch = props.gcnArchName
print(f"Compiling kernel for {arch}") print(f"Compiling kernel for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"KP07DimsplitKernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named "KP07DimsplitKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"KP07DimsplitKernel"))
print()
print("--Get the device kernel *KP07DimsplitKernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
nx, ny, nx, ny,
2, 2, 2, 2,
[rho, rho_u, rho_v, E]) [rho, rho_u, rho_v, E])
self.u1 = Common.ArakawaA2D(self.stream, self.u1 = Common.ArakawaA2D(self.stream,
nx, ny, nx, ny,
2, 2, 2, 2,
[None, None, None, None]) [None, None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
# init device array cfl_data # init device array cfl_data
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(rho_u/rho) + np.sqrt(gamma*rho))) dt_x = np.min(self.dx / (np.abs(rho_u/rho) + np.sqrt(gamma*rho)))
dt_y = np.min(self.dy / (np.abs(rho_v/rho) + np.sqrt(gamma*rho))) dt_y = np.min(self.dy / (np.abs(rho_v/rho) + np.sqrt(gamma*rho)))
self.dt = min(dt_x, dt_y) self.dt = min(dt_x, dt_y)
self.cfl_data.fill(self.dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number, external=True, internal=True): def substep(self, dt, step_number, external=True, internal=True):
self.substepDimsplit(0.5*dt, step_number, external, internal) self.substepDimsplit(0.5*dt, step_number, external, internal)
def substepDimsplit(self, dt, substep, external, internal): def substepDimsplit(self, dt, substep, external, internal):
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u03_strides0 = self.u0[3].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
u13_strides0 = self.u1[3].data.shape[0]*np.float32().itemsize
if external and internal: if external and internal:
#print("COMPLETE DOMAIN (dt=" + str(dt) + ")") #print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, """ Cuda
# self.nx, self.ny, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.dx, self.dy, dt, self.nx, self.ny,
# self.g, self.dx, self.dy, dt,
# self.gamma, self.g,
# self.theta, self.gamma,
# substep, self.theta,
# self.boundary_conditions, substep,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.boundary_conditions,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u0[3].data.gpudata, self.u0[3].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[3].data.gpudata, self.u0[3].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0],
# self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0],
# self.cfl_data.gpudata, self.u1[3].data.gpudata, self.u1[3].data.strides[0],
# 0, 0, self.cfl_data.gpudata,
# self.nx, self.ny) 0, 0,
self.nx, self.ny)
"""
#launch kernel #hip.hipModuleLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, stream, kernelParams, extra)
#The argument grid/block requires 3 components x,y and z. in 2D z=1.
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.gamma), ctypes.c_float(self.gamma),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), self.u0[3].data, ctypes.c_int(u03_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), self.u1[3].data, ctypes.c_int(u13_strides0),
self.cfl_data, self.cfl_data,
0, 0, ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(self.nx), ctypes.c_int(self.ny) ctypes.c_int(self.nx), ctypes.c_int(self.ny),
) )
) )
) )
hip_check(hip.hipDeviceSynchronize()) #print("--External & Internal: Launching Kernel is ok")
hip_check(hip.hipModuleUnload(module))
hip_check(hip.hipFree(cfl_data))
print("--External & Internal: Launching Kernel is ok")
return return
@ -229,236 +303,242 @@ class EE2D_KP07_dimsplit (BaseSimulator):
# XXX: Corners are treated twice! # # XXX: Corners are treated twice! #
################################### ###################################
ns_grid_size = (self.grid_size[0], 1) ns_grid_size = (self.grid_size[0], 1, 1)
# NORTH # NORTH
# (x0, y0) x (x1, y1) # (x0, y0) x (x1, y1)
# (0, ny-y_halo) x (nx, ny) # (0, ny-y_halo) x (nx, ny)
# self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, """ Cuda
# self.nx, self.ny, self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
# self.dx, self.dy, dt, self.nx, self.ny,
# self.g, self.dx, self.dy, dt,
# self.gamma, self.g,
# self.theta, self.gamma,
# substep, self.theta,
# self.boundary_conditions, substep,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.boundary_conditions,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u0[3].data.gpudata, self.u0[3].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[3].data.gpudata, self.u0[3].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0],
# self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0],
# self.cfl_data.gpudata, self.u1[3].data.gpudata, self.u1[3].data.strides[0],
# 0, self.ny - int(self.u0[0].y_halo), self.cfl_data.gpudata,
# self.nx, self.ny) 0, self.ny - int(self.u0[0].y_halo),
self.nx, self.ny)
"""
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*ns_grid_size, *ns_grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.gamma), ctypes.c_float(self.gamma),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), self.u0[3].data, ctypes.c_int(u03_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), self.u1[3].data, ctypes.c_int(u13_strides0),
self.cfl_data, self.cfl_data,
0, ctypes.c_int(self.ny) - ctypes.c_int(self.u0[0].y_halo), ctypes.c_int(0), ctypes.c_int(self.ny - self.u0[0].y_halo),
ctypes.c_int(self.nx), ctypes.c_int(self.ny) ctypes.c_int(self.nx), ctypes.c_int(self.ny),
) )
) )
) )
hip_check(hip.hipStreamSynchronize(self.stream))
#print()
#print("--I m at the NORTH:")
#print()
# SOUTH # SOUTH
# (x0, y0) x (x1, y1) # (x0, y0) x (x1, y1)
# (0, 0) x (nx, y_halo) # (0, 0) x (nx, y_halo)
# self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, """ Cuda
# self.nx, self.ny, self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
# self.dx, self.dy, dt, self.nx, self.ny,
# self.g, self.dx, self.dy, dt,
# self.gamma, self.g,
# self.theta, self.gamma,
# substep, self.theta,
# self.boundary_conditions, substep,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.boundary_conditions,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u0[3].data.gpudata, self.u0[3].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[3].data.gpudata, self.u0[3].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0],
# self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0],
# self.cfl_data.gpudata, self.u1[3].data.gpudata, self.u1[3].data.strides[0],
# 0, 0, self.cfl_data.gpudata,
# self.nx, int(self.u0[0].y_halo)) 0, 0,
self.nx, int(self.u0[0].y_halo))
"""
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*ns_grid_size, *ns_grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.gamma), ctypes.c_float(self.gamma),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), self.u0[3].data, ctypes.c_int(u03_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), self.u1[3].data, ctypes.c_int(u13_strides0),
self.cfl_data, self.cfl_data,
0, 0, ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(self.nx), ctypes.c_int(self.u0[0].y_halo) ctypes.c_int(self.nx), ctypes.c_int(self.u0[0].y_halo),
) )
) )
) )
hip_check(hip.hipStreamSynchronize(self.stream))
we_grid_size = (1, self.grid_size[1]) we_grid_size = (1, self.grid_size[1], 1)
# WEST # WEST
# (x0, y0) x (x1, y1) # (x0, y0) x (x1, y1)
# (0, 0) x (x_halo, ny) # (0, 0) x (x_halo, ny)
# self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, """ Cuda
# self.nx, self.ny, self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
# self.dx, self.dy, dt, self.nx, self.ny,
# self.g, self.dx, self.dy, dt,
# self.gamma, self.g,
# self.theta, self.gamma,
# substep, self.theta,
# self.boundary_conditions, substep,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.boundary_conditions,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u0[3].data.gpudata, self.u0[3].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[3].data.gpudata, self.u0[3].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0],
# self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0],
# self.cfl_data.gpudata, self.u1[3].data.gpudata, self.u1[3].data.strides[0],
# 0, 0, self.cfl_data.gpudata,
# int(self.u0[0].x_halo), self.ny) 0, 0,
int(self.u0[0].x_halo), self.ny)
"""
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*we_grid_size, *we_grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.gamma), ctypes.c_float(self.gamma),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), self.u0[3].data, ctypes.c_int(u03_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), self.u1[3].data, ctypes.c_int(u13_strides0),
self.cfl_data, self.cfl_data,
0, 0, ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.ny) ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.ny),
) )
) )
) )
hip_check(hip.hipStreamSynchronize(self.stream))
# EAST # EAST
# (x0, y0) x (x1, y1) # (x0, y0) x (x1, y1)
# (nx-x_halo, 0) x (nx, ny) # (nx-x_halo, 0) x (nx, ny)
# self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, """ Cuda
# self.nx, self.ny, self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
# self.dx, self.dy, dt, self.nx, self.ny,
# self.g, self.dx, self.dy, dt,
# self.gamma, self.g,
# self.theta, self.gamma,
# substep, self.theta,
# self.boundary_conditions, substep,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.boundary_conditions,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u0[3].data.gpudata, self.u0[3].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[3].data.gpudata, self.u0[3].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0],
# self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0],
# self.cfl_data.gpudata, self.u1[3].data.gpudata, self.u1[3].data.strides[0],
# self.nx - int(self.u0[0].x_halo), 0, self.cfl_data.gpudata,
# self.nx, self.ny) self.nx - int(self.u0[0].x_halo), 0,
self.nx, self.ny)
"""
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*we_grid_size, *we_grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.gamma), ctypes.c_float(self.gamma),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), self.u0[3].data, ctypes.c_int(u03_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), self.u1[3].data, ctypes.c_int(u13_strides0),
self.cfl_data, self.cfl_data,
ctypes.c_int(self.nx) - ctypes.c_int(self.u0[0].x_halo), 0, ctypes.c_int(self.nx - self.u0[0].x_halo), ctypes.c_int(0),
ctypes.c_int(self.nx), ctypes.c_int(self.ny) ctypes.c_int(self.nx), ctypes.c_int(self.ny),
) )
) )
) )
hip_check(hip.hipDeviceSynchronize()) # print("--External and not Internal: Launching Kernel is ok")
hip_check(hip.hipModuleUnload(module))
hip_check(hip.hipFree(cfl_data))
print("--External and not Internal: Launching Kernel is ok")
return return
if internal and not external: if internal and not external:
@ -466,6 +546,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
# INTERNAL DOMAIN # INTERNAL DOMAIN
# (x0, y0) x (x1, y1) # (x0, y0) x (x1, y1)
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo) # (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
"""
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
self.nx, self.ny, self.nx, self.ny,
self.dx, self.dy, dt, self.dx, self.dy, dt,
@ -485,45 +566,40 @@ class EE2D_KP07_dimsplit (BaseSimulator):
self.cfl_data.gpudata, self.cfl_data.gpudata,
int(self.u0[0].x_halo), int(self.u0[0].y_halo), int(self.u0[0].x_halo), int(self.u0[0].y_halo),
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo)) self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
"""
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.internal_stream, stream=self.internal_stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.gamma), ctypes.c_float(self.gamma),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u0[3].data), ctypes.c_float(self.u0[3].data.strides[0]), self.u0[3].data, ctypes.c_int(u03_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
ctypes.c_float(self.u1[3].data), ctypes.c_float(self.u1[3].data.strides[0]), self.u1[3].data, ctypes.c_int(u13_strides0),
self.cfl_data, self.cfl_data,
ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.u0[0].y_halo), ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.u0[0].y_halo),
ctypes.c_int(self.nx) - ctypes.c_int(self.u0[0].x_halo), ctypes.c_int(self.ny) - ctypes.c_int(self.u0[0].y_halo) ctypes.c_int(self.nx - self.u0[0].x_halo), ctypes.c_int(self.ny - self.u0[0].y_halo),
) )
) )
) )
hip_check(hip.hipDeviceSynchronize()) # print("--Internal and not External: Launching Kernel is ok")
hip_check(hip.hipModuleUnload(module))
hip_check(hip.hipFree(cfl_data))
print("--Internal and not External: Launching Kernel is ok")
return return
def swapBuffers(self): def swapBuffers(self):

View File

@ -25,16 +25,24 @@ from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
import numpy as np import numpy as np
import ctypes import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations Class that solves the SW equations
@ -53,19 +61,6 @@ class FORCE (Simulator.BaseSimulator):
dt: Size of each timestep (90 s) dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
@ -87,25 +82,55 @@ class FORCE (Simulator.BaseSimulator):
block_width, block_height) block_width, block_height)
self.g = np.float32(g) self.g = np.float32(g)
#Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_FORCE.cu.hip", """
# defines={ module = context.get_module("cuda/SWE2D_FORCE.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("FORCEKernel") jit_compile_args={})
# self.kernel.prepare("iiffffiPiPiPiPiPiPiP") self.kernel = module.get_function("FORCEKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_FORCE.cu')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_FORCE.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"FORCEKernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"FORCEKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_FORCE.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -113,20 +138,38 @@ class FORCE (Simulator.BaseSimulator):
print(f"Compiling kernel .FORCEKernel. for {arch}") print(f"Compiling kernel .FORCEKernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"FORCEKernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "FORCEKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"FORCEKernel"))
print()
print("--Get the device kernel *FORCEKernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -138,65 +181,79 @@ class FORCE (Simulator.BaseSimulator):
1, 1, 1, 1,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# self.boundary_conditions, self.dx, self.dy, dt,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.g,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
# self.u0, self.u1 = self.u1, self.u0 self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
#print("--Launching Kernel .FORCEKernel. is ok")
hip_check(hip.hipModuleUnload(module))
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .FORCEKernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0

View File

@ -1,7 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements the HLL flux This python module implements the FORCE flux
for the shallow water equations
Copyright (C) 2016 SINTEF ICT Copyright (C) 2016 SINTEF ICT
@ -27,10 +28,21 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
@ -49,19 +61,6 @@ class HLL (Simulator.BaseSimulator):
dt: Size of each timestep (90 s) dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
@ -80,28 +79,58 @@ class HLL (Simulator.BaseSimulator):
boundary_conditions, boundary_conditions,
cfl_scale, cfl_scale,
1, 1,
block_width, block_height); block_width, block_height)
self.g = np.float32(g) self.g = np.float32(g)
#Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_HLL.cu", """
# defines={ module = context.get_module("cuda/SWE2D_HLL.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("HLLKernel") jit_compile_args={})
# self.kernel.prepare("iiffffiPiPiPiPiPiPiP") self.kernel = module.get_function("HLLKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_HLL.cu.hip')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_HLL.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLLKernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLLKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_HLL.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -109,19 +138,38 @@ class HLL (Simulator.BaseSimulator):
print(f"Compiling kernel .HLLKernel. for {arch}") print(f"Compiling kernel .HLLKernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"HLLKernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "FORCEKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"HLLKernel"))
print()
print("--Get the device kernel *HLLKernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -133,63 +181,79 @@ class HLL (Simulator.BaseSimulator):
1, 1, 1, 1,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# self.boundary_conditions, self.dx, self.dy, dt,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.g,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module)) #print("--Launching Kernel .HLLKernel. is ok")
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .HLLKernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0

View File

@ -1,7 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements the 2nd order HLL flux This python module implements the FORCE flux
for the shallow water equations
Copyright (C) 2016 SINTEF ICT Copyright (C) 2016 SINTEF ICT
@ -27,15 +28,24 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the Forward-Backward linear scheme Class that solves the SW equations
""" """
class HLL2 (Simulator.BaseSimulator): class HLL2 (Simulator.BaseSimulator):
@ -51,19 +61,6 @@ class HLL2 (Simulator.BaseSimulator):
dt: Size of each timestep (90 s) dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
@ -83,29 +80,63 @@ class HLL2 (Simulator.BaseSimulator):
boundary_conditions, boundary_conditions,
cfl_scale, cfl_scale,
2, 2,
block_width, block_height); block_width, block_height)
self.g = np.float32(g) self.g = np.float32(g)
self.theta = np.float32(theta) self.theta = np.float32(theta)
#Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_HLL2.cu", """
# defines={ module = context.get_module("cuda/SWE2D_HLL2.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("HLL2Kernel") jit_compile_args={})
# self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") self.kernel = module.get_function("HLL2Kernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_HLL2.cu.hip')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_HLL2.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLL2Kernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#limiters.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h'))
with open(header_file_path, 'r') as file:
header_limiters = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"HLL2Kernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"SWECommon.h",b"limiters.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_HLL2.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -113,19 +144,38 @@ class HLL2 (Simulator.BaseSimulator):
print(f"Compiling kernel .HLL2Kernel. for {arch}") print(f"Compiling kernel .HLL2Kernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"HLL2Kernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "FORCEKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"HLL2Kernel"))
print()
print("--Get the device kernel *HLL2Kernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -137,70 +187,87 @@ class HLL2 (Simulator.BaseSimulator):
2, 2, 2, 2,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
self.substepDimsplit(dt*0.5, step_number) self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep): def substepDimsplit(self, dt, substep):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# self.theta, self.dx, self.dy, dt,
# substep, self.g,
# self.boundary_conditions, self.theta,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], substep,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module)) #print("--Launching Kernel .HLL2Kernel. is ok")
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .HLL2Kernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0
@ -244,4 +311,3 @@ class HLL2 (Simulator.BaseSimulator):
#max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream) max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream)
return max_dt*0.5 return max_dt*0.5

View File

@ -29,6 +29,19 @@ from hip import hip, hiprtc
from GPUSimulators import Common, CudaContext from GPUSimulators import Common, CudaContext
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
@magics_class @magics_class
class MagicCudaContext(Magics): class MagicCudaContext(Magics):
@ -42,19 +55,6 @@ class MagicCudaContext(Magics):
'--no_cache', '-nc', action="store_true", help='Disable caching of kernels') '--no_cache', '-nc', action="store_true", help='Disable caching of kernels')
@magic_arguments.argument( @magic_arguments.argument(
'--no_autotuning', '-na', action="store_true", help='Disable autotuning of kernels') '--no_autotuning', '-na', action="store_true", help='Disable autotuning of kernels')
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def cuda_context_handler(self, line): def cuda_context_handler(self, line):
args = magic_arguments.parse_argstring(self.cuda_context_handler, line) args = magic_arguments.parse_argstring(self.cuda_context_handler, line)

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements the Kurganov-Petrova numerical scheme This python module implements the FORCE flux
for the shallow water equations, described in for the shallow water equations
A. Kurganov & Guergana Petrova
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
Scheme for the Saint-Venant System Communications in Mathematical
Sciences, 5 (2007), 133-160.
Copyright (C) 2016 SINTEF ICT Copyright (C) 2016 SINTEF ICT
@ -32,8 +28,21 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the Forward-Backward linear scheme Class that solves the SW equations using the Forward-Backward linear scheme
@ -52,19 +61,6 @@ class KP07 (Simulator.BaseSimulator):
dt: Size of each timestep (90 s) dt: Size of each timestep (90 s)
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
@ -85,30 +81,64 @@ class KP07 (Simulator.BaseSimulator):
boundary_conditions, boundary_conditions,
cfl_scale, cfl_scale,
order, order,
block_width, block_height); block_width, block_height)
self.g = np.float32(g) self.g = np.float32(g)
self.theta = np.float32(theta) self.theta = np.float32(theta)
self.order = np.int32(order) self.order = np.int32(order)
#Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_KP07.cu", """
# defines={ module = context.get_module("cuda/SWE2D_KP07.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("KP07Kernel") jit_compile_args={})
# self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") self.kernel = module.get_function("KP07Kernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_KP07.cu.hip')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_KP07.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07Kernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#limiters.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h'))
with open(header_file_path, 'r') as file:
header_limiters = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07Kernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"SWECommon.h",b"limiters.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_KP07.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -116,19 +146,38 @@ class KP07 (Simulator.BaseSimulator):
print(f"Compiling kernel .KP07Kernel. for {arch}") print(f"Compiling kernel .KP07Kernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"KP07Kernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "FORCEKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"KP07Kernel"))
print()
print("--Get the device kernel *KP07Kernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -140,73 +189,87 @@ class KP07 (Simulator.BaseSimulator):
2, 2, 2, 2,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
self.substepRK(dt, step_number) self.substepRK(dt, step_number)
def substepRK(self, dt, substep): def substepRK(self, dt, substep):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# self.theta, self.dx, self.dy, dt,
# Simulator.stepOrderToCodedInt(step=substep, order=self.order), self.g,
# self.boundary_conditions, self.theta,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], Simulator.stepOrderToCodedInt(step=substep, order=self.order),
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
Simulator.stepOrderToCodedInt(step=substep, order=self.order), Simulator.stepOrderToCodedInt(step=substep, order=self.order),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module)) #print("--Launching Kernel .KP07Kernel. is ok")
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .KP07Kernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0
@ -247,6 +310,6 @@ class KP07 (Simulator.BaseSimulator):
return min_value return min_value
def computeDt(self): def computeDt(self):
max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream)
#max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); #max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
max_dt = self.min_hipblas(self.cfl_data.size, self.cfl_data, self.stream)
return max_dt*0.5**(self.order-1) return max_dt*0.5**(self.order-1)

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements the Kurganov-Petrova numerical scheme This python module implements the FORCE flux
for the shallow water equations, described in for the shallow water equations
A. Kurganov & Guergana Petrova
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
Scheme for the Saint-Venant System Communications in Mathematical
Sciences, 5 (2007), 133-160.
Copyright (C) 2016 SINTEF ICT Copyright (C) 2016 SINTEF ICT
@ -32,14 +28,26 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the dimentionally split KP07 scheme Class that solves the SW equations using the dimentionally split KP07 scheme
""" """
class KP07_dimsplit(Simulator.BaseSimulator): class KP07_dimsplit (Simulator.BaseSimulator):
""" """
Initialization routine Initialization routine
@ -54,20 +62,6 @@ class KP07_dimsplit(Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
h0, hu0, hv0, h0, hu0, hv0,
@ -92,25 +86,59 @@ class KP07_dimsplit(Simulator.BaseSimulator):
self.g = np.float32(g) self.g = np.float32(g)
self.theta = np.float32(theta) self.theta = np.float32(theta)
#Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu", """
# defines={ module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("KP07DimsplitKernel") jit_compile_args={})
# self.kernel.prepare("iifffffiiPiPiPiPiPiPiP") self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_KP07_dimsplit.cu.hip')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_KP07_dimsplit.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#limiters.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'limiters.h'))
with open(header_file_path, 'r') as file:
header_limiters = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"KP07DimsplitKernel", 3, [header_common.encode(),header_EulerCommon.encode(),header_limiters.encode()], [b"common.h",b"SWECommon.h",b"limiters.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_KP07_dimsplit.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -118,19 +146,38 @@ class KP07_dimsplit(Simulator.BaseSimulator):
print(f"Compiling kernel .KP07DimsplitKernel. for {arch}") print(f"Compiling kernel .KP07DimsplitKernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"KP07DimsplitKernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "FORCEKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"KP07DimsplitKernel"))
print()
print("--Get the device kernel *KP07DimsplitKernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -142,70 +189,87 @@ class KP07_dimsplit(Simulator.BaseSimulator):
self.gc_x, self.gc_y, self.gc_x, self.gc_y,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
self.substepDimsplit(dt*0.5, step_number) self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep): def substepDimsplit(self, dt, substep):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# self.theta, self.dx, self.dy, dt,
# substep, self.g,
# self.boundary_conditions, self.theta,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], substep,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_float(self.theta), ctypes.c_float(self.theta),
ctypes.c_int(substep) ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module))
hip_check(hip.hipFree(cfl_data)) #print("--Launching Kernel .KP07DimsplitKernel. is ok")
print("--Launching Kernel .KP07DimsplitKernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0

View File

@ -1,8 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements the classical Lax-Friedrichs numerical This python module implements the FORCE flux
scheme for the shallow water equations for the shallow water equations
Copyright (C) 2016 SINTEF ICT Copyright (C) 2016 SINTEF ICT
@ -28,10 +28,21 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the Lax Friedrichs scheme Class that solves the SW equations using the Lax Friedrichs scheme
@ -51,20 +62,6 @@ class LxF (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
h0, hu0, hv0, h0, hu0, hv0,
@ -82,28 +79,58 @@ class LxF (Simulator.BaseSimulator):
boundary_conditions, boundary_conditions,
cfl_scale, cfl_scale,
1, 1,
block_width, block_height); block_width, block_height)
self.g = np.float32(g) self.g = np.float32(g)
# Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_LxF.cu", """
# defines={ module = context.get_module("cuda/SWE2D_LxF.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("LxFKernel") jit_compile_args={})
# self.kernel.prepare("iiffffiPiPiPiPiPiPiP") self.kernel = module.get_function("LxFKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_LxF.cu.hip')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_LxF.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"LxFKernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"LxFKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_LxF.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -111,19 +138,38 @@ class LxF (Simulator.BaseSimulator):
print(f"Compiling kernel .LxFKernel. for {arch}") print(f"Compiling kernel .LxFKernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"LxFKernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "LxFKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"LxFKernel"))
print()
print("--Get the device kernel *LxFKernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -135,64 +181,79 @@ class LxF (Simulator.BaseSimulator):
1, 1, 1, 1,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# self.boundary_conditions, self.dx, self.dy, dt,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], self.g,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module)) #print("--Launching Kernel .LxFKernel. is ok")
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .LxFKernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0

View File

@ -30,6 +30,19 @@ import time
#import nvtx #import nvtx
from hip import hip, hiprtc from hip import hip, hiprtc
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
class MPIGrid(object): class MPIGrid(object):
""" """
@ -206,19 +219,6 @@ class MPISimulator(Simulator.BaseSimulator):
""" """
Class which handles communication between simulators on different MPI nodes Class which handles communication between simulators on different MPI nodes
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, sim, grid): def __init__(self, sim, grid):
self.profiling_data_mpi = { 'start': {}, 'end': {} } self.profiling_data_mpi = { 'start': {}, 'end': {} }
@ -306,58 +306,73 @@ class MPISimulator(Simulator.BaseSimulator):
#Note that east and west also transfer ghost cells #Note that east and west also transfer ghost cells
#whilst north/south only transfer internal cells #whilst north/south only transfer internal cells
#Reuses the width/height defined in the read-extets above #Reuses the width/height defined in the read-extets above
##self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) """
self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
"""
##self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) #HIP
##self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) self.in_e = np.zeros((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32)
##self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
self.in_e = np.empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32)
num_bytes_e = self.in_e.size * self.in_e.itemsize num_bytes_e = self.in_e.size * self.in_e.itemsize
#hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device #hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device
#hipHostMallocDefault:Memory is mapped and portable (default allocation) #hipHostMallocDefault:Memory is mapped and portable (default allocation)
#hipHostMallocPortable: memory is explicitely portable across different devices #hipHostMallocPortable: memory is explicitely portable across different devices
self.in_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable)) #self.in_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.in_e, hip.hipHostMallocPortable))
self.in_w = np.empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #print("--hip.hipGetDeviceFlags():", hip.hipGetDeviceFlags())
self.in_w = np.zeros((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32)
num_bytes_w = self.in_w.size * self.in_w.itemsize num_bytes_w = self.in_w.size * self.in_w.itemsize
self.in_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable)) #self.in_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.in_w, hip.hipHostMallocPortable))
self.in_n = np.empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) self.in_n = np.zeros((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32)
num_bytes_n = self.in_n.size * self.in_n.itemsize num_bytes_n = self.in_n.size * self.in_n.itemsize
self.in_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable)) #self.in_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.in_n, hip.hipHostMallocPortable))
self.in_s = np.empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) self.in_s = np.zeros((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32)
num_bytes_s = self.in_s.size * self.in_s.itemsize num_bytes_s = self.in_s.size * self.in_s.itemsize
self.in_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable)) #self.in_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.in_s, hip.hipHostMallocPortable))
#Allocate data for sending #Allocate data for sending
#self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e) """
#self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w) self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e)
#self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n) self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w)
#self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s) self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n)
self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s)
"""
self.out_e = np.empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) self.out_e = np.zeros((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32)
num_bytes_e = self.out_e.size * self.out_e.itemsize num_bytes_e = self.out_e.size * self.out_e.itemsize
self.out_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable)) #self.out_e = hip_check(hip.hipHostMalloc(num_bytes_e,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.out_e, hip.hipHostMallocPortable))
self.out_w = np.empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) self.out_w = np.zeros((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32)
num_bytes_w = self.out_w.size * self.out_w.itemsize num_bytes_w = self.out_w.size * self.out_w.itemsize
self.out_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable)) #self.out_w = hip_check(hip.hipHostMalloc(num_bytes_w,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.out_w, hip.hipHostMallocPortable))
self.out_n = np.empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) self.out_n = np.zeros((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32)
num_bytes_n = self.out_n.size * self.out_n.itemsize num_bytes_n = self.out_n.size * self.out_n.itemsize
self.out_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable)) #self.out_n = hip_check(hip.hipHostMalloc(num_bytes_n,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.out_n, hip.hipHostMallocPortable))
self.out_s = np.empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) self.out_s = np.zeros((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32)
num_bytes_s = self.out_s.size * self.out_s.itemsize num_bytes_s = self.out_s.size * self.out_s.itemsize
self.out_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable)) #self.out_s = hip_check(hip.hipHostMalloc(num_bytes_s,hip.hipHostMallocPortable))
#hip_check(hip.hipHostGetDevicePointer(self.out_s, hip.hipHostMallocPortable))
self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
self.full_exchange() self.full_exchange()
sim.context.synchronize() #hip_check(hip.hipDeviceSynchronize())
#sim.context.synchronize()
def substep(self, dt, step_number): def substep(self, dt, step_number):

View File

@ -29,12 +29,7 @@ from hip import hip, hiprtc
import time import time
class SHMEMSimulator(Simulator.BaseSimulator): def hip_check(call_result):
"""
Class which handles communication and synchronization between simulators in different
contexts (presumably on different GPUs)
"""
def hip_check(call_result):
err = call_result[0] err = call_result[0]
result = call_result[1:] result = call_result[1:]
if len(result) == 1: if len(result) == 1:
@ -48,6 +43,12 @@ class SHMEMSimulator(Simulator.BaseSimulator):
raise RuntimeError(str(err)) raise RuntimeError(str(err))
return result return result
class SHMEMSimulator(Simulator.BaseSimulator):
"""
Class which handles communication and synchronization between simulators in different
contexts (presumably on different GPUs)
"""
def __init__(self, sims, grid): def __init__(self, sims, grid):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)

View File

@ -29,12 +29,7 @@ from hip import hip, hiprtc
import time import time
class SHMEMGrid(object): def hip_check(call_result):
"""
Class which represents an SHMEM grid of GPUs. Facilitates easy communication between
neighboring subdomains in the grid. Contains one CUDA context per subdomain.
"""
def hip_check(call_result):
err = call_result[0] err = call_result[0]
result = call_result[1:] result = call_result[1:]
if len(result) == 1: if len(result) == 1:
@ -48,6 +43,12 @@ class SHMEMGrid(object):
raise RuntimeError(str(err)) raise RuntimeError(str(err))
return result return result
class SHMEMGrid(object):
"""
Class which represents an SHMEM grid of GPUs. Facilitates easy communication between
neighboring subdomains in the grid. Contains one CUDA context per subdomain.
"""
def __init__(self, ngpus=None, ndims=2): def __init__(self, ngpus=None, ndims=2):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)

View File

@ -22,6 +22,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
import numpy as np import numpy as np
import math
import logging import logging
from enum import IntEnum from enum import IntEnum
@ -34,6 +35,20 @@ from hip import hip, hiprtc
from GPUSimulators import Common from GPUSimulators import Common
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
class BoundaryCondition(object): class BoundaryCondition(object):
""" """
Class for holding boundary conditions for global boundaries Class for holding boundary conditions for global boundaries
@ -102,15 +117,6 @@ class BoundaryCondition(object):
class BaseSimulator(object): class BaseSimulator(object):
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
nx, ny, nx, ny,
@ -157,11 +163,16 @@ class BaseSimulator(object):
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height) self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
#Compute kernel launch parameters #Compute kernel launch parameters
"""
self.block_size = (block_width, block_height, 1) self.block_size = (block_width, block_height, 1)
self.grid_size = ( self.grid_size = (
int(np.ceil(self.nx / float(self.block_size[0]))), int(np.ceil(self.nx / float(self.block_size[0]))),
int(np.ceil(self.ny / float(self.block_size[1]))) int(np.ceil(self.ny / float(self.block_size[1])))
) )
"""
self.block_size = hip.dim3(block_width, block_height)
#self.grid_size = hip.dim3(math.ceil(self.nx/block_width),math.ceil(self.ny/block_height))
self.grid_size = hip.dim3(math.ceil((self.nx+block_width-1)/block_width),math.ceil((self.ny+block_height-1)/block_height))
#Create a CUDA stream #Create a CUDA stream
#self.stream = cuda.Stream() #self.stream = cuda.Stream()

View File

@ -1,8 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements the Weighted average flux (WAF) described in This python module implements the FORCE flux
E. Toro, Shock-Capturing methods for free-surface shallow flows, 2001 for the shallow water equations
Copyright (C) 2016 SINTEF ICT Copyright (C) 2016 SINTEF ICT
@ -28,8 +28,21 @@ import ctypes
#from pycuda import gpuarray #from pycuda import gpuarray
from hip import hip,hiprtc from hip import hip,hiprtc
from hip import hipblas
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
""" """
Class that solves the SW equations using the Forward-Backward linear scheme Class that solves the SW equations using the Forward-Backward linear scheme
@ -49,20 +62,6 @@ class WAF (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2) g: Gravitational accelleration (9.81 m/s^2)
""" """
def hip_check(call_result):
err = call_result[0]
result = call_result[1:]
if len(result) == 1:
result = result[0]
if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
raise RuntimeError(str(err))
elif (
isinstance(err, hiprtc.hiprtcResult)
and err != hiprtc.hiprtcResult.HIPRTC_SUCCESS
):
raise RuntimeError(str(err))
return result
def __init__(self, def __init__(self,
context, context,
h0, hu0, hv0, h0, hu0, hv0,
@ -80,28 +79,58 @@ class WAF (Simulator.BaseSimulator):
boundary_conditions, boundary_conditions,
cfl_scale, cfl_scale,
2, 2,
block_width, block_height); block_width, block_height)
self.g = np.float32(g) self.g = np.float32(g)
#Get kernels #Get cuda kernels
# module = context.get_module("cuda/SWE2D_WAF.cu", """
# defines={ module = context.get_module("cuda/SWE2D_WAF.cu",
# 'BLOCK_WIDTH': self.block_size[0], defines={
# 'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_WIDTH': self.block_size[0],
# }, 'BLOCK_HEIGHT': self.block_size[1]
# compile_args={ },
# 'no_extern_c': True, compile_args={
# 'options': ["--use_fast_math"], 'no_extern_c': True,
# }, 'options': ["--use_fast_math"],
# jit_compile_args={}) },
# self.kernel = module.get_function("WAFKernel") jit_compile_args={})
# self.kernel.prepare("iiffffiiPiPiPiPiPiPiP") self.kernel = module.get_function("WAFKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiP")
"""
kernel_file_path = os.path.abspath(os.path.join('cuda', 'SWE2D_WAF.cu.hip')) current_dir = os.path.dirname(os.path.abspath(__file__))
# Specify the relative path to the "cuda" directory
cuda_dir = os.path.join(current_dir, 'cuda')
#kernel source
kernel_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWE2D_WAF.cu.hip'))
with open(kernel_file_path, 'r') as file: with open(kernel_file_path, 'r') as file:
kernel_source = file.read() kernel_source = file.read()
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"WAFKernel", 0, [], [])) #headers
#common.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'common.h'))
with open(header_file_path, 'r') as file:
header_common = file.read()
#SWECommon.h
header_file_path = os.path.abspath(os.path.join(cuda_dir, 'SWECommon.h'))
with open(header_file_path, 'r') as file:
header_EulerCommon = file.read()
#hip.hiprtc.hiprtcCreateProgram(const char *src, const char *name, int numHeaders, headers, includeNames)
prog = hip_check(hiprtc.hiprtcCreateProgram(kernel_source.encode(), b"WAFKernel", 2, [header_common.encode(),header_SWECommon.encode()], [b"common.h", b"SWECommon.h"]))
# Check if the program is created successfully
if prog is not None:
print("--This is <SWE2D_WAF.cu.hip>")
print("--HIPRTC program created successfully")
print()
else:
print("--Failed to create HIPRTC program")
print("--I stop:", err)
exit()
props = hip.hipDeviceProp_t() props = hip.hipDeviceProp_t()
hip_check(hip.hipGetDeviceProperties(props,0)) hip_check(hip.hipGetDeviceProperties(props,0))
@ -109,19 +138,38 @@ class WAF (Simulator.BaseSimulator):
print(f"Compiling kernel .WAFKernel. for {arch}") print(f"Compiling kernel .WAFKernel. for {arch}")
cflags = [b"--offload-arch="+arch] cflags = [b"--offload-arch="+arch, b"-O2", b"-D BLOCK_WIDTH="+ str(self.block_size[0]).encode(), b"-D BLOCK_HEIGHT=" + str(self.block_size[1]).encode()]
err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags) err, = hiprtc.hiprtcCompileProgram(prog, len(cflags), cflags)
# Check if the program is compiled successfully
if err is not None:
print("--Compilation:", err)
print("--The program is compiled successfully")
else:
print("--Compilation:", err)
print("--Failed to compile the program")
print("--I stop:", err)
if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS: if err != hiprtc.hiprtcResult.HIPRTC_SUCCESS:
log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog)) log_size = hip_check(hiprtc.hiprtcGetProgramLogSize(prog))
log = bytearray(log_size) log = bytearray(log_size)
hip_check(hiprtc.hiprtcGetProgramLog(prog, log)) hip_check(hiprtc.hiprtcGetProgramLog(prog, log))
raise RuntimeError(log.decode()) raise RuntimeError(log.decode())
code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog)) code_size = hip_check(hiprtc.hiprtcGetCodeSize(prog))
code = bytearray(code_size) code = bytearray(code_size)
hip_check(hiprtc.hiprtcGetCode(prog, code)) hip_check(hiprtc.hiprtcGetCode(prog, code))
module = hip_check(hip.hipModuleLoadData(code))
kernel = hip_check(hip.hipModuleGetFunction(module, b"WAFKernel")) #Load the code as a module
self.module = hip_check(hip.hipModuleLoadData(code))
#Get the device kernel named named "LxFKernel"
self.kernel = hip_check(hip.hipModuleGetFunction(self.module, b"WAFKernel"))
print()
print("--Get the device kernel *WAFKernel* is created successfully--")
print("--kernel", self.kernel)
print()
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, self.u0 = Common.ArakawaA2D(self.stream,
@ -133,69 +181,84 @@ class WAF (Simulator.BaseSimulator):
2, 2, 2, 2,
[None, None, None]) [None, None, None])
#self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32) #self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
data_h = np.empty(self.grid_size, dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=self.grid_size)
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0))) dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
dt = min(dt_x, dt_y) dt = min(dt_x, dt_y)
self.cfl_data.fill(dt, stream=self.stream) #in HIP, the "DeviceArray" object doesn't have a 'fill' attribute
#self.cfl_data.fill(self.dt, stream=self.stream)
grid_dim_x, grid_dim_y, grid_dim_z = self.grid_size
data_h = np.zeros((grid_dim_x, grid_dim_y), dtype=np.float32)
num_bytes = data_h.size * data_h.itemsize
data_h.fill(self.dt)
self.cfl_data = hip_check(hip.hipMalloc(num_bytes)).configure(
typestr="float32",shape=(grid_dim_x, grid_dim_y))
hip_check(hip.hipMemcpyAsync(self.cfl_data,data_h,num_bytes,hip.hipMemcpyKind.hipMemcpyHostToDevice,self.stream))
#sets the memory region pointed to by x_d to zero asynchronously
#initiates the memset operation asynchronously
#hip_check(hip.hipMemsetAsync(self.cfl_data,0,num_bytes,self.stream))
def substep(self, dt, step_number): def substep(self, dt, step_number):
self.substepDimsplit(dt*0.5, step_number) self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep): def substepDimsplit(self, dt, substep):
# self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, #Cuda
# self.nx, self.ny, """
# self.dx, self.dy, dt, self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
# self.g, self.nx, self.ny,
# substep, self.dx, self.dy, dt,
# self.boundary_conditions, self.g,
# self.u0[0].data.gpudata, self.u0[0].data.strides[0], substep,
# self.u0[1].data.gpudata, self.u0[1].data.strides[0], self.boundary_conditions,
# self.u0[2].data.gpudata, self.u0[2].data.strides[0], self.u0[0].data.gpudata, self.u0[0].data.strides[0],
# self.u1[0].data.gpudata, self.u1[0].data.strides[0], self.u0[1].data.gpudata, self.u0[1].data.strides[0],
# self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u0[2].data.gpudata, self.u0[2].data.strides[0],
# self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[0].data.gpudata, self.u1[0].data.strides[0],
# self.cfl_data.gpudata) self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata)
self.u0, self.u1 = self.u1, self.u0
"""
u00_strides0 = self.u0[0].data.shape[0]*np.float32().itemsize
u01_strides0 = self.u0[1].data.shape[0]*np.float32().itemsize
u02_strides0 = self.u0[2].data.shape[0]*np.float32().itemsize
u10_strides0 = self.u1[0].data.shape[0]*np.float32().itemsize
u11_strides0 = self.u1[1].data.shape[0]*np.float32().itemsize
u12_strides0 = self.u1[2].data.shape[0]*np.float32().itemsize
#launch kernel #launch kernel
hip_check( hip_check(
hip.hipModuleLaunchKernel( hip.hipModuleLaunchKernel(
kernel, self.kernel,
*self.grid_size, *self.grid_size, #grid
*self.block_size, *self.block_size, #block
sharedMemBytes=0, sharedMemBytes=0, #65536,
stream=self.stream, stream=self.stream,
kernelParams=None, kernelParams=None,
extra=( # pass kernel's arguments extra=( # pass kernel's arguments
ctypes.c_int(self.nx), ctypes.c_int(self.ny), ctypes.c_int(self.nx), ctypes.c_int(self.ny),
ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(self.dt), ctypes.c_float(self.dx), ctypes.c_float(self.dy), ctypes.c_float(dt),
ctypes.c_float(self.g), ctypes.c_float(self.g),
ctypes.c_int(substep), ctypes.c_int(substep),
ctypes.c_int(self.boundary_conditions), ctypes.c_int(self.boundary_conditions),
ctypes.c_float(self.u0[0].data), ctypes.c_float(self.u0[0].data.strides[0]), self.u0[0].data, ctypes.c_int(u00_strides0),
ctypes.c_float(self.u0[1].data), ctypes.c_float(self.u0[1].data.strides[0]), self.u0[1].data, ctypes.c_int(u01_strides0),
ctypes.c_float(self.u0[2].data), ctypes.c_float(self.u0[2].data.strides[0]), self.u0[2].data, ctypes.c_int(u02_strides0),
ctypes.c_float(self.u1[0].data), ctypes.c_float(self.u1[0].data.strides[0]), self.u1[0].data, ctypes.c_int(u10_strides0),
ctypes.c_float(self.u1[1].data), ctypes.c_float(self.u1[1].data.strides[0]), self.u1[1].data, ctypes.c_int(u11_strides0),
ctypes.c_float(self.u1[2].data), ctypes.c_float(self.u1[2].data.strides[0]), self.u1[2].data, ctypes.c_int(u12_strides0),
self.cfl_data self.cfl_data,
) )
) )
) )
hip_check(hip.hipDeviceSynchronize())
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
hip_check(hip.hipModuleUnload(module)) #print("--Launching Kernel .WAFKernel. is ok")
hip_check(hip.hipFree(cfl_data))
print("--Launching Kernel .WAFKernel. is ok")
def getOutput(self): def getOutput(self):
return self.u0 return self.u0

View File

@ -25,7 +25,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "EulerCommon.h" #include "EulerCommon.h"
#include "limiters.h" #include "limiters.h"
__device__ __device__
void computeFluxF(float Q[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], void computeFluxF(float Q[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], float Qx[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],

View File

@ -24,6 +24,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#pragma once #pragma once
#include <stddef.h>
#include <float.h>
/** /**
* Float3 operators * Float3 operators
@ -86,9 +88,6 @@ __device__ float desingularize(float x_, float eps_) {
/** /**
* Returns the step stored in the leftmost 16 bits * Returns the step stored in the leftmost 16 bits
* of the 32 bit step-order integer * of the 32 bit step-order integer
@ -497,14 +496,18 @@ __device__ void memset(float Q[vars][shmem_height][shmem_width], float value) {
template <unsigned int threads> template <unsigned int threads>
__device__ void reduce_max(float* data, unsigned int n) { //__device__ void reduce_max(float* data, unsigned int n) {
__device__ float reduce_max(float* data, unsigned int n) {
__shared__ float sdata[threads]; __shared__ float sdata[threads];
unsigned int tid = threadIdx.x; unsigned int tid = threadIdx.x;
//Reduce to "threads" elements //Reduce to "threads" elements
sdata[tid] = FLT_MIN; sdata[tid] = FLT_MIN;
for (unsigned int i=tid; i<n; i += threads) { for (unsigned int i=tid; i<n; i += threads) {
sdata[tid] = max(sdata[tid], dt_ctx.L[i]);
//sdata[tid] = max(sdata[tid], dt_ctx.L[i]);
sdata[tid] = max(sdata[tid], data[i]);
} }
__syncthreads(); __syncthreads();