mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-11-29 17:28:03 +01:00
Refactoring
This commit is contained in:
@@ -170,8 +170,9 @@ class CudaContext(object):
|
||||
"""
|
||||
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
|
||||
prepared_call_args, \
|
||||
include_dirs=[], no_extern_c=True,
|
||||
**kwargs):
|
||||
include_dirs=[], \
|
||||
defines={}, \
|
||||
compile_args={'no_extern_c', True}, jit_compile_args={}):
|
||||
"""
|
||||
Helper function to print compilation output
|
||||
"""
|
||||
@@ -183,19 +184,20 @@ class CudaContext(object):
|
||||
self.logger.debug("Error: %s", error_str)
|
||||
|
||||
kernel_filename = os.path.normpath(kernel_filename)
|
||||
kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
|
||||
#self.logger.debug("Getting %s", kernel_filename)
|
||||
|
||||
# Create a hash of the kernel (and its includes)
|
||||
kwargs_hasher = hashlib.md5()
|
||||
kwargs_hasher.update(str(kwargs).encode('utf-8'));
|
||||
kwargs_hash = kwargs_hasher.hexdigest()
|
||||
kwargs_hasher = None
|
||||
options_hasher = hashlib.md5()
|
||||
options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
|
||||
options_hash = options_hasher.hexdigest()
|
||||
options_hasher = None
|
||||
root, ext = os.path.splitext(kernel_filename)
|
||||
kernel_hash = root \
|
||||
+ "_" + CudaContext.hash_kernel( \
|
||||
os.path.join(self.module_path, kernel_filename), \
|
||||
kernel_path, \
|
||||
include_dirs=[self.module_path] + include_dirs) \
|
||||
+ "_" + kwargs_hash \
|
||||
+ "_" + options_hash \
|
||||
+ ext
|
||||
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
|
||||
|
||||
@@ -210,7 +212,7 @@ class CudaContext(object):
|
||||
|
||||
with io.open(cached_kernel_filename, "rb") as file:
|
||||
file_str = file.read()
|
||||
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
|
||||
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
|
||||
|
||||
kernel = module.get_function(kernel_function_name)
|
||||
kernel.prepare(prepared_call_args)
|
||||
@@ -223,7 +225,7 @@ class CudaContext(object):
|
||||
|
||||
#Create kernel string
|
||||
kernel_string = ""
|
||||
for key, value in kwargs.items():
|
||||
for key, value in defines.items():
|
||||
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
|
||||
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
|
||||
if (self.use_cache):
|
||||
@@ -235,8 +237,11 @@ class CudaContext(object):
|
||||
|
||||
|
||||
with Common.Timer("compiler") as timer:
|
||||
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
|
||||
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
|
||||
import warnings
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
|
||||
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
|
||||
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
|
||||
if (self.use_cache):
|
||||
with io.open(cached_kernel_filename, "wb") as file:
|
||||
file.write(cubin)
|
||||
|
||||
@@ -68,8 +68,15 @@ class FORCE (Simulator.BaseSimulator):
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -85,7 +92,7 @@ class FORCE (Simulator.BaseSimulator):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
@@ -63,8 +63,15 @@ class HLL (Simulator.BaseSimulator):
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -80,7 +87,7 @@ class HLL (Simulator.BaseSimulator):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
@@ -69,8 +69,15 @@ class HLL2 (Simulator.BaseSimulator):
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -89,7 +96,7 @@ class HLL2 (Simulator.BaseSimulator):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
@@ -105,7 +112,7 @@ class HLL2 (Simulator.BaseSimulator):
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
@@ -70,8 +70,15 @@ class KP07 (Simulator.BaseSimulator):
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -87,7 +94,7 @@ class KP07 (Simulator.BaseSimulator):
|
||||
return super().simulateRK(t_end, 2)
|
||||
|
||||
def substepRK(self, dt, substep):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
@@ -70,8 +70,15 @@ class KP07_dimsplit (Simulator.BaseSimulator):
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \
|
||||
"iifffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -90,7 +97,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
@@ -106,7 +113,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
@@ -64,8 +64,15 @@ class LxF (Simulator.BaseSimulator):
|
||||
# Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \
|
||||
"iiffffPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -81,7 +88,7 @@ class LxF (Simulator.BaseSimulator):
|
||||
return super().simulateEuler(t_end)
|
||||
|
||||
def stepEuler(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
@@ -55,20 +55,10 @@ class BaseSimulator:
|
||||
#Get logger
|
||||
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
|
||||
|
||||
self.context = context
|
||||
|
||||
if (self.context.autotuner):
|
||||
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
|
||||
block_width = int(peak_configuration["block_width"])
|
||||
block_height = int(peak_configuration["block_height"])
|
||||
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
|
||||
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
#Save input parameters
|
||||
#Notice that we need to specify them in the correct dataformat for the
|
||||
#GPU kernel
|
||||
self.context = context
|
||||
self.nx = np.int32(nx)
|
||||
self.ny = np.int32(ny)
|
||||
self.dx = np.float32(dx)
|
||||
@@ -76,16 +66,26 @@ class BaseSimulator:
|
||||
self.dt = np.float32(dt)
|
||||
self.g = np.float32(g)
|
||||
|
||||
#Keep track of simulation time
|
||||
self.t = 0.0;
|
||||
#Handle autotuning block size
|
||||
if (self.context.autotuner):
|
||||
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
|
||||
block_width = int(peak_configuration["block_width"])
|
||||
block_height = int(peak_configuration["block_height"])
|
||||
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
|
||||
|
||||
#Compute kernel launch parameters
|
||||
self.local_size = (block_width, block_height, 1)
|
||||
self.global_size = ( \
|
||||
int(np.ceil(self.nx / float(self.local_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.local_size[1]))) \
|
||||
self.block_size = (block_width, block_height, 1)
|
||||
self.grid_size = ( \
|
||||
int(np.ceil(self.nx / float(self.block_size[0]))), \
|
||||
int(np.ceil(self.ny / float(self.block_size[1]))) \
|
||||
)
|
||||
|
||||
#Create a CUDA stream
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
#Keep track of simulation time
|
||||
self.t = 0.0;
|
||||
|
||||
def __str__(self):
|
||||
return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny)
|
||||
|
||||
@@ -115,7 +115,7 @@ class BaseSimulator:
|
||||
# Step with forward Euler
|
||||
self.stepEuler(local_dt)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs)
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n)
|
||||
return self.t, n
|
||||
|
||||
"""
|
||||
@@ -123,22 +123,21 @@ class BaseSimulator:
|
||||
Requires that the stepRK functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateRK(self, t_end, order):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / self.dt + 1)
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / self.dt + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Perform all the Runge-Kutta substeps
|
||||
self.stepRK(local_dt, order)
|
||||
# Perform all the Runge-Kutta substeps
|
||||
self.stepRK(local_dt, order)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs)
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps (RK2)", self, t_end, self.t, n)
|
||||
return self.t, n
|
||||
|
||||
"""
|
||||
@@ -146,23 +145,22 @@ class BaseSimulator:
|
||||
Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
|
||||
"""
|
||||
def simulateDimsplit(self, t_end):
|
||||
with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
# Compute number of timesteps to perform
|
||||
n = int(t_end / (2.0*self.dt) + 1)
|
||||
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
for i in range(0, n):
|
||||
# Compute timestep for "this" iteration
|
||||
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
|
||||
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
# Stop if end reached (should not happen)
|
||||
if (local_dt <= 0.0):
|
||||
break
|
||||
|
||||
# Perform the dimensional split substeps
|
||||
self.stepDimsplitXY(local_dt)
|
||||
self.stepDimsplitYX(local_dt)
|
||||
# Perform the dimensional split substeps
|
||||
self.stepDimsplitXY(local_dt)
|
||||
self.stepDimsplitYX(local_dt)
|
||||
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs)
|
||||
self.logger.info("%s simulated %f seconds to %f with %d steps (dimsplit)", self, t_end, self.t, 2*n)
|
||||
return self.t, 2*n
|
||||
|
||||
|
||||
|
||||
@@ -63,8 +63,15 @@ class WAF (Simulator.BaseSimulator):
|
||||
#Get kernels
|
||||
self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \
|
||||
"iiffffiPiPiPiPiPiPi", \
|
||||
BLOCK_WIDTH=self.local_size[0], \
|
||||
BLOCK_HEIGHT=self.local_size[1])
|
||||
defines={
|
||||
'BLOCK_WIDTH': self.block_size[0],
|
||||
'BLOCK_HEIGHT': self.block_size[1]
|
||||
}, \
|
||||
compile_args={
|
||||
'no_extern_c': True,
|
||||
'options': ["--use_fast_math"],
|
||||
}, \
|
||||
jit_compile_args={})
|
||||
|
||||
#Create data by uploading to device
|
||||
self.u0 = Common.ArakawaA2D(self.stream, \
|
||||
@@ -83,7 +90,7 @@ class WAF (Simulator.BaseSimulator):
|
||||
return self.stepDimsplitXY(dt)
|
||||
|
||||
def stepDimsplitXY(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
@@ -98,7 +105,7 @@ class WAF (Simulator.BaseSimulator):
|
||||
self.t += dt
|
||||
|
||||
def stepDimsplitYX(self, dt):
|
||||
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
|
||||
self.nx, self.ny, \
|
||||
self.dx, self.dy, dt, \
|
||||
self.g, \
|
||||
|
||||
Reference in New Issue
Block a user