Refactoring

This commit is contained in:
André R. Brodtkorb
2018-10-31 10:45:48 +01:00
parent e434b4e02a
commit 71777dad4e
9 changed files with 136 additions and 84 deletions

View File

@@ -170,8 +170,9 @@ class CudaContext(object):
""" """
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
prepared_call_args, \ prepared_call_args, \
include_dirs=[], no_extern_c=True, include_dirs=[], \
**kwargs): defines={}, \
compile_args={'no_extern_c', True}, jit_compile_args={}):
""" """
Helper function to print compilation output Helper function to print compilation output
""" """
@@ -183,19 +184,20 @@ class CudaContext(object):
self.logger.debug("Error: %s", error_str) self.logger.debug("Error: %s", error_str)
kernel_filename = os.path.normpath(kernel_filename) kernel_filename = os.path.normpath(kernel_filename)
kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
#self.logger.debug("Getting %s", kernel_filename) #self.logger.debug("Getting %s", kernel_filename)
# Create a hash of the kernel (and its includes) # Create a hash of the kernel (and its includes)
kwargs_hasher = hashlib.md5() options_hasher = hashlib.md5()
kwargs_hasher.update(str(kwargs).encode('utf-8')); options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
kwargs_hash = kwargs_hasher.hexdigest() options_hash = options_hasher.hexdigest()
kwargs_hasher = None options_hasher = None
root, ext = os.path.splitext(kernel_filename) root, ext = os.path.splitext(kernel_filename)
kernel_hash = root \ kernel_hash = root \
+ "_" + CudaContext.hash_kernel( \ + "_" + CudaContext.hash_kernel( \
os.path.join(self.module_path, kernel_filename), \ kernel_path, \
include_dirs=[self.module_path] + include_dirs) \ include_dirs=[self.module_path] + include_dirs) \
+ "_" + kwargs_hash \ + "_" + options_hash \
+ ext + ext
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
@@ -210,7 +212,7 @@ class CudaContext(object):
with io.open(cached_kernel_filename, "rb") as file: with io.open(cached_kernel_filename, "rb") as file:
file_str = file.read() file_str = file.read()
module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
kernel = module.get_function(kernel_function_name) kernel = module.get_function(kernel_function_name)
kernel.prepare(prepared_call_args) kernel.prepare(prepared_call_args)
@@ -223,7 +225,7 @@ class CudaContext(object):
#Create kernel string #Create kernel string
kernel_string = "" kernel_string = ""
for key, value in kwargs.items(): for key, value in defines.items():
kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
if (self.use_cache): if (self.use_cache):
@@ -235,8 +237,11 @@ class CudaContext(object):
with Common.Timer("compiler") as timer: with Common.Timer("compiler") as timer:
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) import warnings
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
if (self.use_cache): if (self.use_cache):
with io.open(cached_kernel_filename, "wb") as file: with io.open(cached_kernel_filename, "wb") as file:
file.write(cubin) file.write(cubin)

View File

@@ -68,8 +68,15 @@ class FORCE (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \
"iiffffPiPiPiPiPiPi", \ "iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -85,7 +92,7 @@ class FORCE (Simulator.BaseSimulator):
return super().simulateEuler(t_end) return super().simulateEuler(t_end)
def stepEuler(self, dt): def stepEuler(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \

View File

@@ -63,8 +63,15 @@ class HLL (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \
"iiffffPiPiPiPiPiPi", \ "iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -80,7 +87,7 @@ class HLL (Simulator.BaseSimulator):
return super().simulateEuler(t_end) return super().simulateEuler(t_end)
def stepEuler(self, dt): def stepEuler(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \

View File

@@ -69,8 +69,15 @@ class HLL2 (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \
"iifffffiPiPiPiPiPiPi", \ "iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -89,7 +96,7 @@ class HLL2 (Simulator.BaseSimulator):
return self.stepDimsplitXY(dt) return self.stepDimsplitXY(dt)
def stepDimsplitXY(self, dt): def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
@@ -105,7 +112,7 @@ class HLL2 (Simulator.BaseSimulator):
self.t += dt self.t += dt
def stepDimsplitYX(self, dt): def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \

View File

@@ -70,8 +70,15 @@ class KP07 (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \
"iifffffiPiPiPiPiPiPi", \ "iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -87,7 +94,7 @@ class KP07 (Simulator.BaseSimulator):
return super().simulateRK(t_end, 2) return super().simulateRK(t_end, 2)
def substepRK(self, dt, substep): def substepRK(self, dt, substep):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \

View File

@@ -70,8 +70,15 @@ class KP07_dimsplit (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \
"iifffffiPiPiPiPiPiPi", \ "iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -90,7 +97,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
return self.stepDimsplitXY(dt) return self.stepDimsplitXY(dt)
def stepDimsplitXY(self, dt): def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
@@ -106,7 +113,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
self.t += dt self.t += dt
def stepDimsplitYX(self, dt): def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \

View File

@@ -64,8 +64,15 @@ class LxF (Simulator.BaseSimulator):
# Get kernels # Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \
"iiffffPiPiPiPiPiPi", \ "iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -81,7 +88,7 @@ class LxF (Simulator.BaseSimulator):
return super().simulateEuler(t_end) return super().simulateEuler(t_end)
def stepEuler(self, dt): def stepEuler(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \

View File

@@ -55,20 +55,10 @@ class BaseSimulator:
#Get logger #Get logger
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
self.context = context
if (self.context.autotuner):
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
block_width = int(peak_configuration["block_width"])
block_height = int(peak_configuration["block_height"])
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
#Create a CUDA stream
self.stream = cuda.Stream()
#Save input parameters #Save input parameters
#Notice that we need to specify them in the correct dataformat for the #Notice that we need to specify them in the correct dataformat for the
#GPU kernel #GPU kernel
self.context = context
self.nx = np.int32(nx) self.nx = np.int32(nx)
self.ny = np.int32(ny) self.ny = np.int32(ny)
self.dx = np.float32(dx) self.dx = np.float32(dx)
@@ -76,15 +66,25 @@ class BaseSimulator:
self.dt = np.float32(dt) self.dt = np.float32(dt)
self.g = np.float32(g) self.g = np.float32(g)
#Handle autotuning block size
if (self.context.autotuner):
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
block_width = int(peak_configuration["block_width"])
block_height = int(peak_configuration["block_height"])
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
#Compute kernel launch parameters
self.block_size = (block_width, block_height, 1)
self.grid_size = ( \
int(np.ceil(self.nx / float(self.block_size[0]))), \
int(np.ceil(self.ny / float(self.block_size[1]))) \
)
#Create a CUDA stream
self.stream = cuda.Stream()
#Keep track of simulation time #Keep track of simulation time
self.t = 0.0; self.t = 0.0;
#Compute kernel launch parameters
self.local_size = (block_width, block_height, 1)
self.global_size = ( \
int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
def __str__(self): def __str__(self):
return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny) return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny)
@@ -115,7 +115,7 @@ class BaseSimulator:
# Step with forward Euler # Step with forward Euler
self.stepEuler(local_dt) self.stepEuler(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs) self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n)
return self.t, n return self.t, n
""" """
@@ -123,22 +123,21 @@ class BaseSimulator:
Requires that the stepRK functionality is implemented in the subclasses Requires that the stepRK functionality is implemented in the subclasses
""" """
def simulateRK(self, t_end, order): def simulateRK(self, t_end, order):
with Common.Timer(self.__class__.__name__ + ".simulateRK") as t: # Compute number of timesteps to perform
# Compute number of timesteps to perform n = int(t_end / self.dt + 1)
n = int(t_end / self.dt + 1)
for i in range(0, n):
# Compute timestep for "this" iteration
local_dt = np.float32(min(self.dt, t_end-i*self.dt))
for i in range(0, n): # Stop if end reached (should not happen)
# Compute timestep for "this" iteration if (local_dt <= 0.0):
local_dt = np.float32(min(self.dt, t_end-i*self.dt)) break
# Stop if end reached (should not happen) # Perform all the Runge-Kutta substeps
if (local_dt <= 0.0): self.stepRK(local_dt, order)
break
# Perform all the Runge-Kutta substeps self.logger.info("%s simulated %f seconds to %f with %d steps (RK2)", self, t_end, self.t, n)
self.stepRK(local_dt, order)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs)
return self.t, n return self.t, n
""" """
@@ -146,23 +145,22 @@ class BaseSimulator:
Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
""" """
def simulateDimsplit(self, t_end): def simulateDimsplit(self, t_end):
with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t: # Compute number of timesteps to perform
# Compute number of timesteps to perform n = int(t_end / (2.0*self.dt) + 1)
n = int(t_end / (2.0*self.dt) + 1)
for i in range(0, n):
# Compute timestep for "this" iteration
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
for i in range(0, n): # Stop if end reached (should not happen)
# Compute timestep for "this" iteration if (local_dt <= 0.0):
local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt)) break
# Stop if end reached (should not happen)
if (local_dt <= 0.0):
break
# Perform the dimensional split substeps
self.stepDimsplitXY(local_dt)
self.stepDimsplitYX(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs) # Perform the dimensional split substeps
self.stepDimsplitXY(local_dt)
self.stepDimsplitYX(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps (dimsplit)", self, t_end, self.t, 2*n)
return self.t, 2*n return self.t, 2*n

View File

@@ -63,8 +63,15 @@ class WAF (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \ self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \
"iiffffiPiPiPiPiPiPi", \ "iiffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=self.local_size[0], \ defines={
BLOCK_HEIGHT=self.local_size[1]) 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
}, \
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
}, \
jit_compile_args={})
#Create data by uploading to device #Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream, \ self.u0 = Common.ArakawaA2D(self.stream, \
@@ -83,7 +90,7 @@ class WAF (Simulator.BaseSimulator):
return self.stepDimsplitXY(dt) return self.stepDimsplitXY(dt)
def stepDimsplitXY(self, dt): def stepDimsplitXY(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
@@ -98,7 +105,7 @@ class WAF (Simulator.BaseSimulator):
self.t += dt self.t += dt
def stepDimsplitYX(self, dt): def stepDimsplitYX(self, dt):
self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \