From 71777dad4e2fdda9b282b0eb9ffff7cd146c23b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= Date: Wed, 31 Oct 2018 10:45:48 +0100 Subject: [PATCH] Refactoring --- GPUSimulators/CudaContext.py | 29 ++++++----- GPUSimulators/FORCE.py | 13 +++-- GPUSimulators/HLL.py | 13 +++-- GPUSimulators/HLL2.py | 15 ++++-- GPUSimulators/KP07.py | 13 +++-- GPUSimulators/KP07_dimsplit.py | 15 ++++-- GPUSimulators/LxF.py | 13 +++-- GPUSimulators/Simulator.py | 94 +++++++++++++++++----------------- GPUSimulators/WAF.py | 15 ++++-- 9 files changed, 136 insertions(+), 84 deletions(-) diff --git a/GPUSimulators/CudaContext.py b/GPUSimulators/CudaContext.py index fd50864..113fdd8 100644 --- a/GPUSimulators/CudaContext.py +++ b/GPUSimulators/CudaContext.py @@ -170,8 +170,9 @@ class CudaContext(object): """ def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ prepared_call_args, \ - include_dirs=[], no_extern_c=True, - **kwargs): + include_dirs=[], \ + defines={}, \ + compile_args={'no_extern_c', True}, jit_compile_args={}): """ Helper function to print compilation output """ @@ -183,19 +184,20 @@ class CudaContext(object): self.logger.debug("Error: %s", error_str) kernel_filename = os.path.normpath(kernel_filename) + kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename)) #self.logger.debug("Getting %s", kernel_filename) # Create a hash of the kernel (and its includes) - kwargs_hasher = hashlib.md5() - kwargs_hasher.update(str(kwargs).encode('utf-8')); - kwargs_hash = kwargs_hasher.hexdigest() - kwargs_hasher = None + options_hasher = hashlib.md5() + options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8')); + options_hash = options_hasher.hexdigest() + options_hasher = None root, ext = os.path.splitext(kernel_filename) kernel_hash = root \ + "_" + CudaContext.hash_kernel( \ - os.path.join(self.module_path, kernel_filename), \ + kernel_path, \ include_dirs=[self.module_path] + include_dirs) \ - + "_" + kwargs_hash \ + + "_" + options_hash \ + ext cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) @@ -210,7 +212,7 @@ class CudaContext(object): with io.open(cached_kernel_filename, "rb") as file: file_str = file.read() - module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler) + module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args) kernel = module.get_function(kernel_function_name) kernel.prepare(prepared_call_args) @@ -223,7 +225,7 @@ class CudaContext(object): #Create kernel string kernel_string = "" - for key, value in kwargs.items(): + for key, value in defines.items(): kernel_string += "#define {:s} {:s}\n".format(str(key), str(value)) kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename)) if (self.use_cache): @@ -235,8 +237,11 @@ class CudaContext(object): with Common.Timer("compiler") as timer: - cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False) - module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler) + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning) + cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args) + module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args) if (self.use_cache): with io.open(cached_kernel_filename, "wb") as file: file.write(cubin) diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py index 21ff961..e88224c 100644 --- a/GPUSimulators/FORCE.py +++ b/GPUSimulators/FORCE.py @@ -68,8 +68,15 @@ class FORCE (Simulator.BaseSimulator): #Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \ "iiffffPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -85,7 +92,7 @@ class FORCE (Simulator.BaseSimulator): return super().simulateEuler(t_end) def stepEuler(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py index a764b40..bc77ff8 100644 --- a/GPUSimulators/HLL.py +++ b/GPUSimulators/HLL.py @@ -63,8 +63,15 @@ class HLL (Simulator.BaseSimulator): #Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \ "iiffffPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -80,7 +87,7 @@ class HLL (Simulator.BaseSimulator): return super().simulateEuler(t_end) def stepEuler(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py index d0a146c..f4e141b 100644 --- a/GPUSimulators/HLL2.py +++ b/GPUSimulators/HLL2.py @@ -69,8 +69,15 @@ class HLL2 (Simulator.BaseSimulator): #Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \ "iifffffiPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -89,7 +96,7 @@ class HLL2 (Simulator.BaseSimulator): return self.stepDimsplitXY(dt) def stepDimsplitXY(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ @@ -105,7 +112,7 @@ class HLL2 (Simulator.BaseSimulator): self.t += dt def stepDimsplitYX(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py index f9d2f02..46ad8f4 100644 --- a/GPUSimulators/KP07.py +++ b/GPUSimulators/KP07.py @@ -70,8 +70,15 @@ class KP07 (Simulator.BaseSimulator): #Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \ "iifffffiPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -87,7 +94,7 @@ class KP07 (Simulator.BaseSimulator): return super().simulateRK(t_end, 2) def substepRK(self, dt, substep): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py index 372c0fa..92e7fa5 100644 --- a/GPUSimulators/KP07_dimsplit.py +++ b/GPUSimulators/KP07_dimsplit.py @@ -70,8 +70,15 @@ class KP07_dimsplit (Simulator.BaseSimulator): #Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \ "iifffffiPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -90,7 +97,7 @@ class KP07_dimsplit (Simulator.BaseSimulator): return self.stepDimsplitXY(dt) def stepDimsplitXY(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ @@ -106,7 +113,7 @@ class KP07_dimsplit (Simulator.BaseSimulator): self.t += dt def stepDimsplitYX(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py index f48d6a9..33ab080 100644 --- a/GPUSimulators/LxF.py +++ b/GPUSimulators/LxF.py @@ -64,8 +64,15 @@ class LxF (Simulator.BaseSimulator): # Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \ "iiffffPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -81,7 +88,7 @@ class LxF (Simulator.BaseSimulator): return super().simulateEuler(t_end) def stepEuler(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ diff --git a/GPUSimulators/Simulator.py b/GPUSimulators/Simulator.py index 0006e83..7282375 100644 --- a/GPUSimulators/Simulator.py +++ b/GPUSimulators/Simulator.py @@ -55,20 +55,10 @@ class BaseSimulator: #Get logger self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) - self.context = context - - if (self.context.autotuner): - peak_configuration = self.context.autotuner.get_peak_performance(self.__class__) - block_width = int(peak_configuration["block_width"]) - block_height = int(peak_configuration["block_height"]) - self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height) - - #Create a CUDA stream - self.stream = cuda.Stream() - #Save input parameters #Notice that we need to specify them in the correct dataformat for the #GPU kernel + self.context = context self.nx = np.int32(nx) self.ny = np.int32(ny) self.dx = np.float32(dx) @@ -76,15 +66,25 @@ class BaseSimulator: self.dt = np.float32(dt) self.g = np.float32(g) + #Handle autotuning block size + if (self.context.autotuner): + peak_configuration = self.context.autotuner.get_peak_performance(self.__class__) + block_width = int(peak_configuration["block_width"]) + block_height = int(peak_configuration["block_height"]) + self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height) + + #Compute kernel launch parameters + self.block_size = (block_width, block_height, 1) + self.grid_size = ( \ + int(np.ceil(self.nx / float(self.block_size[0]))), \ + int(np.ceil(self.ny / float(self.block_size[1]))) \ + ) + + #Create a CUDA stream + self.stream = cuda.Stream() + #Keep track of simulation time self.t = 0.0; - - #Compute kernel launch parameters - self.local_size = (block_width, block_height, 1) - self.global_size = ( \ - int(np.ceil(self.nx / float(self.local_size[0]))), \ - int(np.ceil(self.ny / float(self.local_size[1]))) \ - ) def __str__(self): return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny) @@ -115,7 +115,7 @@ class BaseSimulator: # Step with forward Euler self.stepEuler(local_dt) - self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs) + self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n) return self.t, n """ @@ -123,22 +123,21 @@ class BaseSimulator: Requires that the stepRK functionality is implemented in the subclasses """ def simulateRK(self, t_end, order): - with Common.Timer(self.__class__.__name__ + ".simulateRK") as t: - # Compute number of timesteps to perform - n = int(t_end / self.dt + 1) + # Compute number of timesteps to perform + n = int(t_end / self.dt + 1) + + for i in range(0, n): + # Compute timestep for "this" iteration + local_dt = np.float32(min(self.dt, t_end-i*self.dt)) - for i in range(0, n): - # Compute timestep for "this" iteration - local_dt = np.float32(min(self.dt, t_end-i*self.dt)) - - # Stop if end reached (should not happen) - if (local_dt <= 0.0): - break + # Stop if end reached (should not happen) + if (local_dt <= 0.0): + break + + # Perform all the Runge-Kutta substeps + self.stepRK(local_dt, order) - # Perform all the Runge-Kutta substeps - self.stepRK(local_dt, order) - - self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs) + self.logger.info("%s simulated %f seconds to %f with %d steps (RK2)", self, t_end, self.t, n) return self.t, n """ @@ -146,23 +145,22 @@ class BaseSimulator: Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses """ def simulateDimsplit(self, t_end): - with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t: - # Compute number of timesteps to perform - n = int(t_end / (2.0*self.dt) + 1) + # Compute number of timesteps to perform + n = int(t_end / (2.0*self.dt) + 1) + + for i in range(0, n): + # Compute timestep for "this" iteration + local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt)) - for i in range(0, n): - # Compute timestep for "this" iteration - local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt)) - - # Stop if end reached (should not happen) - if (local_dt <= 0.0): - break - - # Perform the dimensional split substeps - self.stepDimsplitXY(local_dt) - self.stepDimsplitYX(local_dt) + # Stop if end reached (should not happen) + if (local_dt <= 0.0): + break - self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs) + # Perform the dimensional split substeps + self.stepDimsplitXY(local_dt) + self.stepDimsplitYX(local_dt) + + self.logger.info("%s simulated %f seconds to %f with %d steps (dimsplit)", self, t_end, self.t, 2*n) return self.t, 2*n diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py index a7bdd51..c22c7f6 100644 --- a/GPUSimulators/WAF.py +++ b/GPUSimulators/WAF.py @@ -63,8 +63,15 @@ class WAF (Simulator.BaseSimulator): #Get kernels self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \ "iiffffiPiPiPiPiPiPi", \ - BLOCK_WIDTH=self.local_size[0], \ - BLOCK_HEIGHT=self.local_size[1]) + defines={ + 'BLOCK_WIDTH': self.block_size[0], + 'BLOCK_HEIGHT': self.block_size[1] + }, \ + compile_args={ + 'no_extern_c': True, + 'options': ["--use_fast_math"], + }, \ + jit_compile_args={}) #Create data by uploading to device self.u0 = Common.ArakawaA2D(self.stream, \ @@ -83,7 +90,7 @@ class WAF (Simulator.BaseSimulator): return self.stepDimsplitXY(dt) def stepDimsplitXY(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \ @@ -98,7 +105,7 @@ class WAF (Simulator.BaseSimulator): self.t += dt def stepDimsplitYX(self, dt): - self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \ + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.nx, self.ny, \ self.dx, self.dy, dt, \ self.g, \