Refactoring

2026-01-14 15:48:43 +01:00 · 2018-10-31 10:45:48 +01:00
parent e434b4e02a
commit 71777dad4e
9 changed files with 136 additions and 84 deletions
--- a/GPUSimulators/CudaContext.py
+++ b/GPUSimulators/CudaContext.py
@@ -170,8 +170,9 @@ class CudaContext(object):
    """
    def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
                    prepared_call_args, \
-                    include_dirs=[], no_extern_c=True, 
-                    **kwargs):
+                    include_dirs=[], \
+                    defines={}, \
+                    compile_args={'no_extern_c', True}, jit_compile_args={}):
        """
        Helper function to print compilation output
        """
@@ -183,19 +184,20 @@ class CudaContext(object):
                self.logger.debug("Error: %s", error_str)
        
        kernel_filename = os.path.normpath(kernel_filename)
+        kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
        #self.logger.debug("Getting %s", kernel_filename)
            
        # Create a hash of the kernel (and its includes)
-        kwargs_hasher = hashlib.md5()
-        kwargs_hasher.update(str(kwargs).encode('utf-8'));
-        kwargs_hash = kwargs_hasher.hexdigest()
-        kwargs_hasher = None
+        options_hasher = hashlib.md5()
+        options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
+        options_hash = options_hasher.hexdigest()
+        options_hasher = None
        root, ext = os.path.splitext(kernel_filename)
        kernel_hash = root \
                + "_" + CudaContext.hash_kernel( \
-                    os.path.join(self.module_path, kernel_filename), \
+                    kernel_path, \
                    include_dirs=[self.module_path] + include_dirs) \
-                + "_" + kwargs_hash \
+                + "_" + options_hash \
                + ext
        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
        
@@ -210,7 +212,7 @@ class CudaContext(object):
                
            with io.open(cached_kernel_filename, "rb") as file:
                file_str = file.read()
-                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
+                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
                
            kernel = module.get_function(kernel_function_name)
            kernel.prepare(prepared_call_args)
@@ -223,7 +225,7 @@ class CudaContext(object):
                
            #Create kernel string
            kernel_string = ""
-            for key, value in kwargs.items():
+            for key, value in defines.items():
                kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
            kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
            if (self.use_cache):
@@ -235,8 +237,11 @@ class CudaContext(object):
                
            
            with Common.Timer("compiler") as timer:
-                cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
-                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
+                    cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
+                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
                if (self.use_cache):
                    with io.open(cached_kernel_filename, "wb") as file:
                        file.write(cubin)
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -68,8 +68,15 @@ class FORCE (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \
                                        "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -85,7 +92,7 @@ class FORCE (Simulator.BaseSimulator):
        return super().simulateEuler(t_end)
        
    def stepEuler(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -63,8 +63,15 @@ class HLL (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \
                                        "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -80,7 +87,7 @@ class HLL (Simulator.BaseSimulator):
        return super().simulateEuler(t_end)
        
    def stepEuler(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -69,8 +69,15 @@ class HLL2 (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \
                                        "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
        
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -89,7 +96,7 @@ class HLL2 (Simulator.BaseSimulator):
        return self.stepDimsplitXY(dt)
                
    def stepDimsplitXY(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
@@ -105,7 +112,7 @@ class HLL2 (Simulator.BaseSimulator):
        self.t += dt
            
    def stepDimsplitYX(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -70,8 +70,15 @@ class KP07 (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \
                                        "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
        
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -87,7 +94,7 @@ class KP07 (Simulator.BaseSimulator):
        return super().simulateRK(t_end, 2)
        
    def substepRK(self, dt, substep):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -70,8 +70,15 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \
                                        "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -90,7 +97,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        return self.stepDimsplitXY(dt)
    
    def stepDimsplitXY(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
@@ -106,7 +113,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
        self.t += dt
    
    def stepDimsplitYX(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -64,8 +64,15 @@ class LxF (Simulator.BaseSimulator):
        # Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \
                                        "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})

        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -81,7 +88,7 @@ class LxF (Simulator.BaseSimulator):
        return super().simulateEuler(t_end)
        
    def stepEuler(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -55,20 +55,10 @@ class BaseSimulator:
        #Get logger
        self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
        
-        self.context = context
-        
-        if (self.context.autotuner):
-            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
-            block_width = int(peak_configuration["block_width"])
-            block_height = int(peak_configuration["block_height"])
-            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
-        
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
-                           
        #Save input parameters
        #Notice that we need to specify them in the correct dataformat for the
        #GPU kernel
+        self.context = context
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
@@ -76,16 +66,26 @@ class BaseSimulator:
        self.dt = np.float32(dt)
        self.g = np.float32(g) 
        
-        #Keep track of simulation time
-        self.t = 0.0;
+        #Handle autotuning block size
+        if (self.context.autotuner):
+            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
+            block_width = int(peak_configuration["block_width"])
+            block_height = int(peak_configuration["block_height"])
+            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
        
        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
+        self.block_size = (block_width, block_height, 1) 
+        self.grid_size = ( \
+                       int(np.ceil(self.nx / float(self.block_size[0]))), \
+                       int(np.ceil(self.ny / float(self.block_size[1]))) \
                      )
        
+        #Create a CUDA stream
+        self.stream = cuda.Stream()
+        
+        #Keep track of simulation time
+        self.t = 0.0;
+                      
    def __str__(self):
        return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny)
                      
@@ -115,7 +115,7 @@ class BaseSimulator:
                # Step with forward Euler 
                self.stepEuler(local_dt)
            
-        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs)
+        self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n)
        return self.t, n
        
    """
@@ -123,22 +123,21 @@ class BaseSimulator:
    Requires that the stepRK functionality is implemented in the subclasses
    """
    def simulateRK(self, t_end, order):
-        with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
-            # Compute number of timesteps to perform
-            n = int(t_end / self.dt + 1)
+        # Compute number of timesteps to perform
+        n = int(t_end / self.dt + 1)
        
-            for i in range(0, n):
-                # Compute timestep for "this" iteration
-                local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+        for i in range(0, n):
+            # Compute timestep for "this" iteration
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
            
-                # Stop if end reached (should not happen)
-                if (local_dt <= 0.0):
-                    break
+            # Stop if end reached (should not happen)
+            if (local_dt <= 0.0):
+                break
        
-                # Perform all the Runge-Kutta substeps
-                self.stepRK(local_dt, order)
+            # Perform all the Runge-Kutta substeps
+            self.stepRK(local_dt, order)
            
-        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs)
+        self.logger.info("%s simulated %f seconds to %f with %d steps (RK2)", self, t_end, self.t, n)
        return self.t, n
        
    """
@@ -146,23 +145,22 @@ class BaseSimulator:
    Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
    """
    def simulateDimsplit(self, t_end):
-        with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
-            # Compute number of timesteps to perform
-            n = int(t_end / (2.0*self.dt) + 1)
+        # Compute number of timesteps to perform
+        n = int(t_end / (2.0*self.dt) + 1)
        
-            for i in range(0, n):
-                # Compute timestep for "this" iteration
-                local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
+        for i in range(0, n):
+            # Compute timestep for "this" iteration
+            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
            
-                # Stop if end reached (should not happen)
-                if (local_dt <= 0.0):
-                    break
+            # Stop if end reached (should not happen)
+            if (local_dt <= 0.0):
+                break
            
-                # Perform the dimensional split substeps
-                self.stepDimsplitXY(local_dt)
-                self.stepDimsplitYX(local_dt)
+            # Perform the dimensional split substeps
+            self.stepDimsplitXY(local_dt)
+            self.stepDimsplitYX(local_dt)
            
-        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs)
+        self.logger.info("%s simulated %f seconds to %f with %d steps (dimsplit)", self, t_end, self.t, 2*n)
        return self.t, 2*n
        
    
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -63,8 +63,15 @@ class WAF (Simulator.BaseSimulator):
        #Get kernels
        self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \
                                        "iiffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
    
        #Create data by uploading to device
        self.u0 = Common.ArakawaA2D(self.stream, \
@@ -83,7 +90,7 @@ class WAF (Simulator.BaseSimulator):
        return self.stepDimsplitXY(dt)
        
    def stepDimsplitXY(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \
@@ -98,7 +105,7 @@ class WAF (Simulator.BaseSimulator):
        self.t += dt
        
    def stepDimsplitYX(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                self.nx, self.ny, \
                self.dx, self.dy, dt, \
                self.g, \