From 71777dad4e2fdda9b282b0eb9ffff7cd146c23b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= <Andre.Brodtkorb@sintef.no>
Date: Wed, 31 Oct 2018 10:45:48 +0100
Subject: [PATCH] Refactoring

---
 GPUSimulators/CudaContext.py   | 29 ++++++-----
 GPUSimulators/FORCE.py         | 13 +++--
 GPUSimulators/HLL.py           | 13 +++--
 GPUSimulators/HLL2.py          | 15 ++++--
 GPUSimulators/KP07.py          | 13 +++--
 GPUSimulators/KP07_dimsplit.py | 15 ++++--
 GPUSimulators/LxF.py           | 13 +++--
 GPUSimulators/Simulator.py     | 94 +++++++++++++++++-----------------
 GPUSimulators/WAF.py           | 15 ++++--
 9 files changed, 136 insertions(+), 84 deletions(-)

diff --git a/GPUSimulators/CudaContext.py b/GPUSimulators/CudaContext.py
index fd50864..113fdd8 100644
--- a/GPUSimulators/CudaContext.py
+++ b/GPUSimulators/CudaContext.py
@@ -170,8 +170,9 @@ class CudaContext(object):
     """
     def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
                     prepared_call_args, \
-                    include_dirs=[], no_extern_c=True, 
-                    **kwargs):
+                    include_dirs=[], \
+                    defines={}, \
+                    compile_args={'no_extern_c', True}, jit_compile_args={}):
         """
         Helper function to print compilation output
         """
@@ -183,19 +184,20 @@ class CudaContext(object):
                 self.logger.debug("Error: %s", error_str)
         
         kernel_filename = os.path.normpath(kernel_filename)
+        kernel_path = os.path.abspath(os.path.join(self.module_path, kernel_filename))
         #self.logger.debug("Getting %s", kernel_filename)
             
         # Create a hash of the kernel (and its includes)
-        kwargs_hasher = hashlib.md5()
-        kwargs_hasher.update(str(kwargs).encode('utf-8'));
-        kwargs_hash = kwargs_hasher.hexdigest()
-        kwargs_hasher = None
+        options_hasher = hashlib.md5()
+        options_hasher.update(str(defines).encode('utf-8') + str(compile_args).encode('utf-8'));
+        options_hash = options_hasher.hexdigest()
+        options_hasher = None
         root, ext = os.path.splitext(kernel_filename)
         kernel_hash = root \
                 + "_" + CudaContext.hash_kernel( \
-                    os.path.join(self.module_path, kernel_filename), \
+                    kernel_path, \
                     include_dirs=[self.module_path] + include_dirs) \
-                + "_" + kwargs_hash \
+                + "_" + options_hash \
                 + ext
         cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
         
@@ -210,7 +212,7 @@ class CudaContext(object):
                 
             with io.open(cached_kernel_filename, "rb") as file:
                 file_str = file.read()
-                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
+                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler, **jit_compile_args)
                 
             kernel = module.get_function(kernel_function_name)
             kernel.prepare(prepared_call_args)
@@ -223,7 +225,7 @@ class CudaContext(object):
                 
             #Create kernel string
             kernel_string = ""
-            for key, value in kwargs.items():
+            for key, value in defines.items():
                 kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
             kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
             if (self.use_cache):
@@ -235,8 +237,11 @@ class CudaContext(object):
                 
             
             with Common.Timer("compiler") as timer:
-                cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
-                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", message="The CUDA compiler succeeded, but said the following:\nkernel.cu", category=UserWarning)
+                    cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, cache_dir=False, **compile_args)
+                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler, **jit_compile_args)
                 if (self.use_cache):
                     with io.open(cached_kernel_filename, "wb") as file:
                         file.write(cubin)
diff --git a/GPUSimulators/FORCE.py b/GPUSimulators/FORCE.py
index 21ff961..e88224c 100644
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@@ -68,8 +68,15 @@ class FORCE (Simulator.BaseSimulator):
         #Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_FORCE.cu", "FORCEKernel", \
                                         "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -85,7 +92,7 @@ class FORCE (Simulator.BaseSimulator):
         return super().simulateEuler(t_end)
         
     def stepEuler(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
diff --git a/GPUSimulators/HLL.py b/GPUSimulators/HLL.py
index a764b40..bc77ff8 100644
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@@ -63,8 +63,15 @@ class HLL (Simulator.BaseSimulator):
         #Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_HLL.cu", "HLLKernel", \
                                         "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -80,7 +87,7 @@ class HLL (Simulator.BaseSimulator):
         return super().simulateEuler(t_end)
         
     def stepEuler(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
diff --git a/GPUSimulators/HLL2.py b/GPUSimulators/HLL2.py
index d0a146c..f4e141b 100644
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@@ -69,8 +69,15 @@ class HLL2 (Simulator.BaseSimulator):
         #Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_HLL2.cu", "HLL2Kernel", \
                                         "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
         
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -89,7 +96,7 @@ class HLL2 (Simulator.BaseSimulator):
         return self.stepDimsplitXY(dt)
                 
     def stepDimsplitXY(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
@@ -105,7 +112,7 @@ class HLL2 (Simulator.BaseSimulator):
         self.t += dt
             
     def stepDimsplitYX(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
diff --git a/GPUSimulators/KP07.py b/GPUSimulators/KP07.py
index f9d2f02..46ad8f4 100644
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@@ -70,8 +70,15 @@ class KP07 (Simulator.BaseSimulator):
         #Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_KP07.cu", "KP07Kernel", \
                                         "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
         
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -87,7 +94,7 @@ class KP07 (Simulator.BaseSimulator):
         return super().simulateRK(t_end, 2)
         
     def substepRK(self, dt, substep):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
diff --git a/GPUSimulators/KP07_dimsplit.py b/GPUSimulators/KP07_dimsplit.py
index 372c0fa..92e7fa5 100644
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@@ -70,8 +70,15 @@ class KP07_dimsplit (Simulator.BaseSimulator):
         #Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_KP07_dimsplit.cu", "KP07DimsplitKernel", \
                                         "iifffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -90,7 +97,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
         return self.stepDimsplitXY(dt)
     
     def stepDimsplitXY(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
@@ -106,7 +113,7 @@ class KP07_dimsplit (Simulator.BaseSimulator):
         self.t += dt
     
     def stepDimsplitYX(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
diff --git a/GPUSimulators/LxF.py b/GPUSimulators/LxF.py
index f48d6a9..33ab080 100644
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@@ -64,8 +64,15 @@ class LxF (Simulator.BaseSimulator):
         # Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_LxF.cu", "LxFKernel", \
                                         "iiffffPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
 
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -81,7 +88,7 @@ class LxF (Simulator.BaseSimulator):
         return super().simulateEuler(t_end)
         
     def stepEuler(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
diff --git a/GPUSimulators/Simulator.py b/GPUSimulators/Simulator.py
index 0006e83..7282375 100644
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -55,20 +55,10 @@ class BaseSimulator:
         #Get logger
         self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
         
-        self.context = context
-        
-        if (self.context.autotuner):
-            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
-            block_width = int(peak_configuration["block_width"])
-            block_height = int(peak_configuration["block_height"])
-            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
-        
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
-                           
         #Save input parameters
         #Notice that we need to specify them in the correct dataformat for the
         #GPU kernel
+        self.context = context
         self.nx = np.int32(nx)
         self.ny = np.int32(ny)
         self.dx = np.float32(dx)
@@ -76,15 +66,25 @@ class BaseSimulator:
         self.dt = np.float32(dt)
         self.g = np.float32(g) 
         
+        #Handle autotuning block size
+        if (self.context.autotuner):
+            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
+            block_width = int(peak_configuration["block_width"])
+            block_height = int(peak_configuration["block_height"])
+            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
+        
+        #Compute kernel launch parameters
+        self.block_size = (block_width, block_height, 1) 
+        self.grid_size = ( \
+                       int(np.ceil(self.nx / float(self.block_size[0]))), \
+                       int(np.ceil(self.ny / float(self.block_size[1]))) \
+                      )
+        
+        #Create a CUDA stream
+        self.stream = cuda.Stream()
+        
         #Keep track of simulation time
         self.t = 0.0;
-                            
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      )
                       
     def __str__(self):
         return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny)
@@ -115,7 +115,7 @@ class BaseSimulator:
                 # Step with forward Euler 
                 self.stepEuler(local_dt)
             
-        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs)
+        self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n)
         return self.t, n
         
     """
@@ -123,22 +123,21 @@ class BaseSimulator:
     Requires that the stepRK functionality is implemented in the subclasses
     """
     def simulateRK(self, t_end, order):
-        with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
-            # Compute number of timesteps to perform
-            n = int(t_end / self.dt + 1)
+        # Compute number of timesteps to perform
+        n = int(t_end / self.dt + 1)
+        
+        for i in range(0, n):
+            # Compute timestep for "this" iteration
+            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
             
-            for i in range(0, n):
-                # Compute timestep for "this" iteration
-                local_dt = np.float32(min(self.dt, t_end-i*self.dt))
-                
-                # Stop if end reached (should not happen)
-                if (local_dt <= 0.0):
-                    break
+            # Stop if end reached (should not happen)
+            if (local_dt <= 0.0):
+                break
+        
+            # Perform all the Runge-Kutta substeps
+            self.stepRK(local_dt, order)
             
-                # Perform all the Runge-Kutta substeps
-                self.stepRK(local_dt, order)
-            
-        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs)
+        self.logger.info("%s simulated %f seconds to %f with %d steps (RK2)", self, t_end, self.t, n)
         return self.t, n
         
     """
@@ -146,23 +145,22 @@ class BaseSimulator:
     Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
     """
     def simulateDimsplit(self, t_end):
-        with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
-            # Compute number of timesteps to perform
-            n = int(t_end / (2.0*self.dt) + 1)
+        # Compute number of timesteps to perform
+        n = int(t_end / (2.0*self.dt) + 1)
+        
+        for i in range(0, n):
+            # Compute timestep for "this" iteration
+            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
             
-            for i in range(0, n):
-                # Compute timestep for "this" iteration
-                local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
-                
-                # Stop if end reached (should not happen)
-                if (local_dt <= 0.0):
-                    break
-                
-                # Perform the dimensional split substeps
-                self.stepDimsplitXY(local_dt)
-                self.stepDimsplitYX(local_dt)
+            # Stop if end reached (should not happen)
+            if (local_dt <= 0.0):
+                break
             
-        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs)
+            # Perform the dimensional split substeps
+            self.stepDimsplitXY(local_dt)
+            self.stepDimsplitYX(local_dt)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps (dimsplit)", self, t_end, self.t, 2*n)
         return self.t, 2*n
         
     
diff --git a/GPUSimulators/WAF.py b/GPUSimulators/WAF.py
index a7bdd51..c22c7f6 100644
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@@ -63,8 +63,15 @@ class WAF (Simulator.BaseSimulator):
         #Get kernels
         self.kernel = context.get_prepared_kernel("cuda/SWE_WAF.cu", "WAFKernel", \
                                         "iiffffiPiPiPiPiPiPi", \
-                                        BLOCK_WIDTH=self.local_size[0], \
-                                        BLOCK_HEIGHT=self.local_size[1])
+                                        defines={
+                                            'BLOCK_WIDTH': self.block_size[0], 
+                                            'BLOCK_HEIGHT': self.block_size[1]
+                                        }, \
+                                        compile_args={
+                                            'no_extern_c': True,
+                                            'options': ["--use_fast_math"], 
+                                        }, \
+                                        jit_compile_args={})
     
         #Create data by uploading to device
         self.u0 = Common.ArakawaA2D(self.stream, \
@@ -83,7 +90,7 @@ class WAF (Simulator.BaseSimulator):
         return self.stepDimsplitXY(dt)
         
     def stepDimsplitXY(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \
@@ -98,7 +105,7 @@ class WAF (Simulator.BaseSimulator):
         self.t += dt
         
     def stepDimsplitYX(self, dt):
-        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
                 self.nx, self.ny, \
                 self.dx, self.dy, dt, \
                 self.g, \