Working prototype of autotuning

This commit is contained in:
André R. Brodtkorb 2018-08-22 16:20:18 +02:00
parent f60ceaa316
commit 803ce8ab70
33 changed files with 992 additions and 976 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

277
GPUSimulators/Autotuner.py Normal file
View File

@ -0,0 +1,277 @@
# -*- coding: utf-8 -*-
"""
This python module implements the different helper functions and
classes
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import os
import gc
import numpy as np
import logging
from socket import gethostname
import pycuda.driver as cuda
from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF
class Autotuner:
def __init__(self,
nx=2048, ny=2048,
block_widths=range(8, 32, 2),
block_heights=range(8, 32, 2)):
logger = logging.getLogger(__name__)
self.filename = "autotuning_data_" + gethostname() + ".npz"
self.nx = nx
self.ny = ny
self.block_widths = block_widths
self.block_heights = block_heights
self.performance = {}
def benchmark(self, simulator, force=False):
logger = logging.getLogger(__name__)
#Run through simulators and benchmark
key = str(simulator.__name__)
logger.info("Benchmarking %s to %s", key, self.filename)
#If this simulator has been benchmarked already, skip it
if (force==False and os.path.isfile(self.filename)):
with np.load(self.filename) as data:
if key in data["simulators"]:
logger.info("%s already benchmarked - skipping", key)
return
# Set arguments to send to the simulators during construction
context = Common.CudaContext(autotuning=False)
g = 9.81
h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
arguments = {
'context': context,
'h0': h0, 'hu0': hu0, 'hv0': hv0,
'nx': self.nx, 'ny': self.ny,
'dx': dx, 'dy': dy, 'dt': 0.9*dt,
'g': g
}
# Load existing data into memory
benchmark_data = {
"simulators": [],
}
if (os.path.isfile(self.filename)):
with np.load(self.filename) as data:
for k, v in data.items():
benchmark_data[k] = v
# Run benchmark
benchmark_data[key + "_megacells"] = Autotuner.benchmark_single_simulator(simulator, arguments, self.block_widths, self.block_heights)
benchmark_data[key + "_block_widths"] = self.block_widths
benchmark_data[key + "_block_heights"] = self.block_heights
benchmark_data[key + "_arguments"] = str(arguments)
existing_sims = benchmark_data["simulators"]
if (isinstance(existing_sims, np.ndarray)):
existing_sims = existing_sims.tolist()
if (key not in existing_sims):
benchmark_data["simulators"] = existing_sims + [key]
# Save to file
np.savez_compressed(self.filename, **benchmark_data)
"""
Function which reads a numpy file with autotuning data
and reports the maximum performance and block size
"""
def get_peak_performance(self, simulator):
logger = logging.getLogger(__name__)
assert issubclass(simulator, Simulator.BaseSimulator)
key = simulator.__name__
if (key in self.performance):
return self.performance[key]
else:
#Run simulation if required
if (not os.path.isfile(self.filename)):
logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
self.benchmark(simulator)
with np.load(self.filename) as data:
if key not in data['simulators']:
logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
data.close()
self.benchmark(simulator)
data = np.load(self.filename)
def find_max_index(megacells):
max_index = np.nanargmax(megacells)
return np.unravel_index(max_index, megacells.shape)
megacells = data[key + '_megacells']
block_widths = data[key + '_block_widths']
block_heights = data[key + '_block_heights']
j, i = find_max_index(megacells)
self.performance[key] = { "block_width": block_widths[i],
"block_height": block_heights[j],
"megacells": megacells[j, i] }
logger.debug("Returning %s as peak performance parameters", self.performance[key])
return self.performance[key]
#This should never happen
raise "Something wrong: Could not get autotuning data!"
return None
"""
Runs a set of benchmarks for a single simulator
"""
def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
logger = logging.getLogger(__name__)
megacells = np.empty((len(block_heights), len(block_widths)))
megacells.fill(np.nan)
logger.debug("Running %d benchmarks with %s", len(block_heights)*len(block_widths), simulator.__name__)
sim_arguments = arguments.copy()
with Common.Timer(simulator.__name__) as t:
for j, block_height in enumerate(block_heights):
sim_arguments.update({'block_height': block_height})
for i, block_width in enumerate(block_widths):
sim_arguments.update({'block_width': block_width})
megacells[j, i] = Autotuner.run_benchmark(simulator, sim_arguments)
logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
return megacells
"""
Runs a benchmark, and returns the number of megacells achieved
"""
def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
logger = logging.getLogger(__name__)
#Initialize simulator
try:
sim = simulator(**arguments)
except:
#An exception raised - not possible to continue
logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
return np.nan
#Create timer events
start = cuda.Event()
end = cuda.Event()
#Warmup
for i in range(warmup_timesteps):
sim.stepEuler(sim.dt)
#Run simulation with timer
start.record(sim.stream)
for i in range(timesteps):
sim.stepEuler(sim.dt)
end.record(sim.stream)
#Synchronize end event
end.synchronize()
#Compute megacells
gpu_elapsed = end.time_since(start)*1.0e-3
megacells = (sim.nx*sim.ny*timesteps / (1000*1000)) / gpu_elapsed
#Sanity check solution
h, hu, hv = sim.download()
sane = True
sane = sane and Autotuner.sanity_check(h, 0.3, 0.7)
sane = sane and Autotuner.sanity_check(hu, -0.2, 0.2)
sane = sane and Autotuner.sanity_check(hv, -0.2, 0.2)
if (sane):
logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
return megacells
else:
logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
return np.nan
"""
Generates test dataset
"""
def gen_test_data(nx, ny, g):
width = 100.0
height = 100.0
dx = width / float(nx)
dy = height / float(ny)
x_center = dx*nx/2.0
y_center = dy*ny/2.0
#Create a gaussian "dam break" that will not form shocks
size = width / 5.0
dt = 10**10
h = np.zeros((ny, nx), dtype=np.float32);
hu = np.zeros((ny, nx), dtype=np.float32);
hv = np.zeros((ny, nx), dtype=np.float32);
x = dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center
y = dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
r = np.sqrt(xv**2 + yv**2)
xv = None
yv = None
gc.collect()
#Generate highres
h = 0.5 + 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
hu = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
hv = 0.1*0.5*(1.0 + np.cos(np.pi*r/size)) * (r < size)
scale = 0.7
max_h_estimate = 0.6
max_u_estimate = 0.1*np.sqrt(2.0)
dx = width/nx
dy = height/ny
dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g*max_h_estimate))
return h, hu, hv, dx, dy, dt
"""
Checks that a variable is "sane"
"""
def sanity_check(variable, bound_min, bound_max):
maxval = np.amax(variable)
minval = np.amin(variable)
if (np.isnan(maxval)
or np.isnan(minval)
or maxval > bound_max
or minval < bound_min):
return False
else:
return True

View File

@ -34,7 +34,7 @@ import pycuda.compiler as cuda_compiler
import pycuda.gpuarray import pycuda.gpuarray
import pycuda.driver as cuda import pycuda.driver as cuda
from GPUSimulators import Autotuner
""" """
Class which keeps track of time spent for a section of code Class which keeps track of time spent for a section of code
@ -64,8 +64,7 @@ Class which keeps track of the CUDA context and some helper functions
""" """
class CudaContext(object): class CudaContext(object):
def __init__(self, verbose=True, blocking=False, use_cache=True): def __init__(self, blocking=False, use_cache=True, autotuning=True):
self.verbose = verbose
self.blocking = blocking self.blocking = blocking
self.use_cache = use_cache self.use_cache = use_cache
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@ -76,15 +75,16 @@ class CudaContext(object):
#Initialize cuda (must be first call to PyCUDA) #Initialize cuda (must be first call to PyCUDA)
cuda.init(flags=0) cuda.init(flags=0)
self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT))
#Print some info about CUDA #Print some info about CUDA
self.logger.info("CUDA version %s", str(cuda.get_version())) self.logger.info("CUDA version %s", str(cuda.get_version()))
self.logger.info("Driver version %s", str(cuda.get_driver_version())) self.logger.info("Driver version %s", str(cuda.get_driver_version()))
self.cuda_device = cuda.Device(0) self.cuda_device = cuda.Device(0)
if (self.verbose): self.logger.info("Using '%s' GPU", self.cuda_device.name())
self.logger.info("Using '%s' GPU", self.cuda_device.name()) self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))
# Create the CUDA context # Create the CUDA context
if (self.blocking): if (self.blocking):
@ -102,6 +102,11 @@ class CudaContext(object):
os.mkdir(self.cache_path) os.mkdir(self.cache_path)
self.logger.debug("Using CUDA cache dir %s", self.cache_path) self.logger.debug("Using CUDA cache dir %s", self.cache_path)
self.autotuner = None
if (autotuning):
self.logger.info("Autotuning enabled. It may take several minutes to run the code the first time: have patience")
self.autotuner = Autotuner.Autotuner()
def __del__(self, *args): def __del__(self, *args):
self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle)) self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
@ -131,7 +136,7 @@ class CudaContext(object):
return "CudaContext id " + str(self.cuda_context.handle) return "CudaContext id " + str(self.cuda_context.handle)
def hash_kernel(kernel_filename, include_dirs, verbose=False): def hash_kernel(kernel_filename, include_dirs):
# Generate a kernel ID for our caches # Generate a kernel ID for our caches
num_includes = 0 num_includes = 0
max_includes = 100 max_includes = 100
@ -147,7 +152,7 @@ class CudaContext(object):
filename = files.pop() filename = files.pop()
logger.debug("Hashing %s", filename) #logger.debug("Hashing %s", filename)
modified = os.path.getmtime(filename) modified = os.path.getmtime(filename)
@ -183,7 +188,7 @@ class CudaContext(object):
""" """
def get_prepared_kernel(self, kernel_filename, kernel_function_name, \ def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
prepared_call_args, \ prepared_call_args, \
include_dirs=[], verbose=False, no_extern_c=True, include_dirs=[], no_extern_c=True,
**kwargs): **kwargs):
""" """
Helper function to print compilation output Helper function to print compilation output
@ -195,7 +200,7 @@ class CudaContext(object):
if error_str: if error_str:
self.logger.debug("Error: %s", error_str) self.logger.debug("Error: %s", error_str)
self.logger.debug("Getting %s", kernel_filename) #self.logger.debug("Getting %s", kernel_filename)
# Create a hash of the kernel (and its includes) # Create a hash of the kernel (and its includes)
kwargs_hasher = hashlib.md5() kwargs_hasher = hashlib.md5()
@ -206,8 +211,7 @@ class CudaContext(object):
kernel_hash = root \ kernel_hash = root \
+ "_" + CudaContext.hash_kernel( \ + "_" + CudaContext.hash_kernel( \
os.path.join(self.module_path, kernel_filename), \ os.path.join(self.module_path, kernel_filename), \
include_dirs=[self.module_path] + include_dirs, \ include_dirs=[self.module_path] + include_dirs) \
verbose=verbose) \
+ "_" + kwargs_hash \ + "_" + kwargs_hash \
+ ext + ext
cached_kernel_filename = os.path.join(self.cache_path, kernel_hash) cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
@ -272,61 +276,108 @@ class CudaContext(object):
def synchronize(self): def synchronize(self):
self.cuda_context.synchronize() self.cuda_context.synchronize()
""" """
Class that holds data Class that holds data
""" """
class CUDAArray2D: class CudaArray2D:
""" """
Uploads initial data to the CL device Uploads initial data to the CL device
""" """
def __init__(self, stream, nx, ny, halo_x, halo_y, data): def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self.nx = nx self.nx = nx
self.ny = ny self.ny = ny
self.nx_halo = nx + 2*halo_x self.x_halo = x_halo
self.ny_halo = ny + 2*halo_y self.y_halo = y_halo
self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny) nx_halo = nx + 2*x_halo
ny_halo = ny + 2*y_halo
#self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
#Make sure data is in proper format #Make sure data is in proper format
assert np.issubdtype(data.dtype, np.float32), "Wrong datatype: %s" % str(data.dtype) assert np.issubdtype(cpu_data.dtype, np.float32), "Wrong datatype: %s" % str(cpu_data.dtype)
assert not np.isfortran(data), "Wrong datatype (Fortran, expected C)" assert cpu_data.itemsize == 4, "Wrong size of data type"
assert data.shape == (self.ny_halo, self.nx_halo), "Wrong data shape: %s vs %s" % (str(data.shape), str((self.ny_halo, self.nx_halo))) assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Upload data to the device #Upload data to the device
self.data = pycuda.gpuarray.to_gpu_async(data, stream=stream) if (cpu_data.shape == (ny_halo, nx_halo)):
self.data = pycuda.gpuarray.to_gpu_async(cpu_data, stream=stream)
elif (cpu_data.shape == (self.ny, self.nx)):
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.empty((ny_halo, nx_halo), cpu_data.dtype)
#self.data.fill(0.0)
#Create copy object from host to device
copy = cuda.Memcpy2D()
copy.set_src_host(cpu_data)
copy.set_dst_device(self.data.gpudata)
#Set offsets and pitch of destination
copy.dst_x_in_bytes = self.x_halo*self.data.strides[1]
copy.dst_y = self.y_halo
copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = self.nx*cpu_data.itemsize
copy.height = self.ny
#Perform the copy
copy(stream)
stream.synchronize()
else:
assert False, "Wrong data shape: %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
self.bytes_per_float = data.itemsize #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
assert(self.bytes_per_float == 4)
self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args): def __del__(self, *args):
self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free() self.data.gpudata.free()
self.data = None self.data = None
""" """
Enables downloading data from CL device to Python Enables downloading data from GPU to Python
""" """
def download(self, stream, async=False): def download(self, stream, async=False):
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
#Allocate host memory
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
cpu_data = np.empty((self.ny, self.nx), dtype=np.float32)
#Copy data from device to host #Create copy object from device to host
if (async): copy = cuda.Memcpy2D()
self.logger.debug("Buffer <%s> [%dx%d]: Downloading async ", int(self.data.gpudata), self.nx, self.ny) copy.set_src_device(self.data.gpudata)
host_data = self.data.get_async(stream=stream) copy.set_dst_host(cpu_data)
return host_data
else: #Set offsets and pitch of source
self.logger.debug("Buffer <%s> [%dx%d]: Downloading synchronously", int(self.data.gpudata), self.nx, self.ny) copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
host_data = self.data.get(stream=stream)#, pagelocked=True) # pagelocked causes crash on windows at least copy.src_y = self.y_halo
return host_data copy.src_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = self.nx*cpu_data.itemsize
copy.height = self.ny
copy(stream)
if async==False:
stream.synchronize()
return cpu_data
@ -344,13 +395,13 @@ class SWEDataArakawaA:
Uploads initial data to the CL device Uploads initial data to the CL device
""" """
def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0): def __init__(self, stream, nx, ny, halo_x, halo_y, h0, hu0, hv0):
self.h0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, h0) self.h0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
self.hu0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hu0) self.hu0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
self.hv0 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hv0) self.hv0 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
self.h1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, h0) self.h1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, h0)
self.hu1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hu0) self.hu1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hu0)
self.hv1 = CUDAArray2D(stream, nx, ny, halo_x, halo_y, hv0) self.hv1 = CudaArray2D(stream, nx, ny, halo_x, halo_y, hv0)
""" """
Swaps the variables after a timestep has been completed Swaps the variables after a timestep has been completed

View File

@ -21,7 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
""" """
#Import packages we need #Import packages we need
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -70,8 +70,8 @@ class FORCE (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \ self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
"iiffffPiPiPiPiPiPi", \ "iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "First order centered" return "First order centered"
@ -84,12 +84,12 @@ class FORCE (Simulator.BaseSimulator):
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt

View File

@ -20,7 +20,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
""" """
#Import packages we need #Import packages we need
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -65,8 +65,8 @@ class HLL (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \ self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
"iiffffPiPiPiPiPiPi", \ "iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "Harten-Lax-van Leer" return "Harten-Lax-van Leer"
@ -79,12 +79,12 @@ class HLL (Simulator.BaseSimulator):
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt

View File

@ -21,7 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
import numpy as np import numpy as np
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -71,8 +71,8 @@ class HLL2 (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \ self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
"iifffffiPiPiPiPiPiPi", \ "iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "Harten-Lax-van Leer (2nd order)" return "Harten-Lax-van Leer (2nd order)"
@ -90,12 +90,12 @@ class HLL2 (Simulator.BaseSimulator):
self.g, \ self.g, \
self.theta, \ self.theta, \
np.int32(0), \ np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt
@ -106,12 +106,12 @@ class HLL2 (Simulator.BaseSimulator):
self.g, \ self.g, \
self.theta, \ self.theta, \
np.int32(1), \ np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt

View File

@ -21,13 +21,15 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import logging import logging
from IPython.core import magic_arguments
from IPython.core.magic import line_magic, Magics, magics_class from IPython.core.magic import line_magic, Magics, magics_class
import pycuda.driver as cuda import pycuda.driver as cuda
from GPUSimulators import Common
@magics_class @magics_class
class CudaContextHandler(Magics): class MyIPythonMagic(Magics):
@line_magic @line_magic
def cuda_context_handler(self, context_name): def cuda_context_handler(self, context_name):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@ -39,7 +41,8 @@ class CudaContextHandler(Magics):
return return
else: else:
self.logger.debug("Creating context") self.logger.debug("Creating context")
self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)") #self.shell.ex(context_name + " = Common.CudaContext(blocking=False)")
self.shell.user_ns[context_name] = Common.CudaContext(blocking=False)
# this function will be called on exceptions in any cell # this function will be called on exceptions in any cell
def custom_exc(shell, etype, evalue, tb, tb_offset=None): def custom_exc(shell, etype, evalue, tb, tb_offset=None):
@ -51,11 +54,14 @@ class CudaContextHandler(Magics):
if context_name in self.shell.user_ns.keys(): if context_name in self.shell.user_ns.keys():
self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle)) self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
self.shell.ex(context_name + ".cuda_context.push()") #self.shell.ex(context_name + ".cuda_context.push()")
self.shell.user_ns[context_name].cuda_context.push()
else: else:
self.logger.error("No CUDA context called %s found (something is wrong)", context_name) self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
self.logger.error("CUDA will not work now") self.logger.error("CUDA will not work now")
self.logger.debug("==================================================================")
# still show the error within the notebook, don't just swallow it # still show the error within the notebook, don't just swallow it
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset) shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
@ -71,10 +77,42 @@ class CudaContextHandler(Magics):
context = cuda.Context.get_current() context = cuda.Context.get_current()
self.logger.info("`-> Popping <%s>", str(context.handle)) self.logger.info("`-> Popping <%s>", str(context.handle))
cuda.Context.pop() cuda.Context.pop()
self.logger.debug("==================================================================")
atexit.register(exitfunc) atexit.register(exitfunc)
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument(
'--out', '-o', type=str, default='output.log', help='The filename to store the log to')
@magic_arguments.argument(
'--level', '-l', type=int, default=20, help='The level of logging to screen [0, 50]')
@magic_arguments.argument(
'--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
def setup_logging(self, line):
args = magic_arguments.parse_argstring(self.setup_logging, line)
import sys
#Get root logger
logger = logging.getLogger('')
logger.setLevel(min(args.level, args.file_level))
logger = logging.getLogger(__name__) #Add log to screen
logger.info("Registering automatic CUDA context handling") ch = logging.StreamHandler()
logger.debug("(use %cuda_context_handler my_context to create a context called my_context") ch.setLevel(args.level)
logger.addHandler(ch)
logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level))
#Add log to file
logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), args.out)
fh = logging.FileHandler(args.out)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter)
fh.setLevel(args.file_level)
logger.addHandler(fh)
logger.info("Python version %s", sys.version)
# Register
ip = get_ipython() ip = get_ipython()
ip.register_magics(CudaContextHandler) ip.register_magics(MyIPythonMagic)

View File

@ -26,7 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
import numpy as np import numpy as np
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -72,8 +72,8 @@ class KP07 (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \ self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
"iifffffiPiPiPiPiPiPi", \ "iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "Kurganov-Petrova 2007" return "Kurganov-Petrova 2007"
@ -88,12 +88,12 @@ class KP07 (Simulator.BaseSimulator):
self.g, \ self.g, \
self.theta, \ self.theta, \
np.int32(substep), \ np.int32(substep), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
def stepEuler(self, dt): def stepEuler(self, dt):

View File

@ -26,7 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
import numpy as np import numpy as np
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -71,9 +71,9 @@ class KP07_dimsplit (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \ self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
"iifffffiPiPiPiPiPiPi", \ "iifffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "Kurganov-Petrova 2007 dimensionally split" return "Kurganov-Petrova 2007 dimensionally split"
@ -91,12 +91,12 @@ class KP07_dimsplit (Simulator.BaseSimulator):
self.g, \ self.g, \
self.theta, \ self.theta, \
np.int32(0), \ np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt
@ -107,12 +107,12 @@ class KP07_dimsplit (Simulator.BaseSimulator):
self.g, \ self.g, \
self.theta, \ self.theta, \
np.int32(1), \ np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt

View File

@ -21,7 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
""" """
#Import packages we need #Import packages we need
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -66,8 +66,8 @@ class LxF (Simulator.BaseSimulator):
# Get kernels # Get kernels
self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \ self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
"iiffffPiPiPiPiPiPi", \ "iiffffPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "Lax Friedrichs" return "Lax Friedrichs"
@ -80,12 +80,12 @@ class LxF (Simulator.BaseSimulator):
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt

View File

@ -28,7 +28,7 @@ import pycuda.compiler as cuda_compiler
import pycuda.gpuarray import pycuda.gpuarray
import pycuda.driver as cuda import pycuda.driver as cuda
from SWESimulators import Common from GPUSimulators import Common
class BaseSimulator: class BaseSimulator:
@ -57,6 +57,14 @@ class BaseSimulator:
#Get logger #Get logger
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
self.context = context
if (self.context.autotuner):
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
block_width = int(peak_configuration["block_width"])
block_height = int(peak_configuration["block_height"])
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
#Create a CUDA stream #Create a CUDA stream
self.stream = cuda.Stream() self.stream = cuda.Stream()
@ -85,7 +93,7 @@ class BaseSimulator:
int(np.ceil(self.nx / float(self.local_size[0]))), \ int(np.ceil(self.nx / float(self.local_size[0]))), \
int(np.ceil(self.ny / float(self.local_size[1]))) \ int(np.ceil(self.ny / float(self.local_size[1]))) \
) )
""" """
Function which simulates forward in time using the default simulation type Function which simulates forward in time using the default simulation type
""" """

View File

@ -22,7 +22,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
import numpy as np import numpy as np
from SWESimulators import Simulator from GPUSimulators import Simulator
@ -65,8 +65,8 @@ class WAF (Simulator.BaseSimulator):
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \ self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
"iiffffiPiPiPiPiPiPi", \ "iiffffiPiPiPiPiPiPi", \
BLOCK_WIDTH=block_width, \ BLOCK_WIDTH=self.local_size[0], \
BLOCK_HEIGHT=block_height) BLOCK_HEIGHT=self.local_size[1])
def __str__(self): def __str__(self):
return "Weighted average flux" return "Weighted average flux"
@ -83,12 +83,12 @@ class WAF (Simulator.BaseSimulator):
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
np.int32(0), \ np.int32(0), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt
@ -98,11 +98,11 @@ class WAF (Simulator.BaseSimulator):
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \ self.g, \
np.int32(1), \ np.int32(1), \
self.data.h0.data.gpudata, self.data.h0.pitch, \ self.data.h0.data.gpudata, self.data.h0.data.strides[0], \
self.data.hu0.data.gpudata, self.data.hu0.pitch, \ self.data.hu0.data.gpudata, self.data.hu0.data.strides[0], \
self.data.hv0.data.gpudata, self.data.hv0.pitch, \ self.data.hv0.data.gpudata, self.data.hv0.data.strides[0], \
self.data.h1.data.gpudata, self.data.h1.pitch, \ self.data.h1.data.gpudata, self.data.h1.data.strides[0], \
self.data.hu1.data.gpudata, self.data.hu1.pitch, \ self.data.hu1.data.gpudata, self.data.hu1.data.strides[0], \
self.data.hv1.data.gpudata, self.data.hv1.pitch) self.data.hv1.data.gpudata, self.data.hv1.data.strides[0])
self.data.swap() self.data.swap()
self.t += dt self.t += dt