mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 14:34:13 +02:00
Merge pull request #10 from setmar/master
WiP: Simulating the Euler equations on multiple GPUs using Python
This commit is contained in:
commit
9613269c45
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.vscode/settings.json
|
555162
EulerTesting.ipynb
555162
EulerTesting.ipynb
File diff suppressed because one or more lines are too long
673
Figures_Saga.ipynb
Normal file
673
Figures_Saga.ipynb
Normal file
File diff suppressed because one or more lines are too long
721
Figures_Simula.ipynb
Normal file
721
Figures_Simula.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -89,12 +89,22 @@ def toJson(in_dict, compressed=True):
|
|||||||
out_dict[key] = value
|
out_dict[key] = value
|
||||||
return json.dumps(out_dict)
|
return json.dumps(out_dict)
|
||||||
|
|
||||||
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[]):
|
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
|
||||||
"""
|
"""
|
||||||
Runs a simulation, and stores output in netcdf file. Stores the times given in
|
Runs a simulation, and stores output in netcdf file. Stores the times given in
|
||||||
save_times, and saves all of the variables in list save_var_names. Elements in
|
save_times, and saves all of the variables in list save_var_names. Elements in
|
||||||
save_var_names can be set to None if you do not want to save them
|
save_var_names can be set to None if you do not want to save them
|
||||||
"""
|
"""
|
||||||
|
profiling_data_sim_runner = { 'start': {}, 'end': {} }
|
||||||
|
profiling_data_sim_runner["start"]["t_sim_init"] = 0
|
||||||
|
profiling_data_sim_runner["end"]["t_sim_init"] = 0
|
||||||
|
profiling_data_sim_runner["start"]["t_nc_write"] = 0
|
||||||
|
profiling_data_sim_runner["end"]["t_nc_write"] = 0
|
||||||
|
profiling_data_sim_runner["start"]["t_full_step"] = 0
|
||||||
|
profiling_data_sim_runner["end"]["t_full_step"] = 0
|
||||||
|
|
||||||
|
profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
assert len(save_times) > 0, "Need to specify which times to save"
|
assert len(save_times) > 0, "Need to specify which times to save"
|
||||||
@ -111,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
|||||||
outdata.ncfile.git_hash = getGitHash()
|
outdata.ncfile.git_hash = getGitHash()
|
||||||
outdata.ncfile.git_status = getGitStatus()
|
outdata.ncfile.git_status = getGitStatus()
|
||||||
outdata.ncfile.simulator = str(simulator)
|
outdata.ncfile.simulator = str(simulator)
|
||||||
outdata.ncfile.sim_args = toJson(simulator_args)
|
|
||||||
|
# do not write fields to attributes (they are to large)
|
||||||
|
simulator_args_for_ncfile = simulator_args.copy()
|
||||||
|
del simulator_args_for_ncfile["rho"]
|
||||||
|
del simulator_args_for_ncfile["rho_u"]
|
||||||
|
del simulator_args_for_ncfile["rho_v"]
|
||||||
|
del simulator_args_for_ncfile["E"]
|
||||||
|
outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
|
||||||
|
|
||||||
#Create dimensions
|
#Create dimensions
|
||||||
outdata.ncfile.createDimension('time', len(save_times))
|
outdata.ncfile.createDimension('time', len(save_times))
|
||||||
@ -146,6 +163,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
|||||||
t_steps[0] = save_times[0]
|
t_steps[0] = save_times[0]
|
||||||
t_steps[1:] = save_times[1:] - save_times[0:-1]
|
t_steps[1:] = save_times[1:] - save_times[0:-1]
|
||||||
|
|
||||||
|
profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
|
||||||
|
|
||||||
#Start simulation loop
|
#Start simulation loop
|
||||||
progress_printer = ProgressPrinter(save_times[-1], print_every=10)
|
progress_printer = ProgressPrinter(save_times[-1], print_every=10)
|
||||||
for k in range(len(save_times)):
|
for k in range(len(save_times)):
|
||||||
@ -160,9 +179,15 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
|||||||
logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
|
logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
|
||||||
return outdata.filename
|
return outdata.filename
|
||||||
|
|
||||||
|
profiling_data_sim_runner["start"]["t_full_step"] += time.time()
|
||||||
|
|
||||||
#Simulate
|
#Simulate
|
||||||
if (t_step > 0.0):
|
if (t_step > 0.0):
|
||||||
sim.simulate(t_step)
|
sim.simulate(t_step, dt)
|
||||||
|
|
||||||
|
profiling_data_sim_runner["end"]["t_full_step"] += time.time()
|
||||||
|
|
||||||
|
profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
|
||||||
|
|
||||||
#Download
|
#Download
|
||||||
save_vars = sim.download(download_vars)
|
save_vars = sim.download(download_vars)
|
||||||
@ -171,6 +196,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
|||||||
for i, var_name in enumerate(save_var_names):
|
for i, var_name in enumerate(save_var_names):
|
||||||
ncvars[var_name][k, :] = save_vars[i]
|
ncvars[var_name][k, :] = save_vars[i]
|
||||||
|
|
||||||
|
profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
|
||||||
|
|
||||||
#Write progress to screen
|
#Write progress to screen
|
||||||
print_string = progress_printer.getPrintString(t_end)
|
print_string = progress_printer.getPrintString(t_end)
|
||||||
if (print_string):
|
if (print_string):
|
||||||
@ -178,7 +205,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
|||||||
|
|
||||||
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
|
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
|
||||||
|
|
||||||
return outdata.filename
|
return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -526,9 +553,9 @@ class CudaArray2D:
|
|||||||
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
|
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
|
||||||
#Allocate host memory
|
#Allocate host memory
|
||||||
#The following fails, don't know why (crashes python)
|
#The following fails, don't know why (crashes python)
|
||||||
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32)
|
cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
|
||||||
#Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
|
#Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
|
||||||
cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
|
#cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
|
||||||
|
|
||||||
assert nx == cpu_data.shape[1]
|
assert nx == cpu_data.shape[1]
|
||||||
assert ny == cpu_data.shape[0]
|
assert ny == cpu_data.shape[0]
|
||||||
@ -739,7 +766,7 @@ class ArakawaA2D:
|
|||||||
assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
|
assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
|
||||||
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
|
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
|
||||||
|
|
||||||
stream.synchronize()
|
#stream.synchronize()
|
||||||
return cpu_variables
|
return cpu_variables
|
||||||
|
|
||||||
def check(self):
|
def check(self):
|
||||||
|
@ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
|||||||
},
|
},
|
||||||
jit_compile_args={})
|
jit_compile_args={})
|
||||||
self.kernel = module.get_function("KP07DimsplitKernel")
|
self.kernel = module.get_function("KP07DimsplitKernel")
|
||||||
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP")
|
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
|
||||||
|
|
||||||
|
|
||||||
#Create data by uploading to device
|
#Create data by uploading to device
|
||||||
@ -109,10 +109,13 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
|||||||
self.cfl_data.fill(self.dt, stream=self.stream)
|
self.cfl_data.fill(self.dt, stream=self.stream)
|
||||||
|
|
||||||
|
|
||||||
def substep(self, dt, step_number):
|
def substep(self, dt, step_number, external=True, internal=True):
|
||||||
self.substepDimsplit(0.5*dt, step_number)
|
self.substepDimsplit(0.5*dt, step_number, external, internal)
|
||||||
|
|
||||||
|
def substepDimsplit(self, dt, substep, external, internal):
|
||||||
|
if external and internal:
|
||||||
|
#print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
|
||||||
|
|
||||||
def substepDimsplit(self, dt, substep):
|
|
||||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
|
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
|
||||||
self.nx, self.ny,
|
self.nx, self.ny,
|
||||||
self.dx, self.dy, dt,
|
self.dx, self.dy, dt,
|
||||||
@ -129,8 +132,142 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
|||||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||||
self.cfl_data.gpudata)
|
self.cfl_data.gpudata,
|
||||||
|
0, 0,
|
||||||
|
self.nx, self.ny)
|
||||||
|
return
|
||||||
|
|
||||||
|
if external and not internal:
|
||||||
|
###################################
|
||||||
|
# XXX: Corners are treated twice! #
|
||||||
|
###################################
|
||||||
|
|
||||||
|
ns_grid_size = (self.grid_size[0], 1)
|
||||||
|
|
||||||
|
# NORTH
|
||||||
|
# (x0, y0) x (x1, y1)
|
||||||
|
# (0, ny-y_halo) x (nx, ny)
|
||||||
|
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
|
||||||
|
self.nx, self.ny,
|
||||||
|
self.dx, self.dy, dt,
|
||||||
|
self.g,
|
||||||
|
self.gamma,
|
||||||
|
self.theta,
|
||||||
|
substep,
|
||||||
|
self.boundary_conditions,
|
||||||
|
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||||
|
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||||
|
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||||
|
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||||
|
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||||
|
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||||
|
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||||
|
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||||
|
self.cfl_data.gpudata,
|
||||||
|
0, self.ny - int(self.u0[0].y_halo),
|
||||||
|
self.nx, self.ny)
|
||||||
|
|
||||||
|
# SOUTH
|
||||||
|
# (x0, y0) x (x1, y1)
|
||||||
|
# (0, 0) x (nx, y_halo)
|
||||||
|
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
|
||||||
|
self.nx, self.ny,
|
||||||
|
self.dx, self.dy, dt,
|
||||||
|
self.g,
|
||||||
|
self.gamma,
|
||||||
|
self.theta,
|
||||||
|
substep,
|
||||||
|
self.boundary_conditions,
|
||||||
|
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||||
|
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||||
|
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||||
|
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||||
|
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||||
|
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||||
|
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||||
|
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||||
|
self.cfl_data.gpudata,
|
||||||
|
0, 0,
|
||||||
|
self.nx, int(self.u0[0].y_halo))
|
||||||
|
|
||||||
|
we_grid_size = (1, self.grid_size[1])
|
||||||
|
|
||||||
|
# WEST
|
||||||
|
# (x0, y0) x (x1, y1)
|
||||||
|
# (0, 0) x (x_halo, ny)
|
||||||
|
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
|
||||||
|
self.nx, self.ny,
|
||||||
|
self.dx, self.dy, dt,
|
||||||
|
self.g,
|
||||||
|
self.gamma,
|
||||||
|
self.theta,
|
||||||
|
substep,
|
||||||
|
self.boundary_conditions,
|
||||||
|
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||||
|
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||||
|
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||||
|
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||||
|
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||||
|
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||||
|
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||||
|
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||||
|
self.cfl_data.gpudata,
|
||||||
|
0, 0,
|
||||||
|
int(self.u0[0].x_halo), self.ny)
|
||||||
|
|
||||||
|
# EAST
|
||||||
|
# (x0, y0) x (x1, y1)
|
||||||
|
# (nx-x_halo, 0) x (nx, ny)
|
||||||
|
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
|
||||||
|
self.nx, self.ny,
|
||||||
|
self.dx, self.dy, dt,
|
||||||
|
self.g,
|
||||||
|
self.gamma,
|
||||||
|
self.theta,
|
||||||
|
substep,
|
||||||
|
self.boundary_conditions,
|
||||||
|
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||||
|
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||||
|
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||||
|
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||||
|
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||||
|
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||||
|
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||||
|
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||||
|
self.cfl_data.gpudata,
|
||||||
|
self.nx - int(self.u0[0].x_halo), 0,
|
||||||
|
self.nx, self.ny)
|
||||||
|
return
|
||||||
|
|
||||||
|
if internal and not external:
|
||||||
|
|
||||||
|
# INTERNAL DOMAIN
|
||||||
|
# (x0, y0) x (x1, y1)
|
||||||
|
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
|
||||||
|
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
|
||||||
|
self.nx, self.ny,
|
||||||
|
self.dx, self.dy, dt,
|
||||||
|
self.g,
|
||||||
|
self.gamma,
|
||||||
|
self.theta,
|
||||||
|
substep,
|
||||||
|
self.boundary_conditions,
|
||||||
|
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||||
|
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||||
|
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||||
|
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||||
|
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||||
|
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||||
|
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||||
|
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||||
|
self.cfl_data.gpudata,
|
||||||
|
int(self.u0[0].x_halo), int(self.u0[0].y_halo),
|
||||||
|
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
|
||||||
|
return
|
||||||
|
|
||||||
|
def swapBuffers(self):
|
||||||
self.u0, self.u1 = self.u1, self.u0
|
self.u0, self.u1 = self.u1, self.u0
|
||||||
|
return
|
||||||
|
|
||||||
def getOutput(self):
|
def getOutput(self):
|
||||||
return self.u0
|
return self.u0
|
||||||
@ -138,6 +275,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
|||||||
def check(self):
|
def check(self):
|
||||||
self.u0.check()
|
self.u0.check()
|
||||||
self.u1.check()
|
self.u1.check()
|
||||||
|
return
|
||||||
|
|
||||||
def computeDt(self):
|
def computeDt(self):
|
||||||
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
|
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
|
||||||
|
@ -24,7 +24,10 @@ import logging
|
|||||||
from GPUSimulators import Simulator
|
from GPUSimulators import Simulator
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
#import nvtx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -135,6 +138,10 @@ class MPIGrid(object):
|
|||||||
grid = np.sort(grid)
|
grid = np.sort(grid)
|
||||||
grid = grid[::-1]
|
grid = grid[::-1]
|
||||||
|
|
||||||
|
# XXX: We only use vertical (north-south) partitioning for now
|
||||||
|
grid[0] = 1
|
||||||
|
grid[1] = num_nodes
|
||||||
|
|
||||||
return grid
|
return grid
|
||||||
|
|
||||||
|
|
||||||
@ -200,6 +207,18 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
Class which handles communication between simulators on different MPI nodes
|
Class which handles communication between simulators on different MPI nodes
|
||||||
"""
|
"""
|
||||||
def __init__(self, sim, grid):
|
def __init__(self, sim, grid):
|
||||||
|
self.profiling_data_mpi = { 'start': {}, 'end': {} }
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_step"] = 0
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_step"] = 0
|
||||||
|
self.profiling_data_mpi["n_time_steps"] = 0
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
autotuner = sim.context.autotuner
|
autotuner = sim.context.autotuner
|
||||||
@ -232,6 +251,7 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
'west': Simulator.BoundaryCondition.Type.Dirichlet
|
'west': Simulator.BoundaryCondition.Type.Dirichlet
|
||||||
})
|
})
|
||||||
gi, gj = grid.getCoordinate()
|
gi, gj = grid.getCoordinate()
|
||||||
|
#print("gi: " + str(gi) + ", gj: " + str(gj))
|
||||||
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
|
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
self.west = None
|
self.west = None
|
||||||
new_boundary_conditions.west = boundary_conditions.west;
|
new_boundary_conditions.west = boundary_conditions.west;
|
||||||
@ -272,23 +292,50 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
#Note that east and west also transfer ghost cells
|
#Note that east and west also transfer ghost cells
|
||||||
#whilst north/south only transfer internal cells
|
#whilst north/south only transfer internal cells
|
||||||
#Reuses the width/height defined in the read-extets above
|
#Reuses the width/height defined in the read-extets above
|
||||||
self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
|
self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
|
||||||
self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
|
self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
|
||||||
self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
|
self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
|
||||||
self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
|
self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
|
||||||
|
|
||||||
#Allocate data for sending
|
#Allocate data for sending
|
||||||
self.out_e = np.empty_like(self.in_e)
|
self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e)
|
||||||
self.out_w = np.empty_like(self.in_w)
|
self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w)
|
||||||
self.out_n = np.empty_like(self.in_n)
|
self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n)
|
||||||
self.out_s = np.empty_like(self.in_s)
|
self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s)
|
||||||
|
|
||||||
self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
|
self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
|
||||||
|
|
||||||
|
self.full_exchange()
|
||||||
|
sim.context.synchronize()
|
||||||
|
|
||||||
def substep(self, dt, step_number):
|
def substep(self, dt, step_number):
|
||||||
self.exchange()
|
|
||||||
self.sim.substep(dt, step_number)
|
#nvtx.mark("substep start", color="yellow")
|
||||||
|
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_step"] += time.time()
|
||||||
|
|
||||||
|
#nvtx.mark("substep external", color="blue")
|
||||||
|
self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
|
||||||
|
|
||||||
|
#nvtx.mark("substep internal", color="red")
|
||||||
|
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
|
||||||
|
|
||||||
|
#nvtx.mark("substep full", color="blue")
|
||||||
|
#self.sim.substep(dt, step_number, external=True, internal=True)
|
||||||
|
|
||||||
|
self.sim.swapBuffers()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
|
||||||
|
|
||||||
|
#nvtx.mark("exchange", color="blue")
|
||||||
|
self.full_exchange()
|
||||||
|
|
||||||
|
#nvtx.mark("sync start", color="blue")
|
||||||
|
self.sim.stream.synchronize()
|
||||||
|
self.sim.internal_stream.synchronize()
|
||||||
|
#nvtx.mark("sync end", color="blue")
|
||||||
|
|
||||||
|
self.profiling_data_mpi["n_time_steps"] += 1
|
||||||
|
|
||||||
def getOutput(self):
|
def getOutput(self):
|
||||||
return self.sim.getOutput()
|
return self.sim.getOutput()
|
||||||
@ -321,18 +368,14 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
y1 = y0 + height
|
y1 = y0 + height
|
||||||
return [x0, x1, y0, y1]
|
return [x0, x1, y0, y1]
|
||||||
|
|
||||||
def exchange(self):
|
def full_exchange(self):
|
||||||
####
|
|
||||||
# FIXME: This function can be optimized using persitent communications.
|
|
||||||
# Also by overlapping some of the communications north/south and east/west of GPU and intra-node
|
|
||||||
# communications
|
|
||||||
####
|
|
||||||
|
|
||||||
####
|
####
|
||||||
# First transfer internal cells north-south
|
# First transfer internal cells north-south
|
||||||
####
|
####
|
||||||
|
|
||||||
#Download from the GPU
|
#Download from the GPU
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
|
||||||
|
|
||||||
if self.north is not None:
|
if self.north is not None:
|
||||||
for k in range(self.nvars):
|
for k in range(self.nvars):
|
||||||
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n)
|
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n)
|
||||||
@ -341,7 +384,11 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
|
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
|
||||||
self.sim.stream.synchronize()
|
self.sim.stream.synchronize()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
|
||||||
|
|
||||||
#Send/receive to north/south neighbours
|
#Send/receive to north/south neighbours
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
comm_send = []
|
comm_send = []
|
||||||
comm_recv = []
|
comm_recv = []
|
||||||
if self.north is not None:
|
if self.north is not None:
|
||||||
@ -355,7 +402,11 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
for comm in comm_recv:
|
for comm in comm_recv:
|
||||||
comm.wait()
|
comm.wait()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
#Upload to the GPU
|
#Upload to the GPU
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||||
|
|
||||||
if self.north is not None:
|
if self.north is not None:
|
||||||
for k in range(self.nvars):
|
for k in range(self.nvars):
|
||||||
self.sim.u0[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n)
|
self.sim.u0[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n)
|
||||||
@ -363,17 +414,23 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
for k in range(self.nvars):
|
for k in range(self.nvars):
|
||||||
self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
|
self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||||
|
|
||||||
#Wait for sending to complete
|
#Wait for sending to complete
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
for comm in comm_send:
|
for comm in comm_send:
|
||||||
comm.wait()
|
comm.wait()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
####
|
####
|
||||||
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
|
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
|
||||||
####
|
####
|
||||||
|
|
||||||
#Download from the GPU
|
#Download from the GPU
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
|
||||||
|
|
||||||
if self.east is not None:
|
if self.east is not None:
|
||||||
for k in range(self.nvars):
|
for k in range(self.nvars):
|
||||||
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e)
|
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e)
|
||||||
@ -382,7 +439,11 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
|
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
|
||||||
self.sim.stream.synchronize()
|
self.sim.stream.synchronize()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
|
||||||
|
|
||||||
#Send/receive to east/west neighbours
|
#Send/receive to east/west neighbours
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
comm_send = []
|
comm_send = []
|
||||||
comm_recv = []
|
comm_recv = []
|
||||||
if self.east is not None:
|
if self.east is not None:
|
||||||
@ -392,12 +453,15 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
|
comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
|
||||||
comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
|
comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
|
||||||
|
|
||||||
|
|
||||||
#Wait for incoming transfers to complete
|
#Wait for incoming transfers to complete
|
||||||
for comm in comm_recv:
|
for comm in comm_recv:
|
||||||
comm.wait()
|
comm.wait()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
#Upload to the GPU
|
#Upload to the GPU
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||||
|
|
||||||
if self.east is not None:
|
if self.east is not None:
|
||||||
for k in range(self.nvars):
|
for k in range(self.nvars):
|
||||||
self.sim.u0[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e)
|
self.sim.u0[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e)
|
||||||
@ -405,9 +469,12 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
for k in range(self.nvars):
|
for k in range(self.nvars):
|
||||||
self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
|
self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||||
|
|
||||||
#Wait for sending to complete
|
#Wait for sending to complete
|
||||||
|
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
for comm in comm_send:
|
for comm in comm_send:
|
||||||
comm.wait()
|
comm.wait()
|
||||||
|
|
||||||
|
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||||
|
|
||||||
|
263
GPUSimulators/SHMEMSimulator.py
Normal file
263
GPUSimulators/SHMEMSimulator.py
Normal file
@ -0,0 +1,263 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
This python module implements SHMEM simulator group class
|
||||||
|
|
||||||
|
Copyright (C) 2020 Norwegian Meteorological Institute
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from GPUSimulators import Simulator, CudaContext
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
class SHMEMSimulator(Simulator.BaseSimulator):
|
||||||
|
"""
|
||||||
|
Class which handles communication and synchronization between simulators in different
|
||||||
|
contexts (presumably on different GPUs)
|
||||||
|
"""
|
||||||
|
def __init__(self, sims, grid):
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
assert(len(sims) > 1)
|
||||||
|
|
||||||
|
self.sims = sims
|
||||||
|
|
||||||
|
# XXX: This is not what was intended. Do we need extra wrapper class SHMEMSimulator?
|
||||||
|
# See also getOutput() and check().
|
||||||
|
#
|
||||||
|
# SHMEMSimulatorGroup would then not have any superclass, but manage a collection of
|
||||||
|
# SHMEMSimulators that have BaseSimulator as a superclass.
|
||||||
|
#
|
||||||
|
# This would also eliminate the need for all the array bookkeeping in this class.
|
||||||
|
autotuner = sims[0].context.autotuner
|
||||||
|
sims[0].context.autotuner = None
|
||||||
|
boundary_conditions = sims[0].getBoundaryConditions()
|
||||||
|
super().__init__(sims[0].context,
|
||||||
|
sims[0].nx, sims[0].ny,
|
||||||
|
sims[0].dx, sims[0].dy,
|
||||||
|
boundary_conditions,
|
||||||
|
sims[0].cfl_scale,
|
||||||
|
sims[0].num_substeps,
|
||||||
|
sims[0].block_size[0], sims[0].block_size[1])
|
||||||
|
sims[0].context.autotuner = autotuner
|
||||||
|
|
||||||
|
self.sims = sims
|
||||||
|
self.grid = grid
|
||||||
|
|
||||||
|
self.east = [None] * len(self.sims)
|
||||||
|
self.west = [None] * len(self.sims)
|
||||||
|
self.north = [None] * len(self.sims)
|
||||||
|
self.south = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.nvars = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.read_e = [None] * len(self.sims)
|
||||||
|
self.read_w = [None] * len(self.sims)
|
||||||
|
self.read_n = [None] * len(self.sims)
|
||||||
|
self.read_s = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.write_e = [None] * len(self.sims)
|
||||||
|
self.write_w = [None] * len(self.sims)
|
||||||
|
self.write_n = [None] * len(self.sims)
|
||||||
|
self.write_s = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.e = [None] * len(self.sims)
|
||||||
|
self.w = [None] * len(self.sims)
|
||||||
|
self.n = [None] * len(self.sims)
|
||||||
|
self.s = [None] * len(self.sims)
|
||||||
|
|
||||||
|
for i, sim in enumerate(self.sims):
|
||||||
|
#Get neighbor subdomain ids
|
||||||
|
self.east[i] = grid.getEast(i)
|
||||||
|
self.west[i] = grid.getWest(i)
|
||||||
|
self.north[i] = grid.getNorth(i)
|
||||||
|
self.south[i] = grid.getSouth(i)
|
||||||
|
|
||||||
|
#Get coordinate of this subdomain
|
||||||
|
#and handle global boundary conditions
|
||||||
|
new_boundary_conditions = Simulator.BoundaryCondition({
|
||||||
|
'north': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||||
|
'south': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||||
|
'east': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||||
|
'west': Simulator.BoundaryCondition.Type.Dirichlet
|
||||||
|
})
|
||||||
|
gi, gj = grid.getCoordinate(i)
|
||||||
|
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.west = None
|
||||||
|
new_boundary_conditions.west = boundary_conditions.west;
|
||||||
|
if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.south = None
|
||||||
|
new_boundary_conditions.south = boundary_conditions.south;
|
||||||
|
if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.east = None
|
||||||
|
new_boundary_conditions.east = boundary_conditions.east;
|
||||||
|
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.north = None
|
||||||
|
new_boundary_conditions.north = boundary_conditions.north;
|
||||||
|
sim.setBoundaryConditions(new_boundary_conditions)
|
||||||
|
|
||||||
|
#Get number of variables
|
||||||
|
self.nvars[i] = len(sim.getOutput().gpu_variables)
|
||||||
|
|
||||||
|
#Shorthands for computing extents and sizes
|
||||||
|
gc_x = int(sim.getOutput()[0].x_halo)
|
||||||
|
gc_y = int(sim.getOutput()[0].y_halo)
|
||||||
|
nx = int(sim.nx)
|
||||||
|
ny = int(sim.ny)
|
||||||
|
|
||||||
|
#Set regions for ghost cells to read from
|
||||||
|
#These have the format [x0, y0, width, height]
|
||||||
|
self.read_e[i] = np.array([ nx, 0, gc_x, ny + 2*gc_y])
|
||||||
|
self.read_w[i] = np.array([gc_x, 0, gc_x, ny + 2*gc_y])
|
||||||
|
self.read_n[i] = np.array([gc_x, ny, nx, gc_y])
|
||||||
|
self.read_s[i] = np.array([gc_x, gc_y, nx, gc_y])
|
||||||
|
|
||||||
|
#Set regions for ghost cells to write to
|
||||||
|
self.write_e[i] = self.read_e[i] + np.array([gc_x, 0, 0, 0])
|
||||||
|
self.write_w[i] = self.read_w[i] - np.array([gc_x, 0, 0, 0])
|
||||||
|
self.write_n[i] = self.read_n[i] + np.array([0, gc_y, 0, 0])
|
||||||
|
self.write_s[i] = self.read_s[i] - np.array([0, gc_y, 0, 0])
|
||||||
|
|
||||||
|
#Allocate host data
|
||||||
|
#Note that east and west also transfer ghost cells
|
||||||
|
#whilst north/south only transfer internal cells
|
||||||
|
#Reuses the width/height defined in the read-extets above
|
||||||
|
self.e[i] = np.empty((self.nvars[i], self.read_e[i][3], self.read_e[i][2]), dtype=np.float32)
|
||||||
|
self.w[i] = np.empty((self.nvars[i], self.read_w[i][3], self.read_w[i][2]), dtype=np.float32)
|
||||||
|
self.n[i] = np.empty((self.nvars[i], self.read_n[i][3], self.read_n[i][2]), dtype=np.float32)
|
||||||
|
self.s[i] = np.empty((self.nvars[i], self.read_s[i][3], self.read_s[i][2]), dtype=np.float32)
|
||||||
|
|
||||||
|
self.logger.debug("Initialized {:d} subdomains".format(len(self.sims)))
|
||||||
|
|
||||||
|
|
||||||
|
def substep(self, dt, step_number):
|
||||||
|
self.exchange()
|
||||||
|
|
||||||
|
for i, sim in enumerate(self.sims):
|
||||||
|
sim.substep(dt, step_number)
|
||||||
|
|
||||||
|
def getOutput(self):
|
||||||
|
# XXX: Does not return what we would expect.
|
||||||
|
# Returns first subdomain, but we want the whole domain.
|
||||||
|
return self.sims[0].getOutput()
|
||||||
|
|
||||||
|
def synchronize(self):
|
||||||
|
for sim in self.sims:
|
||||||
|
sim.synchronize()
|
||||||
|
|
||||||
|
def check(self):
|
||||||
|
# XXX: Does not return what we would expect.
|
||||||
|
# Checks only first subdomain, but we want to check the whole domain.
|
||||||
|
return self.sims[0].check()
|
||||||
|
|
||||||
|
def computeDt(self):
|
||||||
|
global_dt = float("inf")
|
||||||
|
|
||||||
|
for sim in self.sims:
|
||||||
|
sim.context.synchronize()
|
||||||
|
|
||||||
|
for sim in self.sims:
|
||||||
|
local_dt = sim.computeDt()
|
||||||
|
if local_dt < global_dt:
|
||||||
|
global_dt = local_dt
|
||||||
|
self.logger.debug("Local dt: {:f}".format(local_dt))
|
||||||
|
|
||||||
|
self.logger.debug("Global dt: {:f}".format(global_dt))
|
||||||
|
return global_dt
|
||||||
|
|
||||||
|
def getExtent(self, index=0):
|
||||||
|
"""
|
||||||
|
Function which returns the extent of the subdomain with index
|
||||||
|
index in the grid
|
||||||
|
"""
|
||||||
|
width = self.sims[index].nx*self.sims[index].dx
|
||||||
|
height = self.sims[index].ny*self.sims[index].dy
|
||||||
|
i, j = self.grid.getCoordinate(index)
|
||||||
|
x0 = i * width
|
||||||
|
y0 = j * height
|
||||||
|
x1 = x0 + width
|
||||||
|
y1 = y0 + height
|
||||||
|
return [x0, x1, y0, y1]
|
||||||
|
|
||||||
|
def exchange(self):
|
||||||
|
####
|
||||||
|
# First transfer internal cells north-south
|
||||||
|
####
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ns_download(i)
|
||||||
|
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ns_upload(i)
|
||||||
|
|
||||||
|
####
|
||||||
|
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
|
||||||
|
####
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ew_download(i)
|
||||||
|
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ew_upload(i)
|
||||||
|
|
||||||
|
def ns_download(self, i):
|
||||||
|
#Download from the GPU
|
||||||
|
if self.north[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the north)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.n[i][k,:,:], extent=self.read_n[i])
|
||||||
|
if self.south[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i])
|
||||||
|
self.sims[i].stream.synchronize()
|
||||||
|
|
||||||
|
def ns_upload(self, i):
|
||||||
|
#Upload to the GPU
|
||||||
|
if self.north[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.s[self.north[i]][k,:,:], extent=self.write_n[i])
|
||||||
|
if self.south[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.n[self.south[i]][k,:,:], extent=self.write_s[i])
|
||||||
|
|
||||||
|
def ew_download(self, i):
|
||||||
|
#Download from the GPU
|
||||||
|
if self.east[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the east)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.e[i][k,:,:], extent=self.read_e[i])
|
||||||
|
if self.west[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i])
|
||||||
|
self.sims[i].stream.synchronize()
|
||||||
|
|
||||||
|
def ew_upload(self, i):
|
||||||
|
#Upload to the GPU
|
||||||
|
if self.east[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.w[self.east[i]][k,:,:], extent=self.write_e[i])
|
||||||
|
#test_east = np.ones_like(self.e[self.east[i]][k,:,:])
|
||||||
|
#self.sims[i].u0[k].upload(self.sims[i].stream, test_east, extent=self.write_e[i])
|
||||||
|
if self.west[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.e[self.west[i]][k,:,:], extent=self.write_w[i])
|
||||||
|
#test_west = np.ones_like(self.e[self.west[i]][k,:,:])
|
||||||
|
#self.sims[i].u0[k].upload(self.sims[i].stream, test_west, extent=self.write_w[i])
|
394
GPUSimulators/SHMEMSimulatorGroup.py
Normal file
394
GPUSimulators/SHMEMSimulatorGroup.py
Normal file
@ -0,0 +1,394 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
This python module implements SHMEM simulator group class
|
||||||
|
|
||||||
|
Copyright (C) 2020 Norwegian Meteorological Institute
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from GPUSimulators import Simulator, CudaContext
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
class SHMEMGrid(object):
|
||||||
|
"""
|
||||||
|
Class which represents an SHMEM grid of GPUs. Facilitates easy communication between
|
||||||
|
neighboring subdomains in the grid. Contains one CUDA context per subdomain.
|
||||||
|
"""
|
||||||
|
def __init__(self, ngpus=None, ndims=2):
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
cuda.init(flags=0)
|
||||||
|
self.logger.info("Initializing CUDA")
|
||||||
|
num_cuda_devices = cuda.Device.count()
|
||||||
|
|
||||||
|
if ngpus is None:
|
||||||
|
ngpus = num_cuda_devices
|
||||||
|
|
||||||
|
# XXX: disabled for testing on single-GPU system
|
||||||
|
#assert ngpus <= num_cuda_devices, "Trying to allocate more GPUs than are available in the system."
|
||||||
|
#assert ngpus >= 2, "Must have at least two GPUs available to run multi-GPU simulations."
|
||||||
|
|
||||||
|
assert ndims == 2, "Unsupported number of dimensions. Must be two at the moment"
|
||||||
|
|
||||||
|
self.ngpus = ngpus
|
||||||
|
self.ndims = ndims
|
||||||
|
|
||||||
|
self.grid = SHMEMGrid.getGrid(self.ngpus, self.ndims)
|
||||||
|
|
||||||
|
self.logger.debug("Created {:}-dimensional SHMEM grid, using {:} GPUs".format(
|
||||||
|
self.ndims, self.ngpus))
|
||||||
|
|
||||||
|
# XXX: Is this a natural place to store the contexts? Consider moving contexts out of this
|
||||||
|
# class, into notebook / calling script (shmemTesting.py)
|
||||||
|
self.cuda_contexts = []
|
||||||
|
|
||||||
|
for i in range(self.ngpus):
|
||||||
|
# XXX: disabled for testing on single-GPU system
|
||||||
|
#self.cuda_contexts.append(CudaContext.CudaContext(device=i, autotuning=False))
|
||||||
|
self.cuda_contexts.append(CudaContext.CudaContext(device=0, autotuning=False))
|
||||||
|
|
||||||
|
def getCoordinate(self, index):
|
||||||
|
i = (index % self.grid[0])
|
||||||
|
j = (index // self.grid[0])
|
||||||
|
return i, j
|
||||||
|
|
||||||
|
def getIndex(self, i, j):
|
||||||
|
return j*self.grid[0] + i
|
||||||
|
|
||||||
|
def getEast(self, index):
|
||||||
|
i, j = self.getCoordinate(index)
|
||||||
|
i = (i+1) % self.grid[0]
|
||||||
|
return self.getIndex(i, j)
|
||||||
|
|
||||||
|
def getWest(self, index):
|
||||||
|
i, j = self.getCoordinate(index)
|
||||||
|
i = (i+self.grid[0]-1) % self.grid[0]
|
||||||
|
return self.getIndex(i, j)
|
||||||
|
|
||||||
|
def getNorth(self, index):
|
||||||
|
i, j = self.getCoordinate(index)
|
||||||
|
j = (j+1) % self.grid[1]
|
||||||
|
return self.getIndex(i, j)
|
||||||
|
|
||||||
|
def getSouth(self, index):
|
||||||
|
i, j = self.getCoordinate(index)
|
||||||
|
j = (j+self.grid[1]-1) % self.grid[1]
|
||||||
|
return self.getIndex(i, j)
|
||||||
|
|
||||||
|
def getGrid(num_gpus, num_dims):
|
||||||
|
assert(isinstance(num_gpus, int))
|
||||||
|
assert(isinstance(num_dims, int))
|
||||||
|
|
||||||
|
# Adapted from https://stackoverflow.com/questions/28057307/factoring-a-number-into-roughly-equal-factors
|
||||||
|
# Original code by https://stackoverflow.com/users/3928385/ishamael
|
||||||
|
# Factorizes a number into n roughly equal factors
|
||||||
|
|
||||||
|
#Dictionary to remember already computed permutations
|
||||||
|
memo = {}
|
||||||
|
def dp(n, left): # returns tuple (cost, [factors])
|
||||||
|
"""
|
||||||
|
Recursively searches through all factorizations
|
||||||
|
"""
|
||||||
|
|
||||||
|
#Already tried: return existing result
|
||||||
|
if (n, left) in memo:
|
||||||
|
return memo[(n, left)]
|
||||||
|
|
||||||
|
#Spent all factors: return number itself
|
||||||
|
if left == 1:
|
||||||
|
return (n, [n])
|
||||||
|
|
||||||
|
#Find new factor
|
||||||
|
i = 2
|
||||||
|
best = n
|
||||||
|
bestTuple = [n]
|
||||||
|
while i * i < n:
|
||||||
|
#If factor found
|
||||||
|
if n % i == 0:
|
||||||
|
#Factorize remainder
|
||||||
|
rem = dp(n // i, left - 1)
|
||||||
|
|
||||||
|
#If new permutation better, save it
|
||||||
|
if rem[0] + i < best:
|
||||||
|
best = rem[0] + i
|
||||||
|
bestTuple = [i] + rem[1]
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
#Store calculation
|
||||||
|
memo[(n, left)] = (best, bestTuple)
|
||||||
|
return memo[(n, left)]
|
||||||
|
|
||||||
|
|
||||||
|
grid = dp(num_gpus, num_dims)[1]
|
||||||
|
|
||||||
|
if (len(grid) < num_dims):
|
||||||
|
#Split problematic 4
|
||||||
|
if (4 in grid):
|
||||||
|
grid.remove(4)
|
||||||
|
grid.append(2)
|
||||||
|
grid.append(2)
|
||||||
|
|
||||||
|
#Pad with ones to guarantee num_dims
|
||||||
|
grid = grid + [1]*(num_dims - len(grid))
|
||||||
|
|
||||||
|
#Sort in descending order
|
||||||
|
grid = np.sort(grid)
|
||||||
|
grid = grid[::-1]
|
||||||
|
|
||||||
|
return grid
|
||||||
|
|
||||||
|
class SHMEMSimulatorGroup(object):
|
||||||
|
"""
|
||||||
|
Class which handles communication and synchronization between simulators in different
|
||||||
|
contexts (typically on different GPUs)
|
||||||
|
"""
|
||||||
|
def __init__(self, sims, grid):
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
assert(len(sims) > 1)
|
||||||
|
|
||||||
|
self.sims = sims
|
||||||
|
|
||||||
|
# XXX: This is not what was intended. Do we need extra wrapper class SHMEMSimulator?
|
||||||
|
# See also getOutput() and check().
|
||||||
|
#
|
||||||
|
# SHMEMSimulatorGroup would then not have any superclass, but manage a collection of
|
||||||
|
# SHMEMSimulators that have BaseSimulator as a superclass.
|
||||||
|
#
|
||||||
|
# This would also eliminate the need for all the array bookkeeping in this class.
|
||||||
|
#
|
||||||
|
CONT HERE! Model shmemTesting after mpiTesting and divide existing functionality between SHMEMSimulatorGroup and SHMEMSimulator
|
||||||
|
|
||||||
|
autotuner = sims[0].context.autotuner
|
||||||
|
sims[0].context.autotuner = None
|
||||||
|
boundary_conditions = sims[0].getBoundaryConditions()
|
||||||
|
super().__init__(sims[0].context,
|
||||||
|
sims[0].nx, sims[0].ny,
|
||||||
|
sims[0].dx, sims[0].dy,
|
||||||
|
boundary_conditions,
|
||||||
|
sims[0].cfl_scale,
|
||||||
|
sims[0].num_substeps,
|
||||||
|
sims[0].block_size[0], sims[0].block_size[1])
|
||||||
|
sims[0].context.autotuner = autotuner
|
||||||
|
|
||||||
|
self.sims = sims
|
||||||
|
self.grid = grid
|
||||||
|
|
||||||
|
self.east = [None] * len(self.sims)
|
||||||
|
self.west = [None] * len(self.sims)
|
||||||
|
self.north = [None] * len(self.sims)
|
||||||
|
self.south = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.nvars = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.read_e = [None] * len(self.sims)
|
||||||
|
self.read_w = [None] * len(self.sims)
|
||||||
|
self.read_n = [None] * len(self.sims)
|
||||||
|
self.read_s = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.write_e = [None] * len(self.sims)
|
||||||
|
self.write_w = [None] * len(self.sims)
|
||||||
|
self.write_n = [None] * len(self.sims)
|
||||||
|
self.write_s = [None] * len(self.sims)
|
||||||
|
|
||||||
|
self.e = [None] * len(self.sims)
|
||||||
|
self.w = [None] * len(self.sims)
|
||||||
|
self.n = [None] * len(self.sims)
|
||||||
|
self.s = [None] * len(self.sims)
|
||||||
|
|
||||||
|
for i, sim in enumerate(self.sims):
|
||||||
|
#Get neighbor subdomain ids
|
||||||
|
self.east[i] = grid.getEast(i)
|
||||||
|
self.west[i] = grid.getWest(i)
|
||||||
|
self.north[i] = grid.getNorth(i)
|
||||||
|
self.south[i] = grid.getSouth(i)
|
||||||
|
|
||||||
|
#Get coordinate of this subdomain
|
||||||
|
#and handle global boundary conditions
|
||||||
|
new_boundary_conditions = Simulator.BoundaryCondition({
|
||||||
|
'north': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||||
|
'south': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||||
|
'east': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||||
|
'west': Simulator.BoundaryCondition.Type.Dirichlet
|
||||||
|
})
|
||||||
|
gi, gj = grid.getCoordinate(i)
|
||||||
|
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.west = None
|
||||||
|
new_boundary_conditions.west = boundary_conditions.west;
|
||||||
|
if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.south = None
|
||||||
|
new_boundary_conditions.south = boundary_conditions.south;
|
||||||
|
if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.east = None
|
||||||
|
new_boundary_conditions.east = boundary_conditions.east;
|
||||||
|
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
|
||||||
|
self.north = None
|
||||||
|
new_boundary_conditions.north = boundary_conditions.north;
|
||||||
|
sim.setBoundaryConditions(new_boundary_conditions)
|
||||||
|
|
||||||
|
#Get number of variables
|
||||||
|
self.nvars[i] = len(sim.getOutput().gpu_variables)
|
||||||
|
|
||||||
|
#Shorthands for computing extents and sizes
|
||||||
|
gc_x = int(sim.getOutput()[0].x_halo)
|
||||||
|
gc_y = int(sim.getOutput()[0].y_halo)
|
||||||
|
nx = int(sim.nx)
|
||||||
|
ny = int(sim.ny)
|
||||||
|
|
||||||
|
#Set regions for ghost cells to read from
|
||||||
|
#These have the format [x0, y0, width, height]
|
||||||
|
self.read_e[i] = np.array([ nx, 0, gc_x, ny + 2*gc_y])
|
||||||
|
self.read_w[i] = np.array([gc_x, 0, gc_x, ny + 2*gc_y])
|
||||||
|
self.read_n[i] = np.array([gc_x, ny, nx, gc_y])
|
||||||
|
self.read_s[i] = np.array([gc_x, gc_y, nx, gc_y])
|
||||||
|
|
||||||
|
#Set regions for ghost cells to write to
|
||||||
|
self.write_e[i] = self.read_e[i] + np.array([gc_x, 0, 0, 0])
|
||||||
|
self.write_w[i] = self.read_w[i] - np.array([gc_x, 0, 0, 0])
|
||||||
|
self.write_n[i] = self.read_n[i] + np.array([0, gc_y, 0, 0])
|
||||||
|
self.write_s[i] = self.read_s[i] - np.array([0, gc_y, 0, 0])
|
||||||
|
|
||||||
|
#Allocate host data
|
||||||
|
#Note that east and west also transfer ghost cells
|
||||||
|
#whilst north/south only transfer internal cells
|
||||||
|
#Reuses the width/height defined in the read-extets above
|
||||||
|
self.e[i] = np.empty((self.nvars[i], self.read_e[i][3], self.read_e[i][2]), dtype=np.float32)
|
||||||
|
self.w[i] = np.empty((self.nvars[i], self.read_w[i][3], self.read_w[i][2]), dtype=np.float32)
|
||||||
|
self.n[i] = np.empty((self.nvars[i], self.read_n[i][3], self.read_n[i][2]), dtype=np.float32)
|
||||||
|
self.s[i] = np.empty((self.nvars[i], self.read_s[i][3], self.read_s[i][2]), dtype=np.float32)
|
||||||
|
|
||||||
|
self.logger.debug("Initialized {:d} subdomains".format(len(self.sims)))
|
||||||
|
|
||||||
|
|
||||||
|
def substep(self, dt, step_number):
|
||||||
|
self.exchange()
|
||||||
|
|
||||||
|
for i, sim in enumerate(self.sims):
|
||||||
|
sim.substep(dt, step_number)
|
||||||
|
|
||||||
|
def getOutput(self):
|
||||||
|
# XXX: Does not return what we would expect.
|
||||||
|
# Returns first subdomain, but we want the whole domain.
|
||||||
|
return self.sims[0].getOutput()
|
||||||
|
|
||||||
|
def synchronize(self):
|
||||||
|
for sim in self.sims:
|
||||||
|
sim.synchronize()
|
||||||
|
|
||||||
|
def check(self):
|
||||||
|
# XXX: Does not return what we would expect.
|
||||||
|
# Checks only first subdomain, but we want to check the whole domain.
|
||||||
|
return self.sims[0].check()
|
||||||
|
|
||||||
|
def computeDt(self):
|
||||||
|
global_dt = float("inf")
|
||||||
|
|
||||||
|
for sim in self.sims:
|
||||||
|
sim.context.synchronize()
|
||||||
|
|
||||||
|
for sim in self.sims:
|
||||||
|
local_dt = sim.computeDt()
|
||||||
|
if local_dt < global_dt:
|
||||||
|
global_dt = local_dt
|
||||||
|
self.logger.debug("Local dt: {:f}".format(local_dt))
|
||||||
|
|
||||||
|
self.logger.debug("Global dt: {:f}".format(global_dt))
|
||||||
|
return global_dt
|
||||||
|
|
||||||
|
def getExtent(self, index=0):
|
||||||
|
"""
|
||||||
|
Function which returns the extent of the subdomain with index
|
||||||
|
index in the grid
|
||||||
|
"""
|
||||||
|
width = self.sims[index].nx*self.sims[index].dx
|
||||||
|
height = self.sims[index].ny*self.sims[index].dy
|
||||||
|
i, j = self.grid.getCoordinate(index)
|
||||||
|
x0 = i * width
|
||||||
|
y0 = j * height
|
||||||
|
x1 = x0 + width
|
||||||
|
y1 = y0 + height
|
||||||
|
return [x0, x1, y0, y1]
|
||||||
|
|
||||||
|
def exchange(self):
|
||||||
|
####
|
||||||
|
# First transfer internal cells north-south
|
||||||
|
####
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ns_download(i)
|
||||||
|
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ns_upload(i)
|
||||||
|
|
||||||
|
####
|
||||||
|
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
|
||||||
|
####
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ew_download(i)
|
||||||
|
|
||||||
|
for i in range(len(self.sims)):
|
||||||
|
self.ew_upload(i)
|
||||||
|
|
||||||
|
def ns_download(self, i):
|
||||||
|
#Download from the GPU
|
||||||
|
if self.north[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the north)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.n[i][k,:,:], extent=self.read_n[i])
|
||||||
|
if self.south[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i])
|
||||||
|
self.sims[i].stream.synchronize()
|
||||||
|
|
||||||
|
def ns_upload(self, i):
|
||||||
|
#Upload to the GPU
|
||||||
|
if self.north[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.s[self.north[i]][k,:,:], extent=self.write_n[i])
|
||||||
|
if self.south[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.n[self.south[i]][k,:,:], extent=self.write_s[i])
|
||||||
|
|
||||||
|
def ew_download(self, i):
|
||||||
|
#Download from the GPU
|
||||||
|
if self.east[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the east)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.e[i][k,:,:], extent=self.read_e[i])
|
||||||
|
if self.west[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west)
|
||||||
|
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i])
|
||||||
|
self.sims[i].stream.synchronize()
|
||||||
|
|
||||||
|
def ew_upload(self, i):
|
||||||
|
#Upload to the GPU
|
||||||
|
if self.east[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.w[self.east[i]][k,:,:], extent=self.write_e[i])
|
||||||
|
#test_east = np.ones_like(self.e[self.east[i]][k,:,:])
|
||||||
|
#self.sims[i].u0[k].upload(self.sims[i].stream, test_east, extent=self.write_e[i])
|
||||||
|
if self.west[i] is not None:
|
||||||
|
for k in range(self.nvars[i]):
|
||||||
|
self.sims[i].u0[k].upload(self.sims[i].stream, self.e[self.west[i]][k,:,:], extent=self.write_w[i])
|
||||||
|
#test_west = np.ones_like(self.e[self.west[i]][k,:,:])
|
||||||
|
#self.sims[i].u0[k].upload(self.sims[i].stream, test_west, extent=self.write_w[i])
|
@ -162,6 +162,7 @@ class BaseSimulator(object):
|
|||||||
|
|
||||||
#Create a CUDA stream
|
#Create a CUDA stream
|
||||||
self.stream = cuda.Stream()
|
self.stream = cuda.Stream()
|
||||||
|
self.internal_stream = cuda.Stream()
|
||||||
|
|
||||||
#Keep track of simulation time and number of timesteps
|
#Keep track of simulation time and number of timesteps
|
||||||
self.t = 0.0
|
self.t = 0.0
|
||||||
|
@ -147,7 +147,18 @@ __global__ void KP07DimsplitKernel(
|
|||||||
float* E1_ptr_, int E1_pitch_,
|
float* E1_ptr_, int E1_pitch_,
|
||||||
|
|
||||||
//Output CFL
|
//Output CFL
|
||||||
float* cfl_) {
|
float* cfl_,
|
||||||
|
|
||||||
|
//Subarea of internal domain to compute
|
||||||
|
int x0=0, int y0=0,
|
||||||
|
int x1=0, int y1=0) {
|
||||||
|
|
||||||
|
if(x1 == 0)
|
||||||
|
x1 = nx_;
|
||||||
|
|
||||||
|
if(y1 == 0)
|
||||||
|
y1 = ny_;
|
||||||
|
|
||||||
const unsigned int w = BLOCK_WIDTH;
|
const unsigned int w = BLOCK_WIDTH;
|
||||||
const unsigned int h = BLOCK_HEIGHT;
|
const unsigned int h = BLOCK_HEIGHT;
|
||||||
const unsigned int gc_x = 2;
|
const unsigned int gc_x = 2;
|
||||||
@ -160,10 +171,10 @@ __global__ void KP07DimsplitKernel(
|
|||||||
__shared__ float F[4][h+2*gc_y][w+2*gc_x];
|
__shared__ float F[4][h+2*gc_y][w+2*gc_x];
|
||||||
|
|
||||||
//Read into shared memory
|
//Read into shared memory
|
||||||
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
|
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||||
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
|
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||||
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
|
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||||
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_);
|
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||||
|
|
||||||
//Step 0 => evolve x first, then y
|
//Step 0 => evolve x first, then y
|
||||||
if (step_ == 0) {
|
if (step_ == 0) {
|
||||||
@ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel(
|
|||||||
|
|
||||||
|
|
||||||
// Write to main memory for all internal cells
|
// Write to main memory for all internal cells
|
||||||
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1);
|
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||||
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1);
|
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||||
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1);
|
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||||
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1);
|
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||||
|
|
||||||
//Compute the CFL for this block
|
//Compute the CFL for this block
|
||||||
if (cfl_ != NULL) {
|
if (cfl_ != NULL) {
|
||||||
|
@ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y>
|
|||||||
inline __device__ void readBlock(float* ptr_, int pitch_,
|
inline __device__ void readBlock(float* ptr_, int pitch_,
|
||||||
float Q[h+2*gc_y][w+2*gc_x],
|
float Q[h+2*gc_y][w+2*gc_x],
|
||||||
const int nx_, const int ny_,
|
const int nx_, const int ny_,
|
||||||
const int boundary_conditions_) {
|
const int boundary_conditions_,
|
||||||
|
int x0, int y0,
|
||||||
|
int x1, int y1) {
|
||||||
//Index of block within domain
|
//Index of block within domain
|
||||||
const int bx = blockDim.x * blockIdx.x;
|
const int bx = blockDim.x * blockIdx.x;
|
||||||
const int by = blockDim.y * blockIdx.y;
|
const int by = blockDim.y * blockIdx.y;
|
||||||
@ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_,
|
|||||||
//Loop over all variables
|
//Loop over all variables
|
||||||
for (int j=threadIdx.y; j<h+2*gc_y; j+=h) {
|
for (int j=threadIdx.y; j<h+2*gc_y; j+=h) {
|
||||||
//Handle periodic boundary conditions here
|
//Handle periodic boundary conditions here
|
||||||
int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_);
|
int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_);
|
||||||
l = min(l, ny_+2*gc_y-1);
|
l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1));
|
||||||
float* row = (float*) ((char*) ptr_ + pitch_*l);
|
float* row = (float*) ((char*) ptr_ + pitch_*l);
|
||||||
|
|
||||||
for (int i=threadIdx.x; i<w+2*gc_x; i+=w) {
|
for (int i=threadIdx.x; i<w+2*gc_x; i+=w) {
|
||||||
//Handle periodic boundary conditions here
|
//Handle periodic boundary conditions here
|
||||||
int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_);
|
int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_);
|
||||||
k = min(k, nx_+2*gc_x-1);
|
k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1));
|
||||||
|
|
||||||
//Read from global memory
|
//Read from global memory
|
||||||
Q[j][i] = row[k];
|
Q[j][i] = row[k];
|
||||||
@ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y>
|
|||||||
inline __device__ void writeBlock(float* ptr_, int pitch_,
|
inline __device__ void writeBlock(float* ptr_, int pitch_,
|
||||||
float shmem[h+2*gc_y][w+2*gc_x],
|
float shmem[h+2*gc_y][w+2*gc_x],
|
||||||
const int nx_, const int ny_,
|
const int nx_, const int ny_,
|
||||||
int rk_step_, int rk_order_) {
|
int rk_step_, int rk_order_,
|
||||||
|
int x0, int y0,
|
||||||
|
int x1, int y1) {
|
||||||
|
|
||||||
//Index of cell within domain
|
//Index of cell within domain
|
||||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x;
|
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0;
|
||||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y;
|
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0;
|
||||||
|
|
||||||
|
//In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
|
||||||
|
const int max_ti = min(nx_+gc_x, x1+gc_x);
|
||||||
|
const int max_tj = min(ny_+gc_y, y1+gc_y);
|
||||||
|
|
||||||
//Only write internal cells
|
//Only write internal cells
|
||||||
if (ti < nx_+gc_x && tj < ny_+gc_y) {
|
if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) {
|
||||||
//Index of thread within block
|
//Index of thread within block
|
||||||
const int tx = threadIdx.x + gc_x;
|
const int tx = threadIdx.x + gc_x;
|
||||||
const int ty = threadIdx.y + gc_y;
|
const int ty = threadIdx.y + gc_y;
|
||||||
@ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
|
|||||||
row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx];
|
row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DEBUG
|
||||||
|
//row[ti] = 99.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,10 +25,13 @@ import numpy as np
|
|||||||
import gc
|
import gc
|
||||||
|
|
||||||
|
|
||||||
def getExtent(width, height, nx, ny, grid):
|
def getExtent(width, height, nx, ny, grid, index=None):
|
||||||
if grid is not None:
|
if grid is not None:
|
||||||
gx = grid.grid[0]
|
gx = grid.grid[0]
|
||||||
gy = grid.grid[1]
|
gy = grid.grid[1]
|
||||||
|
if index is not None:
|
||||||
|
i, j = grid.getCoordinate(index)
|
||||||
|
else:
|
||||||
i, j = grid.getCoordinate()
|
i, j = grid.getCoordinate()
|
||||||
|
|
||||||
dx = (width / gx) / nx
|
dx = (width / gx) / nx
|
||||||
@ -192,7 +195,7 @@ def genShockBubble(nx, ny, gamma, grid=None):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
|
def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None, index=None):
|
||||||
"""
|
"""
|
||||||
Roughness parameter in (0, 1.0] determines how "squiggly"
|
Roughness parameter in (0, 1.0] determines how "squiggly"
|
||||||
the interface betweeen the zones is
|
the interface betweeen the zones is
|
||||||
@ -234,7 +237,7 @@ def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
x0, x1, y0, y1, _, dy = getExtent(1.0, 1.0, nx, ny, grid)
|
x0, x1, y0, y1, _, dy = getExtent(1.0, 1.0, nx, ny, grid, index)
|
||||||
x = np.linspace(x0, x1, nx)
|
x = np.linspace(x0, x1, nx)
|
||||||
y = np.linspace(y0, y1, ny)
|
y = np.linspace(y0, y1, ny)
|
||||||
_, y = np.meshgrid(x, y)
|
_, y = np.meshgrid(x, y)
|
||||||
@ -274,7 +277,7 @@ def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
|
|||||||
|
|
||||||
E = 0.5*rho*(u**2+v**2) + p/(gamma-1.0)
|
E = 0.5*rho*(u**2+v**2) + p/(gamma-1.0)
|
||||||
|
|
||||||
_, _, _, _, dx, dy = getExtent(width, height, nx, ny, grid)
|
_, _, _, _, dx, dy = getExtent(width, height, nx, ny, grid, index)
|
||||||
|
|
||||||
|
|
||||||
bc = BoundaryCondition({
|
bc = BoundaryCondition({
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This python module implements Cuda context handling
|
This python module implements visualization techniques/modes
|
||||||
|
|
||||||
Copyright (C) 2018 SINTEF ICT
|
Copyright (C) 2018 SINTEF ICT
|
||||||
|
|
||||||
|
18
README.md
Normal file
18
README.md
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# ShallowWaterGPU
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
A good place to start exploring this codebase is the notebooks. Complete the following steps to run the notebooks:
|
||||||
|
|
||||||
|
1. Install conda (see e.g. Miniconda or Anaconda)
|
||||||
|
2. Change directory to the repository root and run the following commands
|
||||||
|
3. conda env create -f conda_environment.yml
|
||||||
|
4. conda activate ShallowWaterGPU
|
||||||
|
5. jupyter notebook
|
||||||
|
|
||||||
|
Make sure you are running the correct kernel ("conda:ShallowWaterGPU"). If not, change kernel using the "Kernel"-menu in the notebook.
|
||||||
|
|
||||||
|
If you do not need to run notebooks you may use the conda environment found in conda_environment_hpc.yml
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
Have a look at the conda documentation and https://towardsdatascience.com/how-to-set-up-anaconda-and-jupyter-notebook-the-right-way-de3b7623ea4a
|
||||||
|
|
62
SYSTEMS.md
Normal file
62
SYSTEMS.md
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
$ANSIBLE_VAULT;1.1;AES256
|
||||||
|
61316265663939333638336466323036663861343233316466646432313138653633623662353937
|
||||||
|
3232313165656633346432376237383566363537366534310a303231343936663438653835373161
|
||||||
|
35616161323432653062323164623861353065333761663136313137333732313230626665386336
|
||||||
|
6166656538396463370a356166316363326133313864386536323236346634323537393639653038
|
||||||
|
66336337336132613061353964613638326233356336323962366531333932366539366339623563
|
||||||
|
36333365326463616634323939333062363263636663373635653064626138363464666233316561
|
||||||
|
63393735393233616437386537393739393433663631313864646535636262616336333631396166
|
||||||
|
38643636323530386565396338623366393232313838356536303537393338393634666632656234
|
||||||
|
65353930303762333639376638336364303439306132626531326132376264623063376464636430
|
||||||
|
32333536386134333136313139313861306364333037323363393463333664633764653937623866
|
||||||
|
34313064346261313330373132353563343761323435393930303136353865303163373937623831
|
||||||
|
64343038373162333039653161643233353764633337366434396638376530636261323362373434
|
||||||
|
38393630613065356632663533333331633039663935663732353234643131306665343339373265
|
||||||
|
64356563653838613337663132663234356462343333623139626662363764656239326637653832
|
||||||
|
35396636643937336431623531306133643137623831333936313839333738333730373136666336
|
||||||
|
35623965643664343164373630313362656663386638376237616134343631386366313336626138
|
||||||
|
62376436383837376539663438346431633138383363633862356366376537393932626262383637
|
||||||
|
31323365333139653736623233636233323162343039663035346135326638633430303134396337
|
||||||
|
36353264313835346130643736663665386364343835643166383361316631373338663731373335
|
||||||
|
30313530326662663937666330643565363565616566633333363535656539656531666266613638
|
||||||
|
30306264613438363265646332386535383238373433396337633636616532626161343236336533
|
||||||
|
36366362653137333739353737386563613136653164383437316237643533633133313735633363
|
||||||
|
64326433356266363133343339626333633063326533383632353639613163663966376465396231
|
||||||
|
36663034363534396430316463386564663465323036613636343136643262666566303533346439
|
||||||
|
63396466656639623836613130363835346435633437666463363333356231343038356434343861
|
||||||
|
66643636633739336666316566653136363862346336353862653130346335363334616430366435
|
||||||
|
30376365383262326438306266366265363030353764633630333034663037643037343132303631
|
||||||
|
39316364366234363339363130333765616432306331373566393530653963356539636437383062
|
||||||
|
34633938643563656363633864656361643539663833356638356365373061663964363530393535
|
||||||
|
37646533386235613763396638393539303062326239633238373763326561313634313265613135
|
||||||
|
64646138313562313732393732303133343234323438616165326530333234626363393735636530
|
||||||
|
62353735313231353662353533636134306530623339383730306332613636663366653566313935
|
||||||
|
32343935353566656130393533323639353863666436333839386463396337336635356663373136
|
||||||
|
61323734613239396236393266363631313465363630306565636663396235626132336339623938
|
||||||
|
62383435643661623938393662363262376566613365613465323432343534356433323330666133
|
||||||
|
30303963656635303734316539333038663962626331313366666337663165323230646564623935
|
||||||
|
61316630353739386365323339626166323562616630383538393733353864396565353039656333
|
||||||
|
30343038636231363531383061613836653038373937616163643963393231356235626531366239
|
||||||
|
62343333326434636665363931376235313535343135626261336439636663323233383565633964
|
||||||
|
65333830613131396630336337646230393038386536336365313738316335386261393838383961
|
||||||
|
64656331363738616539346663613261386639353437316231636533353031336464383432623939
|
||||||
|
65386164396231393735643563663337643563633233373338643630313739373861356166616463
|
||||||
|
35306263333963663434376263396464323135346663376334356134393066653439376263376231
|
||||||
|
30333730383163366636323533393334336331633234306536376634313735613263366537346536
|
||||||
|
62366564383861656662353738366665396639313833323038356661306135393338333466333563
|
||||||
|
32653861346166663163383036386432343833333137663462343030363762663139366534326466
|
||||||
|
66313864623438336164333430613766373430656536323964633863333931643036656563353639
|
||||||
|
30313835666366383035343031643265386263316165323537613636656533376239633964393866
|
||||||
|
61646163343032313036303738643763383364663134356634373262633361383035306231636364
|
||||||
|
39333232636538643033313438396332383962656131363365666566633239366532326336363133
|
||||||
|
38393064643030333538333562643435663434343863383834663266373337336433313663646164
|
||||||
|
36343334343965623830613736393231666361643239663062393239613233376335383362666161
|
||||||
|
66383035653330373736613234303631386163656561383138613363613539396332376162316131
|
||||||
|
61313532653531653836343731636535623066383231613635316432323331623761383833623333
|
||||||
|
39343632623961613561373261653939636363366531303839336237383166363733303538363237
|
||||||
|
36373362636263666334316163633766303334373033636539353464393536356466636664333665
|
||||||
|
32643135626366666137626464393961366165383334343063356334373534633764326162363837
|
||||||
|
38643662326266313464343464646166643235663663303761313639376537306337353863336264
|
||||||
|
66376335333738366265343636376363366365306137336665623466626261653937656461303332
|
||||||
|
32613561616662383032393562613831626666373134303032626134313262363830326530643632
|
||||||
|
61366133663564313933366430396430353762386133396436633839303766653765
|
33
dgx-2-shmem-test.job
Normal file
33
dgx-2-shmem-test.job
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH -p dgx2q # partition (GPU queue)
|
||||||
|
#SBATCH -N 1 # number of nodes
|
||||||
|
#SBATCH -n 1 # number of cores
|
||||||
|
#SBATCH -w g001 # DGX-2 node
|
||||||
|
#SBATCH --gres=gpu:1 # number of V100's
|
||||||
|
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||||
|
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||||
|
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||||
|
|
||||||
|
ulimit -s 10240
|
||||||
|
module load slurm
|
||||||
|
module load cuda10.1/toolkit/10.1.243
|
||||||
|
|
||||||
|
# Check how many gpu's your job got
|
||||||
|
#nvidia-smi
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
rm -rf /work/$USER/ShallowWaterGPU
|
||||||
|
mkdir -p /work/$USER/ShallowWaterGPU
|
||||||
|
cp -r . /work/$USER/ShallowWaterGPU
|
||||||
|
|
||||||
|
# Run job
|
||||||
|
# (Assumes Miniconda is installed in user root dir.)
|
||||||
|
cd /work/$USER/ShallowWaterGPU
|
||||||
|
nvprof -o profiler_output $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 shmemTesting.py
|
||||||
|
cd $HOME/src/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Copy files from work directory:
|
||||||
|
# (NOTE: Copying is not performed if job fails!)
|
||||||
|
cp /work/$USER/ShallowWaterGPU/*.log .
|
||||||
|
cp /work/$USER/ShallowWaterGPU/*.nc .
|
||||||
|
cp /work/$USER/ShallowWaterGPU/profiler_output .
|
@ -1,10 +1,10 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
# See http://wiki.ex3.simula.no before changing the values below
|
||||||
#SBATCH -p dgx2q # partition (GPU queue)
|
#SBATCH -p dgx2q # partition (GPU queue)
|
||||||
#SBATCH -N 1 # number of nodes
|
#SBATCH -N 1 # number of nodes
|
||||||
#SBATCH -n 4 # number of cores
|
#SBATCH -n 4 # number of cores
|
||||||
#SBATCH -w g001 # DGX-2 node
|
#SBATCH -w g001 # DGX-2 node
|
||||||
#SBATCH --gres=gpu:4 # number of V100's
|
#SBATCH --gres=gpu:4 # number of V100's
|
||||||
#SBATCH --mem 10G # memory pool for all cores
|
|
||||||
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||||
#SBATCH -o slurm.%N.%j.out # STDOUT
|
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||||
#SBATCH -e slurm.%N.%j.err # STDERR
|
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||||
@ -18,6 +18,7 @@ module load cuda10.1/toolkit/10.1.243
|
|||||||
#nvidia-smi
|
#nvidia-smi
|
||||||
|
|
||||||
## Copy input files to the work directory:
|
## Copy input files to the work directory:
|
||||||
|
rm -rf /work/$USER/ShallowWaterGPU
|
||||||
mkdir -p /work/$USER/ShallowWaterGPU
|
mkdir -p /work/$USER/ShallowWaterGPU
|
||||||
cp -r . /work/$USER/ShallowWaterGPU
|
cp -r . /work/$USER/ShallowWaterGPU
|
||||||
|
|
||||||
|
59
dgx-2_scaling_benchmark.job
Normal file
59
dgx-2_scaling_benchmark.job
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# See http://wiki.ex3.simula.no before changing the values below
|
||||||
|
#SBATCH -p dgx2q # partition (GPU queue)
|
||||||
|
#SBATCH -w g001 # DGX-2 node
|
||||||
|
##SBATCH --gres=gpu:4 # number of V100's
|
||||||
|
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||||
|
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||||
|
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||||
|
#SBATCH --reservation=martinls_17
|
||||||
|
|
||||||
|
|
||||||
|
# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
|
||||||
|
# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
|
||||||
|
# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
|
||||||
|
# mpiexec --mca opal_cuda_support 1 ...
|
||||||
|
#
|
||||||
|
# In addition, the UCX support is also built but disabled by default.
|
||||||
|
# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
|
||||||
|
# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
|
||||||
|
# Equivalently, you can set the MCA parameters in the command line:
|
||||||
|
# mpiexec --mca pml ucx --mca osc ucx ...
|
||||||
|
# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
|
||||||
|
# Please consult UCX's documentation for detail.
|
||||||
|
|
||||||
|
ulimit -s 10240
|
||||||
|
module load slurm/20.02.7
|
||||||
|
module load cuda11.2/toolkit/11.2.2
|
||||||
|
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
|
||||||
|
|
||||||
|
# Check how many gpu's your job got
|
||||||
|
#nvidia-smi
|
||||||
|
|
||||||
|
mkdir -p output_dgx-2/$NOW
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
|
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
|
|
||||||
|
# Run job
|
||||||
|
# (Assumes Miniconda is installed in user root dir.)
|
||||||
|
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
|
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
|
||||||
|
export OMPI_MCA_opal_cuda_support=true
|
||||||
|
mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
|
||||||
|
cd $HOME/src/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Copy files from work directory:
|
||||||
|
# (NOTE: Copying is not performed if job fails!)
|
||||||
|
mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW
|
||||||
|
|
||||||
|
rm -rf /work/$USER/$SLURM_JOB_ID
|
73
dgx-2_strong_scaling_benchmark.sh
Normal file
73
dgx-2_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
|
# one node: 1-16 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
# one node: 4-16 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
# one node: 1-16 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
# one node: 4-16 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
41
dgx-2_weak_scaling_benchmark.sh
Normal file
41
dgx-2_weak_scaling_benchmark.sh
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
|
# one node: 1-16 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
# one node: 1-16 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
|
||||||
|
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
58
hgx_scaling_benchmark.job
Normal file
58
hgx_scaling_benchmark.job
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# See http://wiki.ex3.simula.no before changing the values below
|
||||||
|
#SBATCH -p hgx2q # partition (GPU queue)
|
||||||
|
#SBATCH -w g002 # HGX node
|
||||||
|
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||||
|
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||||
|
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||||
|
#SBATCH --reservation=martinls_11
|
||||||
|
|
||||||
|
|
||||||
|
# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
|
||||||
|
# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
|
||||||
|
# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
|
||||||
|
# mpiexec --mca opal_cuda_support 1 ...
|
||||||
|
#
|
||||||
|
# In addition, the UCX support is also built but disabled by default.
|
||||||
|
# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
|
||||||
|
# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
|
||||||
|
# Equivalently, you can set the MCA parameters in the command line:
|
||||||
|
# mpiexec --mca pml ucx --mca osc ucx ...
|
||||||
|
# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
|
||||||
|
# Please consult UCX's documentation for detail.
|
||||||
|
|
||||||
|
ulimit -s 10240
|
||||||
|
module load slurm/20.02.7
|
||||||
|
module load cuda11.2/toolkit/11.2.2
|
||||||
|
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
|
||||||
|
|
||||||
|
# Check how many gpu's your job got
|
||||||
|
#nvidia-smi
|
||||||
|
|
||||||
|
mkdir -p output_hgx/$NOW
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
|
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
|
|
||||||
|
# Run job
|
||||||
|
# (Assumes Miniconda is installed in user root dir.)
|
||||||
|
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
|
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
|
||||||
|
export OMPI_MCA_opal_cuda_support=true
|
||||||
|
mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
|
||||||
|
cd $HOME/src/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Copy files from work directory:
|
||||||
|
# (NOTE: Copying is not performed if job fails!)
|
||||||
|
mkdir -p output_hgx/$NOW/$SLURM_JOB_ID
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW
|
||||||
|
|
||||||
|
rm -rf /work/$USER/$SLURM_JOB_ID
|
20
hgx_strong_scaling_benchmark.sh
Normal file
20
hgx_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
|
# one node: 1-8 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
|
||||||
|
# one node: 4-8 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
23
hgx_weak_scaling_benchmark.sh
Normal file
23
hgx_weak_scaling_benchmark.sh
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
|
# one node: 1-16 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
|
||||||
|
# one node: 1-8 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
107
mpiTesting.py
107
mpiTesting.py
@ -25,26 +25,42 @@ import gc
|
|||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
#MPI
|
# MPI
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
|
|
||||||
#CUDA
|
# CUDA
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
#Simulator engine etc
|
# Simulator engine etc
|
||||||
from GPUSimulators import MPISimulator, Common, CudaContext
|
from GPUSimulators import MPISimulator, Common, CudaContext
|
||||||
from GPUSimulators import EE2D_KP07_dimsplit
|
from GPUSimulators import EE2D_KP07_dimsplit
|
||||||
from GPUSimulators.helpers import InitialConditions as IC
|
from GPUSimulators.helpers import InitialConditions as IC
|
||||||
from GPUSimulators.Simulator import BoundaryCondition as BC
|
from GPUSimulators.Simulator import BoundaryCondition as BC
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
|
||||||
|
parser.add_argument('-nx', type=int, default=128)
|
||||||
|
parser.add_argument('-ny', type=int, default=128)
|
||||||
|
parser.add_argument('--profile', action='store_true') # default: False
|
||||||
|
|
||||||
#Get MPI COMM to use
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if(args.profile):
|
||||||
|
profiling_data = {}
|
||||||
|
# profiling: total run time
|
||||||
|
t_total_start = time.time()
|
||||||
|
t_init_start = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
# Get MPI COMM to use
|
||||||
comm = MPI.COMM_WORLD
|
comm = MPI.COMM_WORLD
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
#Initialize logging
|
# Initialize logging
|
||||||
####
|
####
|
||||||
log_level_console = 20
|
log_level_console = 20
|
||||||
log_level_file = 10
|
log_level_file = 10
|
||||||
@ -55,15 +71,17 @@ logger.setLevel(min(log_level_console, log_level_file))
|
|||||||
ch = logging.StreamHandler()
|
ch = logging.StreamHandler()
|
||||||
ch.setLevel(log_level_console)
|
ch.setLevel(log_level_console)
|
||||||
logger.addHandler(ch)
|
logger.addHandler(ch)
|
||||||
logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
|
logger.info("Console logger using level %s",
|
||||||
|
logging.getLevelName(log_level_console))
|
||||||
|
|
||||||
fh = logging.FileHandler(log_filename)
|
fh = logging.FileHandler(log_filename)
|
||||||
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
formatter = logging.Formatter(
|
||||||
|
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||||
fh.setFormatter(formatter)
|
fh.setFormatter(formatter)
|
||||||
fh.setLevel(log_level_file)
|
fh.setLevel(log_level_file)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
|
logger.info("File logger using level %s to %s",
|
||||||
|
logging.getLevelName(log_level_file), log_filename)
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
@ -73,7 +91,6 @@ logger.info("Creating MPI grid")
|
|||||||
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
|
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
# Initialize CUDA
|
# Initialize CUDA
|
||||||
####
|
####
|
||||||
@ -82,18 +99,28 @@ logger.info("Initializing CUDA")
|
|||||||
local_rank = grid.getLocalRank()
|
local_rank = grid.getLocalRank()
|
||||||
num_cuda_devices = cuda.Device.count()
|
num_cuda_devices = cuda.Device.count()
|
||||||
cuda_device = local_rank % num_cuda_devices
|
cuda_device = local_rank % num_cuda_devices
|
||||||
|
logger.info("Process %s using CUDA device %s", str(local_rank), str(cuda_device))
|
||||||
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
|
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
# Set initial conditions
|
# Set initial conditions
|
||||||
####
|
####
|
||||||
|
|
||||||
|
# DEBUGGING - setting random seed
|
||||||
|
np.random.seed(42)
|
||||||
|
|
||||||
logger.info("Generating initial conditions")
|
logger.info("Generating initial conditions")
|
||||||
nx = 128
|
nx = args.nx
|
||||||
ny = 128
|
ny = args.ny
|
||||||
|
|
||||||
|
dt = 0.000001
|
||||||
|
|
||||||
gamma = 1.4
|
gamma = 1.4
|
||||||
save_times = np.linspace(0, 5.0, 10)
|
#save_times = np.linspace(0, 0.000009, 2)
|
||||||
|
#save_times = np.linspace(0, 0.000099, 11)
|
||||||
|
#save_times = np.linspace(0, 0.000099, 2)
|
||||||
|
save_times = np.linspace(0, 0.0000999, 2)
|
||||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||||
|
|
||||||
@ -102,21 +129,65 @@ arguments['context'] = cuda_context
|
|||||||
arguments['theta'] = 1.2
|
arguments['theta'] = 1.2
|
||||||
arguments['grid'] = grid
|
arguments['grid'] = grid
|
||||||
|
|
||||||
|
if(args.profile):
|
||||||
|
t_init_end = time.time()
|
||||||
|
t_init = t_init_end - t_init_start
|
||||||
|
profiling_data["t_init"] = t_init
|
||||||
|
|
||||||
####
|
####
|
||||||
# Run simulation
|
# Run simulation
|
||||||
####
|
####
|
||||||
logger.info("Running simulation")
|
logger.info("Running simulation")
|
||||||
#Helper function to create MPI simulator
|
# Helper function to create MPI simulator
|
||||||
|
|
||||||
|
|
||||||
def genSim(grid, **kwargs):
|
def genSim(grid, **kwargs):
|
||||||
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
|
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
|
||||||
sim = MPISimulator.MPISimulator(local_sim, grid)
|
sim = MPISimulator.MPISimulator(local_sim, grid)
|
||||||
return sim
|
return sim
|
||||||
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
|
|
||||||
|
|
||||||
|
|
||||||
|
outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
|
||||||
|
genSim, arguments, outfile, save_times, save_var_names, dt)
|
||||||
|
|
||||||
|
if(args.profile):
|
||||||
|
t_total_end = time.time()
|
||||||
|
t_total = t_total_end - t_total_start
|
||||||
|
profiling_data["t_total"] = t_total
|
||||||
|
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
|
||||||
|
|
||||||
|
# write profiling to json file
|
||||||
|
if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||||
|
job_id = ""
|
||||||
|
if "SLURM_JOB_ID" in os.environ:
|
||||||
|
job_id = int(os.environ["SLURM_JOB_ID"])
|
||||||
|
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
|
||||||
|
allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
|
||||||
|
profiling_file = "MPI_jobid_" + \
|
||||||
|
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
|
||||||
|
profiling_data["outfile"] = outfile
|
||||||
|
else:
|
||||||
|
profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
|
||||||
|
|
||||||
|
for stage in sim_runner_profiling_data["start"].keys():
|
||||||
|
profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
|
||||||
|
|
||||||
|
for stage in sim_profiling_data["start"].keys():
|
||||||
|
profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage]
|
||||||
|
|
||||||
|
profiling_data["nx"] = nx
|
||||||
|
profiling_data["ny"] = ny
|
||||||
|
profiling_data["dt"] = dt
|
||||||
|
profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
|
||||||
|
|
||||||
|
profiling_data["slurm_job_id"] = job_id
|
||||||
|
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
|
||||||
|
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
|
||||||
|
profiling_data["git_hash"] = Common.getGitHash()
|
||||||
|
profiling_data["git_status"] = Common.getGitStatus()
|
||||||
|
|
||||||
|
with open(profiling_file, "w") as write_file:
|
||||||
|
json.dump(profiling_data, write_file)
|
||||||
|
|
||||||
####
|
####
|
||||||
# Clean shutdown
|
# Clean shutdown
|
||||||
|
8
run_script_ppi.sh
Executable file
8
run_script_ppi.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
module purge
|
||||||
|
module load git/2.21.0 hdf5/1.10.5-gcc cuda/10.1
|
||||||
|
|
||||||
|
conda activate ShallowWaterGPU_HPC
|
||||||
|
|
||||||
|
python mpiTesting.py
|
||||||
|
|
54
saga-dev.job
Normal file
54
saga-dev.job
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Job name:
|
||||||
|
#SBATCH --job-name=ShallowWaterGPUScalingDev
|
||||||
|
#
|
||||||
|
# Project:
|
||||||
|
#SBATCH --account=nn9882k
|
||||||
|
#
|
||||||
|
# Wall clock limit:
|
||||||
|
#SBATCH --time=00:02:00
|
||||||
|
#
|
||||||
|
# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
|
||||||
|
#
|
||||||
|
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||||
|
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||||
|
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||||
|
#SBATCH --partition=accel
|
||||||
|
#
|
||||||
|
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||||
|
#SBATCH --mem-per-cpu=3800M
|
||||||
|
#
|
||||||
|
# Number of tasks:
|
||||||
|
#SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
|
||||||
|
#
|
||||||
|
#SBATCH --qos=devel
|
||||||
|
|
||||||
|
## Set up job environment: (this is done automatically behind the scenes)
|
||||||
|
## (make sure to comment '#' or remove the following line 'source ...')
|
||||||
|
# source /cluster/bin/jobsetup
|
||||||
|
|
||||||
|
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||||
|
module load CUDA/11.4.1
|
||||||
|
|
||||||
|
# It is also recommended to to list loaded modules, for easier debugging:
|
||||||
|
module list
|
||||||
|
|
||||||
|
set -o errexit # exit on errors
|
||||||
|
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
mkdir $SCRATCH/ShallowWaterGPU
|
||||||
|
cp -r . $SCRATCH/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||||
|
# chkfile MyResultFileq
|
||||||
|
# chkfile is replaced by 'savefile' on Saga
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||||
|
|
||||||
|
## Do some work:
|
||||||
|
cd $SCRATCH/ShallowWaterGPU
|
||||||
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||||
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile
|
||||||
|
|
@ -1,24 +1,25 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Job name:
|
# Job name:
|
||||||
#SBATCH --job-name=saga-test
|
#SBATCH --job-name=ShallowWaterGPUStrongScaling
|
||||||
#
|
#
|
||||||
# Project:
|
# Project:
|
||||||
#SBATCH --account=nn9550k
|
#SBATCH --account=nn9882k
|
||||||
#
|
#
|
||||||
# Wall clock limit:
|
# Wall clock limit:
|
||||||
#SBATCH --time=00:10:00
|
#SBATCH --time=24:00:00
|
||||||
|
#
|
||||||
|
# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
|
||||||
#
|
#
|
||||||
# Ask for 1 GPU (max is 2)
|
|
||||||
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||||
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||||
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||||
#SBATCH --partition=accel --gres=gpu:1
|
#SBATCH --partition=accel
|
||||||
#
|
#
|
||||||
# Max memory usage per task (core) - increasing this will cost more core hours:
|
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||||
#SBATCH --mem-per-cpu=4G
|
#SBATCH --mem-per-cpu=3800M
|
||||||
#
|
#
|
||||||
# Number of tasks:
|
# Number of tasks:
|
||||||
#SBATCH --nodes=2 --ntasks-per-node=1
|
#SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
|
||||||
|
|
||||||
## Set up job environment: (this is done automatically behind the scenes)
|
## Set up job environment: (this is done automatically behind the scenes)
|
||||||
## (make sure to comment '#' or remove the following line 'source ...')
|
## (make sure to comment '#' or remove the following line 'source ...')
|
||||||
@ -42,9 +43,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
|
|||||||
# chkfile is replaced by 'savefile' on Saga
|
# chkfile is replaced by 'savefile' on Saga
|
||||||
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||||
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||||
|
|
||||||
## Do some work:
|
## Do some work:
|
||||||
cd $SCRATCH/ShallowWaterGPU
|
cd $SCRATCH/ShallowWaterGPU
|
||||||
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||||
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
|
||||||
|
|
||||||
|
65
saga_scaling_benchmark.job
Normal file
65
saga_scaling_benchmark.job
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Job name:
|
||||||
|
#SBATCH --job-name=ShallowWaterGPUScaling
|
||||||
|
#
|
||||||
|
# Project:
|
||||||
|
#SBATCH --account=nn9882k
|
||||||
|
#
|
||||||
|
# Wall clock limit:
|
||||||
|
#SBATCH --time=00:10:00
|
||||||
|
#
|
||||||
|
# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
|
||||||
|
#
|
||||||
|
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||||
|
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||||
|
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||||
|
#SBATCH --partition=accel
|
||||||
|
#
|
||||||
|
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||||
|
##SBATCH --mem-per-cpu=3800M
|
||||||
|
#SBATCH --mem-per-cpu=24G
|
||||||
|
#
|
||||||
|
#SBATCH --qos=devel
|
||||||
|
|
||||||
|
## Set up job environment: (this is done automatically behind the scenes)
|
||||||
|
## (make sure to comment '#' or remove the following line 'source ...')
|
||||||
|
# source /cluster/bin/jobsetup
|
||||||
|
|
||||||
|
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||||
|
module load CUDA/11.4.1
|
||||||
|
#module load CUDA/11.1.1-GCC-10.2.0
|
||||||
|
#module load OpenMPI/4.0.5-gcccuda-2020b
|
||||||
|
|
||||||
|
# It is also recommended to to list loaded modules, for easier debugging:
|
||||||
|
module list
|
||||||
|
|
||||||
|
set -o errexit # exit on errors
|
||||||
|
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
mkdir $SCRATCH/ShallowWaterGPU
|
||||||
|
cp -r . $SCRATCH/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||||
|
# chkfile MyResultFile
|
||||||
|
# chkfile is replaced by 'savefile' on Saga
|
||||||
|
#savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||||
|
#savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||||
|
#savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||||
|
#savefile "$SCRATCH/ShallowWaterGPU/*.qdrep"
|
||||||
|
|
||||||
|
cleanup "rm -rf $SCRATCH/ShallowWaterGPU"
|
||||||
|
|
||||||
|
export OMPI_MCA_opal_cuda_support=true
|
||||||
|
|
||||||
|
## Do some work:
|
||||||
|
cd $SCRATCH/ShallowWaterGPU
|
||||||
|
srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||||
|
srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
|
||||||
|
cd $HOME/src/ShallowWaterGPU
|
||||||
|
mkdir -p output_saga/$NOW/$SLURM_JOB_ID
|
||||||
|
mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID
|
||||||
|
mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID
|
||||||
|
mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW
|
||||||
|
mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW
|
30
saga_strong_scaling_benchmark.sh
Normal file
30
saga_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
|
# one node: 1–4 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
|
||||||
|
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||||
|
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
|
||||||
|
# 4 nodes: 1–4 GPUs per node
|
||||||
|
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
|
||||||
|
|
||||||
|
# 4 nodes: 1–4 GPUs per node
|
||||||
|
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
|
||||||
|
|
||||||
|
## one node: 1–4 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
#
|
||||||
|
## 4 nodes: 1–4 GPUs per node
|
||||||
|
#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
#sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
|
||||||
|
#sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
|
||||||
|
#sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
|
25
saga_weak_scaling_benchmark.sh
Normal file
25
saga_weak_scaling_benchmark.sh
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
|
# one node: 1-4 GPUs
|
||||||
|
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
|
||||||
|
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||||
|
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||||
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
|
||||||
|
# 2-4 nodes: 1 GPUs per node
|
||||||
|
sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||||
|
sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||||
|
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
|
||||||
|
## one node: 1-4 GPUs
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||||
|
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||||
|
|
||||||
|
## 2-4 nodes: 1 GPUs per node
|
||||||
|
#sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||||
|
#sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||||
|
#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
39
seymour_strong_scaling_benchmark.sh
Normal file
39
seymour_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
NOW=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
mkdir -p output_seymour/$NOW
|
||||||
|
|
||||||
|
# one node: 1-8 GPUs
|
||||||
|
mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/1_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/2_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/3_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/4_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/5_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/6_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/7_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ &&
|
||||||
|
|
||||||
|
mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile &&
|
||||||
|
mkdir -p output_seymour/$NOW/8_proc &&
|
||||||
|
mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ &&
|
||||||
|
|
||||||
|
for filename in *.json; do mv "$filename" "output_seymour/$NOW/MPI_${NOW}_${filename#????}"; done;
|
127
shmemTesting.py
Normal file
127
shmemTesting.py
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
This python module implements SHMEM (shared memory) simulations for benchmarking
|
||||||
|
|
||||||
|
Copyright (C) 2020 Norwegian Meteorological Institute
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import gc
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
#Simulator engine etc
|
||||||
|
from GPUSimulators import SHMEMSimulatorGroup, Common, CudaContext
|
||||||
|
from GPUSimulators import EE2D_KP07_dimsplit
|
||||||
|
from GPUSimulators.helpers import InitialConditions as IC
|
||||||
|
from GPUSimulators.Simulator import BoundaryCondition as BC
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
#Initialize logging
|
||||||
|
####
|
||||||
|
log_level_console = 20
|
||||||
|
log_level_file = 10
|
||||||
|
log_filename = 'shmem.log'
|
||||||
|
logger = logging.getLogger('GPUSimulators')
|
||||||
|
logger.setLevel(min(log_level_console, log_level_file))
|
||||||
|
|
||||||
|
ch = logging.StreamHandler()
|
||||||
|
ch.setLevel(log_level_console)
|
||||||
|
logger.addHandler(ch)
|
||||||
|
logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
|
||||||
|
|
||||||
|
fh = logging.FileHandler(log_filename)
|
||||||
|
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||||
|
fh.setFormatter(formatter)
|
||||||
|
fh.setLevel(log_level_file)
|
||||||
|
logger.addHandler(fh)
|
||||||
|
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Initialize SHMEM grid etc
|
||||||
|
####
|
||||||
|
|
||||||
|
logger.info("Creating SHMEM grid")
|
||||||
|
|
||||||
|
# XXX: need to explicitly set ngpus when testing on single-GPU system
|
||||||
|
grid = SHMEMSimulatorGroup.SHMEMGrid(ngpus=4)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Set initial conditions
|
||||||
|
####
|
||||||
|
logger.info("Generating initial conditions")
|
||||||
|
nx = 128
|
||||||
|
ny = 128
|
||||||
|
gamma = 1.4
|
||||||
|
#save_times = np.linspace(0, 0.01, 10)
|
||||||
|
save_times = np.linspace(0, 10, 10)
|
||||||
|
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||||
|
|
||||||
|
outfile = "shmem_out.nc"
|
||||||
|
|
||||||
|
####
|
||||||
|
# Run simulation
|
||||||
|
####
|
||||||
|
logger.info("Running simulation")
|
||||||
|
|
||||||
|
sims = []
|
||||||
|
for i in range(grid.ngpus):
|
||||||
|
arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid, index=i)
|
||||||
|
arguments['context'] = grid.cuda_contexts[i]
|
||||||
|
arguments['theta'] = 1.2
|
||||||
|
|
||||||
|
sims.append(EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**arguments))
|
||||||
|
#sims[i] = SHMEMSimulator(i, local_sim, grid) # 1st attempt: no wrapper (per sim)
|
||||||
|
|
||||||
|
arguments['sims'] = sims
|
||||||
|
arguments['grid'] = grid
|
||||||
|
|
||||||
|
#Helper function to create SHMEM simulator
|
||||||
|
def genSim(sims, grid, **kwargs):
|
||||||
|
# XXX: kwargs not used, since the simulators are already instantiated in the for-loop above
|
||||||
|
sim = SHMEMSimulatorGroup.SHMEMSimulatorGroup(sims, grid)
|
||||||
|
return sim
|
||||||
|
|
||||||
|
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Clean shutdown
|
||||||
|
####
|
||||||
|
sim = None
|
||||||
|
local_sim = None
|
||||||
|
cuda_context = None
|
||||||
|
arguments = None
|
||||||
|
logging.shutdown()
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Print completion and exit
|
||||||
|
####
|
||||||
|
print("Completed!")
|
||||||
|
exit(0)
|
126
singleGPUTesting.py
Normal file
126
singleGPUTesting.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
This python module implements simulations for benchmarking
|
||||||
|
|
||||||
|
Copyright (C) 2018 SINTEF ICT
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import gc
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
# CUDA
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
|
# Simulator engine etc
|
||||||
|
from GPUSimulators import Common, CudaContext
|
||||||
|
from GPUSimulators import EE2D_KP07_dimsplit
|
||||||
|
from GPUSimulators.helpers import InitialConditions as IC
|
||||||
|
from GPUSimulators.Simulator import BoundaryCondition as BC
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser(description='Single GPU testing.')
|
||||||
|
parser.add_argument('-nx', type=int, default=128)
|
||||||
|
parser.add_argument('-ny', type=int, default=128)
|
||||||
|
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
####
|
||||||
|
# Initialize logging
|
||||||
|
####
|
||||||
|
log_level_console = 20
|
||||||
|
log_level_file = 10
|
||||||
|
log_filename = 'single_gpu.log'
|
||||||
|
logger = logging.getLogger('GPUSimulators')
|
||||||
|
logger.setLevel(min(log_level_console, log_level_file))
|
||||||
|
|
||||||
|
ch = logging.StreamHandler()
|
||||||
|
ch.setLevel(log_level_console)
|
||||||
|
logger.addHandler(ch)
|
||||||
|
logger.info("Console logger using level %s",
|
||||||
|
logging.getLevelName(log_level_console))
|
||||||
|
|
||||||
|
fh = logging.FileHandler(log_filename)
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||||
|
fh.setFormatter(formatter)
|
||||||
|
fh.setLevel(log_level_file)
|
||||||
|
logger.addHandler(fh)
|
||||||
|
logger.info("File logger using level %s to %s",
|
||||||
|
logging.getLevelName(log_level_file), log_filename)
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Initialize CUDA
|
||||||
|
####
|
||||||
|
cuda.init(flags=0)
|
||||||
|
logger.info("Initializing CUDA")
|
||||||
|
cuda_context = CudaContext.CudaContext(autotuning=False)
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Set initial conditions
|
||||||
|
####
|
||||||
|
logger.info("Generating initial conditions")
|
||||||
|
nx = args.nx
|
||||||
|
ny = args.ny
|
||||||
|
|
||||||
|
gamma = 1.4
|
||||||
|
roughness = 0.125
|
||||||
|
save_times = np.linspace(0, 0.5, 10)
|
||||||
|
outfile = "single_gpu_out.nc"
|
||||||
|
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||||
|
|
||||||
|
arguments = IC.genKelvinHelmholtz(nx, ny, gamma)
|
||||||
|
arguments['context'] = cuda_context
|
||||||
|
arguments['theta'] = 1.2
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Run simulation
|
||||||
|
####
|
||||||
|
logger.info("Running simulation")
|
||||||
|
# Helper function to create MPI simulator
|
||||||
|
|
||||||
|
|
||||||
|
def genSim(**kwargs):
|
||||||
|
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
|
||||||
|
return local_sim
|
||||||
|
|
||||||
|
|
||||||
|
outfile = Common.runSimulation(
|
||||||
|
genSim, arguments, outfile, save_times, save_var_names)
|
||||||
|
|
||||||
|
####
|
||||||
|
# Clean shutdown
|
||||||
|
####
|
||||||
|
local_sim = None
|
||||||
|
cuda_context = None
|
||||||
|
arguments = None
|
||||||
|
logging.shutdown()
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####
|
||||||
|
# Print completion and exit
|
||||||
|
####
|
||||||
|
print("Completed!")
|
||||||
|
exit(0)
|
Loading…
x
Reference in New Issue
Block a user