Merge pull request #10 from setmar/master

WiP: Simulating the Euler equations on multiple GPUs using Python
This commit is contained in:
André R. Brodtkorb 2023-02-23 08:55:50 +01:00 committed by GitHub
commit 9613269c45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 300874 additions and 257743 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.vscode/settings.json

File diff suppressed because one or more lines are too long

673
Figures_Saga.ipynb Normal file

File diff suppressed because one or more lines are too long

721
Figures_Simula.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -89,12 +89,22 @@ def toJson(in_dict, compressed=True):
out_dict[key] = value out_dict[key] = value
return json.dumps(out_dict) return json.dumps(out_dict)
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[]): def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
""" """
Runs a simulation, and stores output in netcdf file. Stores the times given in Runs a simulation, and stores output in netcdf file. Stores the times given in
save_times, and saves all of the variables in list save_var_names. Elements in save_times, and saves all of the variables in list save_var_names. Elements in
save_var_names can be set to None if you do not want to save them save_var_names can be set to None if you do not want to save them
""" """
profiling_data_sim_runner = { 'start': {}, 'end': {} }
profiling_data_sim_runner["start"]["t_sim_init"] = 0
profiling_data_sim_runner["end"]["t_sim_init"] = 0
profiling_data_sim_runner["start"]["t_nc_write"] = 0
profiling_data_sim_runner["end"]["t_nc_write"] = 0
profiling_data_sim_runner["start"]["t_full_step"] = 0
profiling_data_sim_runner["end"]["t_full_step"] = 0
profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
assert len(save_times) > 0, "Need to specify which times to save" assert len(save_times) > 0, "Need to specify which times to save"
@ -111,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
outdata.ncfile.git_hash = getGitHash() outdata.ncfile.git_hash = getGitHash()
outdata.ncfile.git_status = getGitStatus() outdata.ncfile.git_status = getGitStatus()
outdata.ncfile.simulator = str(simulator) outdata.ncfile.simulator = str(simulator)
outdata.ncfile.sim_args = toJson(simulator_args)
# do not write fields to attributes (they are to large)
simulator_args_for_ncfile = simulator_args.copy()
del simulator_args_for_ncfile["rho"]
del simulator_args_for_ncfile["rho_u"]
del simulator_args_for_ncfile["rho_v"]
del simulator_args_for_ncfile["E"]
outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
#Create dimensions #Create dimensions
outdata.ncfile.createDimension('time', len(save_times)) outdata.ncfile.createDimension('time', len(save_times))
@ -146,6 +163,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
t_steps[0] = save_times[0] t_steps[0] = save_times[0]
t_steps[1:] = save_times[1:] - save_times[0:-1] t_steps[1:] = save_times[1:] - save_times[0:-1]
profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
#Start simulation loop #Start simulation loop
progress_printer = ProgressPrinter(save_times[-1], print_every=10) progress_printer = ProgressPrinter(save_times[-1], print_every=10)
for k in range(len(save_times)): for k in range(len(save_times)):
@ -160,9 +179,15 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e))) logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
return outdata.filename return outdata.filename
profiling_data_sim_runner["start"]["t_full_step"] += time.time()
#Simulate #Simulate
if (t_step > 0.0): if (t_step > 0.0):
sim.simulate(t_step) sim.simulate(t_step, dt)
profiling_data_sim_runner["end"]["t_full_step"] += time.time()
profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
#Download #Download
save_vars = sim.download(download_vars) save_vars = sim.download(download_vars)
@ -171,6 +196,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
for i, var_name in enumerate(save_var_names): for i, var_name in enumerate(save_var_names):
ncvars[var_name][k, :] = save_vars[i] ncvars[var_name][k, :] = save_vars[i]
profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
#Write progress to screen #Write progress to screen
print_string = progress_printer.getPrintString(t_end) print_string = progress_printer.getPrintString(t_end)
if (print_string): if (print_string):
@ -178,7 +205,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps())) logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
return outdata.filename return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
@ -526,9 +553,9 @@ class CudaArray2D:
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
#Allocate host memory #Allocate host memory
#The following fails, don't know why (crashes python) #The following fails, don't know why (crashes python)
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32) cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
#Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
assert nx == cpu_data.shape[1] assert nx == cpu_data.shape[1]
assert ny == cpu_data.shape[0] assert ny == cpu_data.shape[0]
@ -739,7 +766,7 @@ class ArakawaA2D:
assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
stream.synchronize() #stream.synchronize()
return cpu_variables return cpu_variables
def check(self): def check(self):

View File

@ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
}, },
jit_compile_args={}) jit_compile_args={})
self.kernel = module.get_function("KP07DimsplitKernel") self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP") self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
#Create data by uploading to device #Create data by uploading to device
@ -109,11 +109,14 @@ class EE2D_KP07_dimsplit (BaseSimulator):
self.cfl_data.fill(self.dt, stream=self.stream) self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number): def substep(self, dt, step_number, external=True, internal=True):
self.substepDimsplit(0.5*dt, step_number) self.substepDimsplit(0.5*dt, step_number, external, internal)
def substepDimsplit(self, dt, substep): def substepDimsplit(self, dt, substep, external, internal):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, if external and internal:
#print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny, self.nx, self.ny,
self.dx, self.dy, dt, self.dx, self.dy, dt,
self.g, self.g,
@ -129,8 +132,142 @@ class EE2D_KP07_dimsplit (BaseSimulator):
self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata) self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
return
if external and not internal:
###################################
# XXX: Corners are treated twice! #
###################################
ns_grid_size = (self.grid_size[0], 1)
# NORTH
# (x0, y0) x (x1, y1)
# (0, ny-y_halo) x (nx, ny)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, self.ny - int(self.u0[0].y_halo),
self.nx, self.ny)
# SOUTH
# (x0, y0) x (x1, y1)
# (0, 0) x (nx, y_halo)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, int(self.u0[0].y_halo))
we_grid_size = (1, self.grid_size[1])
# WEST
# (x0, y0) x (x1, y1)
# (0, 0) x (x_halo, ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
int(self.u0[0].x_halo), self.ny)
# EAST
# (x0, y0) x (x1, y1)
# (nx-x_halo, 0) x (nx, ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.nx - int(self.u0[0].x_halo), 0,
self.nx, self.ny)
return
if internal and not external:
# INTERNAL DOMAIN
# (x0, y0) x (x1, y1)
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
int(self.u0[0].x_halo), int(self.u0[0].y_halo),
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
return
def swapBuffers(self):
self.u0, self.u1 = self.u1, self.u0 self.u0, self.u1 = self.u1, self.u0
return
def getOutput(self): def getOutput(self):
return self.u0 return self.u0
@ -138,6 +275,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
def check(self): def check(self):
self.u0.check() self.u0.check()
self.u1.check() self.u1.check()
return
def computeDt(self): def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();

View File

@ -24,7 +24,10 @@ import logging
from GPUSimulators import Simulator from GPUSimulators import Simulator
import numpy as np import numpy as np
from mpi4py import MPI from mpi4py import MPI
import time
import pycuda.driver as cuda
#import nvtx
@ -135,6 +138,10 @@ class MPIGrid(object):
grid = np.sort(grid) grid = np.sort(grid)
grid = grid[::-1] grid = grid[::-1]
# XXX: We only use vertical (north-south) partitioning for now
grid[0] = 1
grid[1] = num_nodes
return grid return grid
@ -200,6 +207,18 @@ class MPISimulator(Simulator.BaseSimulator):
Class which handles communication between simulators on different MPI nodes Class which handles communication between simulators on different MPI nodes
""" """
def __init__(self, sim, grid): def __init__(self, sim, grid):
self.profiling_data_mpi = { 'start': {}, 'end': {} }
self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0
self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0
self.profiling_data_mpi["start"]["t_mpi_step"] = 0
self.profiling_data_mpi["end"]["t_mpi_step"] = 0
self.profiling_data_mpi["n_time_steps"] = 0
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
autotuner = sim.context.autotuner autotuner = sim.context.autotuner
@ -232,6 +251,7 @@ class MPISimulator(Simulator.BaseSimulator):
'west': Simulator.BoundaryCondition.Type.Dirichlet 'west': Simulator.BoundaryCondition.Type.Dirichlet
}) })
gi, gj = grid.getCoordinate() gi, gj = grid.getCoordinate()
#print("gi: " + str(gi) + ", gj: " + str(gj))
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic): if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
self.west = None self.west = None
new_boundary_conditions.west = boundary_conditions.west; new_boundary_conditions.west = boundary_conditions.west;
@ -272,23 +292,50 @@ class MPISimulator(Simulator.BaseSimulator):
#Note that east and west also transfer ghost cells #Note that east and west also transfer ghost cells
#whilst north/south only transfer internal cells #whilst north/south only transfer internal cells
#Reuses the width/height defined in the read-extets above #Reuses the width/height defined in the read-extets above
self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
#Allocate data for sending #Allocate data for sending
self.out_e = np.empty_like(self.in_e) self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e)
self.out_w = np.empty_like(self.in_w) self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w)
self.out_n = np.empty_like(self.in_n) self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n)
self.out_s = np.empty_like(self.in_s) self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s)
self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
self.full_exchange()
sim.context.synchronize()
def substep(self, dt, step_number): def substep(self, dt, step_number):
self.exchange()
self.sim.substep(dt, step_number) #nvtx.mark("substep start", color="yellow")
self.profiling_data_mpi["start"]["t_mpi_step"] += time.time()
#nvtx.mark("substep external", color="blue")
self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
#nvtx.mark("substep internal", color="red")
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
#nvtx.mark("substep full", color="blue")
#self.sim.substep(dt, step_number, external=True, internal=True)
self.sim.swapBuffers()
self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
#nvtx.mark("exchange", color="blue")
self.full_exchange()
#nvtx.mark("sync start", color="blue")
self.sim.stream.synchronize()
self.sim.internal_stream.synchronize()
#nvtx.mark("sync end", color="blue")
self.profiling_data_mpi["n_time_steps"] += 1
def getOutput(self): def getOutput(self):
return self.sim.getOutput() return self.sim.getOutput()
@ -321,18 +368,14 @@ class MPISimulator(Simulator.BaseSimulator):
y1 = y0 + height y1 = y0 + height
return [x0, x1, y0, y1] return [x0, x1, y0, y1]
def exchange(self): def full_exchange(self):
####
# FIXME: This function can be optimized using persitent communications.
# Also by overlapping some of the communications north/south and east/west of GPU and intra-node
# communications
####
#### ####
# First transfer internal cells north-south # First transfer internal cells north-south
#### ####
#Download from the GPU #Download from the GPU
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
if self.north is not None: if self.north is not None:
for k in range(self.nvars): for k in range(self.nvars):
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n) self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n)
@ -341,7 +384,11 @@ class MPISimulator(Simulator.BaseSimulator):
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
self.sim.stream.synchronize() self.sim.stream.synchronize()
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
#Send/receive to north/south neighbours #Send/receive to north/south neighbours
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
comm_send = [] comm_send = []
comm_recv = [] comm_recv = []
if self.north is not None: if self.north is not None:
@ -355,7 +402,11 @@ class MPISimulator(Simulator.BaseSimulator):
for comm in comm_recv: for comm in comm_recv:
comm.wait() comm.wait()
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
#Upload to the GPU #Upload to the GPU
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
if self.north is not None: if self.north is not None:
for k in range(self.nvars): for k in range(self.nvars):
self.sim.u0[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n) self.sim.u0[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n)
@ -363,17 +414,23 @@ class MPISimulator(Simulator.BaseSimulator):
for k in range(self.nvars): for k in range(self.nvars):
self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s) self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
#Wait for sending to complete #Wait for sending to complete
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
for comm in comm_send: for comm in comm_send:
comm.wait() comm.wait()
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
#### ####
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above # Then transfer east-west including ghost cells that have been filled in by north-south transfer above
#### ####
#Download from the GPU #Download from the GPU
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
if self.east is not None: if self.east is not None:
for k in range(self.nvars): for k in range(self.nvars):
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e) self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e)
@ -382,7 +439,11 @@ class MPISimulator(Simulator.BaseSimulator):
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
self.sim.stream.synchronize() self.sim.stream.synchronize()
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
#Send/receive to east/west neighbours #Send/receive to east/west neighbours
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
comm_send = [] comm_send = []
comm_recv = [] comm_recv = []
if self.east is not None: if self.east is not None:
@ -392,12 +453,15 @@ class MPISimulator(Simulator.BaseSimulator):
comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)] comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)] comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
#Wait for incoming transfers to complete #Wait for incoming transfers to complete
for comm in comm_recv: for comm in comm_recv:
comm.wait() comm.wait()
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
#Upload to the GPU #Upload to the GPU
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
if self.east is not None: if self.east is not None:
for k in range(self.nvars): for k in range(self.nvars):
self.sim.u0[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e) self.sim.u0[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e)
@ -405,9 +469,12 @@ class MPISimulator(Simulator.BaseSimulator):
for k in range(self.nvars): for k in range(self.nvars):
self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
#Wait for sending to complete #Wait for sending to complete
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
for comm in comm_send: for comm in comm_send:
comm.wait() comm.wait()
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()

View File

@ -0,0 +1,263 @@
# -*- coding: utf-8 -*-
"""
This python module implements SHMEM simulator group class
Copyright (C) 2020 Norwegian Meteorological Institute
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import logging
from GPUSimulators import Simulator, CudaContext
import numpy as np
import pycuda.driver as cuda
import time
class SHMEMSimulator(Simulator.BaseSimulator):
"""
Class which handles communication and synchronization between simulators in different
contexts (presumably on different GPUs)
"""
def __init__(self, sims, grid):
self.logger = logging.getLogger(__name__)
assert(len(sims) > 1)
self.sims = sims
# XXX: This is not what was intended. Do we need extra wrapper class SHMEMSimulator?
# See also getOutput() and check().
#
# SHMEMSimulatorGroup would then not have any superclass, but manage a collection of
# SHMEMSimulators that have BaseSimulator as a superclass.
#
# This would also eliminate the need for all the array bookkeeping in this class.
autotuner = sims[0].context.autotuner
sims[0].context.autotuner = None
boundary_conditions = sims[0].getBoundaryConditions()
super().__init__(sims[0].context,
sims[0].nx, sims[0].ny,
sims[0].dx, sims[0].dy,
boundary_conditions,
sims[0].cfl_scale,
sims[0].num_substeps,
sims[0].block_size[0], sims[0].block_size[1])
sims[0].context.autotuner = autotuner
self.sims = sims
self.grid = grid
self.east = [None] * len(self.sims)
self.west = [None] * len(self.sims)
self.north = [None] * len(self.sims)
self.south = [None] * len(self.sims)
self.nvars = [None] * len(self.sims)
self.read_e = [None] * len(self.sims)
self.read_w = [None] * len(self.sims)
self.read_n = [None] * len(self.sims)
self.read_s = [None] * len(self.sims)
self.write_e = [None] * len(self.sims)
self.write_w = [None] * len(self.sims)
self.write_n = [None] * len(self.sims)
self.write_s = [None] * len(self.sims)
self.e = [None] * len(self.sims)
self.w = [None] * len(self.sims)
self.n = [None] * len(self.sims)
self.s = [None] * len(self.sims)
for i, sim in enumerate(self.sims):
#Get neighbor subdomain ids
self.east[i] = grid.getEast(i)
self.west[i] = grid.getWest(i)
self.north[i] = grid.getNorth(i)
self.south[i] = grid.getSouth(i)
#Get coordinate of this subdomain
#and handle global boundary conditions
new_boundary_conditions = Simulator.BoundaryCondition({
'north': Simulator.BoundaryCondition.Type.Dirichlet,
'south': Simulator.BoundaryCondition.Type.Dirichlet,
'east': Simulator.BoundaryCondition.Type.Dirichlet,
'west': Simulator.BoundaryCondition.Type.Dirichlet
})
gi, gj = grid.getCoordinate(i)
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
self.west = None
new_boundary_conditions.west = boundary_conditions.west;
if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
self.south = None
new_boundary_conditions.south = boundary_conditions.south;
if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
self.east = None
new_boundary_conditions.east = boundary_conditions.east;
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
self.north = None
new_boundary_conditions.north = boundary_conditions.north;
sim.setBoundaryConditions(new_boundary_conditions)
#Get number of variables
self.nvars[i] = len(sim.getOutput().gpu_variables)
#Shorthands for computing extents and sizes
gc_x = int(sim.getOutput()[0].x_halo)
gc_y = int(sim.getOutput()[0].y_halo)
nx = int(sim.nx)
ny = int(sim.ny)
#Set regions for ghost cells to read from
#These have the format [x0, y0, width, height]
self.read_e[i] = np.array([ nx, 0, gc_x, ny + 2*gc_y])
self.read_w[i] = np.array([gc_x, 0, gc_x, ny + 2*gc_y])
self.read_n[i] = np.array([gc_x, ny, nx, gc_y])
self.read_s[i] = np.array([gc_x, gc_y, nx, gc_y])
#Set regions for ghost cells to write to
self.write_e[i] = self.read_e[i] + np.array([gc_x, 0, 0, 0])
self.write_w[i] = self.read_w[i] - np.array([gc_x, 0, 0, 0])
self.write_n[i] = self.read_n[i] + np.array([0, gc_y, 0, 0])
self.write_s[i] = self.read_s[i] - np.array([0, gc_y, 0, 0])
#Allocate host data
#Note that east and west also transfer ghost cells
#whilst north/south only transfer internal cells
#Reuses the width/height defined in the read-extets above
self.e[i] = np.empty((self.nvars[i], self.read_e[i][3], self.read_e[i][2]), dtype=np.float32)
self.w[i] = np.empty((self.nvars[i], self.read_w[i][3], self.read_w[i][2]), dtype=np.float32)
self.n[i] = np.empty((self.nvars[i], self.read_n[i][3], self.read_n[i][2]), dtype=np.float32)
self.s[i] = np.empty((self.nvars[i], self.read_s[i][3], self.read_s[i][2]), dtype=np.float32)
self.logger.debug("Initialized {:d} subdomains".format(len(self.sims)))
def substep(self, dt, step_number):
self.exchange()
for i, sim in enumerate(self.sims):
sim.substep(dt, step_number)
def getOutput(self):
# XXX: Does not return what we would expect.
# Returns first subdomain, but we want the whole domain.
return self.sims[0].getOutput()
def synchronize(self):
for sim in self.sims:
sim.synchronize()
def check(self):
# XXX: Does not return what we would expect.
# Checks only first subdomain, but we want to check the whole domain.
return self.sims[0].check()
def computeDt(self):
global_dt = float("inf")
for sim in self.sims:
sim.context.synchronize()
for sim in self.sims:
local_dt = sim.computeDt()
if local_dt < global_dt:
global_dt = local_dt
self.logger.debug("Local dt: {:f}".format(local_dt))
self.logger.debug("Global dt: {:f}".format(global_dt))
return global_dt
def getExtent(self, index=0):
"""
Function which returns the extent of the subdomain with index
index in the grid
"""
width = self.sims[index].nx*self.sims[index].dx
height = self.sims[index].ny*self.sims[index].dy
i, j = self.grid.getCoordinate(index)
x0 = i * width
y0 = j * height
x1 = x0 + width
y1 = y0 + height
return [x0, x1, y0, y1]
def exchange(self):
####
# First transfer internal cells north-south
####
for i in range(len(self.sims)):
self.ns_download(i)
for i in range(len(self.sims)):
self.ns_upload(i)
####
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
####
for i in range(len(self.sims)):
self.ew_download(i)
for i in range(len(self.sims)):
self.ew_upload(i)
def ns_download(self, i):
#Download from the GPU
if self.north[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the north)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.n[i][k,:,:], extent=self.read_n[i])
if self.south[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i])
self.sims[i].stream.synchronize()
def ns_upload(self, i):
#Upload to the GPU
if self.north[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.s[self.north[i]][k,:,:], extent=self.write_n[i])
if self.south[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.n[self.south[i]][k,:,:], extent=self.write_s[i])
def ew_download(self, i):
#Download from the GPU
if self.east[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the east)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.e[i][k,:,:], extent=self.read_e[i])
if self.west[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i])
self.sims[i].stream.synchronize()
def ew_upload(self, i):
#Upload to the GPU
if self.east[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.w[self.east[i]][k,:,:], extent=self.write_e[i])
#test_east = np.ones_like(self.e[self.east[i]][k,:,:])
#self.sims[i].u0[k].upload(self.sims[i].stream, test_east, extent=self.write_e[i])
if self.west[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.e[self.west[i]][k,:,:], extent=self.write_w[i])
#test_west = np.ones_like(self.e[self.west[i]][k,:,:])
#self.sims[i].u0[k].upload(self.sims[i].stream, test_west, extent=self.write_w[i])

View File

@ -0,0 +1,394 @@
# -*- coding: utf-8 -*-
"""
This python module implements SHMEM simulator group class
Copyright (C) 2020 Norwegian Meteorological Institute
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import logging
from GPUSimulators import Simulator, CudaContext
import numpy as np
import pycuda.driver as cuda
import time
class SHMEMGrid(object):
"""
Class which represents an SHMEM grid of GPUs. Facilitates easy communication between
neighboring subdomains in the grid. Contains one CUDA context per subdomain.
"""
def __init__(self, ngpus=None, ndims=2):
self.logger = logging.getLogger(__name__)
cuda.init(flags=0)
self.logger.info("Initializing CUDA")
num_cuda_devices = cuda.Device.count()
if ngpus is None:
ngpus = num_cuda_devices
# XXX: disabled for testing on single-GPU system
#assert ngpus <= num_cuda_devices, "Trying to allocate more GPUs than are available in the system."
#assert ngpus >= 2, "Must have at least two GPUs available to run multi-GPU simulations."
assert ndims == 2, "Unsupported number of dimensions. Must be two at the moment"
self.ngpus = ngpus
self.ndims = ndims
self.grid = SHMEMGrid.getGrid(self.ngpus, self.ndims)
self.logger.debug("Created {:}-dimensional SHMEM grid, using {:} GPUs".format(
self.ndims, self.ngpus))
# XXX: Is this a natural place to store the contexts? Consider moving contexts out of this
# class, into notebook / calling script (shmemTesting.py)
self.cuda_contexts = []
for i in range(self.ngpus):
# XXX: disabled for testing on single-GPU system
#self.cuda_contexts.append(CudaContext.CudaContext(device=i, autotuning=False))
self.cuda_contexts.append(CudaContext.CudaContext(device=0, autotuning=False))
def getCoordinate(self, index):
i = (index % self.grid[0])
j = (index // self.grid[0])
return i, j
def getIndex(self, i, j):
return j*self.grid[0] + i
def getEast(self, index):
i, j = self.getCoordinate(index)
i = (i+1) % self.grid[0]
return self.getIndex(i, j)
def getWest(self, index):
i, j = self.getCoordinate(index)
i = (i+self.grid[0]-1) % self.grid[0]
return self.getIndex(i, j)
def getNorth(self, index):
i, j = self.getCoordinate(index)
j = (j+1) % self.grid[1]
return self.getIndex(i, j)
def getSouth(self, index):
i, j = self.getCoordinate(index)
j = (j+self.grid[1]-1) % self.grid[1]
return self.getIndex(i, j)
def getGrid(num_gpus, num_dims):
assert(isinstance(num_gpus, int))
assert(isinstance(num_dims, int))
# Adapted from https://stackoverflow.com/questions/28057307/factoring-a-number-into-roughly-equal-factors
# Original code by https://stackoverflow.com/users/3928385/ishamael
# Factorizes a number into n roughly equal factors
#Dictionary to remember already computed permutations
memo = {}
def dp(n, left): # returns tuple (cost, [factors])
"""
Recursively searches through all factorizations
"""
#Already tried: return existing result
if (n, left) in memo:
return memo[(n, left)]
#Spent all factors: return number itself
if left == 1:
return (n, [n])
#Find new factor
i = 2
best = n
bestTuple = [n]
while i * i < n:
#If factor found
if n % i == 0:
#Factorize remainder
rem = dp(n // i, left - 1)
#If new permutation better, save it
if rem[0] + i < best:
best = rem[0] + i
bestTuple = [i] + rem[1]
i += 1
#Store calculation
memo[(n, left)] = (best, bestTuple)
return memo[(n, left)]
grid = dp(num_gpus, num_dims)[1]
if (len(grid) < num_dims):
#Split problematic 4
if (4 in grid):
grid.remove(4)
grid.append(2)
grid.append(2)
#Pad with ones to guarantee num_dims
grid = grid + [1]*(num_dims - len(grid))
#Sort in descending order
grid = np.sort(grid)
grid = grid[::-1]
return grid
class SHMEMSimulatorGroup(object):
"""
Class which handles communication and synchronization between simulators in different
contexts (typically on different GPUs)
"""
def __init__(self, sims, grid):
self.logger = logging.getLogger(__name__)
assert(len(sims) > 1)
self.sims = sims
# XXX: This is not what was intended. Do we need extra wrapper class SHMEMSimulator?
# See also getOutput() and check().
#
# SHMEMSimulatorGroup would then not have any superclass, but manage a collection of
# SHMEMSimulators that have BaseSimulator as a superclass.
#
# This would also eliminate the need for all the array bookkeeping in this class.
#
CONT HERE! Model shmemTesting after mpiTesting and divide existing functionality between SHMEMSimulatorGroup and SHMEMSimulator
autotuner = sims[0].context.autotuner
sims[0].context.autotuner = None
boundary_conditions = sims[0].getBoundaryConditions()
super().__init__(sims[0].context,
sims[0].nx, sims[0].ny,
sims[0].dx, sims[0].dy,
boundary_conditions,
sims[0].cfl_scale,
sims[0].num_substeps,
sims[0].block_size[0], sims[0].block_size[1])
sims[0].context.autotuner = autotuner
self.sims = sims
self.grid = grid
self.east = [None] * len(self.sims)
self.west = [None] * len(self.sims)
self.north = [None] * len(self.sims)
self.south = [None] * len(self.sims)
self.nvars = [None] * len(self.sims)
self.read_e = [None] * len(self.sims)
self.read_w = [None] * len(self.sims)
self.read_n = [None] * len(self.sims)
self.read_s = [None] * len(self.sims)
self.write_e = [None] * len(self.sims)
self.write_w = [None] * len(self.sims)
self.write_n = [None] * len(self.sims)
self.write_s = [None] * len(self.sims)
self.e = [None] * len(self.sims)
self.w = [None] * len(self.sims)
self.n = [None] * len(self.sims)
self.s = [None] * len(self.sims)
for i, sim in enumerate(self.sims):
#Get neighbor subdomain ids
self.east[i] = grid.getEast(i)
self.west[i] = grid.getWest(i)
self.north[i] = grid.getNorth(i)
self.south[i] = grid.getSouth(i)
#Get coordinate of this subdomain
#and handle global boundary conditions
new_boundary_conditions = Simulator.BoundaryCondition({
'north': Simulator.BoundaryCondition.Type.Dirichlet,
'south': Simulator.BoundaryCondition.Type.Dirichlet,
'east': Simulator.BoundaryCondition.Type.Dirichlet,
'west': Simulator.BoundaryCondition.Type.Dirichlet
})
gi, gj = grid.getCoordinate(i)
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
self.west = None
new_boundary_conditions.west = boundary_conditions.west;
if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
self.south = None
new_boundary_conditions.south = boundary_conditions.south;
if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
self.east = None
new_boundary_conditions.east = boundary_conditions.east;
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
self.north = None
new_boundary_conditions.north = boundary_conditions.north;
sim.setBoundaryConditions(new_boundary_conditions)
#Get number of variables
self.nvars[i] = len(sim.getOutput().gpu_variables)
#Shorthands for computing extents and sizes
gc_x = int(sim.getOutput()[0].x_halo)
gc_y = int(sim.getOutput()[0].y_halo)
nx = int(sim.nx)
ny = int(sim.ny)
#Set regions for ghost cells to read from
#These have the format [x0, y0, width, height]
self.read_e[i] = np.array([ nx, 0, gc_x, ny + 2*gc_y])
self.read_w[i] = np.array([gc_x, 0, gc_x, ny + 2*gc_y])
self.read_n[i] = np.array([gc_x, ny, nx, gc_y])
self.read_s[i] = np.array([gc_x, gc_y, nx, gc_y])
#Set regions for ghost cells to write to
self.write_e[i] = self.read_e[i] + np.array([gc_x, 0, 0, 0])
self.write_w[i] = self.read_w[i] - np.array([gc_x, 0, 0, 0])
self.write_n[i] = self.read_n[i] + np.array([0, gc_y, 0, 0])
self.write_s[i] = self.read_s[i] - np.array([0, gc_y, 0, 0])
#Allocate host data
#Note that east and west also transfer ghost cells
#whilst north/south only transfer internal cells
#Reuses the width/height defined in the read-extets above
self.e[i] = np.empty((self.nvars[i], self.read_e[i][3], self.read_e[i][2]), dtype=np.float32)
self.w[i] = np.empty((self.nvars[i], self.read_w[i][3], self.read_w[i][2]), dtype=np.float32)
self.n[i] = np.empty((self.nvars[i], self.read_n[i][3], self.read_n[i][2]), dtype=np.float32)
self.s[i] = np.empty((self.nvars[i], self.read_s[i][3], self.read_s[i][2]), dtype=np.float32)
self.logger.debug("Initialized {:d} subdomains".format(len(self.sims)))
def substep(self, dt, step_number):
self.exchange()
for i, sim in enumerate(self.sims):
sim.substep(dt, step_number)
def getOutput(self):
# XXX: Does not return what we would expect.
# Returns first subdomain, but we want the whole domain.
return self.sims[0].getOutput()
def synchronize(self):
for sim in self.sims:
sim.synchronize()
def check(self):
# XXX: Does not return what we would expect.
# Checks only first subdomain, but we want to check the whole domain.
return self.sims[0].check()
def computeDt(self):
global_dt = float("inf")
for sim in self.sims:
sim.context.synchronize()
for sim in self.sims:
local_dt = sim.computeDt()
if local_dt < global_dt:
global_dt = local_dt
self.logger.debug("Local dt: {:f}".format(local_dt))
self.logger.debug("Global dt: {:f}".format(global_dt))
return global_dt
def getExtent(self, index=0):
"""
Function which returns the extent of the subdomain with index
index in the grid
"""
width = self.sims[index].nx*self.sims[index].dx
height = self.sims[index].ny*self.sims[index].dy
i, j = self.grid.getCoordinate(index)
x0 = i * width
y0 = j * height
x1 = x0 + width
y1 = y0 + height
return [x0, x1, y0, y1]
def exchange(self):
####
# First transfer internal cells north-south
####
for i in range(len(self.sims)):
self.ns_download(i)
for i in range(len(self.sims)):
self.ns_upload(i)
####
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
####
for i in range(len(self.sims)):
self.ew_download(i)
for i in range(len(self.sims)):
self.ew_upload(i)
def ns_download(self, i):
#Download from the GPU
if self.north[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the north)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.n[i][k,:,:], extent=self.read_n[i])
if self.south[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i])
self.sims[i].stream.synchronize()
def ns_upload(self, i):
#Upload to the GPU
if self.north[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.s[self.north[i]][k,:,:], extent=self.write_n[i])
if self.south[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.n[self.south[i]][k,:,:], extent=self.write_s[i])
def ew_download(self, i):
#Download from the GPU
if self.east[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the east)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.e[i][k,:,:], extent=self.read_e[i])
if self.west[i] is not None:
for k in range(self.nvars[i]):
# XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west)
self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i])
self.sims[i].stream.synchronize()
def ew_upload(self, i):
#Upload to the GPU
if self.east[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.w[self.east[i]][k,:,:], extent=self.write_e[i])
#test_east = np.ones_like(self.e[self.east[i]][k,:,:])
#self.sims[i].u0[k].upload(self.sims[i].stream, test_east, extent=self.write_e[i])
if self.west[i] is not None:
for k in range(self.nvars[i]):
self.sims[i].u0[k].upload(self.sims[i].stream, self.e[self.west[i]][k,:,:], extent=self.write_w[i])
#test_west = np.ones_like(self.e[self.west[i]][k,:,:])
#self.sims[i].u0[k].upload(self.sims[i].stream, test_west, extent=self.write_w[i])

View File

@ -162,6 +162,7 @@ class BaseSimulator(object):
#Create a CUDA stream #Create a CUDA stream
self.stream = cuda.Stream() self.stream = cuda.Stream()
self.internal_stream = cuda.Stream()
#Keep track of simulation time and number of timesteps #Keep track of simulation time and number of timesteps
self.t = 0.0 self.t = 0.0

View File

@ -147,7 +147,18 @@ __global__ void KP07DimsplitKernel(
float* E1_ptr_, int E1_pitch_, float* E1_ptr_, int E1_pitch_,
//Output CFL //Output CFL
float* cfl_) { float* cfl_,
//Subarea of internal domain to compute
int x0=0, int y0=0,
int x1=0, int y1=0) {
if(x1 == 0)
x1 = nx_;
if(y1 == 0)
y1 = ny_;
const unsigned int w = BLOCK_WIDTH; const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT; const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc_x = 2; const unsigned int gc_x = 2;
@ -160,10 +171,10 @@ __global__ void KP07DimsplitKernel(
__shared__ float F[4][h+2*gc_y][w+2*gc_x]; __shared__ float F[4][h+2*gc_y][w+2*gc_x];
//Read into shared memory //Read into shared memory
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_); readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_); readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_); readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_); readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
//Step 0 => evolve x first, then y //Step 0 => evolve x first, then y
if (step_ == 0) { if (step_ == 0) {
@ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel(
// Write to main memory for all internal cells // Write to main memory for all internal cells
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1); writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1); writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1); writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1); writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1);
//Compute the CFL for this block //Compute the CFL for this block
if (cfl_ != NULL) { if (cfl_ != NULL) {

View File

@ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y>
inline __device__ void readBlock(float* ptr_, int pitch_, inline __device__ void readBlock(float* ptr_, int pitch_,
float Q[h+2*gc_y][w+2*gc_x], float Q[h+2*gc_y][w+2*gc_x],
const int nx_, const int ny_, const int nx_, const int ny_,
const int boundary_conditions_) { const int boundary_conditions_,
int x0, int y0,
int x1, int y1) {
//Index of block within domain //Index of block within domain
const int bx = blockDim.x * blockIdx.x; const int bx = blockDim.x * blockIdx.x;
const int by = blockDim.y * blockIdx.y; const int by = blockDim.y * blockIdx.y;
@ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_,
//Loop over all variables //Loop over all variables
for (int j=threadIdx.y; j<h+2*gc_y; j+=h) { for (int j=threadIdx.y; j<h+2*gc_y; j+=h) {
//Handle periodic boundary conditions here //Handle periodic boundary conditions here
int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_); int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_);
l = min(l, ny_+2*gc_y-1); l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1));
float* row = (float*) ((char*) ptr_ + pitch_*l); float* row = (float*) ((char*) ptr_ + pitch_*l);
for (int i=threadIdx.x; i<w+2*gc_x; i+=w) { for (int i=threadIdx.x; i<w+2*gc_x; i+=w) {
//Handle periodic boundary conditions here //Handle periodic boundary conditions here
int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_); int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_);
k = min(k, nx_+2*gc_x-1); k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1));
//Read from global memory //Read from global memory
Q[j][i] = row[k]; Q[j][i] = row[k];
@ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y>
inline __device__ void writeBlock(float* ptr_, int pitch_, inline __device__ void writeBlock(float* ptr_, int pitch_,
float shmem[h+2*gc_y][w+2*gc_x], float shmem[h+2*gc_y][w+2*gc_x],
const int nx_, const int ny_, const int nx_, const int ny_,
int rk_step_, int rk_order_) { int rk_step_, int rk_order_,
int x0, int y0,
int x1, int y1) {
//Index of cell within domain //Index of cell within domain
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x; const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0;
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y; const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0;
//In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
const int max_ti = min(nx_+gc_x, x1+gc_x);
const int max_tj = min(ny_+gc_y, y1+gc_y);
//Only write internal cells //Only write internal cells
if (ti < nx_+gc_x && tj < ny_+gc_y) { if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) {
//Index of thread within block //Index of thread within block
const int tx = threadIdx.x + gc_x; const int tx = threadIdx.x + gc_x;
const int ty = threadIdx.y + gc_y; const int ty = threadIdx.y + gc_y;
@ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx]; row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx];
} }
} }
// DEBUG
//row[ti] = 99.0;
} }
} }

View File

@ -25,11 +25,14 @@ import numpy as np
import gc import gc
def getExtent(width, height, nx, ny, grid): def getExtent(width, height, nx, ny, grid, index=None):
if grid is not None: if grid is not None:
gx = grid.grid[0] gx = grid.grid[0]
gy = grid.grid[1] gy = grid.grid[1]
i, j = grid.getCoordinate() if index is not None:
i, j = grid.getCoordinate(index)
else:
i, j = grid.getCoordinate()
dx = (width / gx) / nx dx = (width / gx) / nx
dy = (height / gy) / ny dy = (height / gy) / ny
@ -192,7 +195,7 @@ def genShockBubble(nx, ny, gamma, grid=None):
def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None): def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None, index=None):
""" """
Roughness parameter in (0, 1.0] determines how "squiggly" Roughness parameter in (0, 1.0] determines how "squiggly"
the interface betweeen the zones is the interface betweeen the zones is
@ -234,7 +237,7 @@ def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
x0, x1, y0, y1, _, dy = getExtent(1.0, 1.0, nx, ny, grid) x0, x1, y0, y1, _, dy = getExtent(1.0, 1.0, nx, ny, grid, index)
x = np.linspace(x0, x1, nx) x = np.linspace(x0, x1, nx)
y = np.linspace(y0, y1, ny) y = np.linspace(y0, y1, ny)
_, y = np.meshgrid(x, y) _, y = np.meshgrid(x, y)
@ -274,7 +277,7 @@ def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
E = 0.5*rho*(u**2+v**2) + p/(gamma-1.0) E = 0.5*rho*(u**2+v**2) + p/(gamma-1.0)
_, _, _, _, dx, dy = getExtent(width, height, nx, ny, grid) _, _, _, _, dx, dy = getExtent(width, height, nx, ny, grid, index)
bc = BoundaryCondition({ bc = BoundaryCondition({

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
This python module implements Cuda context handling This python module implements visualization techniques/modes
Copyright (C) 2018 SINTEF ICT Copyright (C) 2018 SINTEF ICT

18
README.md Normal file
View File

@ -0,0 +1,18 @@
# ShallowWaterGPU
## Setup
A good place to start exploring this codebase is the notebooks. Complete the following steps to run the notebooks:
1. Install conda (see e.g. Miniconda or Anaconda)
2. Change directory to the repository root and run the following commands
3. conda env create -f conda_environment.yml
4. conda activate ShallowWaterGPU
5. jupyter notebook
Make sure you are running the correct kernel ("conda:ShallowWaterGPU"). If not, change kernel using the "Kernel"-menu in the notebook.
If you do not need to run notebooks you may use the conda environment found in conda_environment_hpc.yml
## Troubleshooting
Have a look at the conda documentation and https://towardsdatascience.com/how-to-set-up-anaconda-and-jupyter-notebook-the-right-way-de3b7623ea4a

62
SYSTEMS.md Normal file
View File

@ -0,0 +1,62 @@
$ANSIBLE_VAULT;1.1;AES256
61316265663939333638336466323036663861343233316466646432313138653633623662353937
3232313165656633346432376237383566363537366534310a303231343936663438653835373161
35616161323432653062323164623861353065333761663136313137333732313230626665386336
6166656538396463370a356166316363326133313864386536323236346634323537393639653038
66336337336132613061353964613638326233356336323962366531333932366539366339623563
36333365326463616634323939333062363263636663373635653064626138363464666233316561
63393735393233616437386537393739393433663631313864646535636262616336333631396166
38643636323530386565396338623366393232313838356536303537393338393634666632656234
65353930303762333639376638336364303439306132626531326132376264623063376464636430
32333536386134333136313139313861306364333037323363393463333664633764653937623866
34313064346261313330373132353563343761323435393930303136353865303163373937623831
64343038373162333039653161643233353764633337366434396638376530636261323362373434
38393630613065356632663533333331633039663935663732353234643131306665343339373265
64356563653838613337663132663234356462343333623139626662363764656239326637653832
35396636643937336431623531306133643137623831333936313839333738333730373136666336
35623965643664343164373630313362656663386638376237616134343631386366313336626138
62376436383837376539663438346431633138383363633862356366376537393932626262383637
31323365333139653736623233636233323162343039663035346135326638633430303134396337
36353264313835346130643736663665386364343835643166383361316631373338663731373335
30313530326662663937666330643565363565616566633333363535656539656531666266613638
30306264613438363265646332386535383238373433396337633636616532626161343236336533
36366362653137333739353737386563613136653164383437316237643533633133313735633363
64326433356266363133343339626333633063326533383632353639613163663966376465396231
36663034363534396430316463386564663465323036613636343136643262666566303533346439
63396466656639623836613130363835346435633437666463363333356231343038356434343861
66643636633739336666316566653136363862346336353862653130346335363334616430366435
30376365383262326438306266366265363030353764633630333034663037643037343132303631
39316364366234363339363130333765616432306331373566393530653963356539636437383062
34633938643563656363633864656361643539663833356638356365373061663964363530393535
37646533386235613763396638393539303062326239633238373763326561313634313265613135
64646138313562313732393732303133343234323438616165326530333234626363393735636530
62353735313231353662353533636134306530623339383730306332613636663366653566313935
32343935353566656130393533323639353863666436333839386463396337336635356663373136
61323734613239396236393266363631313465363630306565636663396235626132336339623938
62383435643661623938393662363262376566613365613465323432343534356433323330666133
30303963656635303734316539333038663962626331313366666337663165323230646564623935
61316630353739386365323339626166323562616630383538393733353864396565353039656333
30343038636231363531383061613836653038373937616163643963393231356235626531366239
62343333326434636665363931376235313535343135626261336439636663323233383565633964
65333830613131396630336337646230393038386536336365313738316335386261393838383961
64656331363738616539346663613261386639353437316231636533353031336464383432623939
65386164396231393735643563663337643563633233373338643630313739373861356166616463
35306263333963663434376263396464323135346663376334356134393066653439376263376231
30333730383163366636323533393334336331633234306536376634313735613263366537346536
62366564383861656662353738366665396639313833323038356661306135393338333466333563
32653861346166663163383036386432343833333137663462343030363762663139366534326466
66313864623438336164333430613766373430656536323964633863333931643036656563353639
30313835666366383035343031643265386263316165323537613636656533376239633964393866
61646163343032313036303738643763383364663134356634373262633361383035306231636364
39333232636538643033313438396332383962656131363365666566633239366532326336363133
38393064643030333538333562643435663434343863383834663266373337336433313663646164
36343334343965623830613736393231666361643239663062393239613233376335383362666161
66383035653330373736613234303631386163656561383138613363613539396332376162316131
61313532653531653836343731636535623066383231613635316432323331623761383833623333
39343632623961613561373261653939636363366531303839336237383166363733303538363237
36373362636263666334316163633766303334373033636539353464393536356466636664333665
32643135626366666137626464393961366165383334343063356334373534633764326162363837
38643662326266313464343464646166643235663663303761313639376537306337353863336264
66376335333738366265343636376363366365306137336665623466626261653937656461303332
32613561616662383032393562613831626666373134303032626134313262363830326530643632
61366133663564313933366430396430353762386133396436633839303766653765

33
dgx-2-shmem-test.job Normal file
View File

@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH -p dgx2q # partition (GPU queue)
#SBATCH -N 1 # number of nodes
#SBATCH -n 1 # number of cores
#SBATCH -w g001 # DGX-2 node
#SBATCH --gres=gpu:1 # number of V100's
#SBATCH -t 0-00:10 # time (D-HH:MM)
#SBATCH -o slurm.%N.%j.out # STDOUT
#SBATCH -e slurm.%N.%j.err # STDERR
ulimit -s 10240
module load slurm
module load cuda10.1/toolkit/10.1.243
# Check how many gpu's your job got
#nvidia-smi
## Copy input files to the work directory:
rm -rf /work/$USER/ShallowWaterGPU
mkdir -p /work/$USER/ShallowWaterGPU
cp -r . /work/$USER/ShallowWaterGPU
# Run job
# (Assumes Miniconda is installed in user root dir.)
cd /work/$USER/ShallowWaterGPU
nvprof -o profiler_output $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 shmemTesting.py
cd $HOME/src/ShallowWaterGPU
## Copy files from work directory:
# (NOTE: Copying is not performed if job fails!)
cp /work/$USER/ShallowWaterGPU/*.log .
cp /work/$USER/ShallowWaterGPU/*.nc .
cp /work/$USER/ShallowWaterGPU/profiler_output .

View File

@ -1,10 +1,10 @@
#!/bin/bash #!/bin/bash
# See http://wiki.ex3.simula.no before changing the values below
#SBATCH -p dgx2q # partition (GPU queue) #SBATCH -p dgx2q # partition (GPU queue)
#SBATCH -N 1 # number of nodes #SBATCH -N 1 # number of nodes
#SBATCH -n 4 # number of cores #SBATCH -n 4 # number of cores
#SBATCH -w g001 # DGX-2 node #SBATCH -w g001 # DGX-2 node
#SBATCH --gres=gpu:4 # number of V100's #SBATCH --gres=gpu:4 # number of V100's
#SBATCH --mem 10G # memory pool for all cores
#SBATCH -t 0-00:10 # time (D-HH:MM) #SBATCH -t 0-00:10 # time (D-HH:MM)
#SBATCH -o slurm.%N.%j.out # STDOUT #SBATCH -o slurm.%N.%j.out # STDOUT
#SBATCH -e slurm.%N.%j.err # STDERR #SBATCH -e slurm.%N.%j.err # STDERR
@ -18,6 +18,7 @@ module load cuda10.1/toolkit/10.1.243
#nvidia-smi #nvidia-smi
## Copy input files to the work directory: ## Copy input files to the work directory:
rm -rf /work/$USER/ShallowWaterGPU
mkdir -p /work/$USER/ShallowWaterGPU mkdir -p /work/$USER/ShallowWaterGPU
cp -r . /work/$USER/ShallowWaterGPU cp -r . /work/$USER/ShallowWaterGPU

View File

@ -0,0 +1,59 @@
#!/bin/bash
# See http://wiki.ex3.simula.no before changing the values below
#SBATCH -p dgx2q # partition (GPU queue)
#SBATCH -w g001 # DGX-2 node
##SBATCH --gres=gpu:4 # number of V100's
#SBATCH -t 0-00:10 # time (D-HH:MM)
#SBATCH -o slurm.%N.%j.out # STDOUT
#SBATCH -e slurm.%N.%j.err # STDERR
#SBATCH --reservation=martinls_17
# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
# mpiexec --mca opal_cuda_support 1 ...
#
# In addition, the UCX support is also built but disabled by default.
# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
# Equivalently, you can set the MCA parameters in the command line:
# mpiexec --mca pml ucx --mca osc ucx ...
# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
# Please consult UCX's documentation for detail.
ulimit -s 10240
module load slurm/20.02.7
module load cuda11.2/toolkit/11.2.2
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
# Check how many gpu's your job got
#nvidia-smi
mkdir -p output_dgx-2/$NOW
## Copy input files to the work directory:
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
# Run job
# (Assumes Miniconda is installed in user root dir.)
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
export OMPI_MCA_opal_cuda_support=true
mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
cd $HOME/src/ShallowWaterGPU
## Copy files from work directory:
# (NOTE: Copying is not performed if job fails!)
mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW
rm -rf /work/$USER/$SLURM_JOB_ID

View File

@ -0,0 +1,73 @@
#!/bin/bash
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
# one node: 1-16 GPUs
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
# one node: 4-16 GPUs
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
# one node: 1-16 GPUs
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
# one node: 4-16 GPUs
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job

View File

@ -0,0 +1,41 @@
#!/bin/bash
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
# one node: 1-16 GPUs
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
# one node: 1-16 GPUs
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job

58
hgx_scaling_benchmark.job Normal file
View File

@ -0,0 +1,58 @@
#!/bin/bash
# See http://wiki.ex3.simula.no before changing the values below
#SBATCH -p hgx2q # partition (GPU queue)
#SBATCH -w g002 # HGX node
#SBATCH -t 0-00:10 # time (D-HH:MM)
#SBATCH -o slurm.%N.%j.out # STDOUT
#SBATCH -e slurm.%N.%j.err # STDERR
#SBATCH --reservation=martinls_11
# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
# mpiexec --mca opal_cuda_support 1 ...
#
# In addition, the UCX support is also built but disabled by default.
# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
# Equivalently, you can set the MCA parameters in the command line:
# mpiexec --mca pml ucx --mca osc ucx ...
# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
# Please consult UCX's documentation for detail.
ulimit -s 10240
module load slurm/20.02.7
module load cuda11.2/toolkit/11.2.2
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
# Check how many gpu's your job got
#nvidia-smi
mkdir -p output_hgx/$NOW
## Copy input files to the work directory:
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
# Run job
# (Assumes Miniconda is installed in user root dir.)
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
export OMPI_MCA_opal_cuda_support=true
mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
cd $HOME/src/ShallowWaterGPU
## Copy files from work directory:
# (NOTE: Copying is not performed if job fails!)
mkdir -p output_hgx/$NOW/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW
rm -rf /work/$USER/$SLURM_JOB_ID

View File

@ -0,0 +1,20 @@
#!/bin/bash
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
# one node: 1-8 GPUs
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job
# one node: 4-8 GPUs
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job

View File

@ -0,0 +1,23 @@
#!/bin/bash
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
# one node: 1-16 GPUs
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
# one node: 1-8 GPUs
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job

View File

@ -25,29 +25,45 @@ import gc
import time import time
import json import json
import logging import logging
import os
#MPI # MPI
from mpi4py import MPI from mpi4py import MPI
#CUDA # CUDA
import pycuda.driver as cuda import pycuda.driver as cuda
#Simulator engine etc # Simulator engine etc
from GPUSimulators import MPISimulator, Common, CudaContext from GPUSimulators import MPISimulator, Common, CudaContext
from GPUSimulators import EE2D_KP07_dimsplit from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC from GPUSimulators.Simulator import BoundaryCondition as BC
import argparse
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
parser.add_argument('-nx', type=int, default=128)
parser.add_argument('-ny', type=int, default=128)
parser.add_argument('--profile', action='store_true') # default: False
#Get MPI COMM to use
args = parser.parse_args()
if(args.profile):
profiling_data = {}
# profiling: total run time
t_total_start = time.time()
t_init_start = time.time()
# Get MPI COMM to use
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
#### ####
#Initialize logging # Initialize logging
#### ####
log_level_console = 20 log_level_console = 20
log_level_file = 10 log_level_file = 10
log_filename = 'mpi_' + str(comm.rank) + '.log' log_filename = 'mpi_' + str(comm.rank) + '.log'
logger = logging.getLogger('GPUSimulators') logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(log_level_console, log_level_file)) logger.setLevel(min(log_level_console, log_level_file))
@ -55,15 +71,17 @@ logger.setLevel(min(log_level_console, log_level_file))
ch = logging.StreamHandler() ch = logging.StreamHandler()
ch.setLevel(log_level_console) ch.setLevel(log_level_console)
logger.addHandler(ch) logger.addHandler(ch)
logger.info("Console logger using level %s", logging.getLevelName(log_level_console)) logger.info("Console logger using level %s",
logging.getLevelName(log_level_console))
fh = logging.FileHandler(log_filename) fh = logging.FileHandler(log_filename)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') formatter = logging.Formatter(
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter) fh.setFormatter(formatter)
fh.setLevel(log_level_file) fh.setLevel(log_level_file)
logger.addHandler(fh) logger.addHandler(fh)
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename) logger.info("File logger using level %s to %s",
logging.getLevelName(log_level_file), log_filename)
#### ####
@ -73,7 +91,6 @@ logger.info("Creating MPI grid")
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD) grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
#### ####
# Initialize CUDA # Initialize CUDA
#### ####
@ -82,18 +99,28 @@ logger.info("Initializing CUDA")
local_rank = grid.getLocalRank() local_rank = grid.getLocalRank()
num_cuda_devices = cuda.Device.count() num_cuda_devices = cuda.Device.count()
cuda_device = local_rank % num_cuda_devices cuda_device = local_rank % num_cuda_devices
logger.info("Process %s using CUDA device %s", str(local_rank), str(cuda_device))
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False) cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
#### ####
# Set initial conditions # Set initial conditions
#### ####
# DEBUGGING - setting random seed
np.random.seed(42)
logger.info("Generating initial conditions") logger.info("Generating initial conditions")
nx = 128 nx = args.nx
ny = 128 ny = args.ny
dt = 0.000001
gamma = 1.4 gamma = 1.4
save_times = np.linspace(0, 5.0, 10) #save_times = np.linspace(0, 0.000009, 2)
#save_times = np.linspace(0, 0.000099, 11)
#save_times = np.linspace(0, 0.000099, 2)
save_times = np.linspace(0, 0.0000999, 2)
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
@ -102,21 +129,65 @@ arguments['context'] = cuda_context
arguments['theta'] = 1.2 arguments['theta'] = 1.2
arguments['grid'] = grid arguments['grid'] = grid
if(args.profile):
t_init_end = time.time()
t_init = t_init_end - t_init_start
profiling_data["t_init"] = t_init
#### ####
# Run simulation # Run simulation
#### ####
logger.info("Running simulation") logger.info("Running simulation")
#Helper function to create MPI simulator # Helper function to create MPI simulator
def genSim(grid, **kwargs): def genSim(grid, **kwargs):
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs) local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
sim = MPISimulator.MPISimulator(local_sim, grid) sim = MPISimulator.MPISimulator(local_sim, grid)
return sim return sim
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
genSim, arguments, outfile, save_times, save_var_names, dt)
if(args.profile):
t_total_end = time.time()
t_total = t_total_end - t_total_start
profiling_data["t_total"] = t_total
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
# write profiling to json file
if(args.profile and MPI.COMM_WORLD.rank == 0):
job_id = ""
if "SLURM_JOB_ID" in os.environ:
job_id = int(os.environ["SLURM_JOB_ID"])
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
profiling_file = "MPI_jobid_" + \
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
profiling_data["outfile"] = outfile
else:
profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
for stage in sim_runner_profiling_data["start"].keys():
profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
for stage in sim_profiling_data["start"].keys():
profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage]
profiling_data["nx"] = nx
profiling_data["ny"] = ny
profiling_data["dt"] = dt
profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
profiling_data["slurm_job_id"] = job_id
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
profiling_data["git_hash"] = Common.getGitHash()
profiling_data["git_status"] = Common.getGitStatus()
with open(profiling_file, "w") as write_file:
json.dump(profiling_data, write_file)
#### ####
# Clean shutdown # Clean shutdown

8
run_script_ppi.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
module purge
module load git/2.21.0 hdf5/1.10.5-gcc cuda/10.1
conda activate ShallowWaterGPU_HPC
python mpiTesting.py

54
saga-dev.job Normal file
View File

@ -0,0 +1,54 @@
#!/bin/bash
# Job name:
#SBATCH --job-name=ShallowWaterGPUScalingDev
#
# Project:
#SBATCH --account=nn9882k
#
# Wall clock limit:
#SBATCH --time=00:02:00
#
# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
#
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
#SBATCH --partition=accel
#
# Max memory usage per task (core) - increasing this will cost more core hours:
#SBATCH --mem-per-cpu=3800M
#
# Number of tasks:
#SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
#
#SBATCH --qos=devel
## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...')
# source /cluster/bin/jobsetup
module restore system # instead of 'module purge' rather set module environment to the system default
module load CUDA/11.4.1
# It is also recommended to to list loaded modules, for easier debugging:
module list
set -o errexit # exit on errors
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
## Copy input files to the work directory:
mkdir $SCRATCH/ShallowWaterGPU
cp -r . $SCRATCH/ShallowWaterGPU
## Make sure the results are copied back to the submit directory (see Work Directory below):
# chkfile MyResultFileq
# chkfile is replaced by 'savefile' on Saga
savefile "$SCRATCH/ShallowWaterGPU/*.log"
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
savefile "$SCRATCH/ShallowWaterGPU/*.json"
## Do some work:
cd $SCRATCH/ShallowWaterGPU
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile

View File

@ -1,24 +1,25 @@
#!/bin/bash #!/bin/bash
# Job name: # Job name:
#SBATCH --job-name=saga-test #SBATCH --job-name=ShallowWaterGPUStrongScaling
# #
# Project: # Project:
#SBATCH --account=nn9550k #SBATCH --account=nn9882k
# #
# Wall clock limit: # Wall clock limit:
#SBATCH --time=00:10:00 #SBATCH --time=24:00:00
#
# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
# #
# Ask for 1 GPU (max is 2)
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to # device(s) to use. It will have values '0', '1' or '0,1' corresponding to
# /dev/nvidia0, /dev/nvidia1 or both, respectively. # /dev/nvidia0, /dev/nvidia1 or both, respectively.
#SBATCH --partition=accel --gres=gpu:1 #SBATCH --partition=accel
# #
# Max memory usage per task (core) - increasing this will cost more core hours: # Max memory usage per task (core) - increasing this will cost more core hours:
#SBATCH --mem-per-cpu=4G #SBATCH --mem-per-cpu=3800M
# #
# Number of tasks: # Number of tasks:
#SBATCH --nodes=2 --ntasks-per-node=1 #SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
## Set up job environment: (this is done automatically behind the scenes) ## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...') ## (make sure to comment '#' or remove the following line 'source ...')
@ -42,9 +43,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
# chkfile is replaced by 'savefile' on Saga # chkfile is replaced by 'savefile' on Saga
savefile "$SCRATCH/ShallowWaterGPU/*.log" savefile "$SCRATCH/ShallowWaterGPU/*.log"
savefile "$SCRATCH/ShallowWaterGPU/*.nc" savefile "$SCRATCH/ShallowWaterGPU/*.nc"
savefile "$SCRATCH/ShallowWaterGPU/*.json"
## Do some work: ## Do some work:
cd $SCRATCH/ShallowWaterGPU cd $SCRATCH/ShallowWaterGPU
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile

View File

@ -0,0 +1,65 @@
#!/bin/bash
# Job name:
#SBATCH --job-name=ShallowWaterGPUScaling
#
# Project:
#SBATCH --account=nn9882k
#
# Wall clock limit:
#SBATCH --time=00:10:00
#
# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
#
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
#SBATCH --partition=accel
#
# Max memory usage per task (core) - increasing this will cost more core hours:
##SBATCH --mem-per-cpu=3800M
#SBATCH --mem-per-cpu=24G
#
#SBATCH --qos=devel
## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...')
# source /cluster/bin/jobsetup
module restore system # instead of 'module purge' rather set module environment to the system default
module load CUDA/11.4.1
#module load CUDA/11.1.1-GCC-10.2.0
#module load OpenMPI/4.0.5-gcccuda-2020b
# It is also recommended to to list loaded modules, for easier debugging:
module list
set -o errexit # exit on errors
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
## Copy input files to the work directory:
mkdir $SCRATCH/ShallowWaterGPU
cp -r . $SCRATCH/ShallowWaterGPU
## Make sure the results are copied back to the submit directory (see Work Directory below):
# chkfile MyResultFile
# chkfile is replaced by 'savefile' on Saga
#savefile "$SCRATCH/ShallowWaterGPU/*.log"
#savefile "$SCRATCH/ShallowWaterGPU/*.nc"
#savefile "$SCRATCH/ShallowWaterGPU/*.json"
#savefile "$SCRATCH/ShallowWaterGPU/*.qdrep"
cleanup "rm -rf $SCRATCH/ShallowWaterGPU"
export OMPI_MCA_opal_cuda_support=true
## Do some work:
cd $SCRATCH/ShallowWaterGPU
srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
cd $HOME/src/ShallowWaterGPU
mkdir -p output_saga/$NOW/$SLURM_JOB_ID
mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID
mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID
mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW
mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW

View File

@ -0,0 +1,30 @@
#!/bin/bash
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
# one node: 14 GPUs
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
# 4 nodes: 14 GPUs per node
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
# 4 nodes: 14 GPUs per node
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
## one node: 14 GPUs
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
#
## 4 nodes: 14 GPUs per node
#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
#sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
#sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
#sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks

View File

@ -0,0 +1,25 @@
#!/bin/bash
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
# one node: 1-4 GPUs
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
# 2-4 nodes: 1 GPUs per node
sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
## one node: 1-4 GPUs
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
## 2-4 nodes: 1 GPUs per node
#sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
#sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks

View File

@ -0,0 +1,39 @@
#!/bin/bash
NOW=$(date "+%Y-%m-%dT%H%M%S")
mkdir -p output_seymour/$NOW
# one node: 1-8 GPUs
mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile &&
mkdir -p output_seymour/$NOW/1_proc &&
mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ &&
mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile &&
mkdir -p output_seymour/$NOW/2_proc &&
mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ &&
mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile &&
mkdir -p output_seymour/$NOW/3_proc &&
mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ &&
mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile &&
mkdir -p output_seymour/$NOW/4_proc &&
mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ &&
mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile &&
mkdir -p output_seymour/$NOW/5_proc &&
mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ &&
mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile &&
mkdir -p output_seymour/$NOW/6_proc &&
mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ &&
mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile &&
mkdir -p output_seymour/$NOW/7_proc &&
mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ &&
mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile &&
mkdir -p output_seymour/$NOW/8_proc &&
mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ &&
for filename in *.json; do mv "$filename" "output_seymour/$NOW/MPI_${NOW}_${filename#????}"; done;

127
shmemTesting.py Normal file
View File

@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
"""
This python module implements SHMEM (shared memory) simulations for benchmarking
Copyright (C) 2020 Norwegian Meteorological Institute
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import numpy as np
import gc
import time
import json
import logging
#Simulator engine etc
from GPUSimulators import SHMEMSimulatorGroup, Common, CudaContext
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC
####
#Initialize logging
####
log_level_console = 20
log_level_file = 10
log_filename = 'shmem.log'
logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(log_level_console, log_level_file))
ch = logging.StreamHandler()
ch.setLevel(log_level_console)
logger.addHandler(ch)
logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
fh = logging.FileHandler(log_filename)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter)
fh.setLevel(log_level_file)
logger.addHandler(fh)
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
####
# Initialize SHMEM grid etc
####
logger.info("Creating SHMEM grid")
# XXX: need to explicitly set ngpus when testing on single-GPU system
grid = SHMEMSimulatorGroup.SHMEMGrid(ngpus=4)
####
# Set initial conditions
####
logger.info("Generating initial conditions")
nx = 128
ny = 128
gamma = 1.4
#save_times = np.linspace(0, 0.01, 10)
save_times = np.linspace(0, 10, 10)
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
outfile = "shmem_out.nc"
####
# Run simulation
####
logger.info("Running simulation")
sims = []
for i in range(grid.ngpus):
arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid, index=i)
arguments['context'] = grid.cuda_contexts[i]
arguments['theta'] = 1.2
sims.append(EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**arguments))
#sims[i] = SHMEMSimulator(i, local_sim, grid) # 1st attempt: no wrapper (per sim)
arguments['sims'] = sims
arguments['grid'] = grid
#Helper function to create SHMEM simulator
def genSim(sims, grid, **kwargs):
# XXX: kwargs not used, since the simulators are already instantiated in the for-loop above
sim = SHMEMSimulatorGroup.SHMEMSimulatorGroup(sims, grid)
return sim
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
####
# Clean shutdown
####
sim = None
local_sim = None
cuda_context = None
arguments = None
logging.shutdown()
gc.collect()
####
# Print completion and exit
####
print("Completed!")
exit(0)

126
singleGPUTesting.py Normal file
View File

@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
"""
This python module implements simulations for benchmarking
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import numpy as np
import gc
import logging
import os
# CUDA
import pycuda.driver as cuda
# Simulator engine etc
from GPUSimulators import Common, CudaContext
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC
import argparse
parser = argparse.ArgumentParser(description='Single GPU testing.')
parser.add_argument('-nx', type=int, default=128)
parser.add_argument('-ny', type=int, default=128)
args = parser.parse_args()
####
# Initialize logging
####
log_level_console = 20
log_level_file = 10
log_filename = 'single_gpu.log'
logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(log_level_console, log_level_file))
ch = logging.StreamHandler()
ch.setLevel(log_level_console)
logger.addHandler(ch)
logger.info("Console logger using level %s",
logging.getLevelName(log_level_console))
fh = logging.FileHandler(log_filename)
formatter = logging.Formatter(
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter)
fh.setLevel(log_level_file)
logger.addHandler(fh)
logger.info("File logger using level %s to %s",
logging.getLevelName(log_level_file), log_filename)
####
# Initialize CUDA
####
cuda.init(flags=0)
logger.info("Initializing CUDA")
cuda_context = CudaContext.CudaContext(autotuning=False)
####
# Set initial conditions
####
logger.info("Generating initial conditions")
nx = args.nx
ny = args.ny
gamma = 1.4
roughness = 0.125
save_times = np.linspace(0, 0.5, 10)
outfile = "single_gpu_out.nc"
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
arguments = IC.genKelvinHelmholtz(nx, ny, gamma)
arguments['context'] = cuda_context
arguments['theta'] = 1.2
####
# Run simulation
####
logger.info("Running simulation")
# Helper function to create MPI simulator
def genSim(**kwargs):
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
return local_sim
outfile = Common.runSimulation(
genSim, arguments, outfile, save_times, save_var_names)
####
# Clean shutdown
####
local_sim = None
cuda_context = None
arguments = None
logging.shutdown()
gc.collect()
####
# Print completion and exit
####
print("Completed!")
exit(0)