mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 06:24:13 +02:00
Merge branch 'master' of github.com:setmar/ShallowWaterGPU
This commit is contained in:
commit
234c8cb727
619
Figures.ipynb
619
Figures.ipynb
File diff suppressed because one or more lines are too long
@ -100,8 +100,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
profiling_data_sim_runner["end"]["t_sim_init"] = 0
|
||||
profiling_data_sim_runner["start"]["t_nc_write"] = 0
|
||||
profiling_data_sim_runner["end"]["t_nc_write"] = 0
|
||||
profiling_data_sim_runner["start"]["t_step"] = 0
|
||||
profiling_data_sim_runner["end"]["t_step"] = 0
|
||||
profiling_data_sim_runner["start"]["t_full_step"] = 0
|
||||
profiling_data_sim_runner["end"]["t_full_step"] = 0
|
||||
|
||||
profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
|
||||
|
||||
@ -121,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
outdata.ncfile.git_hash = getGitHash()
|
||||
outdata.ncfile.git_status = getGitStatus()
|
||||
outdata.ncfile.simulator = str(simulator)
|
||||
outdata.ncfile.sim_args = toJson(simulator_args)
|
||||
|
||||
# do not write fields to attributes (they are to large)
|
||||
simulator_args_for_ncfile = simulator_args.copy()
|
||||
del simulator_args_for_ncfile["rho"]
|
||||
del simulator_args_for_ncfile["rho_u"]
|
||||
del simulator_args_for_ncfile["rho_v"]
|
||||
del simulator_args_for_ncfile["E"]
|
||||
outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
|
||||
|
||||
#Create dimensions
|
||||
outdata.ncfile.createDimension('time', len(save_times))
|
||||
@ -172,13 +179,13 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
|
||||
return outdata.filename
|
||||
|
||||
profiling_data_sim_runner["start"]["t_step"] += time.time()
|
||||
profiling_data_sim_runner["start"]["t_full_step"] += time.time()
|
||||
|
||||
#Simulate
|
||||
if (t_step > 0.0):
|
||||
sim.simulate(t_step, dt)
|
||||
|
||||
profiling_data_sim_runner["end"]["t_step"] += time.time()
|
||||
profiling_data_sim_runner["end"]["t_full_step"] += time.time()
|
||||
|
||||
profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
|
||||
|
||||
|
@ -138,9 +138,9 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
return
|
||||
|
||||
if external and not internal:
|
||||
#############################################################
|
||||
# XXX: Only treating north and south external cells for now #
|
||||
#############################################################
|
||||
###################################
|
||||
# XXX: Corners are treated twice! #
|
||||
###################################
|
||||
|
||||
ns_grid_size = (self.grid_size[0], 1)
|
||||
|
||||
@ -189,14 +189,58 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
self.cfl_data.gpudata,
|
||||
0, 0,
|
||||
self.nx, int(self.u0[0].y_halo))
|
||||
|
||||
we_grid_size = (1, self.grid_size[1])
|
||||
|
||||
# WEST
|
||||
# (x0, y0) x (x1, y1)
|
||||
# (0, 0) x (x_halo, ny)
|
||||
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
|
||||
self.nx, self.ny,
|
||||
self.dx, self.dy, dt,
|
||||
self.g,
|
||||
self.gamma,
|
||||
self.theta,
|
||||
substep,
|
||||
self.boundary_conditions,
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata,
|
||||
0, 0,
|
||||
int(self.u0[0].x_halo), self.ny)
|
||||
|
||||
# EAST
|
||||
# (x0, y0) x (x1, y1)
|
||||
# (nx-x_halo, 0) x (nx, ny)
|
||||
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
|
||||
self.nx, self.ny,
|
||||
self.dx, self.dy, dt,
|
||||
self.g,
|
||||
self.gamma,
|
||||
self.theta,
|
||||
substep,
|
||||
self.boundary_conditions,
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata,
|
||||
self.nx - int(self.u0[0].x_halo), 0,
|
||||
self.nx, self.ny)
|
||||
return
|
||||
|
||||
if internal and not external:
|
||||
#############################################################
|
||||
# XXX: Only treating north and south external cells for now #
|
||||
# So we need to include west and east boundary here! #
|
||||
#############################################################
|
||||
|
||||
|
||||
# INTERNAL DOMAIN
|
||||
# (x0, y0) x (x1, y1)
|
||||
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
|
||||
@ -217,8 +261,8 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata,
|
||||
0, int(self.u0[0].y_halo),
|
||||
self.nx, self.ny - int(self.u0[0].y_halo))
|
||||
int(self.u0[0].x_halo), int(self.u0[0].y_halo),
|
||||
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
|
||||
return
|
||||
|
||||
def swapBuffers(self):
|
||||
|
@ -27,7 +27,7 @@ from mpi4py import MPI
|
||||
import time
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import nvtx
|
||||
#import nvtx
|
||||
|
||||
|
||||
|
||||
@ -208,18 +208,17 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
"""
|
||||
def __init__(self, sim, grid):
|
||||
self.profiling_data_mpi = { 'start': {}, 'end': {} }
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] = 0
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] = 0
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] = 0
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] = 0
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] = 0
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] = 0
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] = 0
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] = 0
|
||||
self.profiling_data_mpi["start"]["t_step_mpi"] = 0
|
||||
self.profiling_data_mpi["end"]["t_step_mpi"] = 0
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0
|
||||
self.profiling_data_mpi["start"]["t_mpi_step"] = 0
|
||||
self.profiling_data_mpi["end"]["t_mpi_step"] = 0
|
||||
self.profiling_data_mpi["n_time_steps"] = 0
|
||||
self.profiling_data_mpi["start"]["t_sim_mpi_init"] = time.time()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
autotuner = sim.context.autotuner
|
||||
@ -297,43 +296,43 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
#Note that east and west also transfer ghost cells
|
||||
#whilst north/south only transfer internal cells
|
||||
#Reuses the width/height defined in the read-extets above
|
||||
self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
|
||||
self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
|
||||
self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
|
||||
self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
|
||||
self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
|
||||
self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
|
||||
self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
|
||||
self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
|
||||
|
||||
#Allocate data for sending
|
||||
self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e)
|
||||
self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w)
|
||||
self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n)
|
||||
self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s)
|
||||
self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e)
|
||||
self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w)
|
||||
self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n)
|
||||
self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s)
|
||||
|
||||
self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
|
||||
self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time()
|
||||
|
||||
self.old_exchange()
|
||||
self.full_exchange()
|
||||
sim.context.synchronize()
|
||||
|
||||
def substep(self, dt, step_number):
|
||||
|
||||
nvtx.mark("substep start", color="yellow")
|
||||
#nvtx.mark("substep start", color="yellow")
|
||||
|
||||
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_step"] += time.time()
|
||||
|
||||
nvtx.mark("substep internal", color="red")
|
||||
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
|
||||
|
||||
nvtx.mark("substep external", color="blue")
|
||||
#nvtx.mark("substep external", color="blue")
|
||||
self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
|
||||
|
||||
#nvtx.mark("substep internal", color="red")
|
||||
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
|
||||
|
||||
#nvtx.mark("substep full", color="blue")
|
||||
#self.sim.substep(dt, step_number, external=True, internal=True)
|
||||
|
||||
self.sim.swapBuffers()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
||||
self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
|
||||
|
||||
nvtx.mark("exchange", color="blue")
|
||||
self.old_exchange()
|
||||
#nvtx.mark("exchange", color="blue")
|
||||
self.full_exchange()
|
||||
|
||||
#nvtx.mark("download", color="blue")
|
||||
#self.download_for_exchange(self.sim.u0)
|
||||
@ -344,10 +343,10 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
#nvtx.mark("upload", color="blue")
|
||||
#self.upload_for_exchange(self.sim.u0)
|
||||
|
||||
nvtx.mark("sync start", color="blue")
|
||||
#nvtx.mark("sync start", color="blue")
|
||||
self.sim.stream.synchronize()
|
||||
self.sim.internal_stream.synchronize()
|
||||
nvtx.mark("sync end", color="blue")
|
||||
#nvtx.mark("sync end", color="blue")
|
||||
|
||||
self.profiling_data_mpi["n_time_steps"] += 1
|
||||
|
||||
@ -383,8 +382,7 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
return [x0, x1, y0, y1]
|
||||
|
||||
def download_for_exchange(self, u):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
# North-south
|
||||
if self.north is not None:
|
||||
@ -406,12 +404,10 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
|
||||
#self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w)
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
def exchange(self):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
#Send/receive to north/south neighbours
|
||||
comm_send = []
|
||||
@ -441,12 +437,10 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
for comm in comm_send:
|
||||
comm.wait()
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
def upload_for_exchange(self, u):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
# North-south
|
||||
if self.north is not None:
|
||||
@ -464,15 +458,11 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
for k in range(self.nvars):
|
||||
u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def old_exchange(self):
|
||||
def full_exchange(self):
|
||||
####
|
||||
# FIXME: This function can be optimized using persitent communications.
|
||||
# Also by overlapping some of the communications north/south and east/west of GPU and intra-node
|
||||
@ -484,8 +474,7 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
####
|
||||
|
||||
#Download from the GPU
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
if self.north is not None:
|
||||
for k in range(self.nvars):
|
||||
@ -495,10 +484,10 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
|
||||
self.sim.stream.synchronize()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
#Send/receive to north/south neighbours
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
comm_send = []
|
||||
comm_recv = []
|
||||
@ -513,10 +502,10 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
for comm in comm_recv:
|
||||
comm.wait()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
#Upload to the GPU
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
if self.north is not None:
|
||||
for k in range(self.nvars):
|
||||
@ -524,25 +513,23 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
if self.south is not None:
|
||||
for k in range(self.nvars):
|
||||
self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
#Wait for sending to complete
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
for comm in comm_send:
|
||||
comm.wait()
|
||||
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
####
|
||||
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
|
||||
####
|
||||
|
||||
#Download from the GPU
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
if self.east is not None:
|
||||
for k in range(self.nvars):
|
||||
@ -552,10 +539,10 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
|
||||
self.sim.stream.synchronize()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
#Send/receive to east/west neighbours
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
comm_send = []
|
||||
comm_recv = []
|
||||
@ -566,15 +553,14 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
|
||||
comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
|
||||
|
||||
|
||||
#Wait for incoming transfers to complete
|
||||
for comm in comm_recv:
|
||||
comm.wait()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
#Upload to the GPU
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
if self.east is not None:
|
||||
for k in range(self.nvars):
|
||||
@ -583,13 +569,12 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
for k in range(self.nvars):
|
||||
self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
#Wait for sending to complete
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
for comm in comm_send:
|
||||
comm.wait()
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
@ -6,6 +6,21 @@
|
||||
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||
#SBATCH --reservation=martinls_17
|
||||
|
||||
|
||||
# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
|
||||
# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
|
||||
# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
|
||||
# mpiexec --mca opal_cuda_support 1 ...
|
||||
#
|
||||
# In addition, the UCX support is also built but disabled by default.
|
||||
# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
|
||||
# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
|
||||
# Equivalently, you can set the MCA parameters in the command line:
|
||||
# mpiexec --mca pml ucx --mca osc ucx ...
|
||||
# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
|
||||
# Please consult UCX's documentation for detail.
|
||||
|
||||
ulimit -s 10240
|
||||
module load slurm/20.02.7
|
||||
@ -26,7 +41,11 @@ cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
|
||||
export OMPI_MCA_opal_cuda_support=true
|
||||
mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
|
||||
cd $HOME/src/ShallowWaterGPU
|
||||
|
||||
## Copy files from work directory:
|
@ -2,21 +2,72 @@
|
||||
|
||||
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||
|
||||
# one node: 1-8 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
# one node: 1-16 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#
|
||||
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
# one node: 4-16 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#
|
||||
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
# one node: 1-16 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
# one node: 4-16 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
@ -2,21 +2,40 @@
|
||||
|
||||
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||
|
||||
# one node: 1-8 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
# one node: 1-16 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#
|
||||
#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||
# one node: 1-16 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
|
||||
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
|
58
hgx_scaling_benchmark.job
Normal file
58
hgx_scaling_benchmark.job
Normal file
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
# See http://wiki.ex3.simula.no before changing the values below
|
||||
#SBATCH -p hgx2q # partition (GPU queue)
|
||||
#SBATCH -w g002 # HGX node
|
||||
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||
#SBATCH --reservation=martinls_11
|
||||
|
||||
|
||||
# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
|
||||
# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
|
||||
# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
|
||||
# mpiexec --mca opal_cuda_support 1 ...
|
||||
#
|
||||
# In addition, the UCX support is also built but disabled by default.
|
||||
# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
|
||||
# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
|
||||
# Equivalently, you can set the MCA parameters in the command line:
|
||||
# mpiexec --mca pml ucx --mca osc ucx ...
|
||||
# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
|
||||
# Please consult UCX's documentation for detail.
|
||||
|
||||
ulimit -s 10240
|
||||
module load slurm/20.02.7
|
||||
module load cuda11.2/toolkit/11.2.2
|
||||
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
|
||||
|
||||
# Check how many gpu's your job got
|
||||
#nvidia-smi
|
||||
|
||||
mkdir -p output_hgx/$NOW
|
||||
|
||||
## Copy input files to the work directory:
|
||||
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
|
||||
# Run job
|
||||
# (Assumes Miniconda is installed in user root dir.)
|
||||
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
|
||||
export OMPI_MCA_opal_cuda_support=true
|
||||
mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
|
||||
cd $HOME/src/ShallowWaterGPU
|
||||
|
||||
## Copy files from work directory:
|
||||
# (NOTE: Copying is not performed if job fails!)
|
||||
mkdir -p output_hgx/$NOW/$SLURM_JOB_ID
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW
|
||||
|
||||
rm -rf /work/$USER/$SLURM_JOB_ID
|
20
hgx_strong_scaling_benchmark.sh
Normal file
20
hgx_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||
|
||||
# one node: 1-8 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
|
||||
# one node: 4-8 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
23
hgx_weak_scaling_benchmark.sh
Normal file
23
hgx_weak_scaling_benchmark.sh
Normal file
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||
|
||||
# one node: 1-16 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
|
||||
# one node: 1-8 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
|
@ -114,13 +114,13 @@ logger.info("Generating initial conditions")
|
||||
nx = args.nx
|
||||
ny = args.ny
|
||||
|
||||
dt = 0.00001
|
||||
dt = 0.000001
|
||||
|
||||
gamma = 1.4
|
||||
#save_times = np.linspace(0, 0.000009, 2)
|
||||
#save_times = np.linspace(0, 0.000099, 11)
|
||||
#save_times = np.linspace(0, 0.000099, 2)
|
||||
save_times = np.linspace(0, 0.000999, 2)
|
||||
save_times = np.linspace(0, 0.0000999, 2)
|
||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||
|
||||
@ -183,6 +183,8 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
profiling_data["slurm_job_id"] = job_id
|
||||
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
|
||||
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
|
||||
profiling_data["git_hash"] = Common.getGitHash()
|
||||
profiling_data["git_status"] = Common.getGitStatus()
|
||||
|
||||
with open(profiling_file, "w") as write_file:
|
||||
json.dump(profiling_data, write_file)
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
# Job name:
|
||||
#SBATCH --job-name=ShallowWaterGPUStrongScaling
|
||||
#SBATCH --job-name=ShallowWaterGPUScaling
|
||||
#
|
||||
# Project:
|
||||
#SBATCH --account=nn9882k
|
||||
@ -16,7 +16,8 @@
|
||||
#SBATCH --partition=accel
|
||||
#
|
||||
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||
#SBATCH --mem-per-cpu=3800M
|
||||
##SBATCH --mem-per-cpu=3800M
|
||||
#SBATCH --mem-per-cpu=24G
|
||||
#
|
||||
#SBATCH --qos=devel
|
||||
|
||||
@ -26,6 +27,8 @@
|
||||
|
||||
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||
module load CUDA/11.4.1
|
||||
#module load CUDA/11.1.1-GCC-10.2.0
|
||||
#module load OpenMPI/4.0.5-gcccuda-2020b
|
||||
|
||||
# It is also recommended to to list loaded modules, for easier debugging:
|
||||
module list
|
||||
@ -40,12 +43,23 @@ cp -r . $SCRATCH/ShallowWaterGPU
|
||||
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||
# chkfile MyResultFile
|
||||
# chkfile is replaced by 'savefile' on Saga
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||
#savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||
#savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||
#savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||
#savefile "$SCRATCH/ShallowWaterGPU/*.qdrep"
|
||||
|
||||
cleanup "rm -rf $SCRATCH/ShallowWaterGPU"
|
||||
|
||||
export OMPI_MCA_opal_cuda_support=true
|
||||
|
||||
## Do some work:
|
||||
cd $SCRATCH/ShallowWaterGPU
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||
srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
|
||||
cd $HOME/src/ShallowWaterGPU
|
||||
mkdir -p output_saga/$NOW/$SLURM_JOB_ID
|
||||
mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID
|
||||
mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID
|
||||
mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW
|
||||
mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW
|
@ -1,13 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
# one node: 1-4 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=1024 saga_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=1024,NY=512 saga_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=1024,NY=341 saga_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=512,NY=512 saga_strong_scaling_benchmark.job
|
||||
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||
|
||||
# 2-4 nodes: 1 GPUs per node
|
||||
sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=512 saga_strong_scaling_benchmark.job
|
||||
sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=341 saga_strong_scaling_benchmark.job
|
||||
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=512,NY=512 saga_strong_scaling_benchmark.job
|
||||
# one node: 1–4 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
|
||||
# 4 nodes: 1–4 GPUs per node
|
||||
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
|
||||
|
||||
# 4 nodes: 1–4 GPUs per node
|
||||
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
|
||||
|
||||
## one node: 1–4 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
#
|
||||
## 4 nodes: 1–4 GPUs per node
|
||||
#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
#sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
|
||||
#sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
|
||||
#sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
|
||||
|
25
saga_weak_scaling_benchmark.sh
Normal file
25
saga_weak_scaling_benchmark.sh
Normal file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||
|
||||
# one node: 1-4 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
|
||||
# 2-4 nodes: 1 GPUs per node
|
||||
sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||
sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||
sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
|
||||
## one node: 1-4 GPUs
|
||||
#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
|
||||
#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||
#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||
#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
||||
|
||||
## 2-4 nodes: 1 GPUs per node
|
||||
#sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
|
||||
#sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
|
||||
#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
|
@ -4,35 +4,35 @@ NOW=$(date "+%Y-%m-%dT%H%M%S")
|
||||
mkdir -p output_seymour/$NOW
|
||||
|
||||
# one node: 1-8 GPUs
|
||||
mpiexec -n 1 python mpiTesting.py -nx 4096 -ny 4096 --profile &&
|
||||
mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile &&
|
||||
mkdir -p output_seymour/$NOW/1_proc &&
|
||||
mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ &&
|
||||
|
||||
mpiexec -n 2 python mpiTesting.py -nx 4096 -ny 2048 --profile &&
|
||||
mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile &&
|
||||
mkdir -p output_seymour/$NOW/2_proc &&
|
||||
mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ &&
|
||||
|
||||
mpiexec -n 3 python mpiTesting.py -nx 4096 -ny 1365 --profile &&
|
||||
mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile &&
|
||||
mkdir -p output_seymour/$NOW/3_proc &&
|
||||
mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ &&
|
||||
|
||||
mpiexec -n 4 python mpiTesting.py -nx 4096 -ny 1024 --profile &&
|
||||
mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile &&
|
||||
mkdir -p output_seymour/$NOW/4_proc &&
|
||||
mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ &&
|
||||
|
||||
mpiexec -n 5 python mpiTesting.py -nx 4096 -ny 819 --profile &&
|
||||
mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile &&
|
||||
mkdir -p output_seymour/$NOW/5_proc &&
|
||||
mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ &&
|
||||
|
||||
mpiexec -n 6 python mpiTesting.py -nx 4096 -ny 683 --profile &&
|
||||
mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile &&
|
||||
mkdir -p output_seymour/$NOW/6_proc &&
|
||||
mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ &&
|
||||
|
||||
mpiexec -n 7 python mpiTesting.py -nx 4096 -ny 585 --profile &&
|
||||
mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile &&
|
||||
mkdir -p output_seymour/$NOW/7_proc &&
|
||||
mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ &&
|
||||
|
||||
mpiexec -n 8 python mpiTesting.py -nx 4096 -ny 512 --profile &&
|
||||
mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile &&
|
||||
mkdir -p output_seymour/$NOW/8_proc &&
|
||||
mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ &&
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user