Setup for scaling ex's on Saga.

This commit is contained in:
Martin Lilleeng Sætra 2021-05-23 20:32:22 +02:00
parent 11223f0890
commit 9c4de881a8
5 changed files with 182 additions and 26 deletions

View File

@ -25,29 +25,43 @@ import gc
import time
import json
import logging
import os
#MPI
# MPI
from mpi4py import MPI
#CUDA
# CUDA
import pycuda.driver as cuda
#Simulator engine etc
# Simulator engine etc
from GPUSimulators import MPISimulator, Common, CudaContext
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
from GPUSimulators.Simulator import BoundaryCondition as BC
import argparse
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
parser.add_argument('-nx', type=int, default=128)
parser.add_argument('-ny', type=int, default=128)
parser.add_argument('--profile', action='store_true') # default: False
#Get MPI COMM to use
args = parser.parse_args()
if(args.profile):
# profiling: total run time
t_total_start = time.time()
# Get MPI COMM to use
comm = MPI.COMM_WORLD
####
#Initialize logging
# Initialize logging
####
log_level_console = 20
log_level_file = 10
log_level_file = 10
log_filename = 'mpi_' + str(comm.rank) + '.log'
logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(log_level_console, log_level_file))
@ -55,15 +69,17 @@ logger.setLevel(min(log_level_console, log_level_file))
ch = logging.StreamHandler()
ch.setLevel(log_level_console)
logger.addHandler(ch)
logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
logger.info("Console logger using level %s",
logging.getLevelName(log_level_console))
fh = logging.FileHandler(log_filename)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
formatter = logging.Formatter(
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter)
fh.setLevel(log_level_file)
logger.addHandler(fh)
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
logger.info("File logger using level %s to %s",
logging.getLevelName(log_level_file), log_filename)
####
@ -73,7 +89,6 @@ logger.info("Creating MPI grid")
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
####
# Initialize CUDA
####
@ -85,15 +100,15 @@ cuda_device = local_rank % num_cuda_devices
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
####
# Set initial conditions
####
logger.info("Generating initial conditions")
nx = 128
ny = 128
nx = args.nx
ny = args.ny
gamma = 1.4
save_times = np.linspace(0, 5.0, 10)
save_times = np.linspace(0, 10.0, 2)
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
@ -103,20 +118,43 @@ arguments['theta'] = 1.2
arguments['grid'] = grid
####
# Run simulation
####
logger.info("Running simulation")
#Helper function to create MPI simulator
# Helper function to create MPI simulator
def genSim(grid, **kwargs):
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
sim = MPISimulator.MPISimulator(local_sim, grid)
return sim
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
outfile = Common.runSimulation(
genSim, arguments, outfile, save_times, save_var_names)
if(args.profile):
t_total_end = time.time()
t_total = t_total_end - t_total_start
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
# write profiling to json file
if(args.profile and MPI.COMM_WORLD.rank == 0):
if "SLURM_JOB_ID" in os.environ:
job_id = int(os.environ["SLURM_JOB_ID"])
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
profiling_file = "MPI_jobid_" + \
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
else:
profiling_file = "MPI_test_profiling.json"
write_profiling_data = {}
write_profiling_data["total"] = t_total
with open(profiling_file, "w") as write_file:
json.dump(write_profiling_data, write_file)
####
# Clean shutdown
@ -134,4 +172,4 @@ gc.collect()
# Print completion and exit
####
print("Completed!")
exit(0)
exit(0)

53
saga-dev.job Normal file
View File

@ -0,0 +1,53 @@
#!/bin/bash
# Job name:
#SBATCH --job-name=ShallowWaterGPUScalingDev
#
# Project:
#SBATCH --account=nn9550k
#
# Wall clock limit:
#SBATCH --time=01:00:00
#
# Ask for 1 GPU (max is 2)
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
#SBATCH --partition=accel --gres=gpu:1
#
# Max memory usage per task (core) - increasing this will cost more core hours:
#SBATCH --mem-per-cpu=16G
#
# Number of tasks:
#SBATCH --nodes=1 --ntasks-per-node=1
#
#SBATCH --qos=devel
## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...')
# source /cluster/bin/jobsetup
module restore system # instead of 'module purge' rather set module environment to the system default
module load CUDA/10.2.89
# It is also recommended to to list loaded modules, for easier debugging:
module list
set -o errexit # exit on errors
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
## Copy input files to the work directory:
mkdir $SCRATCH/ShallowWaterGPU
cp -r . $SCRATCH/ShallowWaterGPU
## Make sure the results are copied back to the submit directory (see Work Directory below):
# chkfile MyResultFile
# chkfile is replaced by 'savefile' on Saga
savefile "$SCRATCH/ShallowWaterGPU/*.log"
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
savefile "$SCRATCH/ShallowWaterGPU/*.json"
## Do some work:
cd $SCRATCH/ShallowWaterGPU
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile

View File

@ -1,12 +1,12 @@
#!/bin/bash
# Job name:
#SBATCH --job-name=saga-test
#SBATCH --job-name=ShallowWaterGPUStrongScaling
#
# Project:
#SBATCH --account=nn9550k
#
# Wall clock limit:
#SBATCH --time=00:10:00
#SBATCH --time=24:00:00
#
# Ask for 1 GPU (max is 2)
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
@ -15,10 +15,10 @@
#SBATCH --partition=accel --gres=gpu:1
#
# Max memory usage per task (core) - increasing this will cost more core hours:
#SBATCH --mem-per-cpu=4G
#SBATCH --mem-per-cpu=16G
#
# Number of tasks:
#SBATCH --nodes=2 --ntasks-per-node=1
#SBATCH --nodes=1 --ntasks-per-node=1
## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...')
@ -42,9 +42,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
# chkfile is replaced by 'savefile' on Saga
savefile "$SCRATCH/ShallowWaterGPU/*.log"
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
savefile "$SCRATCH/ShallowWaterGPU/*.json"
## Do some work:
cd $SCRATCH/ShallowWaterGPU
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile

View File

@ -0,0 +1,51 @@
#!/bin/bash
# Job name:
#SBATCH --job-name=ShallowWaterGPUStrongScaling
#
# Project:
#SBATCH --account=nn9550k
#
# Wall clock limit:
#SBATCH --time=10:00:00
#
# Ask for 1 GPU (max is 2)
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
#SBATCH --partition=accel --gres=gpu:1
#
# Max memory usage per task (core) - increasing this will cost more core hours:
#SBATCH --mem-per-cpu=16G
#
# Number of tasks:
#SBATCH --nodes=1 --ntasks-per-node=1
## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...')
# source /cluster/bin/jobsetup
module restore system # instead of 'module purge' rather set module environment to the system default
module load CUDA/10.2.89
# It is also recommended to to list loaded modules, for easier debugging:
module list
set -o errexit # exit on errors
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
## Copy input files to the work directory:
mkdir $SCRATCH/ShallowWaterGPU
cp -r . $SCRATCH/ShallowWaterGPU
## Make sure the results are copied back to the submit directory (see Work Directory below):
# chkfile MyResultFile
# chkfile is replaced by 'savefile' on Saga
savefile "$SCRATCH/ShallowWaterGPU/*.log"
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
savefile "$SCRATCH/ShallowWaterGPU/*.json"
## Do some work:
cd $SCRATCH/ShallowWaterGPU
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile

View File

@ -0,0 +1,13 @@
#!/bin/bash
# one node: 1-4 tasks/GPUs
sbatch --partition=accel --gres=gpu:1 --nodes=1 --ntasks-per-node=1 saga_strong_scaling_benchmark.job
sbatch --partition=accel --gres=gpu:2 --nodes=1 --ntasks-per-node=2 saga_strong_scaling_benchmark.job
sbatch --partition=accel --gres=gpu:3 --nodes=1 --ntasks-per-node=3 saga_strong_scaling_benchmark.job
sbatch --partition=accel --gres=gpu:4 --nodes=1 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
# 2-4 nodes: 4 tasks/GPUs per node
sbatch --partition=accel --gres=gpu:4 --nodes=2 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
sbatch --partition=accel --gres=gpu:4 --nodes=3 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
sbatch --partition=accel --gres=gpu:4 --nodes=4 --ntasks-per-node=4 saga_strong_scaling_benchmark.job