diff --git a/mpiTesting.py b/mpiTesting.py index 52deeaa..d7192d9 100644 --- a/mpiTesting.py +++ b/mpiTesting.py @@ -25,29 +25,43 @@ import gc import time import json import logging +import os -#MPI +# MPI from mpi4py import MPI -#CUDA +# CUDA import pycuda.driver as cuda -#Simulator engine etc +# Simulator engine etc from GPUSimulators import MPISimulator, Common, CudaContext from GPUSimulators import EE2D_KP07_dimsplit from GPUSimulators.helpers import InitialConditions as IC from GPUSimulators.Simulator import BoundaryCondition as BC +import argparse +parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.') +parser.add_argument('-nx', type=int, default=128) +parser.add_argument('-ny', type=int, default=128) +parser.add_argument('--profile', action='store_true') # default: False -#Get MPI COMM to use + +args = parser.parse_args() + +if(args.profile): + # profiling: total run time + t_total_start = time.time() + + +# Get MPI COMM to use comm = MPI.COMM_WORLD #### -#Initialize logging +# Initialize logging #### log_level_console = 20 -log_level_file = 10 +log_level_file = 10 log_filename = 'mpi_' + str(comm.rank) + '.log' logger = logging.getLogger('GPUSimulators') logger.setLevel(min(log_level_console, log_level_file)) @@ -55,15 +69,17 @@ logger.setLevel(min(log_level_console, log_level_file)) ch = logging.StreamHandler() ch.setLevel(log_level_console) logger.addHandler(ch) -logger.info("Console logger using level %s", logging.getLevelName(log_level_console)) +logger.info("Console logger using level %s", + logging.getLevelName(log_level_console)) fh = logging.FileHandler(log_filename) -formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') +formatter = logging.Formatter( + '%(asctime)s:%(name)s:%(levelname)s: %(message)s') fh.setFormatter(formatter) fh.setLevel(log_level_file) logger.addHandler(fh) -logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename) - +logger.info("File logger using level %s to %s", + logging.getLevelName(log_level_file), log_filename) #### @@ -73,7 +89,6 @@ logger.info("Creating MPI grid") grid = MPISimulator.MPIGrid(MPI.COMM_WORLD) - #### # Initialize CUDA #### @@ -85,15 +100,15 @@ cuda_device = local_rank % num_cuda_devices cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False) - #### # Set initial conditions #### logger.info("Generating initial conditions") -nx = 128 -ny = 128 +nx = args.nx +ny = args.ny + gamma = 1.4 -save_times = np.linspace(0, 5.0, 10) +save_times = np.linspace(0, 10.0, 2) outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] @@ -103,20 +118,43 @@ arguments['theta'] = 1.2 arguments['grid'] = grid - - #### # Run simulation #### logger.info("Running simulation") -#Helper function to create MPI simulator +# Helper function to create MPI simulator + + def genSim(grid, **kwargs): local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs) sim = MPISimulator.MPISimulator(local_sim, grid) return sim -outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names) +outfile = Common.runSimulation( + genSim, arguments, outfile, save_times, save_var_names) + +if(args.profile): + t_total_end = time.time() + t_total = t_total_end - t_total_start + print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s") + +# write profiling to json file +if(args.profile and MPI.COMM_WORLD.rank == 0): + if "SLURM_JOB_ID" in os.environ: + job_id = int(os.environ["SLURM_JOB_ID"]) + allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"]) + allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1) + profiling_file = "MPI_jobid_" + \ + str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json" + else: + profiling_file = "MPI_test_profiling.json" + + write_profiling_data = {} + write_profiling_data["total"] = t_total + + with open(profiling_file, "w") as write_file: + json.dump(write_profiling_data, write_file) #### # Clean shutdown @@ -134,4 +172,4 @@ gc.collect() # Print completion and exit #### print("Completed!") -exit(0) \ No newline at end of file +exit(0) diff --git a/saga-dev.job b/saga-dev.job new file mode 100644 index 0000000..1244048 --- /dev/null +++ b/saga-dev.job @@ -0,0 +1,53 @@ +#!/bin/bash +# Job name: +#SBATCH --job-name=ShallowWaterGPUScalingDev +# +# Project: +#SBATCH --account=nn9550k +# +# Wall clock limit: +#SBATCH --time=01:00:00 +# +# Ask for 1 GPU (max is 2) +# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU +# device(s) to use. It will have values '0', '1' or '0,1' corresponding to +# /dev/nvidia0, /dev/nvidia1 or both, respectively. +#SBATCH --partition=accel --gres=gpu:1 +# +# Max memory usage per task (core) - increasing this will cost more core hours: +#SBATCH --mem-per-cpu=16G +# +# Number of tasks: +#SBATCH --nodes=1 --ntasks-per-node=1 +# +#SBATCH --qos=devel + +## Set up job environment: (this is done automatically behind the scenes) +## (make sure to comment '#' or remove the following line 'source ...') +# source /cluster/bin/jobsetup + +module restore system # instead of 'module purge' rather set module environment to the system default +module load CUDA/10.2.89 + +# It is also recommended to to list loaded modules, for easier debugging: +module list + +set -o errexit # exit on errors +set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script) + +## Copy input files to the work directory: +mkdir $SCRATCH/ShallowWaterGPU +cp -r . $SCRATCH/ShallowWaterGPU + +## Make sure the results are copied back to the submit directory (see Work Directory below): +# chkfile MyResultFile +# chkfile is replaced by 'savefile' on Saga +savefile "$SCRATCH/ShallowWaterGPU/*.log" +savefile "$SCRATCH/ShallowWaterGPU/*.nc" +savefile "$SCRATCH/ShallowWaterGPU/*.json" + +## Do some work: +cd $SCRATCH/ShallowWaterGPU +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile + diff --git a/saga-test.job b/saga-test.job index 7d2b475..0b5e9f5 100644 --- a/saga-test.job +++ b/saga-test.job @@ -1,12 +1,12 @@ #!/bin/bash # Job name: -#SBATCH --job-name=saga-test +#SBATCH --job-name=ShallowWaterGPUStrongScaling # # Project: #SBATCH --account=nn9550k # # Wall clock limit: -#SBATCH --time=00:10:00 +#SBATCH --time=24:00:00 # # Ask for 1 GPU (max is 2) # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU @@ -15,10 +15,10 @@ #SBATCH --partition=accel --gres=gpu:1 # # Max memory usage per task (core) - increasing this will cost more core hours: -#SBATCH --mem-per-cpu=4G +#SBATCH --mem-per-cpu=16G # # Number of tasks: -#SBATCH --nodes=2 --ntasks-per-node=1 +#SBATCH --nodes=1 --ntasks-per-node=1 ## Set up job environment: (this is done automatically behind the scenes) ## (make sure to comment '#' or remove the following line 'source ...') @@ -42,9 +42,10 @@ cp -r . $SCRATCH/ShallowWaterGPU # chkfile is replaced by 'savefile' on Saga savefile "$SCRATCH/ShallowWaterGPU/*.log" savefile "$SCRATCH/ShallowWaterGPU/*.nc" +savefile "$SCRATCH/ShallowWaterGPU/*.json" ## Do some work: cd $SCRATCH/ShallowWaterGPU -srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version -srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile diff --git a/saga_strong_scaling_benchmark.job b/saga_strong_scaling_benchmark.job new file mode 100644 index 0000000..05320d7 --- /dev/null +++ b/saga_strong_scaling_benchmark.job @@ -0,0 +1,51 @@ +#!/bin/bash +# Job name: +#SBATCH --job-name=ShallowWaterGPUStrongScaling +# +# Project: +#SBATCH --account=nn9550k +# +# Wall clock limit: +#SBATCH --time=10:00:00 +# +# Ask for 1 GPU (max is 2) +# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU +# device(s) to use. It will have values '0', '1' or '0,1' corresponding to +# /dev/nvidia0, /dev/nvidia1 or both, respectively. +#SBATCH --partition=accel --gres=gpu:1 +# +# Max memory usage per task (core) - increasing this will cost more core hours: +#SBATCH --mem-per-cpu=16G +# +# Number of tasks: +#SBATCH --nodes=1 --ntasks-per-node=1 + +## Set up job environment: (this is done automatically behind the scenes) +## (make sure to comment '#' or remove the following line 'source ...') +# source /cluster/bin/jobsetup + +module restore system # instead of 'module purge' rather set module environment to the system default +module load CUDA/10.2.89 + +# It is also recommended to to list loaded modules, for easier debugging: +module list + +set -o errexit # exit on errors +set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script) + +## Copy input files to the work directory: +mkdir $SCRATCH/ShallowWaterGPU +cp -r . $SCRATCH/ShallowWaterGPU + +## Make sure the results are copied back to the submit directory (see Work Directory below): +# chkfile MyResultFile +# chkfile is replaced by 'savefile' on Saga +savefile "$SCRATCH/ShallowWaterGPU/*.log" +savefile "$SCRATCH/ShallowWaterGPU/*.nc" +savefile "$SCRATCH/ShallowWaterGPU/*.json" + +## Do some work: +cd $SCRATCH/ShallowWaterGPU +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile + diff --git a/saga_strong_scaling_benchmark.sh b/saga_strong_scaling_benchmark.sh new file mode 100644 index 0000000..1d2284d --- /dev/null +++ b/saga_strong_scaling_benchmark.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# one node: 1-4 tasks/GPUs +sbatch --partition=accel --gres=gpu:1 --nodes=1 --ntasks-per-node=1 saga_strong_scaling_benchmark.job +sbatch --partition=accel --gres=gpu:2 --nodes=1 --ntasks-per-node=2 saga_strong_scaling_benchmark.job +sbatch --partition=accel --gres=gpu:3 --nodes=1 --ntasks-per-node=3 saga_strong_scaling_benchmark.job +sbatch --partition=accel --gres=gpu:4 --nodes=1 --ntasks-per-node=4 saga_strong_scaling_benchmark.job + +# 2-4 nodes: 4 tasks/GPUs per node +sbatch --partition=accel --gres=gpu:4 --nodes=2 --ntasks-per-node=4 saga_strong_scaling_benchmark.job +sbatch --partition=accel --gres=gpu:4 --nodes=3 --ntasks-per-node=4 saga_strong_scaling_benchmark.job +sbatch --partition=accel --gres=gpu:4 --nodes=4 --ntasks-per-node=4 saga_strong_scaling_benchmark.job +