mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 06:24:13 +02:00
Setup for scaling ex's on Saga.
This commit is contained in:
parent
11223f0890
commit
9c4de881a8
@ -25,29 +25,43 @@ import gc
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
#MPI
|
||||
# MPI
|
||||
from mpi4py import MPI
|
||||
|
||||
#CUDA
|
||||
# CUDA
|
||||
import pycuda.driver as cuda
|
||||
|
||||
#Simulator engine etc
|
||||
# Simulator engine etc
|
||||
from GPUSimulators import MPISimulator, Common, CudaContext
|
||||
from GPUSimulators import EE2D_KP07_dimsplit
|
||||
from GPUSimulators.helpers import InitialConditions as IC
|
||||
from GPUSimulators.Simulator import BoundaryCondition as BC
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
|
||||
parser.add_argument('-nx', type=int, default=128)
|
||||
parser.add_argument('-ny', type=int, default=128)
|
||||
parser.add_argument('--profile', action='store_true') # default: False
|
||||
|
||||
#Get MPI COMM to use
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if(args.profile):
|
||||
# profiling: total run time
|
||||
t_total_start = time.time()
|
||||
|
||||
|
||||
# Get MPI COMM to use
|
||||
comm = MPI.COMM_WORLD
|
||||
|
||||
|
||||
####
|
||||
#Initialize logging
|
||||
# Initialize logging
|
||||
####
|
||||
log_level_console = 20
|
||||
log_level_file = 10
|
||||
log_level_file = 10
|
||||
log_filename = 'mpi_' + str(comm.rank) + '.log'
|
||||
logger = logging.getLogger('GPUSimulators')
|
||||
logger.setLevel(min(log_level_console, log_level_file))
|
||||
@ -55,15 +69,17 @@ logger.setLevel(min(log_level_console, log_level_file))
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(log_level_console)
|
||||
logger.addHandler(ch)
|
||||
logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
|
||||
logger.info("Console logger using level %s",
|
||||
logging.getLevelName(log_level_console))
|
||||
|
||||
fh = logging.FileHandler(log_filename)
|
||||
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||
fh.setFormatter(formatter)
|
||||
fh.setLevel(log_level_file)
|
||||
logger.addHandler(fh)
|
||||
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
|
||||
|
||||
logger.info("File logger using level %s to %s",
|
||||
logging.getLevelName(log_level_file), log_filename)
|
||||
|
||||
|
||||
####
|
||||
@ -73,7 +89,6 @@ logger.info("Creating MPI grid")
|
||||
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
|
||||
|
||||
|
||||
|
||||
####
|
||||
# Initialize CUDA
|
||||
####
|
||||
@ -85,15 +100,15 @@ cuda_device = local_rank % num_cuda_devices
|
||||
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
|
||||
|
||||
|
||||
|
||||
####
|
||||
# Set initial conditions
|
||||
####
|
||||
logger.info("Generating initial conditions")
|
||||
nx = 128
|
||||
ny = 128
|
||||
nx = args.nx
|
||||
ny = args.ny
|
||||
|
||||
gamma = 1.4
|
||||
save_times = np.linspace(0, 5.0, 10)
|
||||
save_times = np.linspace(0, 10.0, 2)
|
||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||
|
||||
@ -103,20 +118,43 @@ arguments['theta'] = 1.2
|
||||
arguments['grid'] = grid
|
||||
|
||||
|
||||
|
||||
|
||||
####
|
||||
# Run simulation
|
||||
####
|
||||
logger.info("Running simulation")
|
||||
#Helper function to create MPI simulator
|
||||
# Helper function to create MPI simulator
|
||||
|
||||
|
||||
def genSim(grid, **kwargs):
|
||||
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
|
||||
sim = MPISimulator.MPISimulator(local_sim, grid)
|
||||
return sim
|
||||
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
|
||||
|
||||
|
||||
outfile = Common.runSimulation(
|
||||
genSim, arguments, outfile, save_times, save_var_names)
|
||||
|
||||
if(args.profile):
|
||||
t_total_end = time.time()
|
||||
t_total = t_total_end - t_total_start
|
||||
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
|
||||
|
||||
# write profiling to json file
|
||||
if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
if "SLURM_JOB_ID" in os.environ:
|
||||
job_id = int(os.environ["SLURM_JOB_ID"])
|
||||
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
|
||||
allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
|
||||
profiling_file = "MPI_jobid_" + \
|
||||
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
|
||||
else:
|
||||
profiling_file = "MPI_test_profiling.json"
|
||||
|
||||
write_profiling_data = {}
|
||||
write_profiling_data["total"] = t_total
|
||||
|
||||
with open(profiling_file, "w") as write_file:
|
||||
json.dump(write_profiling_data, write_file)
|
||||
|
||||
####
|
||||
# Clean shutdown
|
||||
@ -134,4 +172,4 @@ gc.collect()
|
||||
# Print completion and exit
|
||||
####
|
||||
print("Completed!")
|
||||
exit(0)
|
||||
exit(0)
|
||||
|
53
saga-dev.job
Normal file
53
saga-dev.job
Normal file
@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
# Job name:
|
||||
#SBATCH --job-name=ShallowWaterGPUScalingDev
|
||||
#
|
||||
# Project:
|
||||
#SBATCH --account=nn9550k
|
||||
#
|
||||
# Wall clock limit:
|
||||
#SBATCH --time=01:00:00
|
||||
#
|
||||
# Ask for 1 GPU (max is 2)
|
||||
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||
#SBATCH --partition=accel --gres=gpu:1
|
||||
#
|
||||
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||
#SBATCH --mem-per-cpu=16G
|
||||
#
|
||||
# Number of tasks:
|
||||
#SBATCH --nodes=1 --ntasks-per-node=1
|
||||
#
|
||||
#SBATCH --qos=devel
|
||||
|
||||
## Set up job environment: (this is done automatically behind the scenes)
|
||||
## (make sure to comment '#' or remove the following line 'source ...')
|
||||
# source /cluster/bin/jobsetup
|
||||
|
||||
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||
module load CUDA/10.2.89
|
||||
|
||||
# It is also recommended to to list loaded modules, for easier debugging:
|
||||
module list
|
||||
|
||||
set -o errexit # exit on errors
|
||||
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
|
||||
|
||||
## Copy input files to the work directory:
|
||||
mkdir $SCRATCH/ShallowWaterGPU
|
||||
cp -r . $SCRATCH/ShallowWaterGPU
|
||||
|
||||
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||
# chkfile MyResultFile
|
||||
# chkfile is replaced by 'savefile' on Saga
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||
|
||||
## Do some work:
|
||||
cd $SCRATCH/ShallowWaterGPU
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
|
||||
|
@ -1,12 +1,12 @@
|
||||
#!/bin/bash
|
||||
# Job name:
|
||||
#SBATCH --job-name=saga-test
|
||||
#SBATCH --job-name=ShallowWaterGPUStrongScaling
|
||||
#
|
||||
# Project:
|
||||
#SBATCH --account=nn9550k
|
||||
#
|
||||
# Wall clock limit:
|
||||
#SBATCH --time=00:10:00
|
||||
#SBATCH --time=24:00:00
|
||||
#
|
||||
# Ask for 1 GPU (max is 2)
|
||||
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||
@ -15,10 +15,10 @@
|
||||
#SBATCH --partition=accel --gres=gpu:1
|
||||
#
|
||||
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||
#SBATCH --mem-per-cpu=4G
|
||||
#SBATCH --mem-per-cpu=16G
|
||||
#
|
||||
# Number of tasks:
|
||||
#SBATCH --nodes=2 --ntasks-per-node=1
|
||||
#SBATCH --nodes=1 --ntasks-per-node=1
|
||||
|
||||
## Set up job environment: (this is done automatically behind the scenes)
|
||||
## (make sure to comment '#' or remove the following line 'source ...')
|
||||
@ -42,9 +42,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
|
||||
# chkfile is replaced by 'savefile' on Saga
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||
|
||||
## Do some work:
|
||||
cd $SCRATCH/ShallowWaterGPU
|
||||
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
|
||||
|
||||
|
51
saga_strong_scaling_benchmark.job
Normal file
51
saga_strong_scaling_benchmark.job
Normal file
@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
# Job name:
|
||||
#SBATCH --job-name=ShallowWaterGPUStrongScaling
|
||||
#
|
||||
# Project:
|
||||
#SBATCH --account=nn9550k
|
||||
#
|
||||
# Wall clock limit:
|
||||
#SBATCH --time=10:00:00
|
||||
#
|
||||
# Ask for 1 GPU (max is 2)
|
||||
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||
#SBATCH --partition=accel --gres=gpu:1
|
||||
#
|
||||
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||
#SBATCH --mem-per-cpu=16G
|
||||
#
|
||||
# Number of tasks:
|
||||
#SBATCH --nodes=1 --ntasks-per-node=1
|
||||
|
||||
## Set up job environment: (this is done automatically behind the scenes)
|
||||
## (make sure to comment '#' or remove the following line 'source ...')
|
||||
# source /cluster/bin/jobsetup
|
||||
|
||||
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||
module load CUDA/10.2.89
|
||||
|
||||
# It is also recommended to to list loaded modules, for easier debugging:
|
||||
module list
|
||||
|
||||
set -o errexit # exit on errors
|
||||
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
|
||||
|
||||
## Copy input files to the work directory:
|
||||
mkdir $SCRATCH/ShallowWaterGPU
|
||||
cp -r . $SCRATCH/ShallowWaterGPU
|
||||
|
||||
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||
# chkfile MyResultFile
|
||||
# chkfile is replaced by 'savefile' on Saga
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||
|
||||
## Do some work:
|
||||
cd $SCRATCH/ShallowWaterGPU
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile
|
||||
|
13
saga_strong_scaling_benchmark.sh
Normal file
13
saga_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
# one node: 1-4 tasks/GPUs
|
||||
sbatch --partition=accel --gres=gpu:1 --nodes=1 --ntasks-per-node=1 saga_strong_scaling_benchmark.job
|
||||
sbatch --partition=accel --gres=gpu:2 --nodes=1 --ntasks-per-node=2 saga_strong_scaling_benchmark.job
|
||||
sbatch --partition=accel --gres=gpu:3 --nodes=1 --ntasks-per-node=3 saga_strong_scaling_benchmark.job
|
||||
sbatch --partition=accel --gres=gpu:4 --nodes=1 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||
|
||||
# 2-4 nodes: 4 tasks/GPUs per node
|
||||
sbatch --partition=accel --gres=gpu:4 --nodes=2 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||
sbatch --partition=accel --gres=gpu:4 --nodes=3 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||
sbatch --partition=accel --gres=gpu:4 --nodes=4 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||
|
Loading…
x
Reference in New Issue
Block a user