mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-07-16 04:20:59 +02:00
Setup for scaling ex's on Saga.
This commit is contained in:
parent
11223f0890
commit
9c4de881a8
@ -25,29 +25,43 @@ import gc
|
|||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
#MPI
|
# MPI
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
|
|
||||||
#CUDA
|
# CUDA
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
#Simulator engine etc
|
# Simulator engine etc
|
||||||
from GPUSimulators import MPISimulator, Common, CudaContext
|
from GPUSimulators import MPISimulator, Common, CudaContext
|
||||||
from GPUSimulators import EE2D_KP07_dimsplit
|
from GPUSimulators import EE2D_KP07_dimsplit
|
||||||
from GPUSimulators.helpers import InitialConditions as IC
|
from GPUSimulators.helpers import InitialConditions as IC
|
||||||
from GPUSimulators.Simulator import BoundaryCondition as BC
|
from GPUSimulators.Simulator import BoundaryCondition as BC
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
|
||||||
|
parser.add_argument('-nx', type=int, default=128)
|
||||||
|
parser.add_argument('-ny', type=int, default=128)
|
||||||
|
parser.add_argument('--profile', action='store_true') # default: False
|
||||||
|
|
||||||
#Get MPI COMM to use
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if(args.profile):
|
||||||
|
# profiling: total run time
|
||||||
|
t_total_start = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
# Get MPI COMM to use
|
||||||
comm = MPI.COMM_WORLD
|
comm = MPI.COMM_WORLD
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
#Initialize logging
|
# Initialize logging
|
||||||
####
|
####
|
||||||
log_level_console = 20
|
log_level_console = 20
|
||||||
log_level_file = 10
|
log_level_file = 10
|
||||||
log_filename = 'mpi_' + str(comm.rank) + '.log'
|
log_filename = 'mpi_' + str(comm.rank) + '.log'
|
||||||
logger = logging.getLogger('GPUSimulators')
|
logger = logging.getLogger('GPUSimulators')
|
||||||
logger.setLevel(min(log_level_console, log_level_file))
|
logger.setLevel(min(log_level_console, log_level_file))
|
||||||
@ -55,15 +69,17 @@ logger.setLevel(min(log_level_console, log_level_file))
|
|||||||
ch = logging.StreamHandler()
|
ch = logging.StreamHandler()
|
||||||
ch.setLevel(log_level_console)
|
ch.setLevel(log_level_console)
|
||||||
logger.addHandler(ch)
|
logger.addHandler(ch)
|
||||||
logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
|
logger.info("Console logger using level %s",
|
||||||
|
logging.getLevelName(log_level_console))
|
||||||
|
|
||||||
fh = logging.FileHandler(log_filename)
|
fh = logging.FileHandler(log_filename)
|
||||||
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
formatter = logging.Formatter(
|
||||||
|
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
|
||||||
fh.setFormatter(formatter)
|
fh.setFormatter(formatter)
|
||||||
fh.setLevel(log_level_file)
|
fh.setLevel(log_level_file)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
|
logger.info("File logger using level %s to %s",
|
||||||
|
logging.getLevelName(log_level_file), log_filename)
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
@ -73,7 +89,6 @@ logger.info("Creating MPI grid")
|
|||||||
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
|
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
# Initialize CUDA
|
# Initialize CUDA
|
||||||
####
|
####
|
||||||
@ -85,15 +100,15 @@ cuda_device = local_rank % num_cuda_devices
|
|||||||
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
|
cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
# Set initial conditions
|
# Set initial conditions
|
||||||
####
|
####
|
||||||
logger.info("Generating initial conditions")
|
logger.info("Generating initial conditions")
|
||||||
nx = 128
|
nx = args.nx
|
||||||
ny = 128
|
ny = args.ny
|
||||||
|
|
||||||
gamma = 1.4
|
gamma = 1.4
|
||||||
save_times = np.linspace(0, 5.0, 10)
|
save_times = np.linspace(0, 10.0, 2)
|
||||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||||
|
|
||||||
@ -103,20 +118,43 @@ arguments['theta'] = 1.2
|
|||||||
arguments['grid'] = grid
|
arguments['grid'] = grid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
####
|
####
|
||||||
# Run simulation
|
# Run simulation
|
||||||
####
|
####
|
||||||
logger.info("Running simulation")
|
logger.info("Running simulation")
|
||||||
#Helper function to create MPI simulator
|
# Helper function to create MPI simulator
|
||||||
|
|
||||||
|
|
||||||
def genSim(grid, **kwargs):
|
def genSim(grid, **kwargs):
|
||||||
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
|
local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
|
||||||
sim = MPISimulator.MPISimulator(local_sim, grid)
|
sim = MPISimulator.MPISimulator(local_sim, grid)
|
||||||
return sim
|
return sim
|
||||||
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
|
|
||||||
|
|
||||||
|
|
||||||
|
outfile = Common.runSimulation(
|
||||||
|
genSim, arguments, outfile, save_times, save_var_names)
|
||||||
|
|
||||||
|
if(args.profile):
|
||||||
|
t_total_end = time.time()
|
||||||
|
t_total = t_total_end - t_total_start
|
||||||
|
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
|
||||||
|
|
||||||
|
# write profiling to json file
|
||||||
|
if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||||
|
if "SLURM_JOB_ID" in os.environ:
|
||||||
|
job_id = int(os.environ["SLURM_JOB_ID"])
|
||||||
|
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
|
||||||
|
allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
|
||||||
|
profiling_file = "MPI_jobid_" + \
|
||||||
|
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
|
||||||
|
else:
|
||||||
|
profiling_file = "MPI_test_profiling.json"
|
||||||
|
|
||||||
|
write_profiling_data = {}
|
||||||
|
write_profiling_data["total"] = t_total
|
||||||
|
|
||||||
|
with open(profiling_file, "w") as write_file:
|
||||||
|
json.dump(write_profiling_data, write_file)
|
||||||
|
|
||||||
####
|
####
|
||||||
# Clean shutdown
|
# Clean shutdown
|
||||||
@ -134,4 +172,4 @@ gc.collect()
|
|||||||
# Print completion and exit
|
# Print completion and exit
|
||||||
####
|
####
|
||||||
print("Completed!")
|
print("Completed!")
|
||||||
exit(0)
|
exit(0)
|
||||||
|
53
saga-dev.job
Normal file
53
saga-dev.job
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Job name:
|
||||||
|
#SBATCH --job-name=ShallowWaterGPUScalingDev
|
||||||
|
#
|
||||||
|
# Project:
|
||||||
|
#SBATCH --account=nn9550k
|
||||||
|
#
|
||||||
|
# Wall clock limit:
|
||||||
|
#SBATCH --time=01:00:00
|
||||||
|
#
|
||||||
|
# Ask for 1 GPU (max is 2)
|
||||||
|
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||||
|
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||||
|
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||||
|
#SBATCH --partition=accel --gres=gpu:1
|
||||||
|
#
|
||||||
|
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||||
|
#SBATCH --mem-per-cpu=16G
|
||||||
|
#
|
||||||
|
# Number of tasks:
|
||||||
|
#SBATCH --nodes=1 --ntasks-per-node=1
|
||||||
|
#
|
||||||
|
#SBATCH --qos=devel
|
||||||
|
|
||||||
|
## Set up job environment: (this is done automatically behind the scenes)
|
||||||
|
## (make sure to comment '#' or remove the following line 'source ...')
|
||||||
|
# source /cluster/bin/jobsetup
|
||||||
|
|
||||||
|
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||||
|
module load CUDA/10.2.89
|
||||||
|
|
||||||
|
# It is also recommended to to list loaded modules, for easier debugging:
|
||||||
|
module list
|
||||||
|
|
||||||
|
set -o errexit # exit on errors
|
||||||
|
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
mkdir $SCRATCH/ShallowWaterGPU
|
||||||
|
cp -r . $SCRATCH/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||||
|
# chkfile MyResultFile
|
||||||
|
# chkfile is replaced by 'savefile' on Saga
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||||
|
|
||||||
|
## Do some work:
|
||||||
|
cd $SCRATCH/ShallowWaterGPU
|
||||||
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||||
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
|
||||||
|
|
@ -1,12 +1,12 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Job name:
|
# Job name:
|
||||||
#SBATCH --job-name=saga-test
|
#SBATCH --job-name=ShallowWaterGPUStrongScaling
|
||||||
#
|
#
|
||||||
# Project:
|
# Project:
|
||||||
#SBATCH --account=nn9550k
|
#SBATCH --account=nn9550k
|
||||||
#
|
#
|
||||||
# Wall clock limit:
|
# Wall clock limit:
|
||||||
#SBATCH --time=00:10:00
|
#SBATCH --time=24:00:00
|
||||||
#
|
#
|
||||||
# Ask for 1 GPU (max is 2)
|
# Ask for 1 GPU (max is 2)
|
||||||
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||||
@ -15,10 +15,10 @@
|
|||||||
#SBATCH --partition=accel --gres=gpu:1
|
#SBATCH --partition=accel --gres=gpu:1
|
||||||
#
|
#
|
||||||
# Max memory usage per task (core) - increasing this will cost more core hours:
|
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||||
#SBATCH --mem-per-cpu=4G
|
#SBATCH --mem-per-cpu=16G
|
||||||
#
|
#
|
||||||
# Number of tasks:
|
# Number of tasks:
|
||||||
#SBATCH --nodes=2 --ntasks-per-node=1
|
#SBATCH --nodes=1 --ntasks-per-node=1
|
||||||
|
|
||||||
## Set up job environment: (this is done automatically behind the scenes)
|
## Set up job environment: (this is done automatically behind the scenes)
|
||||||
## (make sure to comment '#' or remove the following line 'source ...')
|
## (make sure to comment '#' or remove the following line 'source ...')
|
||||||
@ -42,9 +42,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
|
|||||||
# chkfile is replaced by 'savefile' on Saga
|
# chkfile is replaced by 'savefile' on Saga
|
||||||
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||||
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||||
|
|
||||||
## Do some work:
|
## Do some work:
|
||||||
cd $SCRATCH/ShallowWaterGPU
|
cd $SCRATCH/ShallowWaterGPU
|
||||||
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||||
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
|
||||||
|
|
||||||
|
51
saga_strong_scaling_benchmark.job
Normal file
51
saga_strong_scaling_benchmark.job
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Job name:
|
||||||
|
#SBATCH --job-name=ShallowWaterGPUStrongScaling
|
||||||
|
#
|
||||||
|
# Project:
|
||||||
|
#SBATCH --account=nn9550k
|
||||||
|
#
|
||||||
|
# Wall clock limit:
|
||||||
|
#SBATCH --time=10:00:00
|
||||||
|
#
|
||||||
|
# Ask for 1 GPU (max is 2)
|
||||||
|
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
|
||||||
|
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
|
||||||
|
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
|
||||||
|
#SBATCH --partition=accel --gres=gpu:1
|
||||||
|
#
|
||||||
|
# Max memory usage per task (core) - increasing this will cost more core hours:
|
||||||
|
#SBATCH --mem-per-cpu=16G
|
||||||
|
#
|
||||||
|
# Number of tasks:
|
||||||
|
#SBATCH --nodes=1 --ntasks-per-node=1
|
||||||
|
|
||||||
|
## Set up job environment: (this is done automatically behind the scenes)
|
||||||
|
## (make sure to comment '#' or remove the following line 'source ...')
|
||||||
|
# source /cluster/bin/jobsetup
|
||||||
|
|
||||||
|
module restore system # instead of 'module purge' rather set module environment to the system default
|
||||||
|
module load CUDA/10.2.89
|
||||||
|
|
||||||
|
# It is also recommended to to list loaded modules, for easier debugging:
|
||||||
|
module list
|
||||||
|
|
||||||
|
set -o errexit # exit on errors
|
||||||
|
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
|
||||||
|
|
||||||
|
## Copy input files to the work directory:
|
||||||
|
mkdir $SCRATCH/ShallowWaterGPU
|
||||||
|
cp -r . $SCRATCH/ShallowWaterGPU
|
||||||
|
|
||||||
|
## Make sure the results are copied back to the submit directory (see Work Directory below):
|
||||||
|
# chkfile MyResultFile
|
||||||
|
# chkfile is replaced by 'savefile' on Saga
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.log"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
|
||||||
|
savefile "$SCRATCH/ShallowWaterGPU/*.json"
|
||||||
|
|
||||||
|
## Do some work:
|
||||||
|
cd $SCRATCH/ShallowWaterGPU
|
||||||
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
|
||||||
|
srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile
|
||||||
|
|
13
saga_strong_scaling_benchmark.sh
Normal file
13
saga_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# one node: 1-4 tasks/GPUs
|
||||||
|
sbatch --partition=accel --gres=gpu:1 --nodes=1 --ntasks-per-node=1 saga_strong_scaling_benchmark.job
|
||||||
|
sbatch --partition=accel --gres=gpu:2 --nodes=1 --ntasks-per-node=2 saga_strong_scaling_benchmark.job
|
||||||
|
sbatch --partition=accel --gres=gpu:3 --nodes=1 --ntasks-per-node=3 saga_strong_scaling_benchmark.job
|
||||||
|
sbatch --partition=accel --gres=gpu:4 --nodes=1 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||||
|
|
||||||
|
# 2-4 nodes: 4 tasks/GPUs per node
|
||||||
|
sbatch --partition=accel --gres=gpu:4 --nodes=2 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||||
|
sbatch --partition=accel --gres=gpu:4 --nodes=3 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||||
|
sbatch --partition=accel --gres=gpu:4 --nodes=4 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user