FiniteVolumeGPU/mpi_testing_hip.py
2025-08-08 00:23:08 +02:00

218 lines
6.4 KiB
Python

# -*- coding: utf-8 -*-
"""
This python module implements MPI simulations for benchmarking
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import argparse
import numpy as np
import gc
import time
import json
import logging
import os
from mpi4py import MPI
from hip import hip
from GPUSimulators.mpi import MPISimulator, MPIGrid
from GPUSimulators.common import run_simulation, get_git_hash, get_git_status, hip_check, utils
from GPUSimulators.gpu import KernelContext
from GPUSimulators.model import EE2DKP07Dimsplit
from GPUSimulators.helpers import initial_conditions as IC
# Purely for local debugging
# import pydevd_pycharm
# pydevd_pycharm.settrace('localhost', port=24785, stdoutToServer=True, stderrToServer=True)
parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
parser.add_argument('-nx', type=int, default=128)
parser.add_argument('-ny', type=int, default=128)
parser.add_argument('--profile', action='store_true') # default: False
parser.add_argument('--compile_opts', type=str, help="Compiler options for HIP code.")
parser.add_argument('--progress', action='store_true',
help="Displays a progress bar for the progress of the simulation.")
args = parser.parse_args()
if args.profile:
profiling_data = {}
# profiling: total run time
t_total_start = time.time()
t_init_start = time.time()
nx = args.nx
ny = args.ny
# Get MPI COMM to use
comm = MPI.COMM_WORLD
rank = comm.rank
####
# Initialize logging
####
log_level_console = 20
log_level_file = 10
log_filename = 'mpi_' + str(rank) + '.log'
logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(log_level_console, log_level_file))
ch = logging.StreamHandler()
ch.setLevel(log_level_console)
logger.addHandler(ch)
logger.info(f"Console logger using level {logging.getLevelName(log_level_console)}")
fh = logging.FileHandler(log_filename)
formatter = logging.Formatter(
'%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter)
fh.setLevel(log_level_file)
logger.addHandler(fh)
logger.info(f"File logger using level {logging.getLevelName(log_level_file)} to {log_filename}")
####
# Initialize MPI grid etc
####
logger.info("Creating MPI grid")
grid = MPIGrid(comm, nx, ny)
####
# Initialize HIP
####
logger.info("Initializing HIP")
local_rank = grid.get_local_rank()
num_hip_devices = hip_check(hip.hipGetDeviceCount())
hip_device = local_rank % num_hip_devices
logger.info(f"Process {str(local_rank)} using HIP device {str(hip_device)}")
context = KernelContext(device=hip_device, autotuning=False)
####
# Set initial conditions
####
# DEBUGGING - setting random seed
np.random.seed(42)
logger.info("Generating initial conditions")
dt = 0.001
gamma = 1.4
# save_times = np.linspace(0, 0.000009, 2)
# save_times = np.linspace(0, 0.000099, 11)
# save_times = np.linspace(0, 0.000099, 2)
save_times = np.linspace(0, 0.1, 5)
outfile = "mpi_out.nc4"
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
arguments = IC.gen_kelvin_helmholtz(nx, ny, gamma, grid=grid)
arguments['context'] = context
arguments['theta'] = 1.2
arguments['grid'] = grid
arguments['compile_opts'] = ['-g', '-g3', '-ggdb', '-gdwarf-4', '-O0']
compile_opts = args.compile_opts
if compile_opts is not None:
arguments['compile_opts'] += compile_opts
if args.profile:
t_init_end = time.time()
t_init = t_init_end - t_init_start
profiling_data["t_init"] = t_init
####
# Run simulation
####
logger.info("Running simulation")
# Helper function to create MPI simulator
def gen_sim(grid, **kwargs):
local_sim = EE2DKP07Dimsplit(**kwargs)
sim = MPISimulator(local_sim, grid)
return sim
outfile, sim_runner_profiling_data, sim_profiling_data = run_simulation(
gen_sim, arguments, outfile, save_times, save_var_names, dt, progress_bar=args.progress)
# Move NetCDF4 file to a unique file, for the next run.
if rank == 0:
new_filename = utils.unique_file(outfile)
os.rename(outfile, new_filename)
##### Profiling ######
if args.profile:
t_total_end = time.time()
t_total = t_total_end - t_total_start
profiling_data["t_total"] = t_total
print(f"Total run time on rank {str(rank)} is {str(t_total)} s")
# write profiling to JSON file
if args.profile and rank == 0:
job_id = ""
if "SLURM_JOB_ID" in os.environ:
job_id = int(os.environ["SLURM_JOB_ID"])
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
allocated_gpus = int(os.environ["ROCR_VISIBLE_DEVICES"].count(",") + 1)
profiling_file = "MPI_jobid_" + \
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(
allocated_gpus) + "_GPUs_profiling.json"
profiling_data["outfile"] = outfile
else:
profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(
num_hip_devices) + "_GPUs_profiling.json"
for stage in sim_runner_profiling_data["start"].keys():
profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
for stage in sim_profiling_data["start"].keys():
profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage]
profiling_data["nx"] = nx
profiling_data["ny"] = ny
profiling_data["dt"] = dt
profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
profiling_data["slurm_job_id"] = job_id
profiling_data["n_hip_devices"] = str(num_hip_devices)
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
profiling_data["git_hash"] = get_git_hash()
profiling_data["git_status"] = get_git_status()
with open(profiling_file, "w") as write_file:
json.dump(profiling_data, write_file)
####
# Clean shutdown
####
sim = None
local_sim = None
context = None
arguments = None
logging.shutdown()
gc.collect()
####
# Print completion and exit
####
print("Completed!")
exit(0)