feat: add mpi testing for HIP

2025-10-31 20:27:40 +01:00 · 2025-07-03 14:56:03 +02:00 · 2025-07-03 14:56:03 +02:00 · dff97a1fdf
commit dff97a1fdf
parent 74398718c2
1 changed files with 197 additions and 0 deletions
--- a/mpi_testing_hip.py
+++ b/mpi_testing_hip.py
@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements MPI simulations for benchmarking
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import argparse
+import numpy as np
+import gc
+import time
+import json
+import logging
+import os
+
+from mpi4py import MPI
+from hip import hip
+
+from GPUSimulators.mpi import MPISimulator, MPIGrid
+from GPUSimulators.common import run_simulation, get_git_hash, get_git_status, hip_check
+from GPUSimulators.gpu import KernelContext
+from GPUSimulators.model import EE2DKP07Dimsplit
+from GPUSimulators.helpers import initial_conditions as IC
+
+parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
+parser.add_argument('-nx', type=int, default=128)
+parser.add_argument('-ny', type=int, default=128)
+parser.add_argument('--profile', action='store_true')  # default: False
+
+args = parser.parse_args()
+
+if args.profile:
+    profiling_data = {}
+    # profiling: total run time
+    t_total_start = time.time()
+    t_init_start = time.time()
+
+# Get MPI COMM to use
+comm = MPI.COMM_WORLD
+
+####
+# Initialize logging
+####
+log_level_console = 20
+log_level_file = 10
+log_filename = 'mpi_' + str(comm.rank) + '.log'
+logger = logging.getLogger('GPUSimulators')
+logger.setLevel(min(log_level_console, log_level_file))
+
+ch = logging.StreamHandler()
+ch.setLevel(log_level_console)
+logger.addHandler(ch)
+logger.info(f"Console logger using level {logging.getLevelName(log_level_console)}")
+
+fh = logging.FileHandler(log_filename)
+formatter = logging.Formatter(
+    '%(asctime)s:%(name)s:%(levelname)s: %(message)s')
+fh.setFormatter(formatter)
+fh.setLevel(log_level_file)
+logger.addHandler(fh)
+logger.info(f"File logger using level {logging.getLevelName(log_level_file)} to {log_filename}")
+
+####
+# Initialize MPI grid etc
+####
+logger.info("Creating MPI grid")
+grid = MPIGrid(MPI.COMM_WORLD)
+
+####
+# Initialize HIP
+####
+logger.info("Initializing HIP")
+local_rank = grid.get_local_rank()
+num_hip_devices = hip_check(hip.hipGetDeviceCount())
+hip_device = local_rank % num_hip_devices
+logger.info(f"Process {str(local_rank)} using HIP device {str(hip_device)}")
+context = KernelContext(device=hip_device, autotuning=False)
+
+####
+# Set initial conditions
+####
+
+# DEBUGGING - setting random seed
+np.random.seed(42)
+
+logger.info("Generating initial conditions")
+nx = args.nx
+ny = args.ny
+
+dt = 0.001
+
+gamma = 1.4
+# save_times = np.linspace(0, 0.000009, 2)
+# save_times = np.linspace(0, 0.000099, 11)
+# save_times = np.linspace(0, 0.000099, 2)
+save_times = np.linspace(0, 20, 21)
+outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
+save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
+
+arguments = IC.gen_kelvin_helmholtz(nx, ny, gamma, grid=grid)
+arguments['context'] = context
+arguments['theta'] = 1.2
+arguments['grid'] = grid
+
+if args.profile:
+    t_init_end = time.time()
+    t_init = t_init_end - t_init_start
+    profiling_data["t_init"] = t_init
+
+####
+# Run simulation
+####
+logger.info("Running simulation")
+
+
+# Helper function to create MPI simulator
+
+
+def gen_sim(grid, **kwargs):
+    local_sim = EE2DKP07Dimsplit(**kwargs)
+    sim = MPISimulator(local_sim, grid)
+    return sim
+
+
+outfile, sim_runner_profiling_data, sim_profiling_data = run_simulation(
+    gen_sim, arguments, outfile, save_times, save_var_names, dt)
+
+if args.profile:
+    t_total_end = time.time()
+    t_total = t_total_end - t_total_start
+    profiling_data["t_total"] = t_total
+    print(f"Total run time on rank {str(MPI.COMM_WORLD.rank)} is {str(t_total)} s")
+
+# write profiling to JSON file
+if args.profile and MPI.COMM_WORLD.rank == 0:
+    job_id = ""
+    if "SLURM_JOB_ID" in os.environ:
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
+        allocated_gpus = int(os.environ["ROCR_VISIBLE_DEVICES"].count(",") + 1)
+        profiling_file = "MPI_jobid_" + \
+                         str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(
+            allocated_gpus) + "_GPUs_profiling.json"
+        profiling_data["outfile"] = outfile
+    else:
+        profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(
+            num_hip_devices) + "_GPUs_profiling.json"
+
+    for stage in sim_runner_profiling_data["start"].keys():
+        profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
+
+    for stage in sim_profiling_data["start"].keys():
+        profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage]
+
+    profiling_data["nx"] = nx
+    profiling_data["ny"] = ny
+    profiling_data["dt"] = dt
+    profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
+
+    profiling_data["slurm_job_id"] = job_id
+    profiling_data["n_hip_devices"] = str(num_hip_devices)
+    profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
+    profiling_data["git_hash"] = get_git_hash()
+    profiling_data["git_status"] = get_git_status()
+
+    with open(profiling_file, "w") as write_file:
+        json.dump(profiling_data, write_file)
+
+####
+# Clean shutdown
+####
+sim = None
+local_sim = None
+context = None
+arguments = None
+logging.shutdown()
+gc.collect()
+
+####
+# Print completion and exit
+####
+print("Completed!")
+exit(0)