# -*- coding: utf-8 -*- """ This python module implements MPI simulations for benchmarking Copyright (C) 2018 SINTEF ICT This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import argparse import numpy as np import gc import time import json import logging import os from mpi4py import MPI from hip import hip from GPUSimulators.mpi import MPISimulator, MPIGrid from GPUSimulators.common import run_simulation, get_git_hash, get_git_status, hip_check, utils from GPUSimulators.gpu import KernelContext from GPUSimulators.model import EE2DKP07Dimsplit from GPUSimulators.helpers import initial_conditions as IC # Purely for local debugging # import pydevd_pycharm # pydevd_pycharm.settrace('localhost', port=24785, stdoutToServer=True, stderrToServer=True) parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.') parser.add_argument('-nx', type=int, default=128) parser.add_argument('-ny', type=int, default=128) parser.add_argument('--profile', action='store_true') # default: False parser.add_argument('--compile_opts', type=str, help="Compiler options for HIP code.") parser.add_argument('--progress', type=bool, default=False, help="Displays a progress bar for the progress of the simulation.") args = parser.parse_args() if args.profile: profiling_data = {} # profiling: total run time t_total_start = time.time() t_init_start = time.time() nx = args.nx ny = args.ny # Get MPI COMM to use comm = MPI.COMM_WORLD rank = comm.rank #### # Initialize logging #### log_level_console = 20 log_level_file = 10 log_filename = 'mpi_' + str(rank) + '.log' logger = logging.getLogger('GPUSimulators') logger.setLevel(min(log_level_console, log_level_file)) ch = logging.StreamHandler() ch.setLevel(log_level_console) logger.addHandler(ch) logger.info(f"Console logger using level {logging.getLevelName(log_level_console)}") fh = logging.FileHandler(log_filename) formatter = logging.Formatter( '%(asctime)s:%(name)s:%(levelname)s: %(message)s') fh.setFormatter(formatter) fh.setLevel(log_level_file) logger.addHandler(fh) logger.info(f"File logger using level {logging.getLevelName(log_level_file)} to {log_filename}") #### # Initialize MPI grid etc #### logger.info("Creating MPI grid") grid = MPIGrid(comm, nx, ny) #### # Initialize HIP #### logger.info("Initializing HIP") local_rank = grid.get_local_rank() num_hip_devices = hip_check(hip.hipGetDeviceCount()) hip_device = local_rank % num_hip_devices logger.info(f"Process {str(local_rank)} using HIP device {str(hip_device)}") context = KernelContext(device=hip_device, autotuning=False) #### # Set initial conditions #### # DEBUGGING - setting random seed np.random.seed(42) logger.info("Generating initial conditions") dt = 0.001 gamma = 1.4 # save_times = np.linspace(0, 0.000009, 2) # save_times = np.linspace(0, 0.000099, 11) # save_times = np.linspace(0, 0.000099, 2) save_times = np.linspace(0, 5, 21) outfile = "mpi_out.nc4" save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] arguments = IC.gen_kelvin_helmholtz(nx, ny, gamma, grid=grid) arguments['context'] = context arguments['theta'] = 1.2 arguments['grid'] = grid arguments['compile_opts'] = ['-g', '-g3', '-ggdb', '-gdwarf-4', '-O0'] compile_opts = args.compile_opts if compile_opts is not None: arguments['compile_opts'] += compile_opts if args.profile: t_init_end = time.time() t_init = t_init_end - t_init_start profiling_data["t_init"] = t_init #### # Run simulation #### logger.info("Running simulation") # Helper function to create MPI simulator def gen_sim(grid, **kwargs): local_sim = EE2DKP07Dimsplit(**kwargs) sim = MPISimulator(local_sim, grid) return sim outfile, sim_runner_profiling_data, sim_profiling_data = run_simulation( gen_sim, arguments, outfile, save_times, save_var_names, dt, progress_bar=args.progress) # Move NetCDF4 file to a unique file, for the next run. if rank == 0: new_filename = utils.unique_file(outfile) os.rename(outfile, new_filename) ##### Profiling ###### if args.profile: t_total_end = time.time() t_total = t_total_end - t_total_start profiling_data["t_total"] = t_total print(f"Total run time on rank {str(rank)} is {str(t_total)} s") # write profiling to JSON file if args.profile and rank == 0: job_id = "" if "SLURM_JOB_ID" in os.environ: job_id = int(os.environ["SLURM_JOB_ID"]) allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"]) allocated_gpus = int(os.environ["ROCR_VISIBLE_DEVICES"].count(",") + 1) profiling_file = "MPI_jobid_" + \ str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str( allocated_gpus) + "_GPUs_profiling.json" profiling_data["outfile"] = outfile else: profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str( num_hip_devices) + "_GPUs_profiling.json" for stage in sim_runner_profiling_data["start"].keys(): profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage] for stage in sim_profiling_data["start"].keys(): profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage] profiling_data["nx"] = nx profiling_data["ny"] = ny profiling_data["dt"] = dt profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"] profiling_data["slurm_job_id"] = job_id profiling_data["n_hip_devices"] = str(num_hip_devices) profiling_data["n_processes"] = str(MPI.COMM_WORLD.size) profiling_data["git_hash"] = get_git_hash() profiling_data["git_status"] = get_git_status() with open(profiling_file, "w") as write_file: json.dump(profiling_data, write_file) #### # Clean shutdown #### sim = None local_sim = None context = None arguments = None logging.shutdown() gc.collect() #### # Print completion and exit #### print("Completed!") exit(0)