Compare commits

..

6 Commits

Author SHA1 Message Date
Anthony Berg
6d9f36968d Merge remote-tracking branch 'origin/build/rocm-upgrade' into build/rocm-upgrade 2025-03-30 18:40:46 +02:00
Anthony Berg
5b925cdb42 refactor: change MPI functions into variables 2025-03-30 18:40:38 +02:00
Anthony Berg
b054a4dbcd
Delete GPUSimulators/helpers/__pycache__ directory 2025-03-30 18:22:38 +02:00
Anthony Berg
2e5cf88eef Merge remote-tracking branch 'origin/build/rocm-upgrade' into build/rocm-upgrade
# Conflicts:
#	GPUSimulators/Simulator.py
2025-03-30 17:45:16 +02:00
Anthony Berg
80afd31286 refactor: change how variables are called in for loop 2025-03-30 17:44:33 +02:00
Anthony Berg
e2306406a7 fix: floating point number practically causing an infinite loop 2025-03-30 17:43:52 +02:00
4 changed files with 20 additions and 18 deletions

View File

@ -35,7 +35,7 @@ import gc
import netCDF4 import netCDF4
import json import json
from tqdm import trange from tqdm import tqdm
#import pycuda.compiler as cuda_compiler #import pycuda.compiler as cuda_compiler
#import pycuda.gpuarray #import pycuda.gpuarray
@ -181,10 +181,10 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
#Start simulation loop #Start simulation loop
# progress_printer = ProgressPrinter(save_times[-1], print_every=10) # progress_printer = ProgressPrinter(save_times[-1], print_every=10)
for k in trange(len(save_times)): for k, t_step in tqdm(enumerate(t_steps), desc="Simulation Loop"):
#Get target time and step size there #Get target time and step size there
t_step = t_steps[k] # t_step = t_steps[k]
t_end = save_times[k] # t_end = save_times[k]
#Sanity check simulator #Sanity check simulator
try: try:
@ -196,7 +196,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
profiling_data_sim_runner["start"]["t_full_step"] += time.time() profiling_data_sim_runner["start"]["t_full_step"] += time.time()
#Simulate #Simulate
if (t_step > 0.0): if t_step > 0.0:
sim.simulate(t_step, dt) sim.simulate(t_step, dt)
profiling_data_sim_runner["end"]["t_full_step"] += time.time() profiling_data_sim_runner["end"]["t_full_step"] += time.time()
@ -217,7 +217,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
# if (print_string): # if (print_string):
# logger.debug(print_string) # logger.debug(print_string)
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps())) logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(save_times[-1], sim.simSteps(), sim.simTime() / sim.simSteps()))
return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
#return outdata.filename #return outdata.filename
@ -308,7 +308,7 @@ class IPEngine(object):
import ipyparallel import ipyparallel
self.cluster = ipyparallel.Client()#profile='mpi') self.cluster = ipyparallel.Client()#profile='mpi')
time.sleep(3) time.sleep(3)
while(len(self.cluster.ids) != n_engines): while len(self.cluster.ids) != n_engines:
time.sleep(0.5) time.sleep(0.5)
self.logger.info("Waiting for cluster...") self.logger.info("Waiting for cluster...")
self.cluster = ipyparallel.Client()#profile='mpi') self.cluster = ipyparallel.Client()#profile='mpi')

View File

@ -206,7 +206,7 @@ class BaseSimulator(object):
update_dt = False update_dt = False
self.dt = dt self.dt = dt
for _ in tqdm(range(math.ceil(t_end / self.dt))): for _ in tqdm(range(math.ceil((t_end - t_start) / self.dt)), desc="Simulation"):
# Update dt every 100 timesteps and cross your fingers it works # Update dt every 100 timesteps and cross your fingers it works
# for the next 100 # for the next 100
# TODO this is probably broken now after fixing the "infinite" loop # TODO this is probably broken now after fixing the "infinite" loop

View File

@ -70,7 +70,7 @@ def hip_check(call_result):
args = parser.parse_args() args = parser.parse_args()
if(args.profile): if args.profile:
profiling_data = {} profiling_data = {}
# profiling: total run time # profiling: total run time
t_total_start = time.time() t_total_start = time.time()
@ -79,6 +79,8 @@ if(args.profile):
# Get MPI COMM to use # Get MPI COMM to use
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
#### ####
@ -86,7 +88,7 @@ comm = MPI.COMM_WORLD
#### ####
log_level_console = 20 log_level_console = 20
log_level_file = 10 log_level_file = 10
log_filename = 'mpi_' + str(comm.rank) + '.log' log_filename = 'mpi_' + str(rank) + '.log'
logger = logging.getLogger('GPUSimulators') logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(log_level_console, log_level_file)) logger.setLevel(min(log_level_console, log_level_file))
@ -110,7 +112,7 @@ logger.info("File logger using level %s to %s",
# Initialize MPI grid etc # Initialize MPI grid etc
#### ####
logger.info("Creating MPI grid") logger.info("Creating MPI grid")
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD) grid = MPISimulator.MPIGrid(comm)
""" """
job_id = int(os.environ["SLURM_JOB_ID"]) job_id = int(os.environ["SLURM_JOB_ID"])
@ -152,7 +154,7 @@ gamma = 1.4
#save_times = np.linspace(0, 0.000099, 11) #save_times = np.linspace(0, 0.000099, 11)
#save_times = np.linspace(0, 0.000099, 2) #save_times = np.linspace(0, 0.000099, 2)
save_times = np.linspace(0, 0.0000999, 2) save_times = np.linspace(0, 0.0000999, 2)
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" outfile = "mpi_out_" + str(rank) + ".nc"
save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid) arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid)
@ -160,7 +162,7 @@ arguments['context'] = cuda_context
arguments['theta'] = 1.2 arguments['theta'] = 1.2
arguments['grid'] = grid arguments['grid'] = grid
if(args.profile): if args.profile:
t_init_end = time.time() t_init_end = time.time()
t_init = t_init_end - t_init_start t_init = t_init_end - t_init_start
profiling_data["t_init"] = t_init profiling_data["t_init"] = t_init
@ -181,14 +183,14 @@ def genSim(grid, **kwargs):
(outfile, sim_runner_profiling_data, sim_profiling_data) = Common.runSimulation( (outfile, sim_runner_profiling_data, sim_profiling_data) = Common.runSimulation(
genSim, arguments, outfile, save_times, save_var_names, dt) genSim, arguments, outfile, save_times, save_var_names, dt)
if(args.profile): if args.profile:
t_total_end = time.time() t_total_end = time.time()
t_total = t_total_end - t_total_start t_total = t_total_end - t_total_start
profiling_data["t_total"] = t_total profiling_data["t_total"] = t_total
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s") print("Total run time on rank " + str(rank) + " is " + str(t_total) + " s")
# write profiling to json file # write profiling to json file
if(args.profile and MPI.COMM_WORLD.rank == 0): if args.profile and rank == 0:
job_id = "" job_id = ""
if "SLURM_JOB_ID" in os.environ: if "SLURM_JOB_ID" in os.environ:
job_id = int(os.environ["SLURM_JOB_ID"]) job_id = int(os.environ["SLURM_JOB_ID"])
@ -199,7 +201,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json" str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
profiling_data["outfile"] = outfile profiling_data["outfile"] = outfile
else: else:
profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json" profiling_file = "MPI_" + str(size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
for stage in sim_runner_profiling_data["start"].keys(): for stage in sim_runner_profiling_data["start"].keys():
profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage] profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
@ -214,7 +216,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
profiling_data["slurm_job_id"] = job_id profiling_data["slurm_job_id"] = job_id
profiling_data["n_cuda_devices"] = str(num_cuda_devices) profiling_data["n_cuda_devices"] = str(num_cuda_devices)
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size) profiling_data["n_processes"] = str(size)
profiling_data["git_hash"] = Common.getGitHash() profiling_data["git_hash"] = Common.getGitHash()
profiling_data["git_status"] = Common.getGitStatus() profiling_data["git_status"] = Common.getGitStatus()