mirror of
https://github.com/smyalygames/FiniteVolumeGPU_HIP.git
synced 2025-12-24 05:18:51 +01:00
Compare commits
12 Commits
aa21733806
...
build/rocm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1056ecea67 | ||
|
|
22563df94f | ||
|
|
db9d735a82 | ||
|
|
99520d1503 | ||
|
|
cf102131df | ||
|
|
28a96382ff | ||
|
|
6d9f36968d | ||
|
|
5b925cdb42 | ||
|
|
b054a4dbcd | ||
|
|
2e5cf88eef | ||
|
|
80afd31286 | ||
|
|
e2306406a7 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -186,7 +186,6 @@ cython_debug/
|
||||
.pypirc
|
||||
|
||||
# CUDA
|
||||
cuda_cache/
|
||||
|
||||
# Taken from: https://github.com/github/gitignore/blob/main/CUDA.gitignore
|
||||
*.i
|
||||
|
||||
@@ -35,7 +35,7 @@ import gc
|
||||
import netCDF4
|
||||
import json
|
||||
|
||||
from tqdm import trange
|
||||
from tqdm import tqdm
|
||||
|
||||
#import pycuda.compiler as cuda_compiler
|
||||
#import pycuda.gpuarray
|
||||
@@ -181,10 +181,10 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
|
||||
#Start simulation loop
|
||||
# progress_printer = ProgressPrinter(save_times[-1], print_every=10)
|
||||
for k in trange(len(save_times)):
|
||||
for k, t_step in tqdm(enumerate(t_steps), desc="Simulation Loop"):
|
||||
#Get target time and step size there
|
||||
t_step = t_steps[k]
|
||||
t_end = save_times[k]
|
||||
# t_step = t_steps[k]
|
||||
# t_end = save_times[k]
|
||||
|
||||
#Sanity check simulator
|
||||
try:
|
||||
@@ -196,7 +196,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
profiling_data_sim_runner["start"]["t_full_step"] += time.time()
|
||||
|
||||
#Simulate
|
||||
if (t_step > 0.0):
|
||||
if t_step > 0.0:
|
||||
sim.simulate(t_step, dt)
|
||||
|
||||
profiling_data_sim_runner["end"]["t_full_step"] += time.time()
|
||||
@@ -217,7 +217,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
# if (print_string):
|
||||
# logger.debug(print_string)
|
||||
|
||||
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
|
||||
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(save_times[-1], sim.simSteps(), sim.simTime() / sim.simSteps()))
|
||||
|
||||
return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
|
||||
#return outdata.filename
|
||||
@@ -308,7 +308,7 @@ class IPEngine(object):
|
||||
import ipyparallel
|
||||
self.cluster = ipyparallel.Client()#profile='mpi')
|
||||
time.sleep(3)
|
||||
while(len(self.cluster.ids) != n_engines):
|
||||
while len(self.cluster.ids) != n_engines:
|
||||
time.sleep(0.5)
|
||||
self.logger.info("Waiting for cluster...")
|
||||
self.cluster = ipyparallel.Client()#profile='mpi')
|
||||
|
||||
@@ -206,7 +206,7 @@ class BaseSimulator(object):
|
||||
update_dt = False
|
||||
self.dt = dt
|
||||
|
||||
for _ in tqdm(range(math.ceil(t_end / self.dt))):
|
||||
for _ in tqdm(range(math.ceil((t_end - t_start) / self.dt)), desc="Simulation"):
|
||||
# Update dt every 100 timesteps and cross your fingers it works
|
||||
# for the next 100
|
||||
# TODO this is probably broken now after fixing the "infinite" loop
|
||||
|
||||
0
GPUSimulators/cuda_cache/.gitkeep
Normal file
0
GPUSimulators/cuda_cache/.gitkeep
Normal file
Binary file not shown.
26
Jobs/job_apptainer_lumi.slurm
Normal file
26
Jobs/job_apptainer_lumi.slurm
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash -l
|
||||
#SBATCH --job-name=lumi
|
||||
#SBATCH --account=project_4650000xx
|
||||
#SBATCH --time=00:10:00
|
||||
#SBATCH --partition=dev-g
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH --output=%x-%j.out
|
||||
#SBATCH --exclusive
|
||||
|
||||
N=$SLURM_JOB_NUM_NODES
|
||||
echo "--nbr of nodes:", $N
|
||||
echo "--total nbr of gpus:", $SLURM_NTASKS
|
||||
|
||||
MyDir=/project/project_4650000xx
|
||||
MyApplication=${MyDir}/FiniteVolumeGPU_HIP/mpiTesting.py
|
||||
Container=${MyDir}/FiniteVolumeGPU_HIP/my_container.sif
|
||||
|
||||
CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
|
||||
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
|
||||
srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
|
||||
apptainer exec "${Container}" \
|
||||
python ${MyApplication} -nx 1024 -ny 1024 --profile
|
||||
@@ -1,39 +0,0 @@
|
||||
#!/bin/bash -e
|
||||
#SBATCH --job-name=lumi
|
||||
#SBATCH --account=project_4650000xx
|
||||
#SBATCH --time=00:10:00
|
||||
#SBATCH --partition=dev-g
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --gpus=8
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH -o %x-%j.out
|
||||
#SBATCH --exclusive
|
||||
#
|
||||
|
||||
N=$SLURM_JOB_NUM_NODES
|
||||
echo "--nbr of nodes:", $N
|
||||
echo "--total nbr of gpus:", $SLURM_NTASKS
|
||||
|
||||
Mydir=/project/project_4650000xx
|
||||
Myapplication=${Mydir}/FiniteVolumeGPU_hip/mpiTesting.py
|
||||
|
||||
#modules
|
||||
ml LUMI/24.03 partition/G
|
||||
ml lumi-container-wrapper
|
||||
ml cray-python/3.11.7
|
||||
ml rocm/6.2.2
|
||||
|
||||
ml craype-accel-amd-gfx90a
|
||||
ml cray-mpich/8.1.29
|
||||
|
||||
export PATH="/project/project_4650000xx/FiniteVolumeGPU_hip/MyCondaEnv/bin:$PATH"
|
||||
|
||||
#missing library
|
||||
export LD_LIBRARY_PATH=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib-abi-mpich:$LD_LIBRARY_PATH
|
||||
|
||||
#Binding mask
|
||||
bind_mask="0x${fe}000000000000,0x${fe}00000000000000,0x${fe}0000,0x${fe}000000,0x${fe},0x${fe}00,0x${fe}00000000,0x${fe}0000000000"
|
||||
|
||||
srun --cpu-bind=mask_cpu:$bind_mask \
|
||||
python ${Myapplication} -nx 1024 -ny 1024 --profile
|
||||
27
Jobs/job_lumi.slurm
Normal file
27
Jobs/job_lumi.slurm
Normal file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash -l
|
||||
#SBATCH --job-name=lumi
|
||||
#SBATCH --account=project_4650000xx
|
||||
#SBATCH --time=00:10:00
|
||||
#SBATCH --partition=dev-g
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH --output=%x-%j.out
|
||||
#SBATCH --exclusive
|
||||
|
||||
N=$SLURM_JOB_NUM_NODES
|
||||
echo "--nbr of nodes:", $N
|
||||
echo "--total nbr of gpus:", $SLURM_NTASKS
|
||||
|
||||
MyDir=/project/project_4650000xx
|
||||
MyApplication=${MyDir}/FiniteVolumeGPU_HIP/mpiTesting.py
|
||||
CondaEnv=${MyDir}/FiniteVolumeGPU_HIP/MyCondaEnv/bin
|
||||
|
||||
export PATH="${CondaEnv}:$PATH"
|
||||
|
||||
CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
|
||||
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
|
||||
srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
|
||||
python ${MyApplication} -nx 1024 -ny 1024 --profile
|
||||
63
README.md
63
README.md
@@ -5,48 +5,53 @@ This is a HIP version of the [FiniteVolume code](https://github.com/babrodtk/Fin
|
||||
## Setup on LUMI-G
|
||||
Here is a step-by-step guide on installing packages on LUMI-G
|
||||
|
||||
### Step 1: Install rocm-5.4.6 with Easybuild
|
||||
```
|
||||
export EBU_USER_PREFIX=/project/project_xxxxxx/EasyBuild
|
||||
ml LUMI/24.03 partition/G
|
||||
ml EasyBuild-user
|
||||
export PYTHONIOENCODING=utf-8
|
||||
eb rocm-5.4.6.eb -r
|
||||
```
|
||||
|
||||
### Step 2: run conda-container
|
||||
### Step 1: run conda-container
|
||||
Installation via conda can be done as:
|
||||
```
|
||||
```shell
|
||||
ml LUMI/24.03 partition/G
|
||||
ml lumi-container-wrapper/0.3.3-cray-python-3.11.7
|
||||
```
|
||||
ml lumi-container-wrapper
|
||||
```
|
||||
```shell
|
||||
conda-containerize new --prefix MyCondaEnv conda_environment_lumi.yml
|
||||
```
|
||||
where the file `conda_environment_lumi.yml` contains packages to be installed.
|
||||
|
||||
### Step 3: Set the env. variable to search for binaries
|
||||
```
|
||||
export the bin path: export PATH="$PWD/MyCondaEnv/bin:$PATH"
|
||||
```
|
||||
### An alternative: Convert to a singularity container with cotainr
|
||||
### Step 1 alternative: Convert to a singularity container with cotainr
|
||||
Load the required modules first
|
||||
```shell
|
||||
ml CrayEnv
|
||||
ml cotainr
|
||||
```
|
||||
|
||||
Then build the Singularity/Apptainer container
|
||||
```shell
|
||||
cotainr build my_container.sif --system=lumi-g --conda-env=conda_environment_lumi.yml
|
||||
```
|
||||
|
||||
### Error when running MPI.
|
||||
### Step 2: Modify Slurm Job file
|
||||
Depending on your build method, update [`Jobs/job_lumi.slurm`](Jobs/job_lumi.slurm) if `conda-containerize` was used, or [`Jobs/job_apptainer_lumi.slurm`](Jobs/job_apptainer_lumi.slurm) if `containr` was used.
|
||||
|
||||
In the job file, the required changes is to match your project allocation,
|
||||
and the directories of where the simulator and container is stored.
|
||||
|
||||
### Step 3: Run the Slurm Job
|
||||
If `conda-containerize` was used for building:
|
||||
```shell
|
||||
sbatch Jobs/job_lumi.slurm
|
||||
```
|
||||
|
||||
Otherwise, if `containr` was used for building:
|
||||
```shell
|
||||
sbatch Jobs/job_apptainer_lumi.slurm
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
#### Error when running MPI.
|
||||
```
|
||||
`MPI startup(): PMI server not found. Please set I_MPI_PMI_LIBRARY variable if it is not a singleton case.
|
||||
```
|
||||
This can be resolved by exporting this:
|
||||
```
|
||||
export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.27/ofi/cray/14.0/lib/libmpi.so
|
||||
```
|
||||
### Install hip-python
|
||||
```
|
||||
python -m pip install -i https://test.pypi.org/simple/ hip-python==5.4.3.470.16
|
||||
```
|
||||
|
||||
The testing was done with this specific version `hip-python==5.4.3.470.16`
|
||||
|
||||
|
||||
export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib/libmpi.so
|
||||
```
|
||||
@@ -70,7 +70,7 @@ def hip_check(call_result):
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if(args.profile):
|
||||
if args.profile:
|
||||
profiling_data = {}
|
||||
# profiling: total run time
|
||||
t_total_start = time.time()
|
||||
@@ -79,6 +79,8 @@ if(args.profile):
|
||||
|
||||
# Get MPI COMM to use
|
||||
comm = MPI.COMM_WORLD
|
||||
size = comm.Get_size()
|
||||
rank = comm.Get_rank()
|
||||
|
||||
|
||||
####
|
||||
@@ -86,7 +88,7 @@ comm = MPI.COMM_WORLD
|
||||
####
|
||||
log_level_console = 20
|
||||
log_level_file = 10
|
||||
log_filename = 'mpi_' + str(comm.rank) + '.log'
|
||||
log_filename = 'mpi_' + str(rank) + '.log'
|
||||
logger = logging.getLogger('GPUSimulators')
|
||||
logger.setLevel(min(log_level_console, log_level_file))
|
||||
|
||||
@@ -110,7 +112,7 @@ logger.info("File logger using level %s to %s",
|
||||
# Initialize MPI grid etc
|
||||
####
|
||||
logger.info("Creating MPI grid")
|
||||
grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
|
||||
grid = MPISimulator.MPIGrid(comm)
|
||||
|
||||
"""
|
||||
job_id = int(os.environ["SLURM_JOB_ID"])
|
||||
@@ -152,7 +154,7 @@ gamma = 1.4
|
||||
#save_times = np.linspace(0, 0.000099, 11)
|
||||
#save_times = np.linspace(0, 0.000099, 2)
|
||||
save_times = np.linspace(0, 0.0000999, 2)
|
||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||
outfile = "mpi_out_" + str(rank) + ".nc"
|
||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||
|
||||
arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid)
|
||||
@@ -160,7 +162,7 @@ arguments['context'] = cuda_context
|
||||
arguments['theta'] = 1.2
|
||||
arguments['grid'] = grid
|
||||
|
||||
if(args.profile):
|
||||
if args.profile:
|
||||
t_init_end = time.time()
|
||||
t_init = t_init_end - t_init_start
|
||||
profiling_data["t_init"] = t_init
|
||||
@@ -181,14 +183,14 @@ def genSim(grid, **kwargs):
|
||||
(outfile, sim_runner_profiling_data, sim_profiling_data) = Common.runSimulation(
|
||||
genSim, arguments, outfile, save_times, save_var_names, dt)
|
||||
|
||||
if(args.profile):
|
||||
if args.profile:
|
||||
t_total_end = time.time()
|
||||
t_total = t_total_end - t_total_start
|
||||
profiling_data["t_total"] = t_total
|
||||
print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
|
||||
print("Total run time on rank " + str(rank) + " is " + str(t_total) + " s")
|
||||
|
||||
# write profiling to json file
|
||||
if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
if args.profile and rank == 0:
|
||||
job_id = ""
|
||||
if "SLURM_JOB_ID" in os.environ:
|
||||
job_id = int(os.environ["SLURM_JOB_ID"])
|
||||
@@ -199,7 +201,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
|
||||
profiling_data["outfile"] = outfile
|
||||
else:
|
||||
profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
|
||||
profiling_file = "MPI_" + str(size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
|
||||
|
||||
for stage in sim_runner_profiling_data["start"].keys():
|
||||
profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
|
||||
@@ -214,7 +216,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
|
||||
profiling_data["slurm_job_id"] = job_id
|
||||
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
|
||||
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
|
||||
profiling_data["n_processes"] = str(size)
|
||||
profiling_data["git_hash"] = Common.getGitHash()
|
||||
profiling_data["git_status"] = Common.getGitStatus()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user