feat: add containr Slurm job and docs

fix: removed unused variable in directory
docs: remove redundant separate link to file
2025-12-24 05:18:51 +01:00 · 2025-04-01 14:52:30 +02:00 · 2025-04-01 14:52:13 +02:00 · 2025-03-30 21:14:49 +02:00 · 2025-03-30 21:13:02 +02:00 · 2025-03-30 20:44:40 +02:00
10 changed files with 107 additions and 87 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -186,7 +186,6 @@ cython_debug/
 .pypirc

 # CUDA
-cuda_cache/

 # Taken from: https://github.com/github/gitignore/blob/main/CUDA.gitignore
 *.i
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -35,7 +35,7 @@ import gc
 import netCDF4
 import json

-from tqdm import trange
+from tqdm import tqdm

 #import pycuda.compiler as cuda_compiler
 #import pycuda.gpuarray
@@ -181,10 +181,10 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names

        #Start simulation loop
        # progress_printer = ProgressPrinter(save_times[-1], print_every=10)
-        for k in trange(len(save_times)):
+        for k, t_step in tqdm(enumerate(t_steps), desc="Simulation Loop"):
            #Get target time and step size there
-            t_step = t_steps[k]
-            t_end = save_times[k]
+            # t_step = t_steps[k]
+            # t_end = save_times[k]
            
            #Sanity check simulator
            try:
@@ -196,7 +196,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
            profiling_data_sim_runner["start"]["t_full_step"] += time.time()

            #Simulate
-            if (t_step > 0.0):
+            if t_step > 0.0:
                sim.simulate(t_step, dt)

            profiling_data_sim_runner["end"]["t_full_step"] += time.time()
@@ -217,7 +217,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
            # if (print_string):
            #     logger.debug(print_string)
                
-        logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
+        logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(save_times[-1], sim.simSteps(), sim.simTime() / sim.simSteps()))
        
    return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
    #return outdata.filename
@@ -308,7 +308,7 @@ class IPEngine(object):
        import ipyparallel
        self.cluster = ipyparallel.Client()#profile='mpi')
        time.sleep(3)
-        while(len(self.cluster.ids) != n_engines):
+        while len(self.cluster.ids) != n_engines:
            time.sleep(0.5)
            self.logger.info("Waiting for cluster...")
            self.cluster = ipyparallel.Client()#profile='mpi')
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -206,7 +206,7 @@ class BaseSimulator(object):
            update_dt = False
            self.dt = dt
        
-        for _ in tqdm(range(math.ceil(t_end / self.dt))):
+        for _ in tqdm(range(math.ceil((t_end - t_start) / self.dt)), desc="Simulation"):
            # Update dt every 100 timesteps and cross your fingers it works
            # for the next 100
            # TODO this is probably broken now after fixing the "infinite" loop
--- a/GPUSimulators/cuda_cache/.gitkeep
+++ b/GPUSimulators/cuda_cache/.gitkeep
--- a/GPUSimulators/helpers/pycache/InitialConditions.cpython-39.pyc
+++ b/GPUSimulators/helpers/pycache/InitialConditions.cpython-39.pyc
--- a/Jobs/job_apptainer_lumi.slurm
+++ b/Jobs/job_apptainer_lumi.slurm
@@ -0,0 +1,26 @@
+#!/bin/bash -l
+#SBATCH --job-name=lumi
+#SBATCH --account=project_4650000xx
+#SBATCH --time=00:10:00
+#SBATCH --partition=dev-g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --output=%x-%j.out
+#SBATCH --exclusive
+
+N=$SLURM_JOB_NUM_NODES
+echo "--nbr of nodes:", $N
+echo "--total nbr of gpus:", $SLURM_NTASKS
+
+MyDir=/project/project_4650000xx
+MyApplication=${MyDir}/FiniteVolumeGPU_HIP/mpiTesting.py
+Container=${MyDir}/FiniteVolumeGPU_HIP/my_container.sif
+
+CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
+     apptainer exec "${Container}" \
+     python ${MyApplication} -nx 1024 -ny 1024 --profile
--- a/Jobs/job_lumi.slrum
+++ b/Jobs/job_lumi.slrum
@@ -1,39 +0,0 @@
-#!/bin/bash -e
-#SBATCH --job-name=lumi
-#SBATCH --account=project_4650000xx
-#SBATCH --time=00:10:00
-#SBATCH --partition=dev-g
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=8
-#SBATCH --gpus=8
-#SBATCH --gpus-per-node=8
-#SBATCH -o %x-%j.out
-#SBATCH --exclusive
-#
-
-N=$SLURM_JOB_NUM_NODES
-echo "--nbr of nodes:", $N
-echo "--total nbr of gpus:", $SLURM_NTASKS
-
-Mydir=/project/project_4650000xx
-Myapplication=${Mydir}/FiniteVolumeGPU_hip/mpiTesting.py
-
-#modules
-ml LUMI/24.03 partition/G
-ml lumi-container-wrapper
-ml cray-python/3.11.7
-ml rocm/6.2.2
-
-ml craype-accel-amd-gfx90a
-ml cray-mpich/8.1.29
-
-export PATH="/project/project_4650000xx/FiniteVolumeGPU_hip/MyCondaEnv/bin:$PATH"
-
-#missing library
-export LD_LIBRARY_PATH=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib-abi-mpich:$LD_LIBRARY_PATH
-
-#Binding mask
-bind_mask="0x${fe}000000000000,0x${fe}00000000000000,0x${fe}0000,0x${fe}000000,0x${fe},0x${fe}00,0x${fe}00000000,0x${fe}0000000000"
-
-srun --cpu-bind=mask_cpu:$bind_mask \
-     python ${Myapplication} -nx 1024 -ny 1024 --profile 	
--- a/Jobs/job_lumi.slurm
+++ b/Jobs/job_lumi.slurm
@@ -0,0 +1,27 @@
+#!/bin/bash -l
+#SBATCH --job-name=lumi
+#SBATCH --account=project_4650000xx
+#SBATCH --time=00:10:00
+#SBATCH --partition=dev-g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --output=%x-%j.out
+#SBATCH --exclusive
+
+N=$SLURM_JOB_NUM_NODES
+echo "--nbr of nodes:", $N
+echo "--total nbr of gpus:", $SLURM_NTASKS
+
+MyDir=/project/project_4650000xx
+MyApplication=${MyDir}/FiniteVolumeGPU_HIP/mpiTesting.py
+CondaEnv=${MyDir}/FiniteVolumeGPU_HIP/MyCondaEnv/bin
+
+export PATH="${CondaEnv}:$PATH"
+
+CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
+     python ${MyApplication} -nx 1024 -ny 1024 --profile
--- a/README.md
+++ b/README.md
@@ -5,48 +5,53 @@ This is a HIP version of the [FiniteVolume code](https://github.com/babrodtk/Fin
 ## Setup on LUMI-G
 Here is a step-by-step guide on installing packages on LUMI-G

-### Step 1: Install rocm-5.4.6 with Easybuild
-```
-export EBU_USER_PREFIX=/project/project_xxxxxx/EasyBuild
-ml LUMI/24.03 partition/G
-ml EasyBuild-user
-export PYTHONIOENCODING=utf-8
-eb rocm-5.4.6.eb -r
-```
-
-### Step 2: run conda-container
+### Step 1: run conda-container
 Installation via conda can be done as:
-```
+```shell
 ml LUMI/24.03 partition/G
-ml lumi-container-wrapper/0.3.3-cray-python-3.11.7
-```
+ml lumi-container-wrapper
 ```
+```shell
 conda-containerize new --prefix MyCondaEnv conda_environment_lumi.yml
 ```
 where the file `conda_environment_lumi.yml` contains packages to be installed.

-### Step 3: Set the env. variable to search for binaries
-```
-export the bin path: export PATH="$PWD/MyCondaEnv/bin:$PATH"
-```
-### An alternative: Convert to a singularity container with cotainr
+### Step 1 alternative: Convert to a singularity container with cotainr
+Load the required modules first
+```shell
+ml CrayEnv
+ml cotainr
 ```
+
+Then build the Singularity/Apptainer container 
+```shell
 cotainr build my_container.sif --system=lumi-g --conda-env=conda_environment_lumi.yml
 ```

-### Error when running MPI.
+### Step 2: Modify Slurm Job file
+Depending on your build method, update [`Jobs/job_lumi.slurm`](Jobs/job_lumi.slurm) if `conda-containerize` was used, or [`Jobs/job_apptainer_lumi.slurm`](Jobs/job_apptainer_lumi.slurm) if `containr` was used.
+
+In the job file, the required changes is to match your project allocation,
+and the directories of where the simulator and container is stored.
+
+### Step 3: Run the Slurm Job
+If `conda-containerize` was used for building:
+```shell
+sbatch Jobs/job_lumi.slurm
+```
+
+Otherwise, if `containr` was used for building:
+```shell
+sbatch Jobs/job_apptainer_lumi.slurm
+```
+
+### Troubleshooting
+
+#### Error when running MPI.
 ```
 `MPI startup(): PMI server not found. Please set I_MPI_PMI_LIBRARY variable if it is not a singleton case.
 ```
 This can be resolved by exporting this:
 ```
-export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.27/ofi/cray/14.0/lib/libmpi.so
-```
-### Install hip-python
-```
-python -m pip install -i https://test.pypi.org/simple/ hip-python==5.4.3.470.16
-```
-
-The testing was done with this specific version `hip-python==5.4.3.470.16`
-
- 
+export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib/libmpi.so
+```
--- a/mpiTesting.py
+++ b/mpiTesting.py
@@ -70,7 +70,7 @@ def hip_check(call_result):

 args = parser.parse_args()

-if(args.profile):
+if args.profile:
    profiling_data = {}
    # profiling: total run time
    t_total_start = time.time()
@@ -79,6 +79,8 @@ if(args.profile):

 # Get MPI COMM to use
 comm = MPI.COMM_WORLD
+size = comm.Get_size()
+rank = comm.Get_rank()


 ####
@@ -86,7 +88,7 @@ comm = MPI.COMM_WORLD
 ####
 log_level_console = 20
 log_level_file = 10
-log_filename = 'mpi_' + str(comm.rank) + '.log'
+log_filename = 'mpi_' + str(rank) + '.log'
 logger = logging.getLogger('GPUSimulators')
 logger.setLevel(min(log_level_console, log_level_file))

@@ -110,7 +112,7 @@ logger.info("File logger using level %s to %s",
 # Initialize MPI grid etc
 ####
 logger.info("Creating MPI grid")
-grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
+grid = MPISimulator.MPIGrid(comm)

 """
 job_id = int(os.environ["SLURM_JOB_ID"])
@@ -152,7 +154,7 @@ gamma = 1.4
 #save_times = np.linspace(0, 0.000099, 11)
 #save_times = np.linspace(0, 0.000099, 2)
 save_times = np.linspace(0, 0.0000999, 2)
-outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
+outfile = "mpi_out_" + str(rank) + ".nc"
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']

 arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid)
@@ -160,7 +162,7 @@ arguments['context'] = cuda_context
 arguments['theta'] = 1.2
 arguments['grid'] = grid

-if(args.profile):
+if args.profile:
    t_init_end = time.time()
    t_init = t_init_end - t_init_start
    profiling_data["t_init"] = t_init
@@ -181,14 +183,14 @@ def genSim(grid, **kwargs):
 (outfile, sim_runner_profiling_data, sim_profiling_data) = Common.runSimulation(
    genSim, arguments, outfile, save_times, save_var_names, dt)

-if(args.profile):
+if args.profile:
    t_total_end = time.time()
    t_total = t_total_end - t_total_start
    profiling_data["t_total"] = t_total
-    print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
+    print("Total run time on rank " + str(rank) + " is " + str(t_total) + " s")

 # write profiling to json file
-if(args.profile and MPI.COMM_WORLD.rank == 0):
+if args.profile and rank == 0:
    job_id = ""
    if "SLURM_JOB_ID" in os.environ:
        job_id = int(os.environ["SLURM_JOB_ID"])
@@ -199,7 +201,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
            str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
        profiling_data["outfile"] = outfile
    else:
-        profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
+        profiling_file = "MPI_" + str(size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"

    for stage in sim_runner_profiling_data["start"].keys():
        profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
@@ -214,7 +216,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):

    profiling_data["slurm_job_id"] = job_id
    profiling_data["n_cuda_devices"] = str(num_cuda_devices)
-    profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
+    profiling_data["n_processes"] = str(size)
    profiling_data["git_hash"] = Common.getGitHash()
    profiling_data["git_status"] = Common.getGitStatus()
Author	SHA1	Message	Date
Anthony Berg	1056ecea67	feat: add containr Slurm job and docs	2025-04-01 14:52:30 +02:00
Anthony Berg	22563df94f	fix: removed unused variable in directory	2025-04-01 14:52:13 +02:00
Anthony Berg	db9d735a82	docs: remove redundant separate link to file	2025-03-30 21:14:49 +02:00
Anthony Berg	99520d1503	docs: update instructions to reflect new Slurm job file	2025-03-30 21:13:02 +02:00
Anthony Berg	cf102131df	fix: use MPI in slurm job	2025-03-30 20:44:40 +02:00
Anthony Berg	28a96382ff	fix: add cuda_cache dir to prevent parallel nodes from hanging up	2025-03-30 20:28:56 +02:00
Anthony Berg	6d9f36968d	Merge remote-tracking branch 'origin/build/rocm-upgrade' into build/rocm-upgrade	2025-03-30 18:40:46 +02:00
Anthony Berg	5b925cdb42	refactor: change MPI functions into variables	2025-03-30 18:40:38 +02:00
Anthony Berg	b054a4dbcd	Delete GPUSimulators/helpers/__pycache__ directory	2025-03-30 18:22:38 +02:00
Anthony Berg	2e5cf88eef	Merge remote-tracking branch 'origin/build/rocm-upgrade' into build/rocm-upgrade # Conflicts: # GPUSimulators/Simulator.py	2025-03-30 17:45:16 +02:00
Anthony Berg	80afd31286	refactor: change how variables are called in for loop	2025-03-30 17:44:33 +02:00
Anthony Berg	e2306406a7	fix: floating point number practically causing an infinite loop	2025-03-30 17:43:52 +02:00