feat: add containr Slurm job and docs

fix: removed unused variable in directory
docs: remove redundant separate link to file
2025-12-24 13:29:17 +01:00 · 2025-04-01 14:52:30 +02:00 · 2025-04-01 14:52:13 +02:00 · 2025-03-30 21:14:49 +02:00 · 2025-03-30 21:13:02 +02:00 · 2025-03-30 20:44:40 +02:00
11 changed files with 456 additions and 146 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,275 @@
 .vscode/settings.json
 /data
 # Numpy Zipped
 *.npz
 # NetCDF
 *.nc
 # Python Related files
 # Taken from: https://github.com/github/gitignore/blob/main/Python.gitignore
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # Ruff stuff:
 .ruff_cache/
 # PyPI configuration file
 .pypirc
 # CUDA
 # Taken from: https://github.com/github/gitignore/blob/main/CUDA.gitignore
 *.i
 *.ii
 *.gpu
 *.ptx
 *.cubin
 *.fatbin
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 # Taken from: https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 # User-specific stuff
 .idea/**/workspace.xml
 .idea/**/tasks.xml
 .idea/**/usage.statistics.xml
 .idea/**/dictionaries
 .idea/**/shelf
 # AWS User-specific
 .idea/**/aws.xml
 # Generated files
 .idea/**/contentModel.xml
 # Sensitive or high-churn files
 .idea/**/dataSources/
 .idea/**/dataSources.ids
 .idea/**/dataSources.local.xml
 .idea/**/sqlDataSources.xml
 .idea/**/dynamic.xml
 .idea/**/uiDesigner.xml
 .idea/**/dbnavigator.xml
 # Gradle
 .idea/**/gradle.xml
 .idea/**/libraries
 # Gradle and Maven with auto-import
 # When using Gradle or Maven with auto-import, you should exclude module files,
 # since they will be recreated, and may cause churn.  Uncomment if using
 # auto-import.
 # .idea/artifacts
 # .idea/compiler.xml
 # .idea/jarRepositories.xml
 # .idea/modules.xml
 # .idea/*.iml
 # .idea/modules
 # *.iml
 # *.ipr
 # CMake
 cmake-build-*/
 # Mongo Explorer plugin
 .idea/**/mongoSettings.xml
 # File-based project format
 *.iws
 # IntelliJ
 out/
 # mpeltonen/sbt-idea plugin
 .idea_modules/
 # JIRA plugin
 atlassian-ide-plugin.xml
 # Cursive Clojure plugin
 .idea/replstate.xml
 # SonarLint plugin
 .idea/sonarlint/
 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
 crashlytics-build.properties
 fabric.properties
 # Editor-based Rest Client
 .idea/httpRequests
 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -35,6 +35,8 @@ import gc
 import netCDF4
 import json
 from tqdm import tqdm
 #import pycuda.compiler as cuda_compiler
 #import pycuda.gpuarray
 #import pycuda.driver as cuda
@@ -178,11 +180,11 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
        profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
        #Start simulation loop
-        progress_printer = ProgressPrinter(save_times[-1], print_every=10)
+        # progress_printer = ProgressPrinter(save_times[-1], print_every=10)
-        for k in range(len(save_times)):
+        for k, t_step in tqdm(enumerate(t_steps), desc="Simulation Loop"):
            #Get target time and step size there
-            t_step = t_steps[k]
+            # t_step = t_steps[k]
-            t_end = save_times[k]
+            # t_end = save_times[k]
            #Sanity check simulator
            try:
@@ -194,7 +196,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
            profiling_data_sim_runner["start"]["t_full_step"] += time.time()
            #Simulate
-            if (t_step > 0.0):
+            if t_step > 0.0:
                sim.simulate(t_step, dt)
            profiling_data_sim_runner["end"]["t_full_step"] += time.time()
@@ -211,11 +213,11 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
            profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
            #Write progress to screen
-            print_string = progress_printer.getPrintString(t_end)
+            # print_string = progress_printer.getPrintString(t_end)
-            if (print_string):
+            # if (print_string):
-                logger.debug(print_string)
+            #     logger.debug(print_string)
-        logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
+        logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(save_times[-1], sim.simSteps(), sim.simTime() / sim.simSteps()))
    return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
    #return outdata.filename
@@ -306,7 +308,7 @@ class IPEngine(object):
        import ipyparallel
        self.cluster = ipyparallel.Client()#profile='mpi')
        time.sleep(3)
-        while(len(self.cluster.ids) != n_engines):
+        while len(self.cluster.ids) != n_engines:
            time.sleep(0.5)
            self.logger.info("Waiting for cluster...")
            self.cluster = ipyparallel.Client()#profile='mpi')
@@ -433,58 +435,58 @@ class DataDumper(object):
-class ProgressPrinter(object):
+# class ProgressPrinter(object):
-    """
+#     """
-    Small helper class for 
+#     Small helper class for 
-    """
+#     """
-    def __init__(self, total_steps, print_every=5):
+#     def __init__(self, total_steps, print_every=5):
-        self.logger = logging.getLogger(__name__)
+#         self.logger = logging.getLogger(__name__)
-        self.start = time.time()
+#         self.start = time.time()
-        self.total_steps = total_steps
+#         self.total_steps = total_steps
-        self.print_every = print_every
+#         self.print_every = print_every
-        self.next_print_time = self.print_every
+#         self.next_print_time = self.print_every
-        self.last_step = 0
+#         self.last_step = 0
-        self.secs_per_iter = None
+#         self.secs_per_iter = None
-    def getPrintString(self, step):
+#     def getPrintString(self, step):
-        elapsed =  time.time() - self.start
+#         elapsed =  time.time() - self.start
-        if (elapsed > self.next_print_time):            
+#         if (elapsed > self.next_print_time):            
-            dt = elapsed - (self.next_print_time - self.print_every)
+#             dt = elapsed - (self.next_print_time - self.print_every)
-            dsteps = step - self.last_step
+#             dsteps = step - self.last_step
-            steps_remaining = self.total_steps - step
+#             steps_remaining = self.total_steps - step
-            if (dsteps == 0):
+#             if (dsteps == 0):
-                return
+#                 return
-            self.last_step = step
+#             self.last_step = step
-            self.next_print_time = elapsed + self.print_every
+#             self.next_print_time = elapsed + self.print_every
-            if not self.secs_per_iter:
+#             if not self.secs_per_iter:
-                self.secs_per_iter = dt / dsteps
+#                 self.secs_per_iter = dt / dsteps
-            self.secs_per_iter = 0.2*self.secs_per_iter + 0.8*(dt / dsteps)
+#             self.secs_per_iter = 0.2*self.secs_per_iter + 0.8*(dt / dsteps)
-            remaining_time = steps_remaining * self.secs_per_iter
+#             remaining_time = steps_remaining * self.secs_per_iter
-            return "{:s}. Total: {:s}, elapsed: {:s}, remaining: {:s}".format(
+#             return "{:s}. Total: {:s}, elapsed: {:s}, remaining: {:s}".format(
-                ProgressPrinter.progressBar(step, self.total_steps), 
+#                 ProgressPrinter.progressBar(step, self.total_steps), 
-                ProgressPrinter.timeString(elapsed + remaining_time), 
+#                 ProgressPrinter.timeString(elapsed + remaining_time), 
-                ProgressPrinter.timeString(elapsed), 
+#                 ProgressPrinter.timeString(elapsed), 
-                ProgressPrinter.timeString(remaining_time))
+#                 ProgressPrinter.timeString(remaining_time))
-    def timeString(seconds):
+#     def timeString(seconds):
-        seconds = int(max(seconds, 1))
+#         seconds = int(max(seconds, 1))
-        minutes, seconds = divmod(seconds, 60)
+#         minutes, seconds = divmod(seconds, 60)
-        hours, minutes = divmod(minutes, 60)
+#         hours, minutes = divmod(minutes, 60)
-        periods = [('h', hours), ('m', minutes), ('s', seconds)]
+#         periods = [('h', hours), ('m', minutes), ('s', seconds)]
-        time_string = ' '.join('{}{}'.format(value, name)
+#         time_string = ' '.join('{}{}'.format(value, name)
-                                for name, value in periods
+#                                 for name, value in periods
-                                if value)
+#                                 if value)
-        return time_string
+#         return time_string
-    def progressBar(step, total_steps, width=30):
+#     def progressBar(step, total_steps, width=30):
-        progress = np.round(width * step / total_steps).astype(np.int32)
+#         progress = np.round(width * step / total_steps).astype(np.int32)
-        progressbar = "0% [" + "#"*(progress) + "="*(width-progress) + "] 100%"
+#         progressbar = "0% [" + "#"*(progress) + "="*(width-progress) + "] 100%"
-        return progressbar
+#         return progressbar
 """
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@@ -25,6 +25,7 @@ import numpy as np
 import math
 import logging
 from enum import IntEnum
 from tqdm import tqdm
 #import pycuda.compiler as cuda_compiler
 #import pycuda.gpuarray
@@ -156,7 +157,7 @@ class BaseSimulator(object):
        self.num_substeps = num_substeps
        #Handle autotuning block size
-        if (self.context.autotuner):
+        if self.context.autotuner:
            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
            block_width = int(peak_configuration["block_width"])
            block_height = int(peak_configuration["block_height"])
@@ -195,42 +196,45 @@ class BaseSimulator(object):
        Requires that the step() function is implemented in the subclasses
        """
-        printer = Common.ProgressPrinter(t)
+        # printer = Common.ProgressPrinter(t)
        t_start = self.simTime()
        t_end = t_start + t
        update_dt = True
-        if (dt is not None):
+        if dt is not None:
            update_dt = False
            self.dt = dt
-        while(self.simTime() < t_end):
+        for _ in tqdm(range(math.ceil((t_end - t_start) / self.dt)), desc="Simulation"):
            # Update dt every 100 timesteps and cross your fingers it works
            # for the next 100
-            if (update_dt and (self.simSteps() % 100 == 0)):
+            # TODO this is probably broken now after fixing the "infinite" loop
            if update_dt and (self.simSteps() % 100 == 0):
                self.dt = self.computeDt()*self.cfl_scale
-        
+
            # Compute timestep for "this" iteration (i.e., shorten last timestep)
            current_dt = np.float32(min(self.dt, t_end-self.simTime()))
            # Stop if end reached (should not happen)
-            if (current_dt <= 0.0):
+            if current_dt <= 0.0:
                self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.simSteps()))
                break
-        
+
            # Step forward in time
            self.step(current_dt)
            #Print info
-            print_string = printer.getPrintString(self.simTime() - t_start)
+            # print_string = printer.getPrintString(self.simTime() - t_start)
-            if (print_string):
+            # if (print_string):
-                self.logger.info("%s: %s", self, print_string)
+            #     self.logger.info("%s: %s", self, print_string)
-                try:
+            #     try:
-                    self.check()
+            #         self.check()
-                except AssertionError as e:
+            #     except AssertionError as e:
-                    e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()),)
+            #         e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()),)
-                    raise
+            #         raise
        print("Done")
    def step(self, dt):
--- a/GPUSimulators/cuda_cache/.gitkeep
+++ b/GPUSimulators/cuda_cache/.gitkeep
--- a/GPUSimulators/helpers/pycache/InitialConditions.cpython-39.pyc
+++ b/GPUSimulators/helpers/pycache/InitialConditions.cpython-39.pyc
--- a/Jobs/job_apptainer_lumi.slurm
+++ b/Jobs/job_apptainer_lumi.slurm
@@ -0,0 +1,26 @@
 #!/bin/bash -l
 #SBATCH --job-name=lumi
 #SBATCH --account=project_4650000xx
 #SBATCH --time=00:10:00
 #SBATCH --partition=dev-g
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=8
 #SBATCH --gpus-per-node=8
 #SBATCH --output=%x-%j.out
 #SBATCH --exclusive
 N=$SLURM_JOB_NUM_NODES
 echo "--nbr of nodes:", $N
 echo "--total nbr of gpus:", $SLURM_NTASKS
 MyDir=/project/project_4650000xx
 MyApplication=${MyDir}/FiniteVolumeGPU_HIP/mpiTesting.py
 Container=${MyDir}/FiniteVolumeGPU_HIP/my_container.sif
 CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
 export MPICH_GPU_SUPPORT_ENABLED=1
 srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
     apptainer exec "${Container}" \
     python ${MyApplication} -nx 1024 -ny 1024 --profile
--- a/Jobs/job_lumi.slrum
+++ b/Jobs/job_lumi.slrum
@@ -1,39 +0,0 @@
 #!/bin/bash -e
 #SBATCH --job-name=lumi
 #SBATCH --account=project_4650000xx
 #SBATCH --time=00:10:00
 #SBATCH --partition=dev-g
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=8
 #SBATCH --gpus=8
 #SBATCH --gpus-per-node=8
 #SBATCH -o %x-%j.out
 #SBATCH --exclusive
 #
 N=$SLURM_JOB_NUM_NODES
 echo "--nbr of nodes:", $N
 echo "--total nbr of gpus:", $SLURM_NTASKS
 Mydir=/project/project_4650000xx
 Myapplication=${Mydir}/FiniteVolumeGPU_hip/mpiTesting.py
 #modules
 ml LUMI/23.03 partition/G
 ml lumi-container-wrapper
 ml cray-python/3.9.13.1
 ml rocm/5.2.3
 ml craype-accel-amd-gfx90a
 ml cray-mpich/8.1.27
 export PATH="/project/project_4650000xx/FiniteVolumeGPU_hip/MyCondaEnv/bin:$PATH"
 #missing library
 export LD_LIBRARY_PATH=/opt/cray/pe/mpich/8.1.27/ofi/cray/14.0/lib-abi-mpich:$LD_LIBRARY_PATH
 #Binding mask
 bind_mask="0x${fe}000000000000,0x${fe}00000000000000,0x${fe}0000,0x${fe}000000,0x${fe},0x${fe}00,0x${fe}00000000,0x${fe}0000000000"
 srun --cpu-bind=mask_cpu:$bind_mask \
     python ${Myapplication} -nx 1024 -ny 1024 --profile 	
--- a/Jobs/job_lumi.slurm
+++ b/Jobs/job_lumi.slurm
@@ -0,0 +1,27 @@
 #!/bin/bash -l
 #SBATCH --job-name=lumi
 #SBATCH --account=project_4650000xx
 #SBATCH --time=00:10:00
 #SBATCH --partition=dev-g
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=8
 #SBATCH --gpus-per-node=8
 #SBATCH --output=%x-%j.out
 #SBATCH --exclusive
 N=$SLURM_JOB_NUM_NODES
 echo "--nbr of nodes:", $N
 echo "--total nbr of gpus:", $SLURM_NTASKS
 MyDir=/project/project_4650000xx
 MyApplication=${MyDir}/FiniteVolumeGPU_HIP/mpiTesting.py
 CondaEnv=${MyDir}/FiniteVolumeGPU_HIP/MyCondaEnv/bin
 export PATH="${CondaEnv}:$PATH"
 CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
 export MPICH_GPU_SUPPORT_ENABLED=1
 srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
     python ${MyApplication} -nx 1024 -ny 1024 --profile
--- a/README.md
+++ b/README.md
@@ -1,46 +1,57 @@
 # FiniteVolumeGPU
-This is a HIP version of the [FiniteVolume code](https://github.com/babrodtk/FiniteVolumeGPU) (work in progress). It is a Python software package that implements several finite volume discretizations on Cartesian grids for the shallow water equations and the Euler equations. 
+This is a HIP version of the [FiniteVolume code](https://github.com/babrodtk/FiniteVolumeGPU). It is a Python software package that implements several finite volume discretizations on Cartesian grids for the shallow water equations and the Euler equations. 
 ## Setup on LUMI-G
 Here is a step-by-step guide on installing packages on LUMI-G
 ### Step 0: load modules
 ```
 ml LUMI/23.03 partition/G
 ml lumi-container-wrapper
 ml cray-python/3.9.13.1
 ```
 ### Step 1: run conda-container
 Installation via conda can be done as:
 ```shell
 ml LUMI/24.03 partition/G
 ml lumi-container-wrapper
 ```
 ```shell
 conda-containerize new --prefix MyCondaEnv conda_environment_lumi.yml
 ```
 where the file `conda_environment_lumi.yml` contains packages to be installed.
-### Step 2: Set the env. variable to search for binaries
+### Step 1 alternative: Convert to a singularity container with cotainr
-```
+Load the required modules first
-export the bin path: export PATH="$PWD/MyCondaEnv/bin:$PATH"
+```shell
-```
+ml CrayEnv
-### An alternative: Convert to a singularity container with cotainr
+ml cotainr
 ```
 Then build the Singularity/Apptainer container 
 ```shell
 cotainr build my_container.sif --system=lumi-g --conda-env=conda_environment_lumi.yml
 ```
-### Error when running MPI.
+### Step 2: Modify Slurm Job file
 Depending on your build method, update [`Jobs/job_lumi.slurm`](Jobs/job_lumi.slurm) if `conda-containerize` was used, or [`Jobs/job_apptainer_lumi.slurm`](Jobs/job_apptainer_lumi.slurm) if `containr` was used.
 In the job file, the required changes is to match your project allocation,
 and the directories of where the simulator and container is stored.
 ### Step 3: Run the Slurm Job
 If `conda-containerize` was used for building:
 ```shell
 sbatch Jobs/job_lumi.slurm
 ```
 Otherwise, if `containr` was used for building:
 ```shell
 sbatch Jobs/job_apptainer_lumi.slurm
 ```
 ### Troubleshooting
 #### Error when running MPI.
 ```
 `MPI startup(): PMI server not found. Please set I_MPI_PMI_LIBRARY variable if it is not a singleton case.
 ```
 This can be resolved by exporting this:
 ```
-export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.27/ofi/cray/14.0/lib/libmpi.so
+export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib/libmpi.so
-```
+```
 ### Install hip-python
 ```
 python -m pip install -i https://test.pypi.org/simple/ hip-python==5.4.3.470.16
 ```
 The testing was done with this specific version `hip-python==5.4.3.470.16`
--- a/conda_environment_lumi.yml
+++ b/conda_environment_lumi.yml
@@ -5,15 +5,17 @@ channels:
 - conda-forge
 dependencies:
- python=3.9.13
+- python=3.11.7
 - pip
 - numpy
 - mpi4py
 - six
 - pytools
 - netcdf4
 - scipy
 - tqdm
 - pip:
-  - hip-python==5.4.3.470.16
+  - hip-python==6.2.0.499.16
  - -i https://test.pypi.org/simple/ 
--- a/mpiTesting.py
+++ b/mpiTesting.py
@@ -70,7 +70,7 @@ def hip_check(call_result):
 args = parser.parse_args()
-if(args.profile):
+if args.profile:
    profiling_data = {}
    # profiling: total run time
    t_total_start = time.time()
@@ -79,6 +79,8 @@ if(args.profile):
 # Get MPI COMM to use
 comm = MPI.COMM_WORLD
 size = comm.Get_size()
 rank = comm.Get_rank()
 ####
@@ -86,7 +88,7 @@ comm = MPI.COMM_WORLD
 ####
 log_level_console = 20
 log_level_file = 10
-log_filename = 'mpi_' + str(comm.rank) + '.log'
+log_filename = 'mpi_' + str(rank) + '.log'
 logger = logging.getLogger('GPUSimulators')
 logger.setLevel(min(log_level_console, log_level_file))
@@ -110,7 +112,7 @@ logger.info("File logger using level %s to %s",
 # Initialize MPI grid etc
 ####
 logger.info("Creating MPI grid")
-grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
+grid = MPISimulator.MPIGrid(comm)
 """
 job_id = int(os.environ["SLURM_JOB_ID"])
@@ -152,7 +154,7 @@ gamma = 1.4
 #save_times = np.linspace(0, 0.000099, 11)
 #save_times = np.linspace(0, 0.000099, 2)
 save_times = np.linspace(0, 0.0000999, 2)
-outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
+outfile = "mpi_out_" + str(rank) + ".nc"
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
 arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid)
@@ -160,7 +162,7 @@ arguments['context'] = cuda_context
 arguments['theta'] = 1.2
 arguments['grid'] = grid
-if(args.profile):
+if args.profile:
    t_init_end = time.time()
    t_init = t_init_end - t_init_start
    profiling_data["t_init"] = t_init
@@ -178,17 +180,17 @@ def genSim(grid, **kwargs):
    return sim
-outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
+(outfile, sim_runner_profiling_data, sim_profiling_data) = Common.runSimulation(
    genSim, arguments, outfile, save_times, save_var_names, dt)
-if(args.profile):
+if args.profile:
    t_total_end = time.time()
    t_total = t_total_end - t_total_start
    profiling_data["t_total"] = t_total
-    print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
+    print("Total run time on rank " + str(rank) + " is " + str(t_total) + " s")
 # write profiling to json file
-if(args.profile and MPI.COMM_WORLD.rank == 0):
+if args.profile and rank == 0:
    job_id = ""
    if "SLURM_JOB_ID" in os.environ:
        job_id = int(os.environ["SLURM_JOB_ID"])
@@ -199,7 +201,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
            str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
        profiling_data["outfile"] = outfile
    else:
-        profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
+        profiling_file = "MPI_" + str(size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
    for stage in sim_runner_profiling_data["start"].keys():
        profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
@@ -214,7 +216,7 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
    profiling_data["slurm_job_id"] = job_id
    profiling_data["n_cuda_devices"] = str(num_cuda_devices)
-    profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
+    profiling_data["n_processes"] = str(size)
    profiling_data["git_hash"] = Common.getGitHash()
    profiling_data["git_status"] = Common.getGitStatus()
Author	SHA1	Message	Date
Anthony Berg	1056ecea67	feat: add containr Slurm job and docs	2025-04-01 14:52:30 +02:00
Anthony Berg	22563df94f	fix: removed unused variable in directory	2025-04-01 14:52:13 +02:00
Anthony Berg	db9d735a82	docs: remove redundant separate link to file	2025-03-30 21:14:49 +02:00
Anthony Berg	99520d1503	docs: update instructions to reflect new Slurm job file	2025-03-30 21:13:02 +02:00
Anthony Berg	cf102131df	fix: use MPI in slurm job	2025-03-30 20:44:40 +02:00
Anthony Berg	28a96382ff	fix: add cuda_cache dir to prevent parallel nodes from hanging up	2025-03-30 20:28:56 +02:00
Anthony Berg	6d9f36968d	Merge remote-tracking branch 'origin/build/rocm-upgrade' into build/rocm-upgrade	2025-03-30 18:40:46 +02:00
Anthony Berg	5b925cdb42	refactor: change MPI functions into variables	2025-03-30 18:40:38 +02:00
Anthony Berg	b054a4dbcd	Delete GPUSimulators/helpers/__pycache__ directory	2025-03-30 18:22:38 +02:00
Anthony Berg	2e5cf88eef	Merge remote-tracking branch 'origin/build/rocm-upgrade' into build/rocm-upgrade # Conflicts: # GPUSimulators/Simulator.py	2025-03-30 17:45:16 +02:00
Anthony Berg	80afd31286	refactor: change how variables are called in for loop	2025-03-30 17:44:33 +02:00
Anthony Berg	e2306406a7	fix: floating point number practically causing an infinite loop	2025-03-30 17:43:52 +02:00
Anthony Berg	aa21733806	fix: floating point number practically causing an infinite loop	2025-03-29 22:16:55 +01:00
Anthony Berg	5a27445de8	fix: deprecated modules on LUMI (cherry picked from commit `277a6b4a3c`)	2025-03-26 14:56:55 +01:00
Anthony Berg	cd69f69080	feat: add tqdm progress bar	2025-03-18 07:41:52 +01:00
Anthony Berg	9761ff4924	fix: touples not being assinged	2025-03-18 07:41:38 +01:00
Anthony Berg	5931cee93f	build: update deps	2025-03-18 07:41:19 +01:00
Anthony Berg	208d82ab0b	feat: add .gitignore	2025-03-17 14:48:46 +01:00
Hicham Agueny	31bf80c6f0	Update README.md: install rocm with easybuild	2024-09-24 14:49:23 +02:00
Hicham Agueny	4df5e5853f	Merge pull request #1 from HichamAgueny/implement-hip-python Implement hip python	2024-06-09 23:26:35 +02:00