Compare commits

...

4 Commits

Author SHA1 Message Date
Anthony Berg
db9d735a82 docs: remove redundant separate link to file 2025-03-30 21:14:49 +02:00
Anthony Berg
99520d1503 docs: update instructions to reflect new Slurm job file 2025-03-30 21:13:02 +02:00
Anthony Berg
cf102131df fix: use MPI in slurm job 2025-03-30 20:44:40 +02:00
Anthony Berg
28a96382ff fix: add cuda_cache dir to prevent parallel nodes from hanging up 2025-03-30 20:28:56 +02:00
5 changed files with 46 additions and 69 deletions

1
.gitignore vendored
View File

@ -186,7 +186,6 @@ cython_debug/
.pypirc
# CUDA
cuda_cache/
# Taken from: https://github.com/github/gitignore/blob/main/CUDA.gitignore
*.i

View File

View File

@ -1,39 +0,0 @@
#!/bin/bash -e
#SBATCH --job-name=lumi
#SBATCH --account=project_4650000xx
#SBATCH --time=00:10:00
#SBATCH --partition=dev-g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --gpus=8
#SBATCH --gpus-per-node=8
#SBATCH -o %x-%j.out
#SBATCH --exclusive
#
N=$SLURM_JOB_NUM_NODES
echo "--nbr of nodes:", $N
echo "--total nbr of gpus:", $SLURM_NTASKS
Mydir=/project/project_4650000xx
Myapplication=${Mydir}/FiniteVolumeGPU_hip/mpiTesting.py
#modules
ml LUMI/24.03 partition/G
ml lumi-container-wrapper
ml cray-python/3.11.7
ml rocm/6.2.2
ml craype-accel-amd-gfx90a
ml cray-mpich/8.1.29
export PATH="/project/project_4650000xx/FiniteVolumeGPU_hip/MyCondaEnv/bin:$PATH"
#missing library
export LD_LIBRARY_PATH=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib-abi-mpich:$LD_LIBRARY_PATH
#Binding mask
bind_mask="0x${fe}000000000000,0x${fe}00000000000000,0x${fe}0000,0x${fe}000000,0x${fe},0x${fe}00,0x${fe}00000000,0x${fe}0000000000"
srun --cpu-bind=mask_cpu:$bind_mask \
python ${Myapplication} -nx 1024 -ny 1024 --profile

27
Jobs/job_lumi.slurm Normal file
View File

@ -0,0 +1,27 @@
#!/bin/bash -l
#SBATCH --job-name=lumi
#SBATCH --account=project_4650000xx
#SBATCH --time=00:10:00
#SBATCH --partition=dev-g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8
#SBATCH --output=%x-%j.out
#SBATCH --exclusive
N=$SLURM_JOB_NUM_NODES
echo "--nbr of nodes:", $N
echo "--total nbr of gpus:", $SLURM_NTASKS
Mydir=/project/${project}
Myapplication=${Mydir}/FiniteVolumeGPU_HIP/mpiTesting.py
CondaEnv=${Mydir}/FiniteVolumeGPU_HIP/MyCondaEnv/bin
export PATH="${CondaEnv}:$PATH"
CPU_BIND="map_cpu:49,57,17,25,1,9,33,41"
export MPICH_GPU_SUPPORT_ENABLED=1
srun --cpu-bind=${CPU_BIND} --mpi=pmi2 \
python ${Myapplication} -nx 1024 -ny 1024 --profile

View File

@ -5,48 +5,38 @@ This is a HIP version of the [FiniteVolume code](https://github.com/babrodtk/Fin
## Setup on LUMI-G
Here is a step-by-step guide on installing packages on LUMI-G
### Step 1: Install rocm-5.4.6 with Easybuild
```
export EBU_USER_PREFIX=/project/project_xxxxxx/EasyBuild
ml LUMI/24.03 partition/G
ml EasyBuild-user
export PYTHONIOENCODING=utf-8
eb rocm-5.4.6.eb -r
```
### Step 2: run conda-container
### Step 1: run conda-container
Installation via conda can be done as:
```
```shell
ml LUMI/24.03 partition/G
ml lumi-container-wrapper/0.3.3-cray-python-3.11.7
```
ml lumi-container-wrapper
```
```shell
conda-containerize new --prefix MyCondaEnv conda_environment_lumi.yml
```
where the file `conda_environment_lumi.yml` contains packages to be installed.
### Step 3: Set the env. variable to search for binaries
```
export the bin path: export PATH="$PWD/MyCondaEnv/bin:$PATH"
```
### An alternative: Convert to a singularity container with cotainr
### Step 1 alternative: Convert to a singularity container with cotainr
```
cotainr build my_container.sif --system=lumi-g --conda-env=conda_environment_lumi.yml
```
### Error when running MPI.
### Step 2: Modify Slurm Job file
Update the contents of [`Jobs/job_lumi.slurm`](Jobs/job_lumi.slurm) to match your project allocation,
and the directories of where the simulator and Conda container is stored.
### Step 3: Run the Slurm Job
```shell
sbatch Jobs/job_lumi.slurm
```
### Troubleshooting
#### Error when running MPI.
```
`MPI startup(): PMI server not found. Please set I_MPI_PMI_LIBRARY variable if it is not a singleton case.
```
This can be resolved by exporting this:
```
export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.27/ofi/cray/14.0/lib/libmpi.so
export I_MPI_PMI_LIBRARY=/opt/cray/pe/mpich/8.1.29/ofi/cray/17.0/lib/libmpi.so
```
### Install hip-python
```
python -m pip install -i https://test.pypi.org/simple/ hip-python==5.4.3.470.16
```
The testing was done with this specific version `hip-python==5.4.3.470.16`