Merge pull request #9 from setmar/master

Conda env and job scripts for DGX-2 (Simula) and Saga (Sigma2)
2025-05-18 14:34:13 +02:00 · 2020-10-30 09:45:00 +01:00 · 2020-10-30 09:45:00 +01:00 · 402e2c6f9f
commit 402e2c6f9f
parent 51b54b7711 37488b1ef3
3 changed files with 106 additions and 0 deletions
--- a/conda_environment_hpc.yml
+++ b/conda_environment_hpc.yml
@ -0,0 +1,22 @@
 # Assumes that conda, pip, build-essentials and cuda are installed
 ---
 name: ShallowWaterGPU_HPC
 channels:
 - conda-forge
 dependencies:
 - python=3.7
 - numpy
 - mpi4py
 - six
 - pytools
 - netcdf4
 - scipy
 # Install conda environment (one-time operation):
 # $ conda env create -f conda_environment_hpc.yml
 # Activate environment and install the following packages using pip:
 # $ conda activate ShallowWaterGPU
 #  - pycuda: $ pip3 install --no-deps -U pycuda
 # on Windows: make sure your visual studio c++ compiler is available in PATH
 # PATH should have something like C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\
--- a/dgx-2-test.job
+++ b/dgx-2-test.job
@ -0,0 +1,34 @@
 #!/bin/bash
 #SBATCH -p dgx2q                   # partition (GPU queue)
 #SBATCH -N 1                       # number of nodes
 #SBATCH -n 4                       # number of cores
 #SBATCH -w g001                    # DGX-2 node
 #SBATCH --gres=gpu:4               # number of V100's
 #SBATCH --mem 10G                  # memory pool for all cores
 #SBATCH -t 0-00:10                 # time (D-HH:MM)
 #SBATCH -o slurm.%N.%j.out  # STDOUT
 #SBATCH -e slurm.%N.%j.err  # STDERR
 ulimit -s 10240
 module load slurm 
 module load openmpi/4.0.1
 module load cuda10.1/toolkit/10.1.243
 # Check how many gpu's your job got
 #nvidia-smi
 ## Copy input files to the work directory:
 mkdir -p /work/$USER/ShallowWaterGPU
 cp -r . /work/$USER/ShallowWaterGPU
 # Run job
 # (Assumes Miniconda is installed in user root dir.)
 cd /work/$USER/ShallowWaterGPU
 mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
 cd $HOME/src/ShallowWaterGPU
 ## Copy files from work directory:
 # (NOTE: Copying is not performed if job fails!)
 cp /work/$USER/ShallowWaterGPU/*.log .
 cp /work/$USER/ShallowWaterGPU/*.nc .
--- a/saga-test.job
+++ b/saga-test.job
@ -0,0 +1,50 @@
 #!/bin/bash
 # Job name:
 #SBATCH --job-name=saga-test
 #
 # Project:
 #SBATCH --account=nn9550k
 #
 # Wall clock limit:
 #SBATCH --time=00:10:00
 #
 # Ask for 1 GPU (max is 2)
 # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
 # device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
 # /dev/nvidia0, /dev/nvidia1 or both, respectively.
 #SBATCH --partition=accel --gres=gpu:1
 #
 # Max memory usage per task (core) - increasing this will cost more core hours:
 #SBATCH --mem-per-cpu=4G
 #
 # Number of tasks:
 #SBATCH --nodes=2 --ntasks-per-node=1
 ## Set up job environment: (this is done automatically behind the scenes)
 ## (make sure to comment '#' or remove the following line 'source ...')
 # source /cluster/bin/jobsetup
 module restore system   # instead of 'module purge' rather set module environment to the system default
 module load CUDA/10.2.89
 # It is also recommended to to list loaded modules, for easier debugging:
 module list
 set -o errexit # exit on errors
 set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
 ## Copy input files to the work directory:
 mkdir $SCRATCH/ShallowWaterGPU
 cp -r . $SCRATCH/ShallowWaterGPU
 ## Make sure the results are copied back to the submit directory (see Work Directory below):
 # chkfile MyResultFile
 # chkfile is replaced by 'savefile' on Saga
 savefile "$SCRATCH/ShallowWaterGPU/*.log"
 savefile "$SCRATCH/ShallowWaterGPU/*.nc"
 ## Do some work:
 cd $SCRATCH/ShallowWaterGPU
 srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
 srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py