Merge pull request #9 from setmar/master

Conda env and job scripts for DGX-2 (Simula) and Saga (Sigma2)
This commit is contained in:
André R. Brodtkorb 2020-10-30 09:45:00 +01:00 committed by GitHub
commit 402e2c6f9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 106 additions and 0 deletions

22
conda_environment_hpc.yml Normal file
View File

@ -0,0 +1,22 @@
# Assumes that conda, pip, build-essentials and cuda are installed
---
name: ShallowWaterGPU_HPC
channels:
- conda-forge
dependencies:
- python=3.7
- numpy
- mpi4py
- six
- pytools
- netcdf4
- scipy
# Install conda environment (one-time operation):
# $ conda env create -f conda_environment_hpc.yml
# Activate environment and install the following packages using pip:
# $ conda activate ShallowWaterGPU
# - pycuda: $ pip3 install --no-deps -U pycuda
# on Windows: make sure your visual studio c++ compiler is available in PATH
# PATH should have something like C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\

34
dgx-2-test.job Normal file
View File

@ -0,0 +1,34 @@
#!/bin/bash
#SBATCH -p dgx2q # partition (GPU queue)
#SBATCH -N 1 # number of nodes
#SBATCH -n 4 # number of cores
#SBATCH -w g001 # DGX-2 node
#SBATCH --gres=gpu:4 # number of V100's
#SBATCH --mem 10G # memory pool for all cores
#SBATCH -t 0-00:10 # time (D-HH:MM)
#SBATCH -o slurm.%N.%j.out # STDOUT
#SBATCH -e slurm.%N.%j.err # STDERR
ulimit -s 10240
module load slurm
module load openmpi/4.0.1
module load cuda10.1/toolkit/10.1.243
# Check how many gpu's your job got
#nvidia-smi
## Copy input files to the work directory:
mkdir -p /work/$USER/ShallowWaterGPU
cp -r . /work/$USER/ShallowWaterGPU
# Run job
# (Assumes Miniconda is installed in user root dir.)
cd /work/$USER/ShallowWaterGPU
mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
cd $HOME/src/ShallowWaterGPU
## Copy files from work directory:
# (NOTE: Copying is not performed if job fails!)
cp /work/$USER/ShallowWaterGPU/*.log .
cp /work/$USER/ShallowWaterGPU/*.nc .

50
saga-test.job Normal file
View File

@ -0,0 +1,50 @@
#!/bin/bash
# Job name:
#SBATCH --job-name=saga-test
#
# Project:
#SBATCH --account=nn9550k
#
# Wall clock limit:
#SBATCH --time=00:10:00
#
# Ask for 1 GPU (max is 2)
# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU
# device(s) to use. It will have values '0', '1' or '0,1' corresponding to
# /dev/nvidia0, /dev/nvidia1 or both, respectively.
#SBATCH --partition=accel --gres=gpu:1
#
# Max memory usage per task (core) - increasing this will cost more core hours:
#SBATCH --mem-per-cpu=4G
#
# Number of tasks:
#SBATCH --nodes=2 --ntasks-per-node=1
## Set up job environment: (this is done automatically behind the scenes)
## (make sure to comment '#' or remove the following line 'source ...')
# source /cluster/bin/jobsetup
module restore system # instead of 'module purge' rather set module environment to the system default
module load CUDA/10.2.89
# It is also recommended to to list loaded modules, for easier debugging:
module list
set -o errexit # exit on errors
set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
## Copy input files to the work directory:
mkdir $SCRATCH/ShallowWaterGPU
cp -r . $SCRATCH/ShallowWaterGPU
## Make sure the results are copied back to the submit directory (see Work Directory below):
# chkfile MyResultFile
# chkfile is replaced by 'savefile' on Saga
savefile "$SCRATCH/ShallowWaterGPU/*.log"
savefile "$SCRATCH/ShallowWaterGPU/*.nc"
## Do some work:
cd $SCRATCH/ShallowWaterGPU
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py