Start of HIPIFYcation

2025-10-31 20:27:40 +01:00 · 2025-06-16 16:09:45 +02:00 · 2025-06-16 16:09:45 +02:00 · 19ea8319e8
commit 19ea8319e8
parent e7cd6ae34a
29 changed files with 11 additions and 650 deletions
--- a/GPUSimulators/gpu/common/EulerCommon.h
+++ b/GPUSimulators/gpu/common/EulerCommon.h
--- a/GPUSimulators/gpu/common/SWECommon.h
+++ b/GPUSimulators/gpu/common/SWECommon.h
--- a/GPUSimulators/gpu/common/common.h
+++ b/GPUSimulators/gpu/common/common.h
--- a/GPUSimulators/gpu/common/limiters.h
+++ b/GPUSimulators/gpu/common/limiters.h
--- a/GPUSimulators/gpu/cuda/EE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/gpu/cuda/EE2D_KP07_dimsplit.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_FORCE.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_FORCE.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_HLL.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_HLL.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_HLL2.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_HLL2.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_KP07.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_KP07.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_KP07_dimsplit.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_LxF.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_LxF.cu
--- a/GPUSimulators/gpu/cuda/SWE2D_WAF.cu
+++ b/GPUSimulators/gpu/cuda/SWE2D_WAF.cu
--- a/conda_environment.yml
+++ b/conda_environment.yml
@ -1,29 +1,28 @@
 # Assumes that conda, pip, build-essentials and cuda are installed
 ---
-name: ShallowWaterGPU
+name: FiniteVolumeGPU_HPC
 channels:
 - conda-forge

 dependencies:
- python=3.9
+- python=3.13.5
+- pip
 - numpy
- matplotlib
- jupyter
+- mpi4py
+- six
+- pytools
 - netcdf4
 - scipy
- nb_conda_kernels
- nbdime
- mpi4py
- ffmpeg
- pycuda
- ipyparallel
- line_profiler
 - tqdm
+- pip:
+  - hip-python==6.4.1.552.39
+  - -i https://test.pypi.org/simple/
+

 # Install conda environment (one-time operation):
 # $ conda env create -f conda_environment.yml
 # Activate environment
-# $ conda activate ShallowWaterGPU
+# $ conda activate FiniteVolumeGPU_HPC

 # OPTIONAL: If you want to compile pycuda yourself, uncomment pycuda under 
 # "dependencies" above and do the following (one-time operation):
--- a/conda_environment_hpc.yml
+++ b/conda_environment_hpc.yml
@ -1,23 +0,0 @@
-# Assumes that conda, pip, build-essentials and cuda are installed
---
-name: ShallowWaterGPU_HPC
-channels:
- conda-forge
-
-dependencies:
- python=3.7
- numpy
- mpi4py
- six
- pytools
- netcdf4
- scipy
- tqdm
-
-# Install conda environment (one-time operation):
-# $ conda env create -f conda_environment_hpc.yml
-# Activate environment and install the following packages using pip:
-# $ conda activate ShallowWaterGPU_HPC
-#  - pycuda: $ pip3 install --no-deps -U pycuda
-# on Windows: make sure your visual studio c++ compiler is available in PATH
-# PATH should have something like C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\
--- a/dgx-2-shmem-test.job
+++ b/dgx-2-shmem-test.job
@ -1,33 +0,0 @@
-#!/bin/bash
-#SBATCH -p dgx2q                   # partition (GPU queue)
-#SBATCH -N 1                       # number of nodes
-#SBATCH -n 1                       # number of cores
-#SBATCH -w g001                    # DGX-2 node
-#SBATCH --gres=gpu:1               # number of V100's
-#SBATCH -t 0-00:10                 # time (D-HH:MM)
-#SBATCH -o slurm.%N.%j.out  # STDOUT
-#SBATCH -e slurm.%N.%j.err  # STDERR
-
-ulimit -s 10240
-module load slurm 
-module load cuda10.1/toolkit/10.1.243
-
-# Check how many gpu's your job got
-#nvidia-smi
-
-## Copy input files to the work directory:
-rm -rf /work/$USER/ShallowWaterGPU
-mkdir -p /work/$USER/ShallowWaterGPU
-cp -r . /work/$USER/ShallowWaterGPU
-
-# Run job
-# (Assumes Miniconda is installed in user root dir.)
-cd /work/$USER/ShallowWaterGPU
-nvprof -o profiler_output $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 shmemTesting.py
-cd $HOME/src/ShallowWaterGPU
-
-## Copy files from work directory:
-# (NOTE: Copying is not performed if job fails!)
-cp /work/$USER/ShallowWaterGPU/*.log .
-cp /work/$USER/ShallowWaterGPU/*.nc .
-cp /work/$USER/ShallowWaterGPU/profiler_output .
--- a/dgx-2-test.job
+++ b/dgx-2-test.job
@ -1,35 +0,0 @@
-#!/bin/bash
-# See http://wiki.ex3.simula.no before changing the values below
-#SBATCH -p dgx2q                   # partition (GPU queue)
-#SBATCH -N 1                       # number of nodes
-#SBATCH -n 4                       # number of cores
-#SBATCH -w g001                    # DGX-2 node
-#SBATCH --gres=gpu:4               # number of V100's
-#SBATCH -t 0-00:10                 # time (D-HH:MM)
-#SBATCH -o slurm.%N.%j.out  # STDOUT
-#SBATCH -e slurm.%N.%j.err  # STDERR
-
-ulimit -s 10240
-module load slurm 
-module load openmpi/4.0.1
-module load cuda10.1/toolkit/10.1.243
-
-# Check how many gpu's your job got
-#nvidia-smi
-
-## Copy input files to the work directory:
-rm -rf /work/$USER/ShallowWaterGPU
-mkdir -p /work/$USER/ShallowWaterGPU
-cp -r . /work/$USER/ShallowWaterGPU
-
-# Run job
-# (Assumes Miniconda is installed in user root dir.)
-cd /work/$USER/ShallowWaterGPU
-mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
-cd $HOME/src/ShallowWaterGPU
-
-## Copy files from work directory:
-# (NOTE: Copying is not performed if job fails!)
-cp /work/$USER/ShallowWaterGPU/*.log .
-cp /work/$USER/ShallowWaterGPU/*.nc .
-
--- a/dgx-2_scaling_benchmark.job
+++ b/dgx-2_scaling_benchmark.job
@ -1,59 +0,0 @@
-#!/bin/bash
-# See http://wiki.ex3.simula.no before changing the values below
-#SBATCH -p dgx2q                   # partition (GPU queue)
-#SBATCH -w g001                    # DGX-2 node
-##SBATCH --gres=gpu:4               # number of V100's
-#SBATCH -t 0-00:10                 # time (D-HH:MM)
-#SBATCH -o slurm.%N.%j.out  # STDOUT
-#SBATCH -e slurm.%N.%j.err  # STDERR
-#SBATCH --reservation=martinls_17
-
-
-# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
-# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
-# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
-# mpiexec --mca opal_cuda_support 1 ...
-# 
-# In addition, the UCX support is also built but disabled by default.
-# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
-# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
-# Equivalently, you can set the MCA parameters in the command line:
-# mpiexec --mca pml ucx --mca osc ucx ...
-# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
-# Please consult UCX's documentation for detail.
-
-ulimit -s 10240
-module load slurm/20.02.7
-module load cuda11.2/toolkit/11.2.2
-module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
-
-# Check how many gpu's your job got
-#nvidia-smi
-
-mkdir -p output_dgx-2/$NOW
-
-## Copy input files to the work directory:
-mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
-cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
-
-# Run job
-# (Assumes Miniconda is installed in user root dir.)
-cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
-#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-#nsys profile -t nvtx,cuda mpirun -np  $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-
-export OMPI_MCA_opal_cuda_support=true
-mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-
-cd $HOME/src/ShallowWaterGPU
-
-## Copy files from work directory:
-# (NOTE: Copying is not performed if job fails!)
-mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW
-
-rm -rf /work/$USER/$SLURM_JOB_ID
--- a/dgx-2_strong_scaling_benchmark.sh
+++ b/dgx-2_strong_scaling_benchmark.sh
@ -1,73 +0,0 @@
-#!/bin/bash
-
-TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
-
-# one node: 1-16 GPUs
-#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#
-#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-# one node: 4-16 GPUs
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#
-#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-# one node: 1-16 GPUs
-sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-# one node: 4-16 GPUs
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
--- a/dgx-2_weak_scaling_benchmark.sh
+++ b/dgx-2_weak_scaling_benchmark.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-
-TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
-
-# one node: 1-16 GPUs
-#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#
-#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-# one node: 1-16 GPUs
-sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-
-sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
--- a/hgx_scaling_benchmark.job
+++ b/hgx_scaling_benchmark.job
@ -1,58 +0,0 @@
-#!/bin/bash
-# See http://wiki.ex3.simula.no before changing the values below
-#SBATCH -p hgx2q                   # partition (GPU queue)
-#SBATCH -w g002                    # HGX node
-#SBATCH -t 0-00:10                 # time (D-HH:MM)
-#SBATCH -o slurm.%N.%j.out  # STDOUT
-#SBATCH -e slurm.%N.%j.err  # STDERR
-#SBATCH --reservation=martinls_11
-
-
-# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
-# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
-# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
-# mpiexec --mca opal_cuda_support 1 ...
-# 
-# In addition, the UCX support is also built but disabled by default.
-# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
-# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
-# Equivalently, you can set the MCA parameters in the command line:
-# mpiexec --mca pml ucx --mca osc ucx ...
-# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
-# Please consult UCX's documentation for detail.
-
-ulimit -s 10240
-module load slurm/20.02.7
-module load cuda11.2/toolkit/11.2.2
-module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
-
-# Check how many gpu's your job got
-#nvidia-smi
-
-mkdir -p output_hgx/$NOW
-
-## Copy input files to the work directory:
-mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
-cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
-
-# Run job
-# (Assumes Miniconda is installed in user root dir.)
-cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
-#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-#nsys profile -t nvtx,cuda mpirun -np  $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-
-export OMPI_MCA_opal_cuda_support=true
-mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-
-cd $HOME/src/ShallowWaterGPU
-
-## Copy files from work directory:
-# (NOTE: Copying is not performed if job fails!)
-mkdir -p output_hgx/$NOW/$SLURM_JOB_ID
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW
-mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW
-
-rm -rf /work/$USER/$SLURM_JOB_ID
--- a/hgx_strong_scaling_benchmark.sh
+++ b/hgx_strong_scaling_benchmark.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
-
-# one node: 1-8 GPUs
-#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-
-# one node: 4-8 GPUs
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job
--- a/hgx_weak_scaling_benchmark.sh
+++ b/hgx_weak_scaling_benchmark.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
-
-# one node: 1-16 GPUs
-#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-
-# one node: 1-8 GPUs
-sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
-sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
--- a/run_script_ppi.sh
+++ b/run_script_ppi.sh
@ -1,8 +0,0 @@
-#!/bin/bash
-module purge
-module load git/2.21.0 hdf5/1.10.5-gcc cuda/10.1
-
-conda activate ShallowWaterGPU_HPC
-
-python mpiTesting.py
-
--- a/saga-dev.job
+++ b/saga-dev.job
@ -1,54 +0,0 @@
-#!/bin/bash
-# Job name:
-#SBATCH --job-name=ShallowWaterGPUScalingDev
-#
-# Project:
-#SBATCH --account=nn9882k
-#
-# Wall clock limit:
-#SBATCH --time=00:02:00
-#
-# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
-#
-# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
-# device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
-# /dev/nvidia0, /dev/nvidia1 or both, respectively.
-#SBATCH --partition=accel
-#
-# Max memory usage per task (core) - increasing this will cost more core hours:
-#SBATCH --mem-per-cpu=3800M
-#
-# Number of tasks:
-#SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
-#
-#SBATCH --qos=devel
-
-## Set up job environment: (this is done automatically behind the scenes)
-## (make sure to comment '#' or remove the following line 'source ...')
-# source /cluster/bin/jobsetup
-
-module restore system   # instead of 'module purge' rather set module environment to the system default
-module load CUDA/11.4.1
-
-# It is also recommended to to list loaded modules, for easier debugging:
-module list
-
-set -o errexit # exit on errors
-set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
-
-## Copy input files to the work directory:
-mkdir $SCRATCH/ShallowWaterGPU
-cp -r . $SCRATCH/ShallowWaterGPU
-
-## Make sure the results are copied back to the submit directory (see Work Directory below):
-# chkfile MyResultFileq
-# chkfile is replaced by 'savefile' on Saga
-savefile "$SCRATCH/ShallowWaterGPU/*.log"
-savefile "$SCRATCH/ShallowWaterGPU/*.nc"
-savefile "$SCRATCH/ShallowWaterGPU/*.json"
-
-## Do some work:
-cd $SCRATCH/ShallowWaterGPU
-srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
-srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile
-
--- a/saga-test.job
+++ b/saga-test.job
@ -1,52 +0,0 @@
-#!/bin/bash
-# Job name:
-#SBATCH --job-name=ShallowWaterGPUStrongScaling
-#
-# Project:
-#SBATCH --account=nn9882k
-#
-# Wall clock limit:
-#SBATCH --time=24:00:00
-#
-# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
-#
-# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
-# device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
-# /dev/nvidia0, /dev/nvidia1 or both, respectively.
-#SBATCH --partition=accel
-#
-# Max memory usage per task (core) - increasing this will cost more core hours:
-#SBATCH --mem-per-cpu=3800M
-#
-# Number of tasks:
-#SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
-
-## Set up job environment: (this is done automatically behind the scenes)
-## (make sure to comment '#' or remove the following line 'source ...')
-# source /cluster/bin/jobsetup
-
-module restore system   # instead of 'module purge' rather set module environment to the system default
-module load CUDA/10.2.89
-
-# It is also recommended to to list loaded modules, for easier debugging:
-module list
-
-set -o errexit # exit on errors
-set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
-
-## Copy input files to the work directory:
-mkdir $SCRATCH/ShallowWaterGPU
-cp -r . $SCRATCH/ShallowWaterGPU
-
-## Make sure the results are copied back to the submit directory (see Work Directory below):
-# chkfile MyResultFile
-# chkfile is replaced by 'savefile' on Saga
-savefile "$SCRATCH/ShallowWaterGPU/*.log"
-savefile "$SCRATCH/ShallowWaterGPU/*.nc"
-savefile "$SCRATCH/ShallowWaterGPU/*.json"
-
-## Do some work:
-cd $SCRATCH/ShallowWaterGPU
-srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
-srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
-
--- a/saga_scaling_benchmark.job
+++ b/saga_scaling_benchmark.job
@ -1,65 +0,0 @@
-#!/bin/bash
-# Job name:
-#SBATCH --job-name=ShallowWaterGPUScaling
-#
-# Project:
-#SBATCH --account=nn9882k
-#
-# Wall clock limit:
-#SBATCH --time=00:10:00
-#
-# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
-#
-# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
-# device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
-# /dev/nvidia0, /dev/nvidia1 or both, respectively.
-#SBATCH --partition=accel
-#
-# Max memory usage per task (core) - increasing this will cost more core hours:
-##SBATCH --mem-per-cpu=3800M
-#SBATCH --mem-per-cpu=24G
-#
-#SBATCH --qos=devel
-
-## Set up job environment: (this is done automatically behind the scenes)
-## (make sure to comment '#' or remove the following line 'source ...')
-# source /cluster/bin/jobsetup
-
-module restore system   # instead of 'module purge' rather set module environment to the system default
-module load CUDA/11.4.1
-#module load CUDA/11.1.1-GCC-10.2.0
-#module load OpenMPI/4.0.5-gcccuda-2020b
-
-# It is also recommended to to list loaded modules, for easier debugging:
-module list
-
-set -o errexit # exit on errors
-set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
-
-## Copy input files to the work directory:
-mkdir $SCRATCH/ShallowWaterGPU
-cp -r . $SCRATCH/ShallowWaterGPU
-
-## Make sure the results are copied back to the submit directory (see Work Directory below):
-# chkfile MyResultFile
-# chkfile is replaced by 'savefile' on Saga
-#savefile "$SCRATCH/ShallowWaterGPU/*.log"
-#savefile "$SCRATCH/ShallowWaterGPU/*.nc"
-#savefile "$SCRATCH/ShallowWaterGPU/*.json"
-#savefile "$SCRATCH/ShallowWaterGPU/*.qdrep"
-
-cleanup "rm -rf $SCRATCH/ShallowWaterGPU"
-
-export OMPI_MCA_opal_cuda_support=true
-
-## Do some work:
-cd $SCRATCH/ShallowWaterGPU
-srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
-srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
-
-cd $HOME/src/ShallowWaterGPU
-mkdir -p output_saga/$NOW/$SLURM_JOB_ID
-mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID
-mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID
-mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW
-mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW
--- a/saga_strong_scaling_benchmark.sh
+++ b/saga_strong_scaling_benchmark.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-
-TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
-
-# one node: 1–4 GPUs
-sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
-sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
-sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-
-# 4 nodes: 1–4 GPUs per node
-sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
-sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
-sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
-
-# 4 nodes: 1–4 GPUs per node
-sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
-sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
-sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
-
-## one node: 1–4 GPUs
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-#
-## 4 nodes: 1–4 GPUs per node
-#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-#sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
-#sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
-#sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
--- a/saga_weak_scaling_benchmark.sh
+++ b/saga_weak_scaling_benchmark.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
-
-# one node: 1-4 GPUs
-sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
-sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
-sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
-sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-
-# 2-4 nodes: 1 GPUs per node
-sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
-sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
-sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-
-## one node: 1-4 GPUs
-#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
-#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
-#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
-#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
-
-## 2-4 nodes: 1 GPUs per node
-#sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
-#sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
-#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
--- a/seymour_strong_scaling_benchmark.sh
+++ b/seymour_strong_scaling_benchmark.sh
@ -1,39 +0,0 @@
-#!/bin/bash
-
-NOW=$(date "+%Y-%m-%dT%H%M%S")
-mkdir -p output_seymour/$NOW
-
-# one node: 1-8 GPUs
-mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile && 
-mkdir -p output_seymour/$NOW/1_proc && 
-mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ &&
-
-mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile && 
-mkdir -p output_seymour/$NOW/2_proc && 
-mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ &&
-
-mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile && 
-mkdir -p output_seymour/$NOW/3_proc && 
-mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ &&
-
-mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile && 
-mkdir -p output_seymour/$NOW/4_proc && 
-mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ &&
-
-mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile &&
-mkdir -p output_seymour/$NOW/5_proc && 
-mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ &&
-
-mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile &&
-mkdir -p output_seymour/$NOW/6_proc && 
-mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ &&
-
-mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile &&
-mkdir -p output_seymour/$NOW/7_proc && 
-mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ &&
-
-mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile &&
-mkdir -p output_seymour/$NOW/8_proc && 
-mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ &&
-
-for filename in *.json; do mv "$filename" "output_seymour/$NOW/MPI_${NOW}_${filename#????}"; done;