diff --git a/mpiTesting.py b/mpiTesting.py
index 52deeaa..d7192d9 100644
--- a/mpiTesting.py
+++ b/mpiTesting.py
@@ -25,29 +25,43 @@ import gc
 import time
 import json
 import logging
+import os
 
-#MPI
+# MPI
 from mpi4py import MPI
 
-#CUDA
+# CUDA
 import pycuda.driver as cuda
 
-#Simulator engine etc
+# Simulator engine etc
 from GPUSimulators import MPISimulator, Common, CudaContext
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
 from GPUSimulators.Simulator import BoundaryCondition as BC
 
+import argparse
+parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
+parser.add_argument('-nx', type=int, default=128)
+parser.add_argument('-ny', type=int, default=128)
+parser.add_argument('--profile', action='store_true') # default: False
 
-#Get MPI COMM to use
+
+args = parser.parse_args()
+
+if(args.profile):
+    # profiling: total run time
+    t_total_start = time.time()
+
+
+# Get MPI COMM to use
 comm = MPI.COMM_WORLD
 
 
 ####
-#Initialize logging 
+# Initialize logging
 ####
 log_level_console = 20
-log_level_file    = 10
+log_level_file = 10
 log_filename = 'mpi_' + str(comm.rank) + '.log'
 logger = logging.getLogger('GPUSimulators')
 logger.setLevel(min(log_level_console, log_level_file))
@@ -55,15 +69,17 @@ logger.setLevel(min(log_level_console, log_level_file))
 ch = logging.StreamHandler()
 ch.setLevel(log_level_console)
 logger.addHandler(ch)
-logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
+logger.info("Console logger using level %s",
+            logging.getLevelName(log_level_console))
 
 fh = logging.FileHandler(log_filename)
-formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
+formatter = logging.Formatter(
+    '%(asctime)s:%(name)s:%(levelname)s: %(message)s')
 fh.setFormatter(formatter)
 fh.setLevel(log_level_file)
 logger.addHandler(fh)
-logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
-
+logger.info("File logger using level %s to %s",
+            logging.getLevelName(log_level_file), log_filename)
 
 
 ####
@@ -73,7 +89,6 @@ logger.info("Creating MPI grid")
 grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
 
 
-
 ####
 # Initialize CUDA
 ####
@@ -85,15 +100,15 @@ cuda_device = local_rank % num_cuda_devices
 cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
 
 
-
 ####
 # Set initial conditions
 ####
 logger.info("Generating initial conditions")
-nx = 128
-ny = 128
+nx = args.nx
+ny = args.ny
+
 gamma = 1.4
-save_times = np.linspace(0, 5.0, 10)
+save_times = np.linspace(0, 10.0, 2)
 outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
 
@@ -103,20 +118,43 @@ arguments['theta'] = 1.2
 arguments['grid'] = grid
 
 
-    
-    
 ####
 # Run simulation
 ####
 logger.info("Running simulation")
-#Helper function to create MPI simulator
+# Helper function to create MPI simulator
+
+
 def genSim(grid, **kwargs):
     local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
     sim = MPISimulator.MPISimulator(local_sim, grid)
     return sim
-outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
 
 
+outfile = Common.runSimulation(
+    genSim, arguments, outfile, save_times, save_var_names)
+
+if(args.profile):
+    t_total_end = time.time()
+    t_total = t_total_end - t_total_start
+    print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
+
+# write profiling to json file
+if(args.profile and MPI.COMM_WORLD.rank == 0):
+    if "SLURM_JOB_ID" in os.environ:
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
+        allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
+        profiling_file = "MPI_jobid_" + \
+            str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
+    else:
+        profiling_file = "MPI_test_profiling.json"
+
+    write_profiling_data = {}
+    write_profiling_data["total"] = t_total
+
+    with open(profiling_file, "w") as write_file:
+        json.dump(write_profiling_data, write_file)
 
 ####
 # Clean shutdown
@@ -134,4 +172,4 @@ gc.collect()
 # Print completion and exit
 ####
 print("Completed!")
-exit(0)
\ No newline at end of file
+exit(0)
diff --git a/saga-dev.job b/saga-dev.job
new file mode 100644
index 0000000..1244048
--- /dev/null
+++ b/saga-dev.job
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Job name:
+#SBATCH --job-name=ShallowWaterGPUScalingDev
+#
+# Project:
+#SBATCH --account=nn9550k
+#
+# Wall clock limit:
+#SBATCH --time=01:00:00
+#
+# Ask for 1 GPU (max is 2)
+# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
+# device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
+# /dev/nvidia0, /dev/nvidia1 or both, respectively.
+#SBATCH --partition=accel --gres=gpu:1
+#
+# Max memory usage per task (core) - increasing this will cost more core hours:
+#SBATCH --mem-per-cpu=16G
+#
+# Number of tasks:
+#SBATCH --nodes=1 --ntasks-per-node=1
+#
+#SBATCH --qos=devel
+
+## Set up job environment: (this is done automatically behind the scenes)
+## (make sure to comment '#' or remove the following line 'source ...')
+# source /cluster/bin/jobsetup
+
+module restore system   # instead of 'module purge' rather set module environment to the system default
+module load CUDA/10.2.89
+
+# It is also recommended to to list loaded modules, for easier debugging:
+module list
+
+set -o errexit # exit on errors
+set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
+
+## Copy input files to the work directory:
+mkdir $SCRATCH/ShallowWaterGPU
+cp -r . $SCRATCH/ShallowWaterGPU
+
+## Make sure the results are copied back to the submit directory (see Work Directory below):
+# chkfile MyResultFile
+# chkfile is replaced by 'savefile' on Saga
+savefile "$SCRATCH/ShallowWaterGPU/*.log"
+savefile "$SCRATCH/ShallowWaterGPU/*.nc"
+savefile "$SCRATCH/ShallowWaterGPU/*.json"
+
+## Do some work:
+cd $SCRATCH/ShallowWaterGPU
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
+
diff --git a/saga-test.job b/saga-test.job
index 7d2b475..0b5e9f5 100644
--- a/saga-test.job
+++ b/saga-test.job
@@ -1,12 +1,12 @@
 #!/bin/bash
 # Job name:
-#SBATCH --job-name=saga-test
+#SBATCH --job-name=ShallowWaterGPUStrongScaling
 #
 # Project:
 #SBATCH --account=nn9550k
 #
 # Wall clock limit:
-#SBATCH --time=00:10:00
+#SBATCH --time=24:00:00
 #
 # Ask for 1 GPU (max is 2)
 # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
@@ -15,10 +15,10 @@
 #SBATCH --partition=accel --gres=gpu:1
 #
 # Max memory usage per task (core) - increasing this will cost more core hours:
-#SBATCH --mem-per-cpu=4G
+#SBATCH --mem-per-cpu=16G
 #
 # Number of tasks:
-#SBATCH --nodes=2 --ntasks-per-node=1
+#SBATCH --nodes=1 --ntasks-per-node=1
 
 ## Set up job environment: (this is done automatically behind the scenes)
 ## (make sure to comment '#' or remove the following line 'source ...')
@@ -42,9 +42,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
 # chkfile is replaced by 'savefile' on Saga
 savefile "$SCRATCH/ShallowWaterGPU/*.log"
 savefile "$SCRATCH/ShallowWaterGPU/*.nc"
+savefile "$SCRATCH/ShallowWaterGPU/*.json"
 
 ## Do some work:
 cd $SCRATCH/ShallowWaterGPU
-srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
-srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
 
diff --git a/saga_strong_scaling_benchmark.job b/saga_strong_scaling_benchmark.job
new file mode 100644
index 0000000..05320d7
--- /dev/null
+++ b/saga_strong_scaling_benchmark.job
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Job name:
+#SBATCH --job-name=ShallowWaterGPUStrongScaling
+#
+# Project:
+#SBATCH --account=nn9550k
+#
+# Wall clock limit:
+#SBATCH --time=10:00:00
+#
+# Ask for 1 GPU (max is 2)
+# Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
+# device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
+# /dev/nvidia0, /dev/nvidia1 or both, respectively.
+#SBATCH --partition=accel --gres=gpu:1
+#
+# Max memory usage per task (core) - increasing this will cost more core hours:
+#SBATCH --mem-per-cpu=16G
+#
+# Number of tasks:
+#SBATCH --nodes=1 --ntasks-per-node=1
+
+## Set up job environment: (this is done automatically behind the scenes)
+## (make sure to comment '#' or remove the following line 'source ...')
+# source /cluster/bin/jobsetup
+
+module restore system   # instead of 'module purge' rather set module environment to the system default
+module load CUDA/10.2.89
+
+# It is also recommended to to list loaded modules, for easier debugging:
+module list
+
+set -o errexit # exit on errors
+set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
+
+## Copy input files to the work directory:
+mkdir $SCRATCH/ShallowWaterGPU
+cp -r . $SCRATCH/ShallowWaterGPU
+
+## Make sure the results are copied back to the submit directory (see Work Directory below):
+# chkfile MyResultFile
+# chkfile is replaced by 'savefile' on Saga
+savefile "$SCRATCH/ShallowWaterGPU/*.log"
+savefile "$SCRATCH/ShallowWaterGPU/*.nc"
+savefile "$SCRATCH/ShallowWaterGPU/*.json"
+
+## Do some work:
+cd $SCRATCH/ShallowWaterGPU
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile
+
diff --git a/saga_strong_scaling_benchmark.sh b/saga_strong_scaling_benchmark.sh
new file mode 100644
index 0000000..1d2284d
--- /dev/null
+++ b/saga_strong_scaling_benchmark.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# one node: 1-4 tasks/GPUs
+sbatch --partition=accel --gres=gpu:1 --nodes=1 --ntasks-per-node=1 saga_strong_scaling_benchmark.job
+sbatch --partition=accel --gres=gpu:2 --nodes=1 --ntasks-per-node=2 saga_strong_scaling_benchmark.job
+sbatch --partition=accel --gres=gpu:3 --nodes=1 --ntasks-per-node=3 saga_strong_scaling_benchmark.job
+sbatch --partition=accel --gres=gpu:4 --nodes=1 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
+
+# 2-4 nodes: 4 tasks/GPUs per node
+sbatch --partition=accel --gres=gpu:4 --nodes=2 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
+sbatch --partition=accel --gres=gpu:4 --nodes=3 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
+sbatch --partition=accel --gres=gpu:4 --nodes=4 --ntasks-per-node=4 saga_strong_scaling_benchmark.job
+