Merge pull request #10 from setmar/master

WiP: Simulating the Euler equations on multiple GPUs using Python
2025-05-18 14:34:13 +02:00 · 2023-02-23 08:55:50 +01:00 · 2023-02-23 08:55:50 +01:00 · 9613269c45
commit 9613269c45
parent b48701bfe9 55029275e3
34 changed files with 300874 additions and 257743 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 .vscode/settings.json
--- a/EulerTesting.ipynb
+++ b/EulerTesting.ipynb
--- a/Figures_Saga.ipynb
+++ b/Figures_Saga.ipynb
--- a/Figures_Simula.ipynb
+++ b/Figures_Simula.ipynb
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@ -89,12 +89,22 @@ def toJson(in_dict, compressed=True):
                out_dict[key] = value
    return json.dumps(out_dict)
-def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[]):
+def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
    """
    Runs a simulation, and stores output in netcdf file. Stores the times given in 
    save_times, and saves all of the variables in list save_var_names. Elements in  
    save_var_names can be set to None if you do not want to save them
    """
    profiling_data_sim_runner = { 'start': {}, 'end': {} }
    profiling_data_sim_runner["start"]["t_sim_init"] = 0
    profiling_data_sim_runner["end"]["t_sim_init"] = 0
    profiling_data_sim_runner["start"]["t_nc_write"] = 0
    profiling_data_sim_runner["end"]["t_nc_write"] = 0
    profiling_data_sim_runner["start"]["t_full_step"] = 0
    profiling_data_sim_runner["end"]["t_full_step"] = 0
    profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
    logger = logging.getLogger(__name__)
    assert len(save_times) > 0, "Need to specify which times to save"
@ -111,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
        outdata.ncfile.git_hash = getGitHash()
        outdata.ncfile.git_status = getGitStatus()
        outdata.ncfile.simulator = str(simulator)
-        outdata.ncfile.sim_args = toJson(simulator_args)
+        
        # do not write fields to attributes (they are to large)
        simulator_args_for_ncfile = simulator_args.copy()
        del simulator_args_for_ncfile["rho"]
        del simulator_args_for_ncfile["rho_u"]
        del simulator_args_for_ncfile["rho_v"]
        del simulator_args_for_ncfile["E"]
        outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
        #Create dimensions
        outdata.ncfile.createDimension('time', len(save_times))
@ -146,6 +163,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
        t_steps[0] = save_times[0]
        t_steps[1:] = save_times[1:] - save_times[0:-1]
        profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
        #Start simulation loop
        progress_printer = ProgressPrinter(save_times[-1], print_every=10)
        for k in range(len(save_times)):
@ -160,9 +179,15 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
                logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
                return outdata.filename
            profiling_data_sim_runner["start"]["t_full_step"] += time.time()
            #Simulate
            if (t_step > 0.0):
-                sim.simulate(t_step)
+                sim.simulate(t_step, dt)
            profiling_data_sim_runner["end"]["t_full_step"] += time.time()
            profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
            #Download
            save_vars = sim.download(download_vars)
@ -171,6 +196,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
            for i, var_name in enumerate(save_var_names):
                ncvars[var_name][k, :] = save_vars[i]
            profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
            #Write progress to screen
            print_string = progress_printer.getPrintString(t_end)
            if (print_string):
@ -178,7 +205,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
        logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
-    return outdata.filename   
+    return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
@ -526,9 +553,9 @@ class CudaArray2D:
            #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
            #Allocate host memory
            #The following fails, don't know why (crashes python)
-            #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32)
+            cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
            #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
-            cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
+            #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
        assert nx == cpu_data.shape[1]
        assert ny == cpu_data.shape[0]
@ -739,7 +766,7 @@ class ArakawaA2D:
            assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
            cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
-        stream.synchronize()
+        #stream.synchronize()
        return cpu_variables
    def check(self):
--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                                        }, 
                                        jit_compile_args={})
        self.kernel = module.get_function("KP07DimsplitKernel")
-        self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP")
+        self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
        #Create data by uploading to device
@ -109,10 +109,13 @@ class EE2D_KP07_dimsplit (BaseSimulator):
        self.cfl_data.fill(self.dt, stream=self.stream)
-    def substep(self, dt, step_number):
+    def substep(self, dt, step_number, external=True, internal=True):
-        self.substepDimsplit(0.5*dt, step_number)
+            self.substepDimsplit(0.5*dt, step_number, external, internal)
    def substepDimsplit(self, dt, substep, external, internal):
        if external and internal:
            #print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
    def substepDimsplit(self, dt, substep):
            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
                self.nx, self.ny, 
                self.dx, self.dy, dt, 
@ -129,8 +132,142 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata)
+                self.cfl_data.gpudata,
                0, 0, 
                self.nx, self.ny)
            return
        if external and not internal:
            ###################################
            # XXX: Corners are treated twice! #
            ###################################
            ns_grid_size = (self.grid_size[0], 1)
            # NORTH
            # (x0, y0) x (x1, y1)
            #  (0, ny-y_halo) x (nx, ny)
            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, 
                self.nx, self.ny,
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, self.ny - int(self.u0[0].y_halo),
                self.nx, self.ny)
            # SOUTH
            # (x0, y0) x (x1, y1)
            #   (0, 0) x (nx, y_halo)
            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, 
                self.nx, self.ny,
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, 0,
                self.nx, int(self.u0[0].y_halo))
            we_grid_size = (1, self.grid_size[1])
            # WEST
            # (x0, y0) x (x1, y1)
            #  (0, 0) x (x_halo, ny)
            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, 
                self.nx, self.ny,
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                0, 0,
                int(self.u0[0].x_halo), self.ny)
            # EAST
            # (x0, y0) x (x1, y1)
            #   (nx-x_halo, 0) x (nx, ny)
            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, 
                self.nx, self.ny,
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                self.nx - int(self.u0[0].x_halo), 0,
                self.nx, self.ny)
            return
        if internal and not external:
            # INTERNAL DOMAIN
            #         (x0, y0) x (x1, y1)
            # (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream, 
                self.nx, self.ny, 
                self.dx, self.dy, dt, 
                self.g, 
                self.gamma, 
                self.theta, 
                substep,
                self.boundary_conditions, 
                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                self.cfl_data.gpudata,
                int(self.u0[0].x_halo), int(self.u0[0].y_halo),
                self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
            return
    def swapBuffers(self):
        self.u0, self.u1 = self.u1, self.u0
        return
    def getOutput(self):
        return self.u0
@ -138,6 +275,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
    def check(self):
        self.u0.check()
        self.u1.check()
        return
    def computeDt(self):
        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
--- a/GPUSimulators/MPISimulator.py
+++ b/GPUSimulators/MPISimulator.py
@ -24,7 +24,10 @@ import logging
 from GPUSimulators import Simulator
 import numpy as np
 from mpi4py import MPI
 import time
 import pycuda.driver as cuda
 #import nvtx
@ -135,6 +138,10 @@ class MPIGrid(object):
        grid = np.sort(grid)
        grid = grid[::-1]
        # XXX: We only use vertical (north-south) partitioning for now
        grid[0] = 1
        grid[1] = num_nodes
        return grid
@ -200,6 +207,18 @@ class MPISimulator(Simulator.BaseSimulator):
    Class which handles communication between simulators on different MPI nodes
    """
    def __init__(self, sim, grid):        
        self.profiling_data_mpi = { 'start': {}, 'end': {} }
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0
        self.profiling_data_mpi["start"]["t_mpi_step"] = 0
        self.profiling_data_mpi["end"]["t_mpi_step"] = 0
        self.profiling_data_mpi["n_time_steps"] = 0
        self.logger =  logging.getLogger(__name__)
        autotuner = sim.context.autotuner
@ -232,6 +251,7 @@ class MPISimulator(Simulator.BaseSimulator):
            'west': Simulator.BoundaryCondition.Type.Dirichlet
        })
        gi, gj = grid.getCoordinate()
        #print("gi: " + str(gi) + ", gj: " + str(gj))
        if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
            self.west = None
            new_boundary_conditions.west = boundary_conditions.west;
@ -272,23 +292,50 @@ class MPISimulator(Simulator.BaseSimulator):
        #Note that east and west also transfer ghost cells
        #whilst north/south only transfer internal cells
        #Reuses the width/height defined in the read-extets above
-        self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
+        self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
-        self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
+        self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
-        self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
+        self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
-        self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
+        self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
        #Allocate data for sending
-        self.out_e = np.empty_like(self.in_e)
+        self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e)
-        self.out_w = np.empty_like(self.in_w)
+        self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w)
-        self.out_n = np.empty_like(self.in_n)
+        self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n)
-        self.out_s = np.empty_like(self.in_s)
+        self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s)
        self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
        self.full_exchange()
        sim.context.synchronize()
    def substep(self, dt, step_number):
-        self.exchange()
+        
-        self.sim.substep(dt, step_number)
+        #nvtx.mark("substep start", color="yellow")
        self.profiling_data_mpi["start"]["t_mpi_step"] += time.time()
        #nvtx.mark("substep external", color="blue")
        self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
        #nvtx.mark("substep internal", color="red")
        self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
        #nvtx.mark("substep full", color="blue")
        #self.sim.substep(dt, step_number, external=True, internal=True)
        self.sim.swapBuffers()
        self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
        #nvtx.mark("exchange", color="blue")
        self.full_exchange()
        #nvtx.mark("sync start", color="blue")
        self.sim.stream.synchronize()
        self.sim.internal_stream.synchronize()
        #nvtx.mark("sync end", color="blue")
        self.profiling_data_mpi["n_time_steps"] += 1
    def getOutput(self):
        return self.sim.getOutput()
@ -321,18 +368,14 @@ class MPISimulator(Simulator.BaseSimulator):
        y1 = y0 + height
        return [x0, x1, y0, y1]
-    def exchange(self):        
+    def full_exchange(self):
        ####
        # FIXME: This function can be optimized using persitent communications. 
        # Also by overlapping some of the communications north/south and east/west of GPU and intra-node
        # communications
        ####
        ####
        # First transfer internal cells north-south
        ####
        #Download from the GPU
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
        if self.north is not None:
            for k in range(self.nvars):
                self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n)
@ -341,7 +384,11 @@ class MPISimulator(Simulator.BaseSimulator):
                self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
        self.sim.stream.synchronize()
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
        #Send/receive to north/south neighbours
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        comm_send = []
        comm_recv = []
        if self.north is not None:
@ -355,7 +402,11 @@ class MPISimulator(Simulator.BaseSimulator):
        for comm in comm_recv:
            comm.wait()
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        #Upload to the GPU
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
        if self.north is not None:
            for k in range(self.nvars):
                self.sim.u0[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n)
@ -363,17 +414,23 @@ class MPISimulator(Simulator.BaseSimulator):
            for k in range(self.nvars):
                self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
        #Wait for sending to complete
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        for comm in comm_send:
            comm.wait()
-        
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        ####
        # Then transfer east-west including ghost cells that have been filled in by north-south transfer above
        ####
        #Download from the GPU
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
        if self.east is not None:
            for k in range(self.nvars):
                self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e)
@ -382,7 +439,11 @@ class MPISimulator(Simulator.BaseSimulator):
                self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
        self.sim.stream.synchronize()
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
        #Send/receive to east/west neighbours
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        comm_send = []
        comm_recv = []
        if self.east is not None:
@ -392,12 +453,15 @@ class MPISimulator(Simulator.BaseSimulator):
            comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
            comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
        #Wait for incoming transfers to complete
        for comm in comm_recv:
            comm.wait()
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        #Upload to the GPU
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
        if self.east is not None:
            for k in range(self.nvars):
                self.sim.u0[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e)
@ -405,9 +469,12 @@ class MPISimulator(Simulator.BaseSimulator):
            for k in range(self.nvars):
                self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
        #Wait for sending to complete
        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
        for comm in comm_send:
            comm.wait()
-    
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
--- a/GPUSimulators/SHMEMSimulator.py
+++ b/GPUSimulators/SHMEMSimulator.py
@ -0,0 +1,263 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements SHMEM simulator group class
 Copyright (C) 2020 Norwegian Meteorological Institute
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 import logging
 from GPUSimulators import Simulator, CudaContext
 import numpy as np
 import pycuda.driver as cuda
 import time
 class SHMEMSimulator(Simulator.BaseSimulator):
    """
    Class which handles communication and synchronization between simulators in different 
    contexts (presumably on different GPUs)
    """
    def __init__(self, sims, grid):
        self.logger =  logging.getLogger(__name__)
        assert(len(sims) > 1)
        self.sims = sims
        # XXX: This is not what was intended. Do we need extra wrapper class SHMEMSimulator?
        # See also getOutput() and check().
        #
        # SHMEMSimulatorGroup would then not have any superclass, but manage a collection of
        # SHMEMSimulators that have BaseSimulator as a superclass.
        #
        # This would also eliminate the need for all the array bookkeeping in this class.
        autotuner = sims[0].context.autotuner
        sims[0].context.autotuner = None
        boundary_conditions = sims[0].getBoundaryConditions()
        super().__init__(sims[0].context, 
            sims[0].nx, sims[0].ny, 
            sims[0].dx, sims[0].dy, 
            boundary_conditions,
            sims[0].cfl_scale,
            sims[0].num_substeps,  
            sims[0].block_size[0], sims[0].block_size[1])
        sims[0].context.autotuner = autotuner
        self.sims = sims
        self.grid = grid
        self.east = [None] * len(self.sims)
        self.west = [None] * len(self.sims)
        self.north = [None] * len(self.sims)
        self.south = [None] * len(self.sims)
        self.nvars = [None] * len(self.sims)
        self.read_e = [None] * len(self.sims)
        self.read_w = [None] * len(self.sims)
        self.read_n = [None] * len(self.sims)
        self.read_s = [None] * len(self.sims)
        self.write_e = [None] * len(self.sims)
        self.write_w = [None] * len(self.sims)
        self.write_n = [None] * len(self.sims)
        self.write_s = [None] * len(self.sims)
        self.e = [None] * len(self.sims)
        self.w = [None] * len(self.sims)
        self.n = [None] * len(self.sims)
        self.s = [None] * len(self.sims)
        for i, sim in enumerate(self.sims):
            #Get neighbor subdomain ids
            self.east[i] = grid.getEast(i)
            self.west[i] = grid.getWest(i)
            self.north[i] = grid.getNorth(i)
            self.south[i] = grid.getSouth(i)
            #Get coordinate of this subdomain
            #and handle global boundary conditions
            new_boundary_conditions = Simulator.BoundaryCondition({
                'north': Simulator.BoundaryCondition.Type.Dirichlet,
                'south': Simulator.BoundaryCondition.Type.Dirichlet,
                'east': Simulator.BoundaryCondition.Type.Dirichlet,
                'west': Simulator.BoundaryCondition.Type.Dirichlet
            })
            gi, gj = grid.getCoordinate(i)
            if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
                self.west = None
                new_boundary_conditions.west = boundary_conditions.west;
            if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
                self.south = None
                new_boundary_conditions.south = boundary_conditions.south;
            if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
                self.east = None
                new_boundary_conditions.east = boundary_conditions.east;
            if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
                self.north = None
                new_boundary_conditions.north = boundary_conditions.north;
            sim.setBoundaryConditions(new_boundary_conditions)
            #Get number of variables
            self.nvars[i] = len(sim.getOutput().gpu_variables)
            #Shorthands for computing extents and sizes
            gc_x = int(sim.getOutput()[0].x_halo)
            gc_y = int(sim.getOutput()[0].y_halo)
            nx = int(sim.nx)
            ny = int(sim.ny)
            #Set regions for ghost cells to read from
            #These have the format [x0, y0, width, height]
            self.read_e[i] = np.array([  nx,    0, gc_x, ny + 2*gc_y])
            self.read_w[i] = np.array([gc_x,    0, gc_x, ny + 2*gc_y])
            self.read_n[i] = np.array([gc_x,   ny,   nx,        gc_y])
            self.read_s[i] = np.array([gc_x, gc_y,   nx,        gc_y])
            #Set regions for ghost cells to write to
            self.write_e[i] = self.read_e[i] + np.array([gc_x, 0, 0, 0])
            self.write_w[i] = self.read_w[i] - np.array([gc_x, 0, 0, 0])
            self.write_n[i] = self.read_n[i] + np.array([0, gc_y, 0, 0])
            self.write_s[i] = self.read_s[i] - np.array([0, gc_y, 0, 0])
            #Allocate host data
            #Note that east and west also transfer ghost cells
            #whilst north/south only transfer internal cells
            #Reuses the width/height defined in the read-extets above
            self.e[i] = np.empty((self.nvars[i], self.read_e[i][3], self.read_e[i][2]), dtype=np.float32)
            self.w[i] = np.empty((self.nvars[i], self.read_w[i][3], self.read_w[i][2]), dtype=np.float32)
            self.n[i] = np.empty((self.nvars[i], self.read_n[i][3], self.read_n[i][2]), dtype=np.float32)
            self.s[i] = np.empty((self.nvars[i], self.read_s[i][3], self.read_s[i][2]), dtype=np.float32)
        self.logger.debug("Initialized {:d} subdomains".format(len(self.sims)))
    def substep(self, dt, step_number):
        self.exchange()
        for i, sim in enumerate(self.sims):
            sim.substep(dt, step_number)
    def getOutput(self):
        # XXX: Does not return what we would expect.
        # Returns first subdomain, but we want the whole domain.
        return self.sims[0].getOutput() 
    def synchronize(self):
        for sim in self.sims:
            sim.synchronize()
    def check(self):
        # XXX: Does not return what we would expect.
        # Checks only first subdomain, but we want to check the whole domain.
        return self.sims[0].check()
    def computeDt(self):
        global_dt = float("inf")
        for sim in self.sims:
            sim.context.synchronize()
        for sim in self.sims:
            local_dt = sim.computeDt()
            if local_dt < global_dt:
                global_dt = local_dt
            self.logger.debug("Local dt: {:f}".format(local_dt))
        self.logger.debug("Global dt: {:f}".format(global_dt))
        return global_dt
    def getExtent(self, index=0):
        """
        Function which returns the extent of the subdomain with index 
        index in the grid
        """
        width = self.sims[index].nx*self.sims[index].dx
        height = self.sims[index].ny*self.sims[index].dy
        i, j = self.grid.getCoordinate(index)
        x0 = i * width
        y0 = j * height 
        x1 = x0 + width
        y1 = y0 + height
        return [x0, x1, y0, y1]
    def exchange(self):
        ####
        # First transfer internal cells north-south
        ####
        for i in range(len(self.sims)):
            self.ns_download(i)
        for i in range(len(self.sims)):
            self.ns_upload(i)
        ####
        # Then transfer east-west including ghost cells that have been filled in by north-south transfer above
        ####
        for i in range(len(self.sims)):
            self.ew_download(i)
        for i in range(len(self.sims)):
            self.ew_upload(i)
    def ns_download(self, i):
        #Download from the GPU
        if self.north[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the north)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.n[i][k,:,:], extent=self.read_n[i])
        if self.south[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i])
        self.sims[i].stream.synchronize()
    def ns_upload(self, i):
        #Upload to the GPU
        if self.north[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.s[self.north[i]][k,:,:], extent=self.write_n[i])
        if self.south[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.n[self.south[i]][k,:,:], extent=self.write_s[i])
    def ew_download(self, i):
        #Download from the GPU
        if self.east[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the east)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.e[i][k,:,:], extent=self.read_e[i])
        if self.west[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i])
        self.sims[i].stream.synchronize()
    def ew_upload(self, i):
        #Upload to the GPU
        if self.east[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.w[self.east[i]][k,:,:], extent=self.write_e[i])
                #test_east = np.ones_like(self.e[self.east[i]][k,:,:])
                #self.sims[i].u0[k].upload(self.sims[i].stream, test_east, extent=self.write_e[i])
        if self.west[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.e[self.west[i]][k,:,:], extent=self.write_w[i])
                #test_west = np.ones_like(self.e[self.west[i]][k,:,:])
                #self.sims[i].u0[k].upload(self.sims[i].stream, test_west, extent=self.write_w[i])
--- a/GPUSimulators/SHMEMSimulatorGroup.py
+++ b/GPUSimulators/SHMEMSimulatorGroup.py
@ -0,0 +1,394 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements SHMEM simulator group class
 Copyright (C) 2020 Norwegian Meteorological Institute
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 import logging
 from GPUSimulators import Simulator, CudaContext
 import numpy as np
 import pycuda.driver as cuda
 import time
 class SHMEMGrid(object):
    """
    Class which represents an SHMEM grid of GPUs. Facilitates easy communication between
    neighboring subdomains in the grid. Contains one CUDA context per subdomain.
    """
    def __init__(self, ngpus=None, ndims=2):
        self.logger =  logging.getLogger(__name__)
        cuda.init(flags=0)
        self.logger.info("Initializing CUDA")
        num_cuda_devices = cuda.Device.count()
        if ngpus is None:
            ngpus = num_cuda_devices
        # XXX: disabled for testing on single-GPU system
        #assert ngpus <= num_cuda_devices, "Trying to allocate more GPUs than are available in the system."   
        #assert ngpus >= 2, "Must have at least two GPUs available to run multi-GPU simulations."
        assert ndims == 2, "Unsupported number of dimensions. Must be two at the moment"
        self.ngpus = ngpus
        self.ndims = ndims
        self.grid = SHMEMGrid.getGrid(self.ngpus, self.ndims)
        self.logger.debug("Created {:}-dimensional SHMEM grid, using {:} GPUs".format(
                self.ndims, self.ngpus))    
        # XXX: Is this a natural place to store the contexts? Consider moving contexts out of this 
        # class, into notebook / calling script (shmemTesting.py)
        self.cuda_contexts = []
        for i in range(self.ngpus):
            # XXX: disabled for testing on single-GPU system
            #self.cuda_contexts.append(CudaContext.CudaContext(device=i, autotuning=False))
            self.cuda_contexts.append(CudaContext.CudaContext(device=0, autotuning=False))
    def getCoordinate(self, index):
        i = (index  % self.grid[0])
        j = (index // self.grid[0])
        return i, j
    def getIndex(self, i, j):
        return j*self.grid[0] + i
    def getEast(self, index):
        i, j = self.getCoordinate(index)
        i = (i+1) % self.grid[0]
        return self.getIndex(i, j)
    def getWest(self, index):
        i, j = self.getCoordinate(index)
        i = (i+self.grid[0]-1) % self.grid[0]
        return self.getIndex(i, j)
    def getNorth(self, index):
        i, j = self.getCoordinate(index)
        j = (j+1) % self.grid[1]
        return self.getIndex(i, j)
    def getSouth(self, index):
        i, j = self.getCoordinate(index)
        j = (j+self.grid[1]-1) % self.grid[1]
        return self.getIndex(i, j)
    def getGrid(num_gpus, num_dims):
        assert(isinstance(num_gpus, int))
        assert(isinstance(num_dims, int))
        # Adapted from https://stackoverflow.com/questions/28057307/factoring-a-number-into-roughly-equal-factors
        # Original code by https://stackoverflow.com/users/3928385/ishamael
        # Factorizes a number into n roughly equal factors
        #Dictionary to remember already computed permutations
        memo = {}
        def dp(n, left): # returns tuple (cost, [factors])
            """
            Recursively searches through all factorizations
            """
            #Already tried: return existing result
            if (n, left) in memo: 
                return memo[(n, left)]
            #Spent all factors: return number itself
            if left == 1:
                return (n, [n])
            #Find new factor
            i = 2
            best = n
            bestTuple = [n]
            while i * i < n:
                #If factor found
                if n % i == 0:
                    #Factorize remainder
                    rem = dp(n // i, left - 1)
                    #If new permutation better, save it
                    if rem[0] + i < best:
                        best = rem[0] + i
                        bestTuple = [i] + rem[1]
                i += 1
            #Store calculation
            memo[(n, left)] = (best, bestTuple)
            return memo[(n, left)]
        grid = dp(num_gpus, num_dims)[1]
        if (len(grid) < num_dims):
            #Split problematic 4
            if (4 in grid):
                grid.remove(4)
                grid.append(2)
                grid.append(2)
            #Pad with ones to guarantee num_dims
            grid = grid + [1]*(num_dims - len(grid))
        #Sort in descending order
        grid = np.sort(grid)
        grid = grid[::-1]
        return grid
 class SHMEMSimulatorGroup(object):
    """
    Class which handles communication and synchronization between simulators in different 
    contexts (typically on different GPUs)
    """
    def __init__(self, sims, grid):
        self.logger =  logging.getLogger(__name__)
        assert(len(sims) > 1)
        self.sims = sims
        # XXX: This is not what was intended. Do we need extra wrapper class SHMEMSimulator?
        # See also getOutput() and check().
        #
        # SHMEMSimulatorGroup would then not have any superclass, but manage a collection of
        # SHMEMSimulators that have BaseSimulator as a superclass.
        #
        # This would also eliminate the need for all the array bookkeeping in this class.
        #
        CONT HERE! Model shmemTesting after mpiTesting and divide existing functionality between SHMEMSimulatorGroup and SHMEMSimulator
        autotuner = sims[0].context.autotuner
        sims[0].context.autotuner = None
        boundary_conditions = sims[0].getBoundaryConditions()
        super().__init__(sims[0].context, 
            sims[0].nx, sims[0].ny, 
            sims[0].dx, sims[0].dy, 
            boundary_conditions,
            sims[0].cfl_scale,
            sims[0].num_substeps,  
            sims[0].block_size[0], sims[0].block_size[1])
        sims[0].context.autotuner = autotuner
        self.sims = sims
        self.grid = grid
        self.east = [None] * len(self.sims)
        self.west = [None] * len(self.sims)
        self.north = [None] * len(self.sims)
        self.south = [None] * len(self.sims)
        self.nvars = [None] * len(self.sims)
        self.read_e = [None] * len(self.sims)
        self.read_w = [None] * len(self.sims)
        self.read_n = [None] * len(self.sims)
        self.read_s = [None] * len(self.sims)
        self.write_e = [None] * len(self.sims)
        self.write_w = [None] * len(self.sims)
        self.write_n = [None] * len(self.sims)
        self.write_s = [None] * len(self.sims)
        self.e = [None] * len(self.sims)
        self.w = [None] * len(self.sims)
        self.n = [None] * len(self.sims)
        self.s = [None] * len(self.sims)
        for i, sim in enumerate(self.sims):
            #Get neighbor subdomain ids
            self.east[i] = grid.getEast(i)
            self.west[i] = grid.getWest(i)
            self.north[i] = grid.getNorth(i)
            self.south[i] = grid.getSouth(i)
            #Get coordinate of this subdomain
            #and handle global boundary conditions
            new_boundary_conditions = Simulator.BoundaryCondition({
                'north': Simulator.BoundaryCondition.Type.Dirichlet,
                'south': Simulator.BoundaryCondition.Type.Dirichlet,
                'east': Simulator.BoundaryCondition.Type.Dirichlet,
                'west': Simulator.BoundaryCondition.Type.Dirichlet
            })
            gi, gj = grid.getCoordinate(i)
            if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
                self.west = None
                new_boundary_conditions.west = boundary_conditions.west;
            if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
                self.south = None
                new_boundary_conditions.south = boundary_conditions.south;
            if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
                self.east = None
                new_boundary_conditions.east = boundary_conditions.east;
            if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
                self.north = None
                new_boundary_conditions.north = boundary_conditions.north;
            sim.setBoundaryConditions(new_boundary_conditions)
            #Get number of variables
            self.nvars[i] = len(sim.getOutput().gpu_variables)
            #Shorthands for computing extents and sizes
            gc_x = int(sim.getOutput()[0].x_halo)
            gc_y = int(sim.getOutput()[0].y_halo)
            nx = int(sim.nx)
            ny = int(sim.ny)
            #Set regions for ghost cells to read from
            #These have the format [x0, y0, width, height]
            self.read_e[i] = np.array([  nx,    0, gc_x, ny + 2*gc_y])
            self.read_w[i] = np.array([gc_x,    0, gc_x, ny + 2*gc_y])
            self.read_n[i] = np.array([gc_x,   ny,   nx,        gc_y])
            self.read_s[i] = np.array([gc_x, gc_y,   nx,        gc_y])
            #Set regions for ghost cells to write to
            self.write_e[i] = self.read_e[i] + np.array([gc_x, 0, 0, 0])
            self.write_w[i] = self.read_w[i] - np.array([gc_x, 0, 0, 0])
            self.write_n[i] = self.read_n[i] + np.array([0, gc_y, 0, 0])
            self.write_s[i] = self.read_s[i] - np.array([0, gc_y, 0, 0])
            #Allocate host data
            #Note that east and west also transfer ghost cells
            #whilst north/south only transfer internal cells
            #Reuses the width/height defined in the read-extets above
            self.e[i] = np.empty((self.nvars[i], self.read_e[i][3], self.read_e[i][2]), dtype=np.float32)
            self.w[i] = np.empty((self.nvars[i], self.read_w[i][3], self.read_w[i][2]), dtype=np.float32)
            self.n[i] = np.empty((self.nvars[i], self.read_n[i][3], self.read_n[i][2]), dtype=np.float32)
            self.s[i] = np.empty((self.nvars[i], self.read_s[i][3], self.read_s[i][2]), dtype=np.float32)
        self.logger.debug("Initialized {:d} subdomains".format(len(self.sims)))
    def substep(self, dt, step_number):
        self.exchange()
        for i, sim in enumerate(self.sims):
            sim.substep(dt, step_number)
    def getOutput(self):
        # XXX: Does not return what we would expect.
        # Returns first subdomain, but we want the whole domain.
        return self.sims[0].getOutput() 
    def synchronize(self):
        for sim in self.sims:
            sim.synchronize()
    def check(self):
        # XXX: Does not return what we would expect.
        # Checks only first subdomain, but we want to check the whole domain.
        return self.sims[0].check()
    def computeDt(self):
        global_dt = float("inf")
        for sim in self.sims:
            sim.context.synchronize()
        for sim in self.sims:
            local_dt = sim.computeDt()
            if local_dt < global_dt:
                global_dt = local_dt
            self.logger.debug("Local dt: {:f}".format(local_dt))
        self.logger.debug("Global dt: {:f}".format(global_dt))
        return global_dt
    def getExtent(self, index=0):
        """
        Function which returns the extent of the subdomain with index 
        index in the grid
        """
        width = self.sims[index].nx*self.sims[index].dx
        height = self.sims[index].ny*self.sims[index].dy
        i, j = self.grid.getCoordinate(index)
        x0 = i * width
        y0 = j * height 
        x1 = x0 + width
        y1 = y0 + height
        return [x0, x1, y0, y1]
    def exchange(self):
        ####
        # First transfer internal cells north-south
        ####
        for i in range(len(self.sims)):
            self.ns_download(i)
        for i in range(len(self.sims)):
            self.ns_upload(i)
        ####
        # Then transfer east-west including ghost cells that have been filled in by north-south transfer above
        ####
        for i in range(len(self.sims)):
            self.ew_download(i)
        for i in range(len(self.sims)):
            self.ew_upload(i)
    def ns_download(self, i):
        #Download from the GPU
        if self.north[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the north)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.n[i][k,:,:], extent=self.read_n[i])
        if self.south[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the south)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.s[i][k,:,:], extent=self.read_s[i])
        self.sims[i].stream.synchronize()
    def ns_upload(self, i):
        #Upload to the GPU
        if self.north[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.s[self.north[i]][k,:,:], extent=self.write_n[i])
        if self.south[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.n[self.south[i]][k,:,:], extent=self.write_s[i])
    def ew_download(self, i):
        #Download from the GPU
        if self.east[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the east)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.e[i][k,:,:], extent=self.read_e[i])
        if self.west[i] is not None:
            for k in range(self.nvars[i]):
                # XXX: Unnecessary global sync (only need to sync with neighboring subdomain to the west)
                self.sims[i].u0[k].download(self.sims[i].stream, cpu_data=self.w[i][k,:,:], extent=self.read_w[i])
        self.sims[i].stream.synchronize()
    def ew_upload(self, i):
        #Upload to the GPU
        if self.east[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.w[self.east[i]][k,:,:], extent=self.write_e[i])
                #test_east = np.ones_like(self.e[self.east[i]][k,:,:])
                #self.sims[i].u0[k].upload(self.sims[i].stream, test_east, extent=self.write_e[i])
        if self.west[i] is not None:
            for k in range(self.nvars[i]):
                self.sims[i].u0[k].upload(self.sims[i].stream, self.e[self.west[i]][k,:,:], extent=self.write_w[i])
                #test_west = np.ones_like(self.e[self.west[i]][k,:,:])
                #self.sims[i].u0[k].upload(self.sims[i].stream, test_west, extent=self.write_w[i])
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@ -162,6 +162,7 @@ class BaseSimulator(object):
        #Create a CUDA stream
        self.stream = cuda.Stream()
        self.internal_stream = cuda.Stream()
        #Keep track of simulation time and number of timesteps
        self.t = 0.0
--- a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu
+++ b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu
@ -147,7 +147,18 @@ __global__ void KP07DimsplitKernel(
        float* E1_ptr_, int E1_pitch_, 
        //Output CFL
-        float* cfl_) {
+        float* cfl_,
        //Subarea of internal domain to compute
        int x0=0, int y0=0,
        int x1=0, int y1=0) {
    if(x1 == 0)
        x1 = nx_;
    if(y1 == 0)
        y1 = ny_;
    const unsigned int w = BLOCK_WIDTH;
    const unsigned int h = BLOCK_HEIGHT;
    const unsigned int gc_x = 2;
@ -160,10 +171,10 @@ __global__ void KP07DimsplitKernel(
    __shared__ float  F[4][h+2*gc_y][w+2*gc_x];
    //Read into shared memory
-    readBlock<w, h, gc_x, gc_y,  1,  1>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
-    readBlock<w, h, gc_x, gc_y, -1,  1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y, -1,  1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
-    readBlock<w, h, gc_x, gc_y,  1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
-    readBlock<w, h, gc_x, gc_y,  1,  1>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_, boundary_conditions_);
+    readBlock<w, h, gc_x, gc_y,  1,  1>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
    //Step 0 => evolve x first, then y
    if (step_ == 0) {
@ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel(
    // Write to main memory for all internal cells
-    writeBlock<w, h, gc_x, gc_y>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
-    writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
-    writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
-    writeBlock<w, h, gc_x, gc_y>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_, 0, 1);
+    writeBlock<w, h, gc_x, gc_y>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1);
    //Compute the CFL for this block
    if (cfl_ != NULL) {
--- a/GPUSimulators/cuda/common.h
+++ b/GPUSimulators/cuda/common.h
@ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y>
 inline __device__ void readBlock(float* ptr_, int pitch_,
                float Q[h+2*gc_y][w+2*gc_x], 
                const int nx_, const int ny_,
-                const int boundary_conditions_) {
+                const int boundary_conditions_,
                 int x0, int y0,
                 int x1, int y1) {
    //Index of block within domain
    const int bx = blockDim.x * blockIdx.x;
    const int by = blockDim.y * blockIdx.y;
@ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_,
    //Loop over all variables
    for (int j=threadIdx.y; j<h+2*gc_y; j+=h) {
        //Handle periodic boundary conditions here
-        int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_);
+        int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_);
-        l = min(l, ny_+2*gc_y-1);
+        l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1));
        float* row = (float*) ((char*) ptr_ + pitch_*l);
        for (int i=threadIdx.x; i<w+2*gc_x; i+=w) {
            //Handle periodic boundary conditions here
-            int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_);
+            int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_);
-            k = min(k, nx_+2*gc_x-1);
+            k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1));
            //Read from global memory
            Q[j][i] = row[k];
@ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y>
 inline __device__ void writeBlock(float* ptr_, int pitch_,
                 float shmem[h+2*gc_y][w+2*gc_x],
                 const int nx_, const int ny_,
-                 int rk_step_, int rk_order_) {
+                 int rk_step_, int rk_order_,
                 int x0, int y0,
                 int x1, int y1) {
    //Index of cell within domain
-    const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x;
+    const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0;
-    const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y;
+    const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0;
    //In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
    const int max_ti = min(nx_+gc_x, x1+gc_x);
    const int max_tj = min(ny_+gc_y, y1+gc_y);
    //Only write internal cells
-    if (ti < nx_+gc_x && tj < ny_+gc_y) {
+    if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) {
        //Index of thread within block
        const int tx = threadIdx.x + gc_x;
        const int ty = threadIdx.y + gc_y;
@ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
                row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx];
            }
        }
        // DEBUG
        //row[ti] = 99.0;
    }
 }
--- a/GPUSimulators/helpers/InitialConditions.py
+++ b/GPUSimulators/helpers/InitialConditions.py
@ -25,10 +25,13 @@ import numpy as np
 import gc
-def getExtent(width, height, nx, ny, grid):
+def getExtent(width, height, nx, ny, grid, index=None):
    if grid is not None:
        gx = grid.grid[0]
        gy = grid.grid[1]
        if index is not None:
            i, j = grid.getCoordinate(index)
        else:
            i, j = grid.getCoordinate()
        dx = (width / gx) / nx
@ -192,7 +195,7 @@ def genShockBubble(nx, ny, gamma, grid=None):
-def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
+def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None, index=None):
    """
    Roughness parameter in (0, 1.0] determines how "squiggly" 
    the interface betweeen the zones is
@ -234,7 +237,7 @@ def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
-        x0, x1, y0, y1, _, dy = getExtent(1.0, 1.0, nx, ny, grid)
+        x0, x1, y0, y1, _, dy = getExtent(1.0, 1.0, nx, ny, grid, index)
        x = np.linspace(x0, x1, nx)
        y = np.linspace(y0, y1, ny)
        _, y = np.meshgrid(x, y)
@ -274,7 +277,7 @@ def genKelvinHelmholtz(nx, ny, gamma, roughness=0.125, grid=None):
    E = 0.5*rho*(u**2+v**2) + p/(gamma-1.0)
-    _, _, _, _, dx, dy = getExtent(width, height, nx, ny, grid)
+    _, _, _, _, dx, dy = getExtent(width, height, nx, ny, grid, index)
    bc = BoundaryCondition({
--- a/GPUSimulators/helpers/Visualization.py
+++ b/GPUSimulators/helpers/Visualization.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-This python module implements Cuda context handling
+This python module implements visualization techniques/modes
 Copyright (C) 2018  SINTEF ICT
--- a/README.md
+++ b/README.md
@ -0,0 +1,18 @@
 # ShallowWaterGPU
 ## Setup
 A good place to start exploring this codebase is the notebooks. Complete the following steps to run the notebooks:
 1. Install conda (see e.g. Miniconda or Anaconda)
 2. Change directory to the repository root and run the following commands
 3. conda env create -f conda_environment.yml
 4. conda activate ShallowWaterGPU
 5. jupyter notebook
 Make sure you are running the correct kernel ("conda:ShallowWaterGPU"). If not, change kernel using the "Kernel"-menu in the notebook.
 If you do not need to run notebooks you may use the conda environment found in conda_environment_hpc.yml
 ## Troubleshooting
 Have a look at the conda documentation and https://towardsdatascience.com/how-to-set-up-anaconda-and-jupyter-notebook-the-right-way-de3b7623ea4a
--- a/SYSTEMS.md
+++ b/SYSTEMS.md
@ -0,0 +1,62 @@
 $ANSIBLE_VAULT;1.1;AES256
 61316265663939333638336466323036663861343233316466646432313138653633623662353937
 3232313165656633346432376237383566363537366534310a303231343936663438653835373161
 35616161323432653062323164623861353065333761663136313137333732313230626665386336
 6166656538396463370a356166316363326133313864386536323236346634323537393639653038
 66336337336132613061353964613638326233356336323962366531333932366539366339623563
 36333365326463616634323939333062363263636663373635653064626138363464666233316561
 63393735393233616437386537393739393433663631313864646535636262616336333631396166
 38643636323530386565396338623366393232313838356536303537393338393634666632656234
 65353930303762333639376638336364303439306132626531326132376264623063376464636430
 32333536386134333136313139313861306364333037323363393463333664633764653937623866
 34313064346261313330373132353563343761323435393930303136353865303163373937623831
 64343038373162333039653161643233353764633337366434396638376530636261323362373434
 38393630613065356632663533333331633039663935663732353234643131306665343339373265
 64356563653838613337663132663234356462343333623139626662363764656239326637653832
 35396636643937336431623531306133643137623831333936313839333738333730373136666336
 35623965643664343164373630313362656663386638376237616134343631386366313336626138
 62376436383837376539663438346431633138383363633862356366376537393932626262383637
 31323365333139653736623233636233323162343039663035346135326638633430303134396337
 36353264313835346130643736663665386364343835643166383361316631373338663731373335
 30313530326662663937666330643565363565616566633333363535656539656531666266613638
 30306264613438363265646332386535383238373433396337633636616532626161343236336533
 36366362653137333739353737386563613136653164383437316237643533633133313735633363
 64326433356266363133343339626333633063326533383632353639613163663966376465396231
 36663034363534396430316463386564663465323036613636343136643262666566303533346439
 63396466656639623836613130363835346435633437666463363333356231343038356434343861
 66643636633739336666316566653136363862346336353862653130346335363334616430366435
 30376365383262326438306266366265363030353764633630333034663037643037343132303631
 39316364366234363339363130333765616432306331373566393530653963356539636437383062
 34633938643563656363633864656361643539663833356638356365373061663964363530393535
 37646533386235613763396638393539303062326239633238373763326561313634313265613135
 64646138313562313732393732303133343234323438616165326530333234626363393735636530
 62353735313231353662353533636134306530623339383730306332613636663366653566313935
 32343935353566656130393533323639353863666436333839386463396337336635356663373136
 61323734613239396236393266363631313465363630306565636663396235626132336339623938
 62383435643661623938393662363262376566613365613465323432343534356433323330666133
 30303963656635303734316539333038663962626331313366666337663165323230646564623935
 61316630353739386365323339626166323562616630383538393733353864396565353039656333
 30343038636231363531383061613836653038373937616163643963393231356235626531366239
 62343333326434636665363931376235313535343135626261336439636663323233383565633964
 65333830613131396630336337646230393038386536336365313738316335386261393838383961
 64656331363738616539346663613261386639353437316231636533353031336464383432623939
 65386164396231393735643563663337643563633233373338643630313739373861356166616463
 35306263333963663434376263396464323135346663376334356134393066653439376263376231
 30333730383163366636323533393334336331633234306536376634313735613263366537346536
 62366564383861656662353738366665396639313833323038356661306135393338333466333563
 32653861346166663163383036386432343833333137663462343030363762663139366534326466
 66313864623438336164333430613766373430656536323964633863333931643036656563353639
 30313835666366383035343031643265386263316165323537613636656533376239633964393866
 61646163343032313036303738643763383364663134356634373262633361383035306231636364
 39333232636538643033313438396332383962656131363365666566633239366532326336363133
 38393064643030333538333562643435663434343863383834663266373337336433313663646164
 36343334343965623830613736393231666361643239663062393239613233376335383362666161
 66383035653330373736613234303631386163656561383138613363613539396332376162316131
 61313532653531653836343731636535623066383231613635316432323331623761383833623333
 39343632623961613561373261653939636363366531303839336237383166363733303538363237
 36373362636263666334316163633766303334373033636539353464393536356466636664333665
 32643135626366666137626464393961366165383334343063356334373534633764326162363837
 38643662326266313464343464646166643235663663303761313639376537306337353863336264
 66376335333738366265343636376363366365306137336665623466626261653937656461303332
 32613561616662383032393562613831626666373134303032626134313262363830326530643632
 61366133663564313933366430396430353762386133396436633839303766653765
--- a/dgx-2-shmem-test.job
+++ b/dgx-2-shmem-test.job
@ -0,0 +1,33 @@
 #!/bin/bash
 #SBATCH -p dgx2q                   # partition (GPU queue)
 #SBATCH -N 1                       # number of nodes
 #SBATCH -n 1                       # number of cores
 #SBATCH -w g001                    # DGX-2 node
 #SBATCH --gres=gpu:1               # number of V100's
 #SBATCH -t 0-00:10                 # time (D-HH:MM)
 #SBATCH -o slurm.%N.%j.out  # STDOUT
 #SBATCH -e slurm.%N.%j.err  # STDERR
 ulimit -s 10240
 module load slurm 
 module load cuda10.1/toolkit/10.1.243
 # Check how many gpu's your job got
 #nvidia-smi
 ## Copy input files to the work directory:
 rm -rf /work/$USER/ShallowWaterGPU
 mkdir -p /work/$USER/ShallowWaterGPU
 cp -r . /work/$USER/ShallowWaterGPU
 # Run job
 # (Assumes Miniconda is installed in user root dir.)
 cd /work/$USER/ShallowWaterGPU
 nvprof -o profiler_output $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 shmemTesting.py
 cd $HOME/src/ShallowWaterGPU
 ## Copy files from work directory:
 # (NOTE: Copying is not performed if job fails!)
 cp /work/$USER/ShallowWaterGPU/*.log .
 cp /work/$USER/ShallowWaterGPU/*.nc .
 cp /work/$USER/ShallowWaterGPU/profiler_output .
--- a/dgx-2-test.job
+++ b/dgx-2-test.job
@ -1,10 +1,10 @@
 #!/bin/bash
 # See http://wiki.ex3.simula.no before changing the values below
 #SBATCH -p dgx2q                   # partition (GPU queue)
 #SBATCH -N 1                       # number of nodes
 #SBATCH -n 4                       # number of cores
 #SBATCH -w g001                    # DGX-2 node
 #SBATCH --gres=gpu:4               # number of V100's
 #SBATCH --mem 10G                  # memory pool for all cores
 #SBATCH -t 0-00:10                 # time (D-HH:MM)
 #SBATCH -o slurm.%N.%j.out  # STDOUT
 #SBATCH -e slurm.%N.%j.err  # STDERR
@ -18,6 +18,7 @@ module load cuda10.1/toolkit/10.1.243
 #nvidia-smi
 ## Copy input files to the work directory:
 rm -rf /work/$USER/ShallowWaterGPU
 mkdir -p /work/$USER/ShallowWaterGPU
 cp -r . /work/$USER/ShallowWaterGPU
--- a/dgx-2_scaling_benchmark.job
+++ b/dgx-2_scaling_benchmark.job
@ -0,0 +1,59 @@
 #!/bin/bash
 # See http://wiki.ex3.simula.no before changing the values below
 #SBATCH -p dgx2q                   # partition (GPU queue)
 #SBATCH -w g001                    # DGX-2 node
 ##SBATCH --gres=gpu:4               # number of V100's
 #SBATCH -t 0-00:10                 # time (D-HH:MM)
 #SBATCH -o slurm.%N.%j.out  # STDOUT
 #SBATCH -e slurm.%N.%j.err  # STDERR
 #SBATCH --reservation=martinls_17
 # For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
 # To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
 # launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
 # mpiexec --mca opal_cuda_support 1 ...
 # 
 # In addition, the UCX support is also built but disabled by default.
 # To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
 # variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
 # Equivalently, you can set the MCA parameters in the command line:
 # mpiexec --mca pml ucx --mca osc ucx ...
 # Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
 # Please consult UCX's documentation for detail.
 ulimit -s 10240
 module load slurm/20.02.7
 module load cuda11.2/toolkit/11.2.2
 module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
 # Check how many gpu's your job got
 #nvidia-smi
 mkdir -p output_dgx-2/$NOW
 ## Copy input files to the work directory:
 mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
 cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
 # Run job
 # (Assumes Miniconda is installed in user root dir.)
 cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
 #mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 #nsys profile -t nvtx,cuda mpirun -np  $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 #mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 export OMPI_MCA_opal_cuda_support=true
 mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 cd $HOME/src/ShallowWaterGPU
 ## Copy files from work directory:
 # (NOTE: Copying is not performed if job fails!)
 mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW
 rm -rf /work/$USER/$SLURM_JOB_ID
--- a/dgx-2_strong_scaling_benchmark.sh
+++ b/dgx-2_strong_scaling_benchmark.sh
@ -0,0 +1,73 @@
 #!/bin/bash
 TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
 # one node: 1-16 GPUs
 #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #
 #sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 # one node: 4-16 GPUs
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #
 #sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 # one node: 1-16 GPUs
 sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 # one node: 4-16 GPUs
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
--- a/dgx-2_weak_scaling_benchmark.sh
+++ b/dgx-2_weak_scaling_benchmark.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
 # one node: 1-16 GPUs
 #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #
 #sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 # one node: 1-16 GPUs
 sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job
--- a/hgx_scaling_benchmark.job
+++ b/hgx_scaling_benchmark.job
@ -0,0 +1,58 @@
 #!/bin/bash
 # See http://wiki.ex3.simula.no before changing the values below
 #SBATCH -p hgx2q                   # partition (GPU queue)
 #SBATCH -w g002                    # HGX node
 #SBATCH -t 0-00:10                 # time (D-HH:MM)
 #SBATCH -o slurm.%N.%j.out  # STDOUT
 #SBATCH -e slurm.%N.%j.err  # STDERR
 #SBATCH --reservation=martinls_11
 # For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default.
 # To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before
 # launching your MPI processes. Equivalently, you can set the MCA parameter in the command line:
 # mpiexec --mca opal_cuda_support 1 ...
 # 
 # In addition, the UCX support is also built but disabled by default.
 # To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment
 # variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes.
 # Equivalently, you can set the MCA parameters in the command line:
 # mpiexec --mca pml ucx --mca osc ucx ...
 # Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX.
 # Please consult UCX's documentation for detail.
 ulimit -s 10240
 module load slurm/20.02.7
 module load cuda11.2/toolkit/11.2.2
 module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
 # Check how many gpu's your job got
 #nvidia-smi
 mkdir -p output_hgx/$NOW
 ## Copy input files to the work directory:
 mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
 cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
 # Run job
 # (Assumes Miniconda is installed in user root dir.)
 cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
 #mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 #nsys profile -t nvtx,cuda mpirun -np  $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 #mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 export OMPI_MCA_opal_cuda_support=true
 mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 cd $HOME/src/ShallowWaterGPU
 ## Copy files from work directory:
 # (NOTE: Copying is not performed if job fails!)
 mkdir -p output_hgx/$NOW/$SLURM_JOB_ID
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW
 mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW
 rm -rf /work/$USER/$SLURM_JOB_ID
--- a/hgx_strong_scaling_benchmark.sh
+++ b/hgx_strong_scaling_benchmark.sh
@ -0,0 +1,20 @@
 #!/bin/bash
 TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
 # one node: 1-8 GPUs
 #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 # one node: 4-8 GPUs
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job
--- a/hgx_weak_scaling_benchmark.sh
+++ b/hgx_weak_scaling_benchmark.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
 # one node: 1-16 GPUs
 #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 # one node: 1-8 GPUs
 sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
 sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job
--- a/mpiTesting.py
+++ b/mpiTesting.py
@ -25,26 +25,42 @@ import gc
 import time
 import json
 import logging
 import os
-#MPI
+# MPI
 from mpi4py import MPI
-#CUDA
+# CUDA
 import pycuda.driver as cuda
-#Simulator engine etc
+# Simulator engine etc
 from GPUSimulators import MPISimulator, Common, CudaContext
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
 from GPUSimulators.Simulator import BoundaryCondition as BC
 import argparse
 parser = argparse.ArgumentParser(description='Strong and weak scaling experiments.')
 parser.add_argument('-nx', type=int, default=128)
 parser.add_argument('-ny', type=int, default=128)
 parser.add_argument('--profile', action='store_true') # default: False
-#Get MPI COMM to use
+
 args = parser.parse_args()
 if(args.profile):
    profiling_data = {}
    # profiling: total run time
    t_total_start = time.time()
    t_init_start = time.time()
 # Get MPI COMM to use
 comm = MPI.COMM_WORLD
 ####
-#Initialize logging 
+# Initialize logging
 ####
 log_level_console = 20
 log_level_file = 10
@ -55,15 +71,17 @@ logger.setLevel(min(log_level_console, log_level_file))
 ch = logging.StreamHandler()
 ch.setLevel(log_level_console)
 logger.addHandler(ch)
-logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
+logger.info("Console logger using level %s",
            logging.getLevelName(log_level_console))
 fh = logging.FileHandler(log_filename)
-formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
+formatter = logging.Formatter(
    '%(asctime)s:%(name)s:%(levelname)s: %(message)s')
 fh.setFormatter(formatter)
 fh.setLevel(log_level_file)
 logger.addHandler(fh)
-logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
+logger.info("File logger using level %s to %s",
-
+            logging.getLevelName(log_level_file), log_filename)
 ####
@ -73,7 +91,6 @@ logger.info("Creating MPI grid")
 grid = MPISimulator.MPIGrid(MPI.COMM_WORLD)
 ####
 # Initialize CUDA
 ####
@ -82,18 +99,28 @@ logger.info("Initializing CUDA")
 local_rank = grid.getLocalRank()
 num_cuda_devices = cuda.Device.count()
 cuda_device = local_rank % num_cuda_devices
 logger.info("Process %s using CUDA device %s", str(local_rank), str(cuda_device))
 cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
 ####
 # Set initial conditions
 ####
 # DEBUGGING - setting random seed
 np.random.seed(42)
 logger.info("Generating initial conditions")
-nx = 128
+nx = args.nx
-ny = 128
+ny = args.ny
 dt = 0.000001
 gamma = 1.4
-save_times = np.linspace(0, 5.0, 10)
+#save_times = np.linspace(0, 0.000009, 2)
 #save_times = np.linspace(0, 0.000099, 11)
 #save_times = np.linspace(0, 0.000099, 2)
 save_times = np.linspace(0, 0.0000999, 2)
 outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
@ -102,21 +129,65 @@ arguments['context'] = cuda_context
 arguments['theta'] = 1.2
 arguments['grid'] = grid
-
+if(args.profile):
-    
+    t_init_end = time.time()
    t_init = t_init_end - t_init_start
    profiling_data["t_init"] = t_init
 ####
 # Run simulation
 ####
 logger.info("Running simulation")
-#Helper function to create MPI simulator
+# Helper function to create MPI simulator
 def genSim(grid, **kwargs):
    local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
    sim = MPISimulator.MPISimulator(local_sim, grid)
    return sim
 outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
 outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
    genSim, arguments, outfile, save_times, save_var_names, dt)
 if(args.profile):
    t_total_end = time.time()
    t_total = t_total_end - t_total_start
    profiling_data["t_total"] = t_total
    print("Total run time on rank " + str(MPI.COMM_WORLD.rank) + " is " + str(t_total) + " s")
 # write profiling to json file
 if(args.profile and MPI.COMM_WORLD.rank == 0):
    job_id = ""
    if "SLURM_JOB_ID" in os.environ:
        job_id = int(os.environ["SLURM_JOB_ID"])
        allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
        allocated_gpus = int(os.environ["CUDA_VISIBLE_DEVICES"].count(",") + 1)
        profiling_file = "MPI_jobid_" + \
            str(job_id) + "_" + str(allocated_nodes) + "_nodes_and_" + str(allocated_gpus) + "_GPUs_profiling.json"
        profiling_data["outfile"] = outfile
    else:
        profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json"
    for stage in sim_runner_profiling_data["start"].keys():
        profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage]
    for stage in sim_profiling_data["start"].keys():
        profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage]
    profiling_data["nx"] = nx
    profiling_data["ny"] = ny
    profiling_data["dt"] = dt
    profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
    profiling_data["slurm_job_id"] = job_id
    profiling_data["n_cuda_devices"] = str(num_cuda_devices)
    profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
    profiling_data["git_hash"] = Common.getGitHash()
    profiling_data["git_status"] = Common.getGitStatus()
    with open(profiling_file, "w") as write_file:
        json.dump(profiling_data, write_file)
 ####
 # Clean shutdown
--- a/run_script_ppi.sh
+++ b/run_script_ppi.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 module purge
 module load git/2.21.0 hdf5/1.10.5-gcc cuda/10.1
 conda activate ShallowWaterGPU_HPC
 python mpiTesting.py
--- a/saga-dev.job
+++ b/saga-dev.job
@ -0,0 +1,54 @@
 #!/bin/bash
 # Job name:
 #SBATCH --job-name=ShallowWaterGPUScalingDev
 #
 # Project:
 #SBATCH --account=nn9882k
 #
 # Wall clock limit:
 #SBATCH --time=00:02:00
 #
 # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
 #
 # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
 # device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
 # /dev/nvidia0, /dev/nvidia1 or both, respectively.
 #SBATCH --partition=accel
 #
 # Max memory usage per task (core) - increasing this will cost more core hours:
 #SBATCH --mem-per-cpu=3800M
 #
 # Number of tasks:
 #SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
 #
 #SBATCH --qos=devel
 ## Set up job environment: (this is done automatically behind the scenes)
 ## (make sure to comment '#' or remove the following line 'source ...')
 # source /cluster/bin/jobsetup
 module restore system   # instead of 'module purge' rather set module environment to the system default
 module load CUDA/11.4.1
 # It is also recommended to to list loaded modules, for easier debugging:
 module list
 set -o errexit # exit on errors
 set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
 ## Copy input files to the work directory:
 mkdir $SCRATCH/ShallowWaterGPU
 cp -r . $SCRATCH/ShallowWaterGPU
 ## Make sure the results are copied back to the submit directory (see Work Directory below):
 # chkfile MyResultFileq
 # chkfile is replaced by 'savefile' on Saga
 savefile "$SCRATCH/ShallowWaterGPU/*.log"
 savefile "$SCRATCH/ShallowWaterGPU/*.nc"
 savefile "$SCRATCH/ShallowWaterGPU/*.json"
 ## Do some work:
 cd $SCRATCH/ShallowWaterGPU
 srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
 srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile
--- a/saga-test.job
+++ b/saga-test.job
@ -1,24 +1,25 @@
 #!/bin/bash
 # Job name:
-#SBATCH --job-name=saga-test
+#SBATCH --job-name=ShallowWaterGPUStrongScaling
 #
 # Project:
-#SBATCH --account=nn9550k
+#SBATCH --account=nn9882k
 #
 # Wall clock limit:
-#SBATCH --time=00:10:00
+#SBATCH --time=24:00:00
 #
 # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
 #
 # Ask for 1 GPU (max is 2)
 # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
 # device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
 # /dev/nvidia0, /dev/nvidia1 or both, respectively.
-#SBATCH --partition=accel --gres=gpu:1
+#SBATCH --partition=accel
 #
 # Max memory usage per task (core) - increasing this will cost more core hours:
-#SBATCH --mem-per-cpu=4G
+#SBATCH --mem-per-cpu=3800M
 #
 # Number of tasks:
-#SBATCH --nodes=2 --ntasks-per-node=1
+#SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1
 ## Set up job environment: (this is done automatically behind the scenes)
 ## (make sure to comment '#' or remove the following line 'source ...')
@ -42,9 +43,10 @@ cp -r . $SCRATCH/ShallowWaterGPU
 # chkfile is replaced by 'savefile' on Saga
 savefile "$SCRATCH/ShallowWaterGPU/*.log"
 savefile "$SCRATCH/ShallowWaterGPU/*.nc"
 savefile "$SCRATCH/ShallowWaterGPU/*.json"
 ## Do some work:
 cd $SCRATCH/ShallowWaterGPU
-srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
-srun /cluster/home/$HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py
+srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile
--- a/saga_scaling_benchmark.job
+++ b/saga_scaling_benchmark.job
@ -0,0 +1,65 @@
 #!/bin/bash
 # Job name:
 #SBATCH --job-name=ShallowWaterGPUScaling
 #
 # Project:
 #SBATCH --account=nn9882k
 #
 # Wall clock limit:
 #SBATCH --time=00:10:00
 #
 # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below
 #
 # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU 
 # device(s) to use. It will have values '0', '1' or '0,1' corresponding to 
 # /dev/nvidia0, /dev/nvidia1 or both, respectively.
 #SBATCH --partition=accel
 #
 # Max memory usage per task (core) - increasing this will cost more core hours:
 ##SBATCH --mem-per-cpu=3800M
 #SBATCH --mem-per-cpu=24G
 #
 #SBATCH --qos=devel
 ## Set up job environment: (this is done automatically behind the scenes)
 ## (make sure to comment '#' or remove the following line 'source ...')
 # source /cluster/bin/jobsetup
 module restore system   # instead of 'module purge' rather set module environment to the system default
 module load CUDA/11.4.1
 #module load CUDA/11.1.1-GCC-10.2.0
 #module load OpenMPI/4.0.5-gcccuda-2020b
 # It is also recommended to to list loaded modules, for easier debugging:
 module list
 set -o errexit # exit on errors
 set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script)
 ## Copy input files to the work directory:
 mkdir $SCRATCH/ShallowWaterGPU
 cp -r . $SCRATCH/ShallowWaterGPU
 ## Make sure the results are copied back to the submit directory (see Work Directory below):
 # chkfile MyResultFile
 # chkfile is replaced by 'savefile' on Saga
 #savefile "$SCRATCH/ShallowWaterGPU/*.log"
 #savefile "$SCRATCH/ShallowWaterGPU/*.nc"
 #savefile "$SCRATCH/ShallowWaterGPU/*.json"
 #savefile "$SCRATCH/ShallowWaterGPU/*.qdrep"
 cleanup "rm -rf $SCRATCH/ShallowWaterGPU"
 export OMPI_MCA_opal_cuda_support=true
 ## Do some work:
 cd $SCRATCH/ShallowWaterGPU
 srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version
 srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
 cd $HOME/src/ShallowWaterGPU
 mkdir -p output_saga/$NOW/$SLURM_JOB_ID
 mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID
 mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID
 mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW
 mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW
--- a/saga_strong_scaling_benchmark.sh
+++ b/saga_strong_scaling_benchmark.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
 # one node: 1–4 GPUs
 sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
 sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
 sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 # 4 nodes: 1–4 GPUs per node
 sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
 sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
 sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
 # 4 nodes: 1–4 GPUs per node
 sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
 sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
 sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
 ## one node: 1–4 GPUs
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 #
 ## 4 nodes: 1–4 GPUs per node
 #sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 #sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks
 #sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks
 #sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks
--- a/saga_weak_scaling_benchmark.sh
+++ b/saga_weak_scaling_benchmark.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
 # one node: 1-4 GPUs
 sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
 sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
 sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
 sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 # 2-4 nodes: 1 GPUs per node
 sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
 sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
 sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 ## one node: 1-4 GPUs
 #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks
 #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
 #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
 #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
 ## 2-4 nodes: 1 GPUs per node
 #sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks
 #sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks
 #sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks
--- a/seymour_strong_scaling_benchmark.sh
+++ b/seymour_strong_scaling_benchmark.sh
@ -0,0 +1,39 @@
 #!/bin/bash
 NOW=$(date "+%Y-%m-%dT%H%M%S")
 mkdir -p output_seymour/$NOW
 # one node: 1-8 GPUs
 mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile && 
 mkdir -p output_seymour/$NOW/1_proc && 
 mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ &&
 mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile && 
 mkdir -p output_seymour/$NOW/2_proc && 
 mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ &&
 mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile && 
 mkdir -p output_seymour/$NOW/3_proc && 
 mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ &&
 mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile && 
 mkdir -p output_seymour/$NOW/4_proc && 
 mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ &&
 mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile &&
 mkdir -p output_seymour/$NOW/5_proc && 
 mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ &&
 mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile &&
 mkdir -p output_seymour/$NOW/6_proc && 
 mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ &&
 mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile &&
 mkdir -p output_seymour/$NOW/7_proc && 
 mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ &&
 mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile &&
 mkdir -p output_seymour/$NOW/8_proc && 
 mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ &&
 for filename in *.json; do mv "$filename" "output_seymour/$NOW/MPI_${NOW}_${filename#????}"; done;
--- a/shmemTesting.py
+++ b/shmemTesting.py
@ -0,0 +1,127 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements SHMEM (shared memory) simulations for benchmarking
 Copyright (C) 2020 Norwegian Meteorological Institute
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 import numpy as np
 import gc
 import time
 import json
 import logging
 #Simulator engine etc
 from GPUSimulators import SHMEMSimulatorGroup, Common, CudaContext
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
 from GPUSimulators.Simulator import BoundaryCondition as BC
 ####
 #Initialize logging 
 ####
 log_level_console = 20
 log_level_file    = 10
 log_filename = 'shmem.log'
 logger = logging.getLogger('GPUSimulators')
 logger.setLevel(min(log_level_console, log_level_file))
 ch = logging.StreamHandler()
 ch.setLevel(log_level_console)
 logger.addHandler(ch)
 logger.info("Console logger using level %s", logging.getLevelName(log_level_console))
 fh = logging.FileHandler(log_filename)
 formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
 fh.setFormatter(formatter)
 fh.setLevel(log_level_file)
 logger.addHandler(fh)
 logger.info("File logger using level %s to %s", logging.getLevelName(log_level_file), log_filename)
 ####
 # Initialize SHMEM grid etc
 ####
 logger.info("Creating SHMEM grid")
 # XXX: need to explicitly set ngpus when testing on single-GPU system
 grid = SHMEMSimulatorGroup.SHMEMGrid(ngpus=4) 
 ####
 # Set initial conditions
 ####
 logger.info("Generating initial conditions")
 nx = 128
 ny = 128
 gamma = 1.4
 #save_times = np.linspace(0, 0.01, 10)
 save_times = np.linspace(0, 10, 10)
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
 outfile = "shmem_out.nc"
 ####
 # Run simulation
 ####
 logger.info("Running simulation")
 sims = []
 for i in range(grid.ngpus):
    arguments = IC.genKelvinHelmholtz(nx, ny, gamma, grid=grid, index=i)
    arguments['context'] = grid.cuda_contexts[i]
    arguments['theta'] = 1.2
    sims.append(EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**arguments))
    #sims[i] = SHMEMSimulator(i, local_sim, grid) # 1st attempt: no wrapper (per sim)
 arguments['sims'] = sims
 arguments['grid'] = grid
 #Helper function to create SHMEM simulator
 def genSim(sims, grid, **kwargs):
    # XXX: kwargs not used, since the simulators are already instantiated in the for-loop above
    sim = SHMEMSimulatorGroup.SHMEMSimulatorGroup(sims, grid)
    return sim
 outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
 ####
 # Clean shutdown
 ####
 sim = None
 local_sim = None
 cuda_context = None
 arguments = None
 logging.shutdown()
 gc.collect()
 ####
 # Print completion and exit
 ####
 print("Completed!")
 exit(0)
--- a/singleGPUTesting.py
+++ b/singleGPUTesting.py
@ -0,0 +1,126 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements simulations for benchmarking
 Copyright (C) 2018  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 import numpy as np
 import gc
 import logging
 import os
 # CUDA
 import pycuda.driver as cuda
 # Simulator engine etc
 from GPUSimulators import Common, CudaContext
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC
 from GPUSimulators.Simulator import BoundaryCondition as BC
 import argparse
 parser = argparse.ArgumentParser(description='Single GPU testing.')
 parser.add_argument('-nx', type=int, default=128)
 parser.add_argument('-ny', type=int, default=128)
 args = parser.parse_args()
 ####
 # Initialize logging
 ####
 log_level_console = 20
 log_level_file = 10
 log_filename = 'single_gpu.log'
 logger = logging.getLogger('GPUSimulators')
 logger.setLevel(min(log_level_console, log_level_file))
 ch = logging.StreamHandler()
 ch.setLevel(log_level_console)
 logger.addHandler(ch)
 logger.info("Console logger using level %s",
            logging.getLevelName(log_level_console))
 fh = logging.FileHandler(log_filename)
 formatter = logging.Formatter(
    '%(asctime)s:%(name)s:%(levelname)s: %(message)s')
 fh.setFormatter(formatter)
 fh.setLevel(log_level_file)
 logger.addHandler(fh)
 logger.info("File logger using level %s to %s",
            logging.getLevelName(log_level_file), log_filename)
 ####
 # Initialize CUDA
 ####
 cuda.init(flags=0)
 logger.info("Initializing CUDA")
 cuda_context = CudaContext.CudaContext(autotuning=False)
 ####
 # Set initial conditions
 ####
 logger.info("Generating initial conditions")
 nx = args.nx
 ny = args.ny
 gamma = 1.4
 roughness = 0.125
 save_times = np.linspace(0, 0.5, 10)
 outfile = "single_gpu_out.nc"
 save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
 arguments = IC.genKelvinHelmholtz(nx, ny, gamma)
 arguments['context'] = cuda_context
 arguments['theta'] = 1.2
 ####
 # Run simulation
 ####
 logger.info("Running simulation")
 # Helper function to create MPI simulator
 def genSim(**kwargs):
    local_sim = EE2D_KP07_dimsplit.EE2D_KP07_dimsplit(**kwargs)
    return local_sim
 outfile = Common.runSimulation(
    genSim, arguments, outfile, save_times, save_var_names)
 ####
 # Clean shutdown
 ####
 local_sim = None
 cuda_context = None
 arguments = None
 logging.shutdown()
 gc.collect()
 ####
 # Print completion and exit
 ####
 print("Completed!")
 exit(0)