From a588948e77908b2985ee143b055aa4ba15b4fbc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Lilleeng=20S=C3=A6tra?= <martinls@met.no>
Date: Wed, 25 May 2022 11:25:10 +0000
Subject: [PATCH] Bugfix for ghost cell exchange and nc atts

---
 GPUSimulators/Common.py             |  17 ++--
 GPUSimulators/EE2D_KP07_dimsplit.py |  64 +++++++++++---
 GPUSimulators/MPISimulator.py       | 127 ++++++++++++----------------
 mpiTesting.py                       |   2 +
 4 files changed, 124 insertions(+), 86 deletions(-)

diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py
index a965231..76902c5 100644
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -100,8 +100,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
     profiling_data_sim_runner["end"]["t_sim_init"] = 0
     profiling_data_sim_runner["start"]["t_nc_write"] = 0
     profiling_data_sim_runner["end"]["t_nc_write"] = 0
-    profiling_data_sim_runner["start"]["t_step"] = 0
-    profiling_data_sim_runner["end"]["t_step"] = 0
+    profiling_data_sim_runner["start"]["t_full_step"] = 0
+    profiling_data_sim_runner["end"]["t_full_step"] = 0
 
     profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
 
@@ -121,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
         outdata.ncfile.git_hash = getGitHash()
         outdata.ncfile.git_status = getGitStatus()
         outdata.ncfile.simulator = str(simulator)
-        outdata.ncfile.sim_args = toJson(simulator_args)
+        
+        # do not write fields to attributes (they are to large)
+        simulator_args_for_ncfile = simulator_args.copy()
+        del simulator_args_for_ncfile["rho"]
+        del simulator_args_for_ncfile["rho_u"]
+        del simulator_args_for_ncfile["rho_v"]
+        del simulator_args_for_ncfile["E"]
+        outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
         
         #Create dimensions
         outdata.ncfile.createDimension('time', len(save_times))
@@ -172,13 +179,13 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
                 logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
                 return outdata.filename
 
-            profiling_data_sim_runner["start"]["t_step"] += time.time()
+            profiling_data_sim_runner["start"]["t_full_step"] += time.time()
 
             #Simulate
             if (t_step > 0.0):
                 sim.simulate(t_step, dt)
 
-            profiling_data_sim_runner["end"]["t_step"] += time.time()
+            profiling_data_sim_runner["end"]["t_full_step"] += time.time()
 
             profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
 
diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py
index cc15c9c..2c3f810 100644
--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@@ -138,9 +138,9 @@ class EE2D_KP07_dimsplit (BaseSimulator):
             return
         
         if external and not internal:
-            #############################################################
-            # XXX: Only treating north and south external cells for now #
-            #############################################################
+            ###################################
+            # XXX: Corners are treated twice! #
+            ###################################
 
             ns_grid_size = (self.grid_size[0], 1)
 
@@ -189,14 +189,58 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                 self.cfl_data.gpudata,
                 0, 0,
                 self.nx, int(self.u0[0].y_halo))
+            
+            we_grid_size = (1, self.grid_size[1])
+            
+            # WEST
+            # (x0, y0) x (x1, y1)
+            #  (0, 0) x (x_halo, ny)
+            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, 
+                self.nx, self.ny,
+                self.dx, self.dy, dt, 
+                self.g, 
+                self.gamma, 
+                self.theta, 
+                substep,
+                self.boundary_conditions, 
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
+                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
+                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                self.cfl_data.gpudata,
+                0, 0,
+                int(self.u0[0].x_halo), self.ny)
+
+            # EAST
+            # (x0, y0) x (x1, y1)
+            #   (nx-x_halo, 0) x (nx, ny)
+            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, 
+                self.nx, self.ny,
+                self.dx, self.dy, dt, 
+                self.g, 
+                self.gamma, 
+                self.theta, 
+                substep,
+                self.boundary_conditions, 
+                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
+                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
+                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
+                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
+                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
+                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
+                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
+                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                self.cfl_data.gpudata,
+                self.nx - int(self.u0[0].x_halo), 0,
+                self.nx, self.ny)
             return
 
         if internal and not external:
-            #############################################################
-            # XXX: Only treating north and south external cells for now #
-            #      So we need to include west and east boundary here!   #
-            #############################################################
-
+            
             # INTERNAL DOMAIN
             #         (x0, y0) x (x1, y1)
             # (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
@@ -217,8 +261,8 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                 self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
                 self.u1[3].data.gpudata, self.u1[3].data.strides[0],
                 self.cfl_data.gpudata,
-                0, int(self.u0[0].y_halo),
-                self.nx, self.ny - int(self.u0[0].y_halo))
+                int(self.u0[0].x_halo), int(self.u0[0].y_halo),
+                self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
             return
 
     def swapBuffers(self):
diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py
index 3e2a7e0..4828a6c 100644
--- a/GPUSimulators/MPISimulator.py
+++ b/GPUSimulators/MPISimulator.py
@@ -208,18 +208,17 @@ class MPISimulator(Simulator.BaseSimulator):
     """
     def __init__(self, sim, grid):        
         self.profiling_data_mpi = { 'start': {}, 'end': {} }
-        self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] = 0
-        self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] = 0
-        self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] = 0
-        self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] = 0
-        self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] = 0
-        self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] = 0
-        self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] = 0
-        self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] = 0
-        self.profiling_data_mpi["start"]["t_step_mpi"] = 0
-        self.profiling_data_mpi["end"]["t_step_mpi"] = 0
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0
+        self.profiling_data_mpi["start"]["t_mpi_step"] = 0
+        self.profiling_data_mpi["end"]["t_mpi_step"] = 0
         self.profiling_data_mpi["n_time_steps"] = 0
-        self.profiling_data_mpi["start"]["t_sim_mpi_init"] = time.time()
         self.logger =  logging.getLogger(__name__)
         
         autotuner = sim.context.autotuner
@@ -297,43 +296,43 @@ class MPISimulator(Simulator.BaseSimulator):
         #Note that east and west also transfer ghost cells
         #whilst north/south only transfer internal cells
         #Reuses the width/height defined in the read-extets above
-        self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
-        self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
-        self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
-        self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
+        self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
+        self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
+        self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
+        self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
 
         #Allocate data for sending
-        self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e)
-        self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w)
-        self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n)
-        self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s)
+        self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e)
+        self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w)
+        self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n)
+        self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s)
         
         self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
-        self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time()
 
-        self.old_exchange()
+        self.full_exchange()
+        sim.context.synchronize()
     
     def substep(self, dt, step_number):
         
         nvtx.mark("substep start", color="yellow")
 
-        self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_step"] += time.time()
+        
+        nvtx.mark("substep external", color="blue")
+        self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
         
         nvtx.mark("substep internal", color="red")
         self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
 
-        nvtx.mark("substep external", color="blue")
-        self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
-
         #nvtx.mark("substep full", color="blue")
         #self.sim.substep(dt, step_number, external=True, internal=True)
 
         self.sim.swapBuffers()
 
-        self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
+        self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
         
         nvtx.mark("exchange", color="blue")
-        self.old_exchange()
+        self.full_exchange()
 
         #nvtx.mark("download", color="blue")
         #self.download_for_exchange(self.sim.u0)
@@ -383,8 +382,7 @@ class MPISimulator(Simulator.BaseSimulator):
         return [x0, x1, y0, y1]
 
     def download_for_exchange(self, u):
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
 
         # North-south
         if self.north is not None:
@@ -406,12 +404,10 @@ class MPISimulator(Simulator.BaseSimulator):
                 u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
                 #self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w)
          
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
 
     def exchange(self):
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
 
         #Send/receive to north/south neighbours
         comm_send = []
@@ -441,12 +437,10 @@ class MPISimulator(Simulator.BaseSimulator):
         for comm in comm_send:
             comm.wait()
             
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
 
     def upload_for_exchange(self, u):
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
 
         # North-south
         if self.north is not None:
@@ -464,15 +458,11 @@ class MPISimulator(Simulator.BaseSimulator):
             for k in range(self.nvars):
                 u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
         
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
-    
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
 
 
 
-
-
-    def old_exchange(self):
+    def full_exchange(self):
         ####
         # FIXME: This function can be optimized using persitent communications. 
         # Also by overlapping some of the communications north/south and east/west of GPU and intra-node
@@ -484,8 +474,7 @@ class MPISimulator(Simulator.BaseSimulator):
         ####
         
         #Download from the GPU
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
             
         if self.north is not None:
             for k in range(self.nvars):
@@ -495,10 +484,10 @@ class MPISimulator(Simulator.BaseSimulator):
                 self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
         self.sim.stream.synchronize()
         
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
+        
         #Send/receive to north/south neighbours
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
         
         comm_send = []
         comm_recv = []
@@ -513,10 +502,10 @@ class MPISimulator(Simulator.BaseSimulator):
         for comm in comm_recv:
             comm.wait()
         
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
+        
         #Upload to the GPU
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
         
         if self.north is not None:
             for k in range(self.nvars):
@@ -524,25 +513,23 @@ class MPISimulator(Simulator.BaseSimulator):
         if self.south is not None:
             for k in range(self.nvars):
                 self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
+                
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
         
         #Wait for sending to complete
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
         
         for comm in comm_send:
             comm.wait()
         
-        
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
         
         ####
         # Then transfer east-west including ghost cells that have been filled in by north-south transfer above
         ####
         
         #Download from the GPU
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time()
         
         if self.east is not None:
             for k in range(self.nvars):
@@ -552,10 +539,10 @@ class MPISimulator(Simulator.BaseSimulator):
                 self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
         self.sim.stream.synchronize()
         
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time()
+        
         #Send/receive to east/west neighbours
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
         
         comm_send = []
         comm_recv = []
@@ -566,15 +553,14 @@ class MPISimulator(Simulator.BaseSimulator):
             comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
             comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
         
-        
         #Wait for incoming transfers to complete
         for comm in comm_recv:
             comm.wait()
         
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
+        
         #Upload to the GPU
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time()
         
         if self.east is not None:
             for k in range(self.nvars):
@@ -583,13 +569,12 @@ class MPISimulator(Simulator.BaseSimulator):
             for k in range(self.nvars):
                 self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
         
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time()
+        
         #Wait for sending to complete
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
-            self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time()
         
         for comm in comm_send:
             comm.wait()
-            
-        if self.profiling_data_mpi["n_time_steps"] > 0:
-            self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
+        
+        self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time()
diff --git a/mpiTesting.py b/mpiTesting.py
index 42e3bad..11b8189 100644
--- a/mpiTesting.py
+++ b/mpiTesting.py
@@ -183,6 +183,8 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
     profiling_data["slurm_job_id"] = job_id
     profiling_data["n_cuda_devices"] = str(num_cuda_devices)
     profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
+    profiling_data["git_hash"] = Common.getGitHash()
+    profiling_data["git_status"] = Common.getGitStatus()
 
     with open(profiling_file, "w") as write_file:
         json.dump(profiling_data, write_file)