diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py
index 5059f9d..cc15c9c 100644
--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@@ -135,7 +135,6 @@ class EE2D_KP07_dimsplit (BaseSimulator):
                 self.cfl_data.gpudata,
                 0, 0, 
                 self.nx, self.ny)
-            self.u0, self.u1 = self.u1, self.u0
             return
         
         if external and not internal:
diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py
index 80cca46..3e2a7e0 100644
--- a/GPUSimulators/MPISimulator.py
+++ b/GPUSimulators/MPISimulator.py
@@ -27,7 +27,7 @@ from mpi4py import MPI
 import time
 
 import pycuda.driver as cuda
-#import nvtx
+import nvtx
 
 
 
@@ -137,6 +137,10 @@ class MPIGrid(object):
         #Sort in descending order
         grid = np.sort(grid)
         grid = grid[::-1]
+
+        # XXX: We only use vertical (north-south) partitioning for now
+        grid[0] = 1
+        grid[1] = num_nodes
         
         return grid
 
@@ -241,17 +245,11 @@ class MPISimulator(Simulator.BaseSimulator):
         
         #Get coordinate of this node
         #and handle global boundary conditions
-        #new_boundary_conditions = Simulator.BoundaryCondition({
-        #    'north': Simulator.BoundaryCondition.Type.Dirichlet,
-        #    'south': Simulator.BoundaryCondition.Type.Dirichlet,
-        #    'east': Simulator.BoundaryCondition.Type.Dirichlet,
-        #    'west': Simulator.BoundaryCondition.Type.Dirichlet
-        #})
         new_boundary_conditions = Simulator.BoundaryCondition({
-            'north': Simulator.BoundaryCondition.Type.Reflective,
-            'south': Simulator.BoundaryCondition.Type.Reflective,
-            'east': Simulator.BoundaryCondition.Type.Reflective,
-            'west': Simulator.BoundaryCondition.Type.Reflective
+            'north': Simulator.BoundaryCondition.Type.Dirichlet,
+            'south': Simulator.BoundaryCondition.Type.Dirichlet,
+            'east': Simulator.BoundaryCondition.Type.Dirichlet,
+            'west': Simulator.BoundaryCondition.Type.Dirichlet
         })
         gi, gj = grid.getCoordinate()
         print("gi: " + str(gi) + ", gj: " + str(gj))
@@ -313,36 +311,43 @@ class MPISimulator(Simulator.BaseSimulator):
         self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
         self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time()
 
-        #Init ghost cells (with data from neighboring subdomains)
-        self.download_for_exchange(self.sim.u0)
-        self.exchange()
-        self.upload_for_exchange(self.sim.u0)
+        self.old_exchange()
     
     def substep(self, dt, step_number):
-        #nvtx.mark("substep start", color="red")
+        
+        nvtx.mark("substep start", color="yellow")
 
         self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
-        #nvtx.mark("substep external", color="blue")
+        
+        nvtx.mark("substep internal", color="red")
+        self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
+
+        nvtx.mark("substep external", color="blue")
         self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
 
-        #nvtx.mark("substep internal", color="red")
-        self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
-        
-        #nvtx.mark("download", color="red")
+        #nvtx.mark("substep full", color="blue")
+        #self.sim.substep(dt, step_number, external=True, internal=True)
+
         self.sim.swapBuffers()
-        self.download_for_exchange(self.sim.u0)
 
-        #nvtx.mark("sync", color="red")
-        self.sim.stream.synchronize()
+        self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
+        
+        nvtx.mark("exchange", color="blue")
+        self.old_exchange()
+
+        #nvtx.mark("download", color="blue")
+        #self.download_for_exchange(self.sim.u0)
+        #nvtx.mark("sync", color="blue")
+        #self.sim.stream.synchronize()
         #nvtx.mark("MPI", color="green")
-        self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
-        self.exchange()
-        self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
-        #nvtx.mark("upload", color="red")
-        self.upload_for_exchange(self.sim.u0)
+        #self.exchange()
+        #nvtx.mark("upload", color="blue")
+        #self.upload_for_exchange(self.sim.u0)
 
+        nvtx.mark("sync start", color="blue")
+        self.sim.stream.synchronize()
         self.sim.internal_stream.synchronize()
-        self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
+        nvtx.mark("sync end", color="blue")
         
         self.profiling_data_mpi["n_time_steps"] += 1
 
@@ -408,10 +413,6 @@ class MPISimulator(Simulator.BaseSimulator):
         if self.profiling_data_mpi["n_time_steps"] > 0:
             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
 
-        ####
-        # First transfer internal cells north-south
-        ####
-
         #Send/receive to north/south neighbours
         comm_send = []
         comm_recv = []
@@ -422,17 +423,6 @@ class MPISimulator(Simulator.BaseSimulator):
             comm_send += [self.grid.comm.Isend(self.out_s, dest=self.south, tag=4*self.nt + 1)]
             comm_recv += [self.grid.comm.Irecv(self.in_s, source=self.south, tag=4*self.nt + 0)]
         
-        #Wait for incoming transfers to complete
-        for comm in comm_recv:
-            comm.wait()
-        
-        #Wait for sending to complete
-        for comm in comm_send:
-            comm.wait()
-        
-        ####
-        # Then transfer east-west including ghost cells that have been filled in by north-south transfer above
-        ####
         #Send/receive to east/west neighbours
         comm_send = []
         comm_recv = []
@@ -443,7 +433,6 @@ class MPISimulator(Simulator.BaseSimulator):
             comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
             comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
         
-        
         #Wait for incoming transfers to complete
         for comm in comm_recv:
             comm.wait()
diff --git a/GPUSimulators/SHMEMSimulatorGroup.py b/GPUSimulators/SHMEMSimulatorGroup.py
index 12ea456..fc11d50 100644
--- a/GPUSimulators/SHMEMSimulatorGroup.py
+++ b/GPUSimulators/SHMEMSimulatorGroup.py
@@ -156,10 +156,10 @@ class SHMEMGrid(object):
         
         return grid
 
-class SHMEMSimulatorGroup(Simulator.BaseSimulator):
+class SHMEMSimulatorGroup(object):
     """
     Class which handles communication and synchronization between simulators in different 
-    contexts (presumably on different GPUs)
+    contexts (typically on different GPUs)
     """
     def __init__(self, sims, grid):
         self.logger =  logging.getLogger(__name__)
@@ -175,6 +175,9 @@ class SHMEMSimulatorGroup(Simulator.BaseSimulator):
         # SHMEMSimulators that have BaseSimulator as a superclass.
         #
         # This would also eliminate the need for all the array bookkeeping in this class.
+        #
+        CONT HERE! Model shmemTesting after mpiTesting and divide existing functionality between SHMEMSimulatorGroup and SHMEMSimulator
+        
         autotuner = sims[0].context.autotuner
         sims[0].context.autotuner = None
         boundary_conditions = sims[0].getBoundaryConditions()