Fixed multigpu for euler

2026-01-14 15:48:43 +01:00 · 2018-11-29 14:33:13 +01:00
parent b03afc3d81
commit f9f0f20df8
7 changed files with 463 additions and 637 deletions
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -24,6 +24,7 @@ import os

 import numpy as np
 import time
+import signal
 import subprocess
 import tempfile
 import re
@@ -102,7 +103,11 @@ class IPEngine(object):
        self.logger.info("Starting IPController")
        self.c_buff = PopenFileBuffer()
        c_cmd = ["ipcontroller",  "--ip='*'"]
-        self.c = subprocess.Popen(c_cmd, stderr=self.c_buff.stderr, stdout=self.c_buff.stdout, shell=False)
+        self.c = subprocess.Popen(c_cmd, 
+                stderr=self.c_buff.stderr, 
+                stdout=self.c_buff.stdout, 
+                shell=False,
+                creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
        
        #Wait until controller is running
        time.sleep(3)
@@ -111,11 +116,16 @@ class IPEngine(object):
        self.logger.info("Starting IPEngines")
        self.e_buff = PopenFileBuffer()
        e_cmd = ["mpiexec", "-n", str(n_engines), "ipengine", "--mpi"]
-        self.e = subprocess.Popen(e_cmd, stderr=self.e_buff.stderr, stdout=self.e_buff.stdout, shell=False)
+        self.e = subprocess.Popen(e_cmd, 
+                stderr=self.e_buff.stderr, 
+                stdout=self.e_buff.stdout, 
+                shell=False,
+                creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)

        # attach to a running cluster
        import ipyparallel
        self.cluster = ipyparallel.Client()#profile='mpi')
+        time.sleep(3)
        while(len(self.cluster.ids) != n_engines):
            time.sleep(0.5)
            self.logger.info("Waiting for cluster...")
@@ -124,37 +134,51 @@ class IPEngine(object):
        self.logger.info("Done")
        
    def __del__(self):
-        self.cluster.shutdown(hub=True)
-        
-        self.e.terminate()
-        try:
-            self.e.communicate(timeout=3)
-        except TimeoutExpired:
-            self.logger.warn("Killing IPEngine")
-            self.e.kill()
-            self.e.communicate()
+        self.shutdown()
+    
+    def shutdown(self):
+        if (self.e is not None):
+            if (os.name == 'nt'):
+                self.logger.warn("Sending CTRL+C to IPEngine")
+                self.e.send_signal(signal.CTRL_C_EVENT)
+                
+            try:
+                self.e.communicate(timeout=3)
+                self.e.kill()
+            except subprocess.TimeoutExpired:
+                self.logger.warn("Killing IPEngine")
+                self.e.kill()
+                self.e.communicate()
+            self.e = None
+                
+            cout, cerr = self.e_buff.read()
+            self.logger.info("IPEngine cout: {:s}".format(cout))
+            self.logger.info("IPEngine cerr: {:s}".format(cerr))
+            self.e_buff = None
            
-        cout, cerr = self.e_buff.read()
-        self.logger.info("IPEngine cout: {:s}".format(cout))
-        self.logger.info("IPEngine cerr: {:s}".format(cerr))
-        
-        
-        while(len(self.cluster.ids) != 0):
-            time.sleep(0.5)
-            self.logger.info("Waiting for cluster to shutdown...")
+            gc.collect()
            
+        if (self.c is not None):
+            if (os.name == 'nt'):
+                self.logger.warn("Sending CTRL+C to IPController")
+                self.c.send_signal(signal.CTRL_C_EVENT)
+                
+            try:
+                self.c.communicate(timeout=3)
+                self.c.kill()
+            except subprocess.TimeoutExpired:
+                self.logger.warn("Killing IPController")
+                self.c.kill()
+                self.c.communicate()
+            self.c = None
+                
+            cout, cerr = self.c_buff.read()
+            self.logger.info("IPController cout: {:s}".format(cout))
+            self.logger.info("IPController cerr: {:s}".format(cerr))
+            self.c_buff = None
+        
+            gc.collect()
        
-        self.c.terminate()
-        try:
-            self.c.communicate(timeout=3)
-        except TimeoutExpired:
-            self.logger.warn("Killing IPController")
-            self.c.kill()
-            self.c.communicate()
-            
-        cout, cerr = self.c_buff.read()
-        self.logger.info("IPController cout: {:s}".format(cout))
-        self.logger.info("IPController cerr: {:s}".format(cerr))

            
        
@@ -348,14 +372,14 @@ class CudaArray2D:
        copy.set_dst_host(cpu_data)
        
        #Set offsets and pitch of source
-        copy.src_x_in_bytes = x*self.data.strides[1]
-        copy.src_y = y
+        copy.src_x_in_bytes = int(x)*self.data.strides[1]
+        copy.src_y = int(y)
        copy.src_pitch = self.data.strides[0]
        
        #Set width in bytes to copy for each row and
        #number of rows to copy
-        copy.width_in_bytes = nx*cpu_data.itemsize
-        copy.height = ny
+        copy.width_in_bytes = int(nx)*cpu_data.itemsize
+        copy.height = int(ny)
        
        copy(stream)
        if async==False:
@@ -384,14 +408,14 @@ class CudaArray2D:
        copy.set_src_host(cpu_data)
        
        #Set offsets and pitch of source
-        copy.dst_x_in_bytes = x*self.data.strides[1]
-        copy.dst_y = y
+        copy.dst_x_in_bytes = int(x)*self.data.strides[1]
+        copy.dst_y = int(y)
        copy.dst_pitch = self.data.strides[0]
        
        #Set width in bytes to copy for each row and
        #number of rows to copy
-        copy.width_in_bytes = nx*cpu_data.itemsize
-        copy.height = ny
+        copy.width_in_bytes = int(nx)*cpu_data.itemsize
+        copy.height = int(ny)
        
        copy(stream)