From 2c6ecc8d886ef8466fae226aa15fbed2f37295ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20R=2E=20Brodtkorb?= <Andre.Brodtkorb@sintef.no>
Date: Mon, 10 Dec 2018 12:08:10 +0100
Subject: [PATCH] Pinned memory

---
 GPUSimulators/Common.py       | 16 +++++++++++++---
 GPUSimulators/IPythonMagic.py |  6 +++++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py
index ddc7701..5e8fd24 100644
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@@ -38,6 +38,7 @@ import json
 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda
+from pycuda.tools import PageLockedMemoryPool
 
 
 
@@ -482,6 +483,9 @@ class CudaArray2D:
         #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
         self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)
         
+        #For returning to download
+        self.memorypool = PageLockedMemoryPool()
+        
         #If we don't have any data, just allocate and return
         if cpu_data is None:
             return
@@ -518,8 +522,10 @@ class CudaArray2D:
         if (cpu_data is None):
             #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
             #Allocate host memory
-            #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
-            cpu_data = np.empty((ny, nx), dtype=np.float32)
+            #The following fails, don't know why (crashes python)
+            #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32)
+            #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
+            cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
             
         assert nx == cpu_data.shape[1]
         assert ny == cpu_data.shape[0]
@@ -610,6 +616,9 @@ class CudaArray3D:
         #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
         self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)
         
+        #For returning to download
+        self.memorypool = PageLockedMemoryPool()
+        
         #If we don't have any data, just allocate and return
         if cpu_data is None:
             return
@@ -662,7 +671,8 @@ class CudaArray3D:
         #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
         #Allocate host memory
         #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
-        cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
+        #cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
+        cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32)
         
         #Create copy object from device to host
         copy = cuda.Memcpy2D()
diff --git a/GPUSimulators/IPythonMagic.py b/GPUSimulators/IPythonMagic.py
index 2cca8c1..fa452df 100644
--- a/GPUSimulators/IPythonMagic.py
+++ b/GPUSimulators/IPythonMagic.py
@@ -47,6 +47,10 @@ class MagicCudaContext(Magics):
         
         self.logger.info("Registering %s in user workspace", args.name)
         
+        context_flags = None
+        if (args.blocking):
+            context_flags = cuda.ctx_flags.SCHED_BLOCKING_SYNC
+        
         if args.name in self.shell.user_ns.keys():
             self.logger.debug("Context already registered! Ignoring")
             return
@@ -54,7 +58,7 @@ class MagicCudaContext(Magics):
             self.logger.debug("Creating context")
             use_cache = False if args.no_cache else True
             use_autotuning = False if args.no_autotuning else True
-            self.shell.user_ns[args.name] = CudaContext.CudaContext(blocking=args.blocking, use_cache=use_cache, autotuning=use_autotuning)
+            self.shell.user_ns[args.name] = CudaContext.CudaContext(context_flags=context_flags, use_cache=use_cache, autotuning=use_autotuning)
         
         # this function will be called on exceptions in any cell
         def custom_exc(shell, etype, evalue, tb, tb_offset=None):