refactor(kernel): split Common.py to a separate package

2025-07-04 05:41:00 +02:00 · 2025-06-24 17:34:29 +02:00 · 2025-06-24 17:34:29 +02:00 · c54f08c417
commit c54f08c417
parent 8f24cd45ea
39 changed files with 1969 additions and 143694 deletions
--- a/Autotuning.ipynb
+++ b/Autotuning.ipynb
@ -27,16 +27,11 @@
    "from matplotlib import pyplot as plt\n",
    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
    "\n",
-    "import subprocess\n",
    "import os\n",
    "import gc\n",
-    "import datetime\n",
-    "import importlib\n",
    "import logging\n",
-    "from socket import gethostname\n",
    "\n",
    "import pycuda.driver as cuda\n",
-    "import pycuda.compiler\n",
    "\n",
    "try:\n",
    "    from StringIO import StringIO\n",
@ -55,7 +50,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from GPUSimulators import Common, IPythonMagic, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, Autotuner"
+    "from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, Autotuner\n",
+    "from GPUSimulators.common import common"
   ]
  },
  {
@ -124,14 +120,14 @@
     "evalue": "All-NaN slice encountered",
     "output_type": "error",
     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m simulators \u001b[38;5;241m=\u001b[39m [LxF\u001b[38;5;241m.\u001b[39mLxF, FORCE\u001b[38;5;241m.\u001b[39mFORCE, HLL\u001b[38;5;241m.\u001b[39mHLL, HLL2\u001b[38;5;241m.\u001b[39mHLL2, KP07\u001b[38;5;241m.\u001b[39mKP07, KP07_dimsplit\u001b[38;5;241m.\u001b[39mKP07_dimsplit, WAF\u001b[38;5;241m.\u001b[39mWAF]\n\u001b[0;32m----> 2\u001b[0m peak_performance \u001b[38;5;241m=\u001b[39m [autotuner\u001b[38;5;241m.\u001b[39mget_peak_performance(simulator) \u001b[38;5;28;01mfor\u001b[39;00m simulator \u001b[38;5;129;01min\u001b[39;00m simulators]\n\u001b[1;32m      3\u001b[0m megacells \u001b[38;5;241m=\u001b[39m [performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmegacells\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m performance \u001b[38;5;129;01min\u001b[39;00m peak_performance]\n\u001b[1;32m      4\u001b[0m xlabels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{:s}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m[\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124mx\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(simulators[i]\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m'\u001b[39m], performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;28;01mfor\u001b[39;00m i, performance \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(peak_performance)]\n",
-      "Cell \u001b[0;32mIn[9], line 2\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      1\u001b[0m simulators \u001b[38;5;241m=\u001b[39m [LxF\u001b[38;5;241m.\u001b[39mLxF, FORCE\u001b[38;5;241m.\u001b[39mFORCE, HLL\u001b[38;5;241m.\u001b[39mHLL, HLL2\u001b[38;5;241m.\u001b[39mHLL2, KP07\u001b[38;5;241m.\u001b[39mKP07, KP07_dimsplit\u001b[38;5;241m.\u001b[39mKP07_dimsplit, WAF\u001b[38;5;241m.\u001b[39mWAF]\n\u001b[0;32m----> 2\u001b[0m peak_performance \u001b[38;5;241m=\u001b[39m [\u001b[43mautotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_peak_performance\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m simulator \u001b[38;5;129;01min\u001b[39;00m simulators]\n\u001b[1;32m      3\u001b[0m megacells \u001b[38;5;241m=\u001b[39m [performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmegacells\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m performance \u001b[38;5;129;01min\u001b[39;00m peak_performance]\n\u001b[1;32m      4\u001b[0m xlabels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{:s}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m[\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124mx\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(simulators[i]\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m'\u001b[39m], performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;28;01mfor\u001b[39;00m i, performance \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(peak_performance)]\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:132\u001b[0m, in \u001b[0;36mAutotuner.get_peak_performance\u001b[0;34m(self, simulator)\u001b[0m\n\u001b[1;32m    130\u001b[0m block_widths \u001b[38;5;241m=\u001b[39m data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_block_widths\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m    131\u001b[0m block_heights \u001b[38;5;241m=\u001b[39m data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_block_heights\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m--> 132\u001b[0m j, i \u001b[38;5;241m=\u001b[39m \u001b[43mfind_max_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmegacells\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mperformance[key] \u001b[38;5;241m=\u001b[39m { \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m\"\u001b[39m: block_widths[i],\n\u001b[1;32m    135\u001b[0m                          \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m\"\u001b[39m: block_heights[j],\n\u001b[1;32m    136\u001b[0m                          \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmegacells\u001b[39m\u001b[38;5;124m\"\u001b[39m: megacells[j, i] }\n\u001b[1;32m    137\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReturning \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m as peak performance parameters\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mperformance[key])\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:126\u001b[0m, in \u001b[0;36mAutotuner.get_peak_performance.<locals>.find_max_index\u001b[0;34m(megacells)\u001b[0m\n\u001b[1;32m    125\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_max_index\u001b[39m(megacells):\n\u001b[0;32m--> 126\u001b[0m     max_index \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnanargmax\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmegacells\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    127\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39munravel_index(max_index, megacells\u001b[38;5;241m.\u001b[39mshape)\n",
-      "File \u001b[0;32m~/.conda/envs/ShallowWaterGPU/lib/python3.9/site-packages/numpy/lib/nanfunctions.py:613\u001b[0m, in \u001b[0;36mnanargmax\u001b[0;34m(a, axis, out, keepdims)\u001b[0m\n\u001b[1;32m    611\u001b[0m     mask \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mall(mask, axis\u001b[38;5;241m=\u001b[39maxis)\n\u001b[1;32m    612\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(mask):\n\u001b[0;32m--> 613\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAll-NaN slice encountered\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    614\u001b[0m res \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39margmax(a, axis\u001b[38;5;241m=\u001b[39maxis, out\u001b[38;5;241m=\u001b[39mout, keepdims\u001b[38;5;241m=\u001b[39mkeepdims)\n\u001b[1;32m    615\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n",
-      "\u001b[0;31mValueError\u001b[0m: All-NaN slice encountered"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[9], line 2\u001B[0m\n\u001B[1;32m      1\u001B[0m simulators \u001B[38;5;241m=\u001B[39m [LxF\u001B[38;5;241m.\u001B[39mLxF, FORCE\u001B[38;5;241m.\u001B[39mFORCE, HLL\u001B[38;5;241m.\u001B[39mHLL, HLL2\u001B[38;5;241m.\u001B[39mHLL2, KP07\u001B[38;5;241m.\u001B[39mKP07, KP07_dimsplit\u001B[38;5;241m.\u001B[39mKP07_dimsplit, WAF\u001B[38;5;241m.\u001B[39mWAF]\n\u001B[0;32m----> 2\u001B[0m peak_performance \u001B[38;5;241m=\u001B[39m [autotuner\u001B[38;5;241m.\u001B[39mget_peak_performance(simulator) \u001B[38;5;28;01mfor\u001B[39;00m simulator \u001B[38;5;129;01min\u001B[39;00m simulators]\n\u001B[1;32m      3\u001B[0m megacells \u001B[38;5;241m=\u001B[39m [performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmegacells\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m performance \u001B[38;5;129;01min\u001B[39;00m peak_performance]\n\u001B[1;32m      4\u001B[0m xlabels \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{:s}\u001B[39;00m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m[\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124mx\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124m]\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(simulators[i]\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m'\u001B[39m], performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m'\u001B[39m]) \u001B[38;5;28;01mfor\u001B[39;00m i, performance \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(peak_performance)]\n",
+      "Cell \u001B[0;32mIn[9], line 2\u001B[0m, in \u001B[0;36m<listcomp>\u001B[0;34m(.0)\u001B[0m\n\u001B[1;32m      1\u001B[0m simulators \u001B[38;5;241m=\u001B[39m [LxF\u001B[38;5;241m.\u001B[39mLxF, FORCE\u001B[38;5;241m.\u001B[39mFORCE, HLL\u001B[38;5;241m.\u001B[39mHLL, HLL2\u001B[38;5;241m.\u001B[39mHLL2, KP07\u001B[38;5;241m.\u001B[39mKP07, KP07_dimsplit\u001B[38;5;241m.\u001B[39mKP07_dimsplit, WAF\u001B[38;5;241m.\u001B[39mWAF]\n\u001B[0;32m----> 2\u001B[0m peak_performance \u001B[38;5;241m=\u001B[39m [\u001B[43mautotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_peak_performance\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mfor\u001B[39;00m simulator \u001B[38;5;129;01min\u001B[39;00m simulators]\n\u001B[1;32m      3\u001B[0m megacells \u001B[38;5;241m=\u001B[39m [performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmegacells\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m performance \u001B[38;5;129;01min\u001B[39;00m peak_performance]\n\u001B[1;32m      4\u001B[0m xlabels \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{:s}\u001B[39;00m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m[\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124mx\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124m]\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(simulators[i]\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m'\u001B[39m], performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m'\u001B[39m]) \u001B[38;5;28;01mfor\u001B[39;00m i, performance \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(peak_performance)]\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:132\u001B[0m, in \u001B[0;36mAutotuner.get_peak_performance\u001B[0;34m(self, simulator)\u001B[0m\n\u001B[1;32m    130\u001B[0m block_widths \u001B[38;5;241m=\u001B[39m data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m_block_widths\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[1;32m    131\u001B[0m block_heights \u001B[38;5;241m=\u001B[39m data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m_block_heights\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m--> 132\u001B[0m j, i \u001B[38;5;241m=\u001B[39m \u001B[43mfind_max_index\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmegacells\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    134\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mperformance[key] \u001B[38;5;241m=\u001B[39m { \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m\"\u001B[39m: block_widths[i],\n\u001B[1;32m    135\u001B[0m                          \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m\"\u001B[39m: block_heights[j],\n\u001B[1;32m    136\u001B[0m                          \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmegacells\u001B[39m\u001B[38;5;124m\"\u001B[39m: megacells[j, i] }\n\u001B[1;32m    137\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mReturning \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m as peak performance parameters\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mperformance[key])\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:126\u001B[0m, in \u001B[0;36mAutotuner.get_peak_performance.<locals>.find_max_index\u001B[0;34m(megacells)\u001B[0m\n\u001B[1;32m    125\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfind_max_index\u001B[39m(megacells):\n\u001B[0;32m--> 126\u001B[0m     max_index \u001B[38;5;241m=\u001B[39m \u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mnanargmax\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmegacells\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    127\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m np\u001B[38;5;241m.\u001B[39munravel_index(max_index, megacells\u001B[38;5;241m.\u001B[39mshape)\n",
+      "File \u001B[0;32m~/.conda/envs/ShallowWaterGPU/lib/python3.9/site-packages/numpy/lib/nanfunctions.py:613\u001B[0m, in \u001B[0;36mnanargmax\u001B[0;34m(a, axis, out, keepdims)\u001B[0m\n\u001B[1;32m    611\u001B[0m     mask \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mall(mask, axis\u001B[38;5;241m=\u001B[39maxis)\n\u001B[1;32m    612\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m np\u001B[38;5;241m.\u001B[39many(mask):\n\u001B[0;32m--> 613\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAll-NaN slice encountered\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m    614\u001B[0m res \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39margmax(a, axis\u001B[38;5;241m=\u001B[39maxis, out\u001B[38;5;241m=\u001B[39mout, keepdims\u001B[38;5;241m=\u001B[39mkeepdims)\n\u001B[1;32m    615\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n",
+      "\u001B[0;31mValueError\u001B[0m: All-NaN slice encountered"
     ]
    }
   ],
--- a/ConvergenceShock1D.ipynb
+++ b/ConvergenceShock1D.ipynb
@ -49,9 +49,6 @@
    "import time\n",
    "import os\n",
    "import gc\n",
-    "import datetime\n",
-    "\n",
-    "import pycuda.driver as cuda\n",
    "\n",
    "try:\n",
    "    from StringIO import StringIO\n",
@ -59,7 +56,8 @@
    "    from io import StringIO\n",
    "\n",
    "#Finally, import our simulator\n",
-    "from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, IPythonMagic"
+    "from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF\n",
+    "from GPUSimulators.common import common"
   ]
  },
  {
--- a/ConvergenceSmooth1D.ipynb
+++ b/ConvergenceSmooth1D.ipynb
@ -42,15 +42,10 @@
    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
    "#import mpld3\n",
    "\n",
-    "import subprocess\n",
    "import socket\n",
    "import time\n",
    "import os\n",
    "import gc\n",
-    "import datetime\n",
-    "import logging\n",
-    "\n",
-    "import pycuda.driver as cuda\n",
    "\n",
    "try:\n",
    "    from StringIO import StringIO\n",
@ -65,7 +60,8 @@
   "outputs": [],
   "source": [
    "#Finally, import our simulator\n",
-    "from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, IPythonMagic\n",
+    "from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF\n",
+    "from GPUSimulators.common import common\n",
    "from GPUSimulators.helpers import InitialConditions"
   ]
  },
@ -250,8 +246,8 @@
    "            sim.simulate(1.0, dt=dt)\n",
    "            sim.check()\n",
    "            \n",
-    "            nt = sim.simSteps()\n",
-    "            dt = sim.simTime() / nt\n",
+    "            nt = sim.sim_steps()\n",
+    "            dt = sim.sim_time() / nt\n",
    "            h, hu, hv = sim.download()\n",
    "            \n",
    "            if (transpose):\n",
--- a/ConvergenceSmooth2D.ipynb
+++ b/ConvergenceSmooth2D.ipynb
@ -42,15 +42,10 @@
    "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
    "#import mpld3\n",
    "\n",
-    "import subprocess\n",
    "import socket\n",
    "import time\n",
    "import os\n",
    "import gc\n",
-    "import datetime\n",
-    "import logging\n",
-    "\n",
-    "import pycuda.driver as cuda\n",
    "\n",
    "try:\n",
    "    from StringIO import StringIO\n",
@ -65,7 +60,8 @@
   "outputs": [],
   "source": [
    "#Finally, import our simulator\n",
-    "from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, IPythonMagic\n",
+    "from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF\n",
+    "from GPUSimulators.common import common\n",
    "from GPUSimulators.helpers import InitialConditions"
   ]
  },
@ -250,8 +246,8 @@
    "            sim.simulate(1.0, dt=dt)\n",
    "            sim.check()\n",
    "            \n",
-    "            nt = sim.simSteps()\n",
-    "            dt = sim.simTime() / nt\n",
+    "            nt = sim.sim_steps()\n",
+    "            dt = sim.sim_time() / nt\n",
    "            h, hu, hv = sim.download()\n",
    "            \n",
    "            if (transpose):\n",
--- a/EulerTesting.ipynb
+++ b/EulerTesting.ipynb
--- a/GPUSimulators/Autotuner.py
+++ b/GPUSimulators/Autotuner.py
@ -29,15 +29,159 @@ from tqdm.auto import tqdm

 import pycuda.driver as cuda

-from GPUSimulators import Common, Simulator
+from GPUSimulators import Simulator
+from GPUSimulators.common import common, Timer
 from GPUSimulators.gpu import CudaContext


+def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
+    """
+    Runs a benchmark, and returns the number of megacells achieved
+    """
+
+    logger = logging.getLogger(__name__)
+
+    # Initialize simulator
+    try:
+        sim = simulator(**arguments)
+    except:
+        # An exception raised - not possible to continue
+        logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
+        # raise RuntimeError("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
+        return np.nan
+
+    # Create timer events
+    start = cuda.Event()
+    end = cuda.Event()
+
+    # Warmup
+    for i in range(warmup_timesteps):
+        sim.substep(sim.dt, i)
+
+    # Run simulation with timer
+    start.record(sim.stream)
+    for i in range(timesteps):
+        sim.substep(sim.dt, i)
+    end.record(sim.stream)
+
+    # Synchronize end event
+    end.synchronize()
+
+    # Compute megacells
+    gpu_elapsed = end.time_since(start) * 1.0e-3
+    megacells = (sim.nx * sim.ny * timesteps / (1000 * 1000)) / gpu_elapsed
+
+    # Sanity check solution
+    h, hu, hv = sim.download()
+    sane = True
+    sane = sane and sanity_check(0.3, 0.7)
+    sane = sane and sanity_check(-0.2, 0.2)
+    sane = sane and sanity_check(-0.2, 0.2)
+
+    if sane:
+        logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__,
+                     arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
+        return megacells
+    else:
+        logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"],
+                     arguments["block_height"], gpu_elapsed)
+        # raise RuntimeError("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
+        return np.nan
+
+
+def gen_test_data(nx, ny, g):
+    """
+    Generates test dataset
+    """
+
+    width = 100.0
+    height = 100.0
+    dx = width / float(nx)
+    dy = height / float(ny)
+
+    x_center = dx * nx / 2.0
+    y_center = dy * ny / 2.0
+
+    # Create a gaussian "dam break" that will not form shocks
+    size = width / 5.0
+    dt = 10 ** 10
+
+    h = np.zeros((ny, nx), dtype=np.float32)
+    hu = np.zeros((ny, nx), dtype=np.float32)
+    hv = np.zeros((ny, nx), dtype=np.float32)
+
+    extent = 1.0 / np.sqrt(2.0)
+    x = (dx * (np.arange(0, nx, dtype=np.float32) + 0.5) - x_center) / size
+    y = (dy * (np.arange(0, ny, dtype=np.float32) + 0.5) - y_center) / size
+    xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
+    r = np.minimum(1.0, np.sqrt(xv ** 2 + yv ** 2))
+    xv = None
+    yv = None
+    gc.collect()
+
+    # Generate highres
+    cos = np.cos(np.pi * r)
+    h = 0.5 + 0.1 * 0.5 * (1.0 + cos)
+    hu = 0.1 * 0.5 * (1.0 + cos)
+    hv = hu.copy()
+
+    scale = 0.7
+    max_h_estimate = 0.6
+    max_u_estimate = 0.1 * np.sqrt(2.0)
+    dx = width / nx
+    dy = height / ny
+    dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g * max_h_estimate))
+
+    return h, hu, hv, dx, dy, dt
+
+
+def sanity_check(variable, bound_min, bound_max):
+    """
+    Checks that a variable is "sane"
+    """
+
+    maxval = np.amax(variable)
+    minval = np.amin(variable)
+    if (np.isnan(maxval)
+            or np.isnan(minval)
+            or maxval > bound_max
+            or minval < bound_min):
+        return False
+    else:
+        return True
+
+
+def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
+    """
+    Runs a set of benchmarks for a single simulator
+    """
+
+    logger = logging.getLogger(__name__)
+
+    megacells = np.empty((len(block_heights), len(block_widths)))
+    megacells.fill(np.nan)
+
+    logger.debug("Running %d benchmarks with %s", len(block_heights) * len(block_widths), simulator.__name__)
+
+    sim_arguments = arguments.copy()
+
+    with Timer(simulator.__name__) as t:
+        for j, block_height in enumerate(tqdm(block_heights, desc='Autotuner Progress')):
+            sim_arguments.update({'block_height': block_height})
+            for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
+                sim_arguments.update({'block_width': block_width})
+                megacells[j, i] = run_benchmark(sim_arguments)
+
+    logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
+
+    return megacells
+
+
 class Autotuner:
-    def __init__(self, 
-                nx=2048, ny=2048, 
-                block_widths=range(8, 32, 1),
-                block_heights=range(8, 32, 1)):
+    def __init__(self,
+                 nx=2048, ny=2048,
+                 block_widths=range(8, 32, 1),
+                 block_heights=range(8, 32, 1)):
        logger = logging.getLogger(__name__)
        self.filename = "autotuning_data_" + gethostname() + ".npz"
        self.nx = nx
@ -48,50 +192,51 @@ class Autotuner:

    def benchmark(self, simulator, force=False):
        logger = logging.getLogger(__name__)
-        
-        #Run through simulators and benchmark
+
+        # Run through simulators and benchmark
        key = str(simulator.__name__)
        logger.info("Benchmarking %s to %s", key, self.filename)
-        
-        #If this simulator has been benchmarked already, skip it
-        if (force==False and os.path.isfile(self.filename)):
+
+        # If this simulator has been benchmarked already, skip it
+        if force == False and os.path.isfile(self.filename):
            with np.load(self.filename) as data:
                if key in data["simulators"]:
                    logger.info("%s already benchmarked - skipping", key)
                    return
-    
+
        # Set arguments to send to the simulators during construction
-        context = CudaContext.CudaContext(autotuning=False)
+        context = CudaContext(autotuning=False)
        g = 9.81
-        h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
+        h0, hu0, hv0, dx, dy, dt = gen_test_data(ny=self.ny, g=g)
        arguments = {
            'context': context,
            'h0': h0, 'hu0': hu0, 'hv0': hv0,
            'nx': self.nx, 'ny': self.ny,
-            'dx': dx, 'dy': dy, 'dt': 0.9*dt,
+            'dx': dx, 'dy': dy, 'dt': 0.9 * dt,
            'g': g,
            'compile_opts': ['-Wno-deprecated-gpu-targets']
-        } 
-             
+        }
+
        # Load existing data into memory
        benchmark_data = {
-                "simulators": [],
+            "simulators": [],
        }
-        if (os.path.isfile(self.filename)):
+        if os.path.isfile(self.filename):
            with np.load(self.filename) as data:
                for k, v in data.items():
                    benchmark_data[k] = v
-   
+
        # Run benchmark
-        benchmark_data[key + "_megacells"] = Autotuner.benchmark_single_simulator(simulator, arguments, self.block_widths, self.block_heights)
+        benchmark_data[key + "_megacells"] = benchmark_single_simulator(arguments, self.block_widths,
+                                                                        self.block_heights)
        benchmark_data[key + "_block_widths"] = self.block_widths
        benchmark_data[key + "_block_heights"] = self.block_heights
        benchmark_data[key + "_arguments"] = str(arguments)
-        
+
        existing_sims = benchmark_data["simulators"]
-        if (isinstance(existing_sims, np.ndarray)):
+        if isinstance(existing_sims, np.ndarray):
            existing_sims = existing_sims.tolist()
-        if (key not in existing_sims):
+        if key not in existing_sims:
            benchmark_data["simulators"] = existing_sims + [key]

        # Save to file
@ -104,178 +249,40 @@ class Autotuner:
        """

        logger = logging.getLogger(__name__)
-        
+
        assert issubclass(simulator, Simulator.BaseSimulator)
        key = simulator.__name__
-        
-        if (key in self.performance):
+
+        if key in self.performance:
            return self.performance[key]
        else:
-            #Run simulation if required
-            if (not os.path.isfile(self.filename)):
+            # Run simulation if required
+            if not os.path.isfile(self.filename):
                logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
                self.benchmark(simulator)
-            
+
            with np.load(self.filename) as data:
                if key not in data['simulators']:
                    logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
                    data.close()
                    self.benchmark(simulator)
                    data = np.load(self.filename)
-                
+
                def find_max_index(megacells):
                    max_index = np.nanargmax(megacells)
                    return np.unravel_index(max_index, megacells.shape)
-                
+
                megacells = data[key + '_megacells']
                block_widths = data[key + '_block_widths']
                block_heights = data[key + '_block_heights']
                j, i = find_max_index(megacells)
-                
-                self.performance[key] = { "block_width": block_widths[i],
+
+                self.performance[key] = {"block_width": block_widths[i],
                                         "block_height": block_heights[j],
-                                         "megacells": megacells[j, i] }
+                                         "megacells": megacells[j, i]}
                logger.debug("Returning %s as peak performance parameters", self.performance[key])
                return self.performance[key]
-        
-            #This should never happen
+
+            # This should never happen
            raise "Something wrong: Could not get autotuning data!"
            return None
-    
-    def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
-        """
-        Runs a set of benchmarks for a single simulator
-        """
-
-        logger = logging.getLogger(__name__)
-        
-        megacells = np.empty((len(block_heights), len(block_widths)))
-        megacells.fill(np.nan)
-
-        logger.debug("Running %d benchmarks with %s", len(block_heights)*len(block_widths), simulator.__name__)
-        
-        sim_arguments = arguments.copy()
-                    
-        with Common.Timer(simulator.__name__) as t:
-            for j, block_height in enumerate(tqdm(block_heights, desc='Autotuner Progress')):
-                sim_arguments.update({'block_height': block_height})
-                for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
-                    sim_arguments.update({'block_width': block_width})
-                    megacells[j, i] = Autotuner.run_benchmark(simulator, sim_arguments)
-                        
-
-        logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
-
-        return megacells
-            
-    def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
-        """
-        Runs a benchmark, and returns the number of megacells achieved
-        """
-
-        logger = logging.getLogger(__name__)
-        
-        #Initialize simulator
-        try:
-            sim = simulator(**arguments)
-        except:
-            #An exception raised - not possible to continue
-            logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
-            # raise RuntimeError("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
-            return np.nan
-        
-        #Create timer events
-        start = cuda.Event()
-        end = cuda.Event()
-        
-        #Warmup
-        for i in range(warmup_timesteps):
-            sim.substep(sim.dt, i)
-            
-        #Run simulation with timer        
-        start.record(sim.stream)
-        for i in range(timesteps):
-            sim.substep(sim.dt, i)
-        end.record(sim.stream)
-        
-        #Synchronize end event
-        end.synchronize()
-        
-        #Compute megacells
-        gpu_elapsed = end.time_since(start)*1.0e-3
-        megacells = (sim.nx*sim.ny*timesteps / (1000*1000)) / gpu_elapsed
-
-        #Sanity check solution
-        h, hu, hv = sim.download()
-        sane = True
-        sane = sane and Autotuner.sanity_check(h, 0.3, 0.7)
-        sane = sane and Autotuner.sanity_check(hu, -0.2, 0.2)
-        sane = sane and Autotuner.sanity_check(hv, -0.2, 0.2)
-        
-        if (sane):
-            logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
-            return megacells
-        else:
-            logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
-            # raise RuntimeError("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
-            return np.nan
-        
-    def gen_test_data(nx, ny, g):
-        """
-        Generates test dataset
-        """
-
-        width = 100.0
-        height = 100.0
-        dx = width / float(nx)
-        dy = height / float(ny)
-
-        x_center = dx*nx/2.0
-        y_center = dy*ny/2.0
-
-        #Create a gaussian "dam break" that will not form shocks
-        size = width / 5.0
-        dt = 10**10
-        
-        h  = np.zeros((ny, nx), dtype=np.float32); 
-        hu = np.zeros((ny, nx), dtype=np.float32);
-        hv = np.zeros((ny, nx), dtype=np.float32);
-
-        extent = 1.0/np.sqrt(2.0)
-        x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size
-        y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size
-        xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
-        r = np.minimum(1.0, np.sqrt(xv**2 + yv**2))
-        xv = None
-        yv = None
-        gc.collect()
-
-        #Generate highres
-        cos = np.cos(np.pi*r)
-        h = 0.5 + 0.1*0.5*(1.0 + cos)
-        hu = 0.1*0.5*(1.0 + cos)
-        hv = hu.copy()
-        
-        scale = 0.7
-        max_h_estimate = 0.6
-        max_u_estimate = 0.1*np.sqrt(2.0)
-        dx = width/nx
-        dy = height/ny
-        dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g*max_h_estimate))
-        
-        return h, hu, hv, dx, dy, dt
-        
-    def sanity_check(variable, bound_min, bound_max):
-        """
-        Checks that a variable is "sane"
-        """
-
-        maxval = np.amax(variable)
-        minval = np.amin(variable)
-        if (np.isnan(maxval) 
-                or np.isnan(minval)
-                or maxval > bound_max
-                or minval < bound_min):
-            return False
-        else:
-            return True
--- a/GPUSimulators/Common.py
+++ b/GPUSimulators/Common.py
@ -1,758 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-This python module implements the different helper functions and 
-classes
-
-Copyright (C) 2018  SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-import os
-
-import numpy as np
-import time
-import signal
-import subprocess
-import tempfile
-import re
-import io
-import hashlib
-import logging
-import gc
-import netCDF4
-import json
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-from pycuda.tools import PageLockedMemoryPool
-
-
-def safeCall(cmd):
-    logger = logging.getLogger(__name__)
-    try:
-        #git rev-parse HEAD
-        current_dir = os.path.dirname(os.path.realpath(__file__))
-        params = dict()
-        params['stderr'] = subprocess.STDOUT
-        params['cwd'] = current_dir
-        params['universal_newlines'] = True #text=True in more recent python
-        params['shell'] = False
-        if os.name == 'nt':
-            params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
-        stdout = subprocess.check_output(cmd, **params)
-    except subprocess.CalledProcessError as e:
-        output = e.output
-        logger.error("Git failed, \nReturn code: " + str(e.returncode) + "\nOutput: " + output)
-        raise e
-
-    return stdout
-
-
-def getGitHash():
-    return safeCall(["git", "rev-parse", "HEAD"])
-
-
-def getGitStatus():
-    return safeCall(["git", "status", "--porcelain", "-uno"])
-
-
-def toJson(in_dict, compressed=True):
-    """
-    Creates JSON string from a dictionary
-    """
-
-    logger = logging.getLogger(__name__)
-    out_dict = in_dict.copy()
-    for key in out_dict:
-        if isinstance(out_dict[key], np.ndarray):
-            out_dict[key] = out_dict[key].tolist()
-        else:
-            try:
-                json.dumps(out_dict[key])
-            except:
-                value = str(out_dict[key])
-                logger.warning("JSON: Converting {:s} to string ({:s})".format(key, value))
-                out_dict[key] = value
-    return json.dumps(out_dict)
-
-
-def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
-    """
-    Runs a simulation, and stores output in netcdf file. Stores the times given in 
-    save_times, and saves all of the variables in list save_var_names. Elements in  
-    save_var_names can be set to None if you do not want to save them
-    """
-
-    profiling_data_sim_runner = { 'start': {}, 'end': {} }
-    profiling_data_sim_runner["start"]["t_sim_init"] = 0
-    profiling_data_sim_runner["end"]["t_sim_init"] = 0
-    profiling_data_sim_runner["start"]["t_nc_write"] = 0
-    profiling_data_sim_runner["end"]["t_nc_write"] = 0
-    profiling_data_sim_runner["start"]["t_full_step"] = 0
-    profiling_data_sim_runner["end"]["t_full_step"] = 0
-
-    profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
-
-    logger = logging.getLogger(__name__)
-
-    assert len(save_times) > 0, "Need to specify which times to save"
-
-    with Timer("construct") as t:
-        sim = simulator(**simulator_args)
-    logger.info("Constructed in " + str(t.secs) + " seconds")
-
-    #Create netcdf file and simulate
-    with DataDumper(outfile, mode='w', clobber=False) as outdata:
-        
-        #Create attributes (metadata)
-        outdata.ncfile.created = time.ctime(time.time())
-        outdata.ncfile.git_hash = getGitHash()
-        outdata.ncfile.git_status = getGitStatus()
-        outdata.ncfile.simulator = str(simulator)
-        
-        # do not write fields to attributes (they are to large)
-        simulator_args_for_ncfile = simulator_args.copy()
-        del simulator_args_for_ncfile["rho"]
-        del simulator_args_for_ncfile["rho_u"]
-        del simulator_args_for_ncfile["rho_v"]
-        del simulator_args_for_ncfile["E"]
-        outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
-        
-        #Create dimensions
-        outdata.ncfile.createDimension('time', len(save_times))
-        outdata.ncfile.createDimension('x', simulator_args['nx'])
-        outdata.ncfile.createDimension('y', simulator_args['ny'])
-
-        #Create variables for dimensions
-        ncvars = {}
-        ncvars['time'] = outdata.ncfile.createVariable('time', np.dtype('float32').char, 'time')
-        ncvars['x']    = outdata.ncfile.createVariable(   'x', np.dtype('float32').char,    'x')
-        ncvars['y']    = outdata.ncfile.createVariable(   'y', np.dtype('float32').char,    'y')
-        
-        #Fill variables with proper values
-        ncvars['time'][:] = save_times
-        extent = sim.getExtent()
-        ncvars['x'][:] = np.linspace(extent[0], extent[1], simulator_args['nx'])
-        ncvars['y'][:] = np.linspace(extent[2], extent[3], simulator_args['ny'])
-        
-        #Choose which variables to download (prune None from list, but keep the index)
-        download_vars = []
-        for i, var_name in enumerate(save_var_names):
-            if var_name is not None:
-                download_vars += [i]
-        save_var_names = list(save_var_names[i] for i in download_vars)
-        
-        #Create variables
-        for var_name in save_var_names:
-            ncvars[var_name] = outdata.ncfile.createVariable(var_name, np.dtype('float32').char, ('time', 'y', 'x'), zlib=True, least_significant_digit=3)
-
-        #Create step sizes between each save
-        t_steps = np.empty_like(save_times)
-        t_steps[0] = save_times[0]
-        t_steps[1:] = save_times[1:] - save_times[0:-1]
-
-        profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
-
-        #Start simulation loop
-        progress_printer = ProgressPrinter(save_times[-1], print_every=10)
-        for k in range(len(save_times)):
-            #Get target time and step size there
-            t_step = t_steps[k]
-            t_end = save_times[k]
-            
-            #Sanity check simulator
-            try:
-                sim.check()
-            except AssertionError as e:
-                logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
-                return outdata.filename
-
-            profiling_data_sim_runner["start"]["t_full_step"] += time.time()
-
-            #Simulate
-            if (t_step > 0.0):
-                sim.simulate(t_step, dt)
-
-            profiling_data_sim_runner["end"]["t_full_step"] += time.time()
-
-            profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
-
-            #Download
-            save_vars = sim.download(download_vars)
-            
-            #Save to file
-            for i, var_name in enumerate(save_var_names):
-                ncvars[var_name][k, :] = save_vars[i]
-
-            profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
-
-            #Write progress to screen
-            print_string = progress_printer.getPrintString(t_end)
-            if (print_string):
-                logger.debug(print_string)
-                
-        logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
-
-    return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
-
-
-class Timer(object):
-    """
-    Class which keeps track of time spent for a section of code
-    """
-
-    def __init__(self, tag, log_level=logging.DEBUG):
-        self.tag = tag
-        self.log_level = log_level
-        self.logger = logging.getLogger(__name__)
-        
-    def __enter__(self):
-        self.start = time.time()
-        return self
-    
-    def __exit__(self, *args):
-        self.end = time.time()
-        self.secs = self.end - self.start
-        self.msecs = self.secs * 1000 # millisecs
-        self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
-
-    def elapsed(self):
-        return time.time() - self.start
-
-
-class PopenFileBuffer(object):
-    """
-    Simple class for holding a set of tempfiles
-    for communicating with a subprocess
-    """
-
-    def __init__(self):
-        self.stdout = tempfile.TemporaryFile(mode='w+t')
-        self.stderr = tempfile.TemporaryFile(mode='w+t')
-
-    def __del__(self):
-        self.stdout.close()
-        self.stderr.close()
-
-    def read(self):
-        self.stdout.seek(0)
-        cout = self.stdout.read()
-        self.stdout.seek(0, 2)
-
-        self.stderr.seek(0)
-        cerr = self.stderr.read()
-        self.stderr.seek(0, 2)
-
-        return cout, cerr
-
-
-class IPEngine(object):
-    """
-    Class for starting IPEngines for MPI processing in IPython
-    """
-
-    def __init__(self, n_engines):
-        self.logger = logging.getLogger(__name__)
-        
-        #Start ipcontroller
-        self.logger.info("Starting IPController")
-        self.c_buff = PopenFileBuffer()
-        c_cmd = ["ipcontroller",  "--ip='*'"]
-        c_params = dict()
-        c_params['stderr'] = self.c_buff.stderr
-        c_params['stdout'] = self.c_buff.stdout
-        c_params['shell'] = False
-        if os.name == 'nt':
-            c_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
-        self.c = subprocess.Popen(c_cmd, **c_params)
-        
-        #Wait until controller is running
-        time.sleep(3)
-        
-        #Start engines
-        self.logger.info("Starting IPEngines")
-        self.e_buff = PopenFileBuffer()
-        e_cmd = ["mpiexec", "-n", str(n_engines), "ipengine", "--mpi"]
-        e_params = dict()
-        e_params['stderr'] = self.e_buff.stderr
-        e_params['stdout'] = self.e_buff.stdout
-        e_params['shell'] = False
-        if os.name == 'nt':
-            e_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
-        self.e = subprocess.Popen(e_cmd, **e_params)
-
-        # attach to a running cluster
-        import ipyparallel
-        self.cluster = ipyparallel.Client()#profile='mpi')
-        time.sleep(3)
-        while(len(self.cluster.ids) != n_engines):
-            time.sleep(0.5)
-            self.logger.info("Waiting for cluster...")
-            self.cluster = ipyparallel.Client()#profile='mpi')
-        
-        self.logger.info("Done")
-        
-    def __del__(self):
-        self.shutdown()
-    
-    def shutdown(self):
-        if (self.e is not None):
-            if (os.name == 'nt'):
-                self.logger.warn("Sending CTRL+C to IPEngine")
-                self.e.send_signal(signal.CTRL_C_EVENT)
-                
-            try:
-                self.e.communicate(timeout=3)
-                self.e.kill()
-            except subprocess.TimeoutExpired:
-                self.logger.warn("Killing IPEngine")
-                self.e.kill()
-                self.e.communicate()
-            self.e = None
-                
-            cout, cerr = self.e_buff.read()
-            self.logger.info("IPEngine cout: {:s}".format(cout))
-            self.logger.info("IPEngine cerr: {:s}".format(cerr))
-            self.e_buff = None
-            
-            gc.collect()
-            
-        if (self.c is not None):
-            if (os.name == 'nt'):
-                self.logger.warn("Sending CTRL+C to IPController")
-                self.c.send_signal(signal.CTRL_C_EVENT)
-                
-            try:
-                self.c.communicate(timeout=3)
-                self.c.kill()
-            except subprocess.TimeoutExpired:
-                self.logger.warn("Killing IPController")
-                self.c.kill()
-                self.c.communicate()
-            self.c = None
-                
-            cout, cerr = self.c_buff.read()
-            self.logger.info("IPController cout: {:s}".format(cout))
-            self.logger.info("IPController cerr: {:s}".format(cerr))
-            self.c_buff = None
-        
-            gc.collect()
-
-
-class DataDumper(object):
-    """
-    Simple class for holding a netCDF4 object
-    (handles opening and closing in a nice way)
-    Use as 
-    with DataDumper("filename") as data:
-        ...
-    """
-
-    def __init__(self, filename, *args, **kwargs):
-        self.logger = logging.getLogger(__name__)
-        
-        #Create directory if needed
-        filename = os.path.abspath(filename)
-        dirname = os.path.dirname(filename)
-        if dirname and not os.path.isdir(dirname):
-            self.logger.info("Creating directory " + dirname)
-            os.makedirs(dirname)
-        
-        #Get mode of file if we have that
-        mode = None
-        if (args):
-            mode = args[0]
-        elif (kwargs and 'mode' in kwargs.keys()):
-            mode = kwargs['mode']
-            
-        #Create new unique file if writing
-        if (mode):
-            if (("w" in mode) or ("+" in mode) or ("a" in mode)):
-                i = 0
-                stem, ext = os.path.splitext(filename)
-                while (os.path.isfile(filename)):
-                    filename = "{:s}_{:04d}{:s}".format(stem, i, ext)
-                    i = i+1
-        self.filename = os.path.abspath(filename)
-        
-        #Save arguments
-        self.args = args
-        self.kwargs = kwargs
-                
-        #Log output
-        self.logger.info("Initialized " + self.filename)
-        
-    def __enter__(self):
-        self.logger.info("Opening " + self.filename)
-        if (self.args):
-            self.logger.info("Arguments: " + str(self.args))
-        if (self.kwargs):
-            self.logger.info("Keyword arguments: " + str(self.kwargs))
-        self.ncfile = netCDF4.Dataset(self.filename, *self.args, **self.kwargs)
-        return self
-        
-    def __exit__(self, *args):
-        self.logger.info("Closing " + self.filename)
-        self.ncfile.close()
-        
-    def toJson(in_dict):
-        out_dict = in_dict.copy()
-
-        for key in out_dict:
-            if isinstance(out_dict[key], np.ndarray):
-                out_dict[key] = out_dict[key].tolist()
-            else:
-                try:
-                    json.dumps(out_dict[key])
-                except:
-                    out_dict[key] = str(out_dict[key])
-
-        return json.dumps(out_dict)
-
-
-class ProgressPrinter(object):
-    """
-    Small helper class for 
-    """
-
-    def __init__(self, total_steps, print_every=5):
-        self.logger = logging.getLogger(__name__)
-        self.start = time.time()
-        self.total_steps = total_steps
-        self.print_every = print_every
-        self.next_print_time = self.print_every
-        self.last_step = 0
-        self.secs_per_iter = None
-        
-    def getPrintString(self, step):
-        elapsed =  time.time() - self.start
-        if (elapsed > self.next_print_time):            
-            dt = elapsed - (self.next_print_time - self.print_every)
-            dsteps = step - self.last_step
-            steps_remaining = self.total_steps - step
-                        
-            if (dsteps == 0):
-                return
-                
-            self.last_step = step
-            self.next_print_time = elapsed + self.print_every
-            
-            if not self.secs_per_iter:
-                self.secs_per_iter = dt / dsteps
-            self.secs_per_iter = 0.2*self.secs_per_iter + 0.8*(dt / dsteps)
-            
-            remaining_time = steps_remaining * self.secs_per_iter
-
-            return "{:s}. Total: {:s}, elapsed: {:s}, remaining: {:s}".format(
-                ProgressPrinter.progressBar(step, self.total_steps), 
-                ProgressPrinter.timeString(elapsed + remaining_time), 
-                ProgressPrinter.timeString(elapsed), 
-                ProgressPrinter.timeString(remaining_time))
-
-    def timeString(seconds):
-        seconds = int(max(seconds, 1))
-        minutes, seconds = divmod(seconds, 60)
-        hours, minutes = divmod(minutes, 60)
-        periods = [('h', hours), ('m', minutes), ('s', seconds)]
-        time_string = ' '.join('{}{}'.format(value, name)
-                                for name, value in periods
-                                if value)
-        return time_string
-
-    def progressBar(step, total_steps, width=30):
-        progress = np.round(width * step / total_steps).astype(np.int32)
-        progressbar = "0% [" + "#"*(progress) + "="*(width-progress) + "] 100%"
-        return progressbar
-
-
-class CudaArray2D:
-    """
-    Class that holds 2D data 
-    """
-
-    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32):
-        """
-        Uploads initial data to the CUDA device
-        """
-
-        self.logger =  logging.getLogger(__name__)
-        self.nx = nx
-        self.ny = ny
-        self.x_halo = x_halo
-        self.y_halo = y_halo
-        
-        nx_halo = nx + 2*x_halo
-        ny_halo = ny + 2*y_halo
-        
-        #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
-        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
-        self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)
-        
-        #For returning to download
-        self.memorypool = PageLockedMemoryPool()
-        
-        #If we don't have any data, just allocate and return
-        if cpu_data is None:
-            return
-            
-        #Make sure data is in proper format
-        assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
-        assert cpu_data.itemsize == 4, "Wrong size of data type"
-        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
-
-        #Create copy object from host to device
-        x = (nx_halo - cpu_data.shape[1]) // 2
-        y = (ny_halo - cpu_data.shape[0]) // 2
-        self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
-        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
-        
-    def __del__(self, *args):
-        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
-        self.data.gpudata.free()
-        self.data = None
-
-    def download(self, stream, cpu_data=None, asynch=False, extent=None):
-        """
-        Enables downloading data from GPU to Python
-        """
-
-        if (extent is None):
-            x = self.x_halo
-            y = self.y_halo
-            nx = self.nx
-            ny = self.ny
-        else:
-            x, y, nx, ny = extent
-            
-        if (cpu_data is None):
-            #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
-            #Allocate host memory
-            #The following fails, don't know why (crashes python)
-            cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
-            #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
-            #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
-            
-        assert nx == cpu_data.shape[1]
-        assert ny == cpu_data.shape[0]
-        assert x+nx <= self.nx + 2*self.x_halo
-        assert y+ny <= self.ny + 2*self.y_halo
-        
-        #Create copy object from device to host
-        copy = cuda.Memcpy2D()
-        copy.set_src_device(self.data.gpudata)
-        copy.set_dst_host(cpu_data)
-        
-        #Set offsets and pitch of source
-        copy.src_x_in_bytes = int(x)*self.data.strides[1]
-        copy.src_y = int(y)
-        copy.src_pitch = self.data.strides[0]
-        
-        #Set width in bytes to copy for each row and
-        #number of rows to copy
-        copy.width_in_bytes = int(nx)*cpu_data.itemsize
-        copy.height = int(ny)
-        
-        copy(stream)
-        if asynch==False:
-            stream.synchronize()
-        
-        return cpu_data
-        
-    def upload(self, stream, cpu_data, extent=None):
-        if (extent is None):
-            x = self.x_halo
-            y = self.y_halo
-            nx = self.nx
-            ny = self.ny
-        else:
-            x, y, nx, ny = extent
-            
-        assert(nx == cpu_data.shape[1])
-        assert(ny == cpu_data.shape[0])
-        assert(x+nx <= self.nx + 2*self.x_halo)
-        assert(y+ny <= self.ny + 2*self.y_halo)
-         
-        #Create copy object from device to host
-        copy = cuda.Memcpy2D()
-        copy.set_dst_device(self.data.gpudata)
-        copy.set_src_host(cpu_data)
-        
-        #Set offsets and pitch of source
-        copy.dst_x_in_bytes = int(x)*self.data.strides[1]
-        copy.dst_y = int(y)
-        copy.dst_pitch = self.data.strides[0]
-        
-        #Set width in bytes to copy for each row and
-        #number of rows to copy
-        copy.width_in_bytes = int(nx)*cpu_data.itemsize
-        copy.height = int(ny)
-        
-        copy(stream)
-
-
-class CudaArray3D:
-    """
-    Class that holds 3D data 
-    """
-
-    def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
-        """
-        Uploads initial data to the CL device
-        """
-
-        self.logger =  logging.getLogger(__name__)
-        self.nx = nx
-        self.ny = ny
-        self.nz = nz
-        self.x_halo = x_halo
-        self.y_halo = y_halo
-        self.z_halo = z_halo
-        
-        nx_halo = nx + 2*x_halo
-        ny_halo = ny + 2*y_halo
-        nz_halo = nz + 2*z_halo
-        
-        #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
-        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
-        self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)
-        
-        #For returning to download
-        self.memorypool = PageLockedMemoryPool()
-        
-        #If we don't have any data, just allocate and return
-        if cpu_data is None:
-            return
-            
-        #Make sure data is in proper format
-        assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo)))
-        assert cpu_data.itemsize == 4, "Wrong size of data type"
-        assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
-            
-        #Create copy object from host to device
-        copy = cuda.Memcpy3D()
-        copy.set_src_host(cpu_data)
-        copy.set_dst_device(self.data.gpudata)
-        
-        #Set offsets of destination
-        x_offset = (nx_halo - cpu_data.shape[2]) // 2
-        y_offset = (ny_halo - cpu_data.shape[1]) // 2
-        z_offset = (nz_halo - cpu_data.shape[0]) // 2
-        copy.dst_x_in_bytes = x_offset*self.data.strides[1]
-        copy.dst_y = y_offset
-        copy.dst_z = z_offset
-        
-        #Set pitch of destination
-        copy.dst_pitch = self.data.strides[0]
-        
-        #Set width in bytes to copy for each row and
-        #number of rows to copy
-        width = max(self.nx, cpu_data.shape[2])
-        height = max(self.ny, cpu_data.shape[1])
-        depth = max(self.nz, cpu-data.shape[0])
-        copy.width_in_bytes = width*cpu_data.itemsize
-        copy.height = height
-        copy.depth = depth
-        
-        #Perform the copy
-        copy(stream)
-        
-        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
-        
-    def __del__(self, *args):
-        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
-        self.data.gpudata.free()
-        self.data = None
-        
-    def download(self, stream, asynch=False):
-        """
-        Enables downloading data from GPU to Python
-        """
-
-        #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
-        #Allocate host memory
-        #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
-        #cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
-        cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32)
-        
-        #Create copy object from device to host
-        copy = cuda.Memcpy2D()
-        copy.set_src_device(self.data.gpudata)
-        copy.set_dst_host(cpu_data)
-        
-        #Set offsets and pitch of source
-        copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
-        copy.src_y = self.y_halo
-        copy.src_z = self.z_halo
-        copy.src_pitch = self.data.strides[0]
-        
-        #Set width in bytes to copy for each row and
-        #number of rows to copy
-        copy.width_in_bytes = self.nx*cpu_data.itemsize
-        copy.height = self.ny
-        copy.depth = self.nz
-        
-        copy(stream)
-        if asynch==False:
-            stream.synchronize()
-        
-        return cpu_data
-
-
-class ArakawaA2D:
-    """
-    A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
-    """
-
-    def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
-        """
-        Uploads initial data to the GPU device
-        """
-        self.logger =  logging.getLogger(__name__)
-        self.gpu_variables = []
-        for cpu_variable in cpu_variables:
-            self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)]
-        
-    def __getitem__(self, key):
-        assert type(key) == int, "Indexing is int based"
-        if (key > len(self.gpu_variables) or key < 0):
-            raise IndexError("Out of bounds")
-        return self.gpu_variables[key]
-    
-    def download(self, stream, variables=None):
-        """
-        Enables downloading data from the GPU device to Python
-        """
-        if variables is None:
-            variables=range(len(self.gpu_variables))
-        
-        cpu_variables = []
-        for i in variables:
-            assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
-            cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
-
-        #stream.synchronize()
-        return cpu_variables
-        
-    def check(self):
-        """
-        Checks that data is still sane
-        """
-        for i, gpu_variable in enumerate(self.gpu_variables):
-            var_sum = pycuda.gpuarray.sum(gpu_variable.data).get()
-            self.logger.debug("Data %d with size [%d x %d] has average %f", i, gpu_variable.nx, gpu_variable.ny, var_sum / (gpu_variable.nx * gpu_variable.ny))
-            assert np.isnan(var_sum) == False, "Data contains NaN values!"
-    
--- a/GPUSimulators/EE2D_KP07_dimsplit.py
+++ b/GPUSimulators/EE2D_KP07_dimsplit.py
@ -19,29 +19,29 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition

-class EE2D_KP07_dimsplit (BaseSimulator):
+
+class EE2D_KP07_dimsplit(BaseSimulator):
    """
    Class that solves the SW equations using the Forward-Backward linear scheme
    """

-    def __init__(self, 
-                 context, 
-                 rho, rho_u, rho_v, E, 
-                 nx, ny, 
-                 dx, dy,  
-                 g, 
-                 gamma, 
-                 theta=1.3, 
+    def __init__(self,
+                 context,
+                 rho, rho_u, rho_v, E,
+                 nx, ny,
+                 dx, dy,
+                 g,
+                 gamma,
+                 theta=1.3,
                 cfl_scale=0.9,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=8):
        """
        Initialization routine
@ -60,77 +60,76 @@ class EE2D_KP07_dimsplit (BaseSimulator):
            gamma: Gas constant
            p: pressure
        """
-                    
+
        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale, 
-            2, 
-            block_width, block_height)
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         2,
+                         block_width, block_height)
        self.g = np.float32(g)
        self.gamma = np.float32(gamma)
-        self.theta = np.float32(theta) 
+        self.theta = np.float32(theta)

-        #Get kernels
-        module = context.get_module("cuda/EE2D_KP07_dimsplit.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"], 
-                                        }, 
-                                        jit_compile_args={})
+        # Get kernels
+        module = context.get_module("cuda/EE2D_KP07_dimsplit.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"],
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("KP07DimsplitKernel")
        self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
-        
-        
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [rho, rho_u, rho_v, E])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [None, None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [rho, rho_u, rho_v, E])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [None, None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        dt_x = np.min(self.dx / (np.abs(rho_u/rho) + np.sqrt(gamma*rho)))
-        dt_y = np.min(self.dy / (np.abs(rho_v/rho) + np.sqrt(gamma*rho)))
+        dt_x = np.min(self.dx / (np.abs(rho_u / rho) + np.sqrt(gamma * rho)))
+        dt_y = np.min(self.dy / (np.abs(rho_v / rho) + np.sqrt(gamma * rho)))
        self.dt = min(dt_x, dt_y)
        self.cfl_data.fill(self.dt, stream=self.stream)
-    
-    def substep(self, dt, step_number, external=True, internal=True):
-            self.substepDimsplit(0.5*dt, step_number, external, internal)
-    
-    def substepDimsplit(self, dt, substep, external, internal):
-        if external and internal:
-            #print("COMPLETE DOMAIN (dt=" + str(dt) + ")")

-            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.gamma, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
-                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0, 
-                self.nx, self.ny)
+    def substep(self, dt, step_number, external=True, internal=True):
+        self.substep_dimsplit(0.5 * dt, step_number, external, internal)
+
+    def substep_dimsplit(self, dt, substep, external, internal):
+        if external and internal:
+            # print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
+
+            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                            self.nx, self.ny,
+                                            self.dx, self.dy, dt,
+                                            self.g,
+                                            self.gamma,
+                                            self.theta,
+                                            substep,
+                                            self.boundary_conditions,
+                                            self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                            self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                            self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                            self.u0[3].data.gpudata, self.u0[3].data.strides[0],
+                                            self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                            self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                            self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                            self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                                            self.cfl_data.gpudata,
+                                            0, 0,
+                                            self.nx, self.ny)
            return
-        
+
        if external and not internal:
            ###################################
            # XXX: Corners are treated twice! #
@ -141,136 +140,135 @@ class EE2D_KP07_dimsplit (BaseSimulator):
            # NORTH
            # (x0, y0) x (x1, y1)
            #  (0, ny-y_halo) x (nx, ny)
-            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, 
-                self.nx, self.ny,
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.gamma, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
-                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata,
-                0, self.ny - int(self.u0[0].y_halo),
-                self.nx, self.ny)
+            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
+                                            self.nx, self.ny,
+                                            self.dx, self.dy, dt,
+                                            self.g,
+                                            self.gamma,
+                                            self.theta,
+                                            substep,
+                                            self.boundary_conditions,
+                                            self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                            self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                            self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                            self.u0[3].data.gpudata, self.u0[3].data.strides[0],
+                                            self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                            self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                            self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                            self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                                            self.cfl_data.gpudata,
+                                            0, self.ny - int(self.u0[0].y_halo),
+                                            self.nx, self.ny)

            # SOUTH
            # (x0, y0) x (x1, y1)
            #   (0, 0) x (nx, y_halo)
-            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, 
-                self.nx, self.ny,
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.gamma, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
-                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, int(self.u0[0].y_halo))
-            
+            self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
+                                            self.nx, self.ny,
+                                            self.dx, self.dy, dt,
+                                            self.g,
+                                            self.gamma,
+                                            self.theta,
+                                            substep,
+                                            self.boundary_conditions,
+                                            self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                            self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                            self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                            self.u0[3].data.gpudata, self.u0[3].data.strides[0],
+                                            self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                            self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                            self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                            self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                                            self.cfl_data.gpudata,
+                                            0, 0,
+                                            self.nx, int(self.u0[0].y_halo))
+
            we_grid_size = (1, self.grid_size[1])
-            
+
            # WEST
            # (x0, y0) x (x1, y1)
            #  (0, 0) x (x_halo, ny)
-            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, 
-                self.nx, self.ny,
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.gamma, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
-                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                int(self.u0[0].x_halo), self.ny)
+            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
+                                            self.nx, self.ny,
+                                            self.dx, self.dy, dt,
+                                            self.g,
+                                            self.gamma,
+                                            self.theta,
+                                            substep,
+                                            self.boundary_conditions,
+                                            self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                            self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                            self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                            self.u0[3].data.gpudata, self.u0[3].data.strides[0],
+                                            self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                            self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                            self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                            self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                                            self.cfl_data.gpudata,
+                                            0, 0,
+                                            int(self.u0[0].x_halo), self.ny)

            # EAST
            # (x0, y0) x (x1, y1)
            #   (nx-x_halo, 0) x (nx, ny)
-            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, 
-                self.nx, self.ny,
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.gamma, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
-                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata,
-                self.nx - int(self.u0[0].x_halo), 0,
-                self.nx, self.ny)
+            self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
+                                            self.nx, self.ny,
+                                            self.dx, self.dy, dt,
+                                            self.g,
+                                            self.gamma,
+                                            self.theta,
+                                            substep,
+                                            self.boundary_conditions,
+                                            self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                            self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                            self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                            self.u0[3].data.gpudata, self.u0[3].data.strides[0],
+                                            self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                            self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                            self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                            self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                                            self.cfl_data.gpudata,
+                                            self.nx - int(self.u0[0].x_halo), 0,
+                                            self.nx, self.ny)
            return

        if internal and not external:
-            
            # INTERNAL DOMAIN
            #         (x0, y0) x (x1, y1)
            # (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
-            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.gamma, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u0[3].data.gpudata, self.u0[3].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0], 
-                self.u1[3].data.gpudata, self.u1[3].data.strides[0],
-                self.cfl_data.gpudata,
-                int(self.u0[0].x_halo), int(self.u0[0].y_halo),
-                self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
+            self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
+                                            self.nx, self.ny,
+                                            self.dx, self.dy, dt,
+                                            self.g,
+                                            self.gamma,
+                                            self.theta,
+                                            substep,
+                                            self.boundary_conditions,
+                                            self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                            self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                            self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                            self.u0[3].data.gpudata, self.u0[3].data.strides[0],
+                                            self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                            self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                            self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                            self.u1[3].data.gpudata, self.u1[3].data.strides[0],
+                                            self.cfl_data.gpudata,
+                                            int(self.u0[0].x_halo), int(self.u0[0].y_halo),
+                                            self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
            return

-    def swapBuffers(self):
+    def swap_buffers(self):
        self.u0, self.u1 = self.u1, self.u0
        return
-        
-    def getOutput(self):
+
+    def get_output(self):
        return self.u0

    def check(self):
        self.u0.check()
        self.u1.check()
        return
-        
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5
--- a/GPUSimulators/FORCE.py
+++ b/GPUSimulators/FORCE.py
@ -20,30 +20,31 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators import Simulator
+from GPUSimulators.Simulator import BoundaryCondition

-class FORCE (Simulator.BaseSimulator):
+
+class FORCE(Simulator.BaseSimulator):
    """
    Class that solves the SW equations 
    """

-    def __init__(self, 
-                 context, 
-                 h0, hu0, hv0, 
-                 nx, ny, 
-                 dx, dy, 
-                 g, 
+    def __init__(self,
+                 context,
+                 h0, hu0, hv0,
+                 nx, ny,
+                 dx, dy,
+                 g,
                 cfl_scale=0.9,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine
        
@ -59,76 +60,76 @@ class FORCE (Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
-        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            1,
-            block_width, block_height)
-        self.g = np.float32(g) 

-        #Get kernels
+        # Call super constructor
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         1,
+                         block_width, block_height)
+        self.g = np.float32(g)
+
+        # Get kernels
        module = context.get_module("cuda/SWE2D_FORCE.cu",
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("FORCEKernel")
        self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
-    
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        1, 1, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        1, 1, 
-                        [None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             1, 1,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             1, 1,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)

-        if dt == None:
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt

        self.cfl_data.fill(self.dt, stream=self.stream)
-        
+
    def substep(self, dt, step_number):
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)

        self.u0, self.u1 = self.u1, self.u0
-        
-    def getOutput(self):
+
+    def get_output(self):
        return self.u0
-        
+
    def check(self):
        self.u0.check()
        self.u1.check()
-                
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt
--- a/GPUSimulators/HLL.py
+++ b/GPUSimulators/HLL.py
@ -19,30 +19,31 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators import Simulator
+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.Simulator import BoundaryCondition

-class HLL (Simulator.BaseSimulator):
+
+class HLL(Simulator.BaseSimulator):
    """
    Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
    """

-    def __init__(self, 
+    def __init__(self,
                 context,
-                 h0, hu0, hv0, 
-                 nx, ny, 
-                 dx, dy, 
-                 g, 
+                 h0, hu0, hv0,
+                 nx, ny,
+                 dx, dy,
+                 g,
                 cfl_scale=0.9,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine

@ -58,74 +59,74 @@ class HLL (Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
-        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            1,
-            block_width, block_height);
-        self.g = np.float32(g) 

-        #Get kernels
-        module = context.get_module("cuda/SWE2D_HLL.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+        # Call super constructor
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         1,
+                         block_width, block_height)
+        self.g = np.float32(g)
+
+        # Get kernels
+        module = context.get_module("cuda/SWE2D_HLL.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("HLLKernel")
        self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
-    
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        1, 1, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        1, 1, 
-                        [None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             1, 1,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             1, 1,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        if dt == None:
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt
-        
+
        self.cfl_data.fill(self.dt, stream=self.stream)
-        
+
    def substep(self, dt, step_number):
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)
        self.u0, self.u1 = self.u1, self.u0
-        
-    def getOutput(self):
+
+    def get_output(self):
        return self.u0
-                        
+
    def check(self):
        self.u0.check()
        self.u1.check()
-        
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5
--- a/GPUSimulators/HLL2.py
+++ b/GPUSimulators/HLL2.py
@ -19,31 +19,32 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators import Simulator
+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.Simulator import BoundaryCondition

-class HLL2 (Simulator.BaseSimulator):
+
+class HLL2(Simulator.BaseSimulator):
    """
    Class that solves the SW equations using the Forward-Backward linear scheme
    """

-    def __init__(self, 
-                 context, 
-                 h0, hu0, hv0, 
-                 nx, ny, 
-                 dx, dy, 
-                 g, 
-                 theta=1.8, 
+    def __init__(self,
+                 context,
+                 h0, hu0, hv0,
+                 nx, ny,
+                 dx, dy,
+                 g,
+                 theta=1.8,
                 cfl_scale=0.9,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine

@ -59,81 +60,81 @@ class HLL2 (Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
+
        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            2,
-            block_width, block_height);
-        self.g = np.float32(g) 
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         2,
+                         block_width, block_height)
+        self.g = np.float32(g)
        self.theta = np.float32(theta)
-        
-        #Get kernels
-        module = context.get_module("cuda/SWE2D_HLL2.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+
+        # Get kernels
+        module = context.get_module("cuda/SWE2D_HLL2.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("HLL2Kernel")
        self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
-        
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        
-        if dt == None:
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt
-        
+
        self.cfl_data.fill(self.dt, stream=self.stream)
-        
+
    def substep(self, dt, step_number):
-        self.substepDimsplit(dt*0.5, step_number)
-                
-    def substepDimsplit(self, dt, substep):
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.theta, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+        self.substep_dimsplit(dt * 0.5, step_number)
+
+    def substep_dimsplit(self, dt, substep):
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        self.theta,
+                                        substep,
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)
        self.u0, self.u1 = self.u1, self.u0
-    
-    def getOutput(self):
+
+    def get_output(self):
        return self.u0
-        
+
    def check(self):
        self.u0.check()
        self.u1.check()
-        
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5
--- a/GPUSimulators/IPythonMagic.py
+++ b/GPUSimulators/IPythonMagic.py
@ -26,12 +26,12 @@ from IPython.core import magic_arguments
 from IPython.core.magic import line_magic, Magics, magics_class
 import pycuda.driver as cuda

-from GPUSimulators import Common
+from GPUSimulators.common import IPEngine
 from GPUSimulators.gpu import CudaContext


@magics_class
-class MagicCudaContext(Magics): 
+class MagicCudaContext(Magics):
    @line_magic
    @magic_arguments.magic_arguments()
    @magic_arguments.argument(
@ -44,14 +44,14 @@ class MagicCudaContext(Magics):
        '--no_autotuning', '-na', action="store_true", help='Disable autotuning of kernels')
    def cuda_context_handler(self, line):
        args = magic_arguments.parse_argstring(self.cuda_context_handler, line)
-        self.logger =  logging.getLogger(__name__)
-        
+        self.logger = logging.getLogger(__name__)
+
        self.logger.info("Registering %s in user workspace", args.name)
-        
+
        context_flags = None
-        if (args.blocking):
+        if args.blocking:
            context_flags = cuda.ctx_flags.SCHED_BLOCKING_SYNC
-        
+
        if args.name in self.shell.user_ns.keys():
            self.logger.debug("Context already registered! Ignoring")
            return
@ -59,12 +59,13 @@ class MagicCudaContext(Magics):
            self.logger.debug("Creating context")
            use_cache = False if args.no_cache else True
            use_autotuning = False if args.no_autotuning else True
-            self.shell.user_ns[args.name] = CudaContext.CudaContext(context_flags=context_flags, use_cache=use_cache, autotuning=use_autotuning)
-        
+            self.shell.user_ns[args.name] = CudaContext(context_flags=context_flags, use_cache=use_cache,
+                                                                    autotuning=use_autotuning)
+
        # this function will be called on exceptions in any cell
        def custom_exc(shell, etype, evalue, tb, tb_offset=None):
            self.logger.exception("Exception caught: Resetting to CUDA context %s", args.name)
-            while (cuda.Context.get_current() != None):
+            while cuda.Context.get_current() is not None:
                context = cuda.Context.get_current()
                self.logger.info("Popping <%s>", str(context.handle))
                cuda.Context.pop()
@ -77,36 +78,30 @@ class MagicCudaContext(Magics):
                self.logger.error("CUDA will not work now")

            self.logger.debug("==================================================================")
-            
+
            # still show the error within the notebook, don't just swallow it
            shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)

        # this registers a custom exception handler for the whole current notebook
        get_ipython().set_custom_exc((Exception,), custom_exc)
-        
-        
+
        # Handle CUDA context when exiting python
        import atexit
        def exitfunc():
            self.logger.info("Exitfunc: Resetting CUDA context stack")
-            while (cuda.Context.get_current() != None):
+            while cuda.Context.get_current() != None:
                context = cuda.Context.get_current()
                self.logger.info("`-> Popping <%s>", str(context.handle))
                cuda.Context.pop()
            self.logger.debug("==================================================================")
+
        atexit.register(exitfunc)
-        
-        
-        
-        
-        
-        
-        
-        
+
+
@magics_class
-class MagicLogger(Magics): 
+class MagicLogger(Magics):
    logger_initialized = False
-    
+
    @line_magic
    @magic_arguments.magic_arguments()
    @magic_arguments.argument(
@ -118,51 +113,47 @@ class MagicLogger(Magics):
    @magic_arguments.argument(
        '--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
    def setup_logging(self, line):
-        if (self.logger_initialized):
+        if self.logger_initialized:
            logging.getLogger('GPUSimulators').info("Global logger already initialized!")
-            return;
+            return
        else:
            self.logger_initialized = True
-            
+
            args = magic_arguments.parse_argstring(self.setup_logging, line)
            import sys
-            
-            #Get root logger
+
+            # Get root logger
            logger = logging.getLogger('GPUSimulators')
            logger.setLevel(min(args.level, args.file_level))

-            #Add log to screen
+            # Add log to screen
            ch = logging.StreamHandler()
            ch.setLevel(args.level)
            logger.addHandler(ch)
            logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level))
-            
-            #Get the outfilename (try to evaluate if Python expression...)
+
+            # Get the outfilename (try to evaluate if Python expression...)
            try:
                outfile = eval(args.out, self.shell.user_global_ns, self.shell.user_ns)
            except:
                outfile = args.out
-            
-            #Add log to file
+
+            # Add log to file
            logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), outfile)
-            
+
            fh = logging.FileHandler(outfile)
            formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
            fh.setFormatter(formatter)
            fh.setLevel(args.file_level)
            logger.addHandler(fh)
-        
+
        logger.info("Python version %s", sys.version)
        self.shell.user_ns[args.name] = logger


-
-        
-
-
@magics_class
-class MagicMPI(Magics): 
-    
+class MagicMPI(Magics):
+
    @line_magic
    @magic_arguments.magic_arguments()
    @magic_arguments.argument(
@ -177,13 +168,7 @@ class MagicMPI(Magics):
            self.shell.user_ns[args.name].shutdown()
            self.shell.user_ns[args.name] = None
            gc.collect()
-        self.shell.user_ns[args.name] = Common.IPEngine(args.num_engines)
-
-        
-
-
-
-
+        self.shell.user_ns[args.name] = IPEngine(args.num_engines)


 # Register 
@ -191,4 +176,3 @@ ip = get_ipython()
 ip.register_magics(MagicCudaContext)
 ip.register_magics(MagicLogger)
 ip.register_magics(MagicMPI)
-
--- a/GPUSimulators/KP07.py
+++ b/GPUSimulators/KP07.py
@ -24,32 +24,33 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators import Simulator
+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.Simulator import BoundaryCondition

-class KP07 (Simulator.BaseSimulator):
+
+class KP07(Simulator.BaseSimulator):
    """
    Class that solves the SW equations using the Forward-Backward linear scheme
    """

-    def __init__(self, 
-                 context, 
-                 h0, hu0, hv0, 
-                 nx, ny, 
-                 dx, dy, 
-                 g, 
-                 theta=1.3, 
+    def __init__(self,
+                 context,
+                 h0, hu0, hv0,
+                 nx, ny,
+                 dx, dy,
+                 g,
+                 theta=1.3,
                 cfl_scale=0.9,
                 order=2,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine
        
@ -65,84 +66,82 @@ class KP07 (Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
+
        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            order,
-            block_width, block_height);
-        self.g = np.float32(g)             
-        self.theta = np.float32(theta) 
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         order,
+                         block_width, block_height)
+        self.g = np.float32(g)
+        self.theta = np.float32(theta)
        self.order = np.int32(order)

-        #Get kernels
-        module = context.get_module("cuda/SWE2D_KP07.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+        # Get kernels
+        module = context.get_module("cuda/SWE2D_KP07.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("KP07Kernel")
        self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
-        
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        
-        if dt == None:
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt
-        
-        self.cfl_data.fill(self.dt, stream=self.stream)
-                        
-        
-    def substep(self, dt, step_number):
-            self.substepRK(dt, step_number)

-        
-    def substepRK(self, dt, substep):
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.theta, 
-                Simulator.stepOrderToCodedInt(step=substep, order=self.order), 
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+        self.cfl_data.fill(self.dt, stream=self.stream)
+
+    def substep(self, dt, step_number):
+        self.substep_rk(dt, step_number)
+
+    def substep_rk(self, dt, substep):
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        self.theta,
+                                        Simulator.step_order_to_coded_int(step=substep, order=self.order),
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)
        self.u0, self.u1 = self.u1, self.u0

-    def getOutput(self):
+    def get_output(self):
        return self.u0
-        
+
    def check(self):
        self.u0.check()
        self.u1.check()
-        
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5**(self.order-1)
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5 ** (self.order - 1)
--- a/GPUSimulators/KP07_dimsplit.py
+++ b/GPUSimulators/KP07_dimsplit.py
@ -24,31 +24,32 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators import Simulator
+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.Simulator import BoundaryCondition
+

 class KP07_dimsplit(Simulator.BaseSimulator):
    """
    Class that solves the SW equations using the dimentionally split KP07 scheme
    """

-    def __init__(self, 
-                 context, 
-                 h0, hu0, hv0, 
-                 nx, ny, 
-                 dx, dy, 
-                 g, 
-                 theta=1.3, 
+    def __init__(self,
+                 context,
+                 h0, hu0, hv0,
+                 nx, ny,
+                 dx, dy,
+                 g,
+                 theta=1.3,
                 cfl_scale=0.9,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine
        
@ -64,83 +65,83 @@ class KP07_dimsplit(Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
+
        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            2, 
-            block_width, block_height)
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         2,
+                         block_width, block_height)
        self.gc_x = 2
        self.gc_y = 2
        self.g = np.float32(g)
        self.theta = np.float32(theta)

-        #Get kernels
-        module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+        # Get kernels
+        module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("KP07DimsplitKernel")
        self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
-    
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        self.gc_x, self.gc_y, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        self.gc_x, self.gc_y, 
-                        [None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             self.gc_x, self.gc_y,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             self.gc_x, self.gc_y,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)

-        if dt == None:
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt
-        
+
        self.cfl_data.fill(self.dt, stream=self.stream)
-    
+
    def substep(self, dt, step_number):
-        self.substepDimsplit(dt*0.5, step_number)
-    
-    def substepDimsplit(self, dt, substep):
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.theta, 
-                substep, 
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+        self.substep_dimsplit(dt * 0.5, step_number)
+
+    def substep_dimsplit(self, dt, substep):
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        self.theta,
+                                        substep,
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)
        self.u0, self.u1 = self.u1, self.u0

-    def getOutput(self):
+    def get_output(self):
        return self.u0

    def check(self):
        self.u0.check()
        self.u1.check()

-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5
--- a/GPUSimulators/LxF.py
+++ b/GPUSimulators/LxF.py
@ -20,16 +20,17 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.gpu import CudaContext
-from GPUSimulators.Simulator import BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators import Simulator
+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.gpu import CudaContext
+from GPUSimulators.Simulator import BoundaryCondition

-class LxF (Simulator.BaseSimulator):
+
+class LxF(Simulator.BaseSimulator):
    """
    Class that solves the SW equations using the Lax Friedrichs scheme
    """
@ -40,11 +41,11 @@ class LxF (Simulator.BaseSimulator):
                 nx: int, ny: int,
                 dx: int, dy: int,
                 g: float,
-                 cfl_scale: float=0.9,
+                 cfl_scale: float = 0.9,
                 boundary_conditions=BoundaryCondition(),
-                 block_width: int=16, block_height: int=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 block_width: int = 16, block_height: int = 16,
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine

@ -60,80 +61,80 @@ class LxF (Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
+
        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            1,
-            block_width, block_height)
-        self.g = np.float32(g) 
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         1,
+                         block_width, block_height)
+        self.g = np.float32(g)

        # Get kernels
-        module = context.get_module("cuda/SWE2D_LxF.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+        module = context.get_module("cuda/SWE2D_LxF.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("LxFKernel")
        self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")

-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        1, 1, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        1, 1, 
-                        [None, None, None])
+        # Create data by uploading to thedevice
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             1, 1,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             1, 1,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)

-        if dt == None:  
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0))) 
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt
-        
+
        self.cfl_data.fill(self.dt, stream=self.stream)
-        
+
    def substep(self, dt, step_number):
        """
        Args:
            dt: Size of each timestep (seconds)
        """
-        
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)
        self.u0, self.u1 = self.u1, self.u0
-  
-    def getOutput(self):
+
+    def get_output(self):
        return self.u0

    def check(self):
        self.u0.check()
        self.u1.check()
-        
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5
--- a/GPUSimulators/MPISimulator.py
+++ b/GPUSimulators/MPISimulator.py
@ -222,7 +222,7 @@ class MPISimulator(Simulator.BaseSimulator):
        
        autotuner = sim.context.autotuner
        sim.context.autotuner = None;
-        boundary_conditions = sim.getBoundaryConditions()
+        boundary_conditions = sim.get_boundary_conditions()
        super().__init__(sim.context, 
            sim.nx, sim.ny, 
            sim.dx, sim.dy, 
@ -263,14 +263,14 @@ class MPISimulator(Simulator.BaseSimulator):
        if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
            self.north = None
            new_boundary_conditions.north = boundary_conditions.north;
-        sim.setBoundaryConditions(new_boundary_conditions)
+        sim.set_boundary_conditions(new_boundary_conditions)
                
        #Get number of variables
-        self.nvars = len(self.getOutput().gpu_variables)
+        self.nvars = len(self.get_output().gpu_variables)
        
        #Shorthands for computing extents and sizes
-        gc_x = int(self.sim.getOutput()[0].x_halo)
-        gc_y = int(self.sim.getOutput()[0].y_halo)
+        gc_x = int(self.sim.get_output()[0].x_halo)
+        gc_y = int(self.sim.get_output()[0].y_halo)
        nx = int(self.sim.nx)
        ny = int(self.sim.ny)
        
@ -322,7 +322,7 @@ class MPISimulator(Simulator.BaseSimulator):
        #nvtx.mark("substep full", color="blue")
        #self.sim.substep(dt, step_number, external=True, internal=True)

-        self.sim.swapBuffers()
+        self.sim.swap_buffers()

        self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
        
@ -336,8 +336,8 @@ class MPISimulator(Simulator.BaseSimulator):
        
        self.profiling_data_mpi["n_time_steps"] += 1

-    def getOutput(self):
-        return self.sim.getOutput()
+    def get_output(self):
+        return self.sim.get_output()
        
    def synchronize(self):
        self.sim.synchronize()
@ -345,14 +345,14 @@ class MPISimulator(Simulator.BaseSimulator):
    def check(self):
        return self.sim.check()
        
-    def computeDt(self):
-        local_dt = np.array([np.float32(self.sim.computeDt())]);
+    def compute_dt(self):
+        local_dt = np.array([np.float32(self.sim.compute_dt())]);
        global_dt = np.empty(1, dtype=np.float32)
        self.grid.comm.Allreduce(local_dt, global_dt, op=MPI.MIN)
        self.logger.debug("Local dt: {:f}, global dt: {:f}".format(local_dt[0], global_dt[0]))
        return global_dt[0]
        
-    def getExtent(self):
+    def get_extent(self):
        """
        Function which returns the extent of node with rank 
        rank in the grid
--- a/GPUSimulators/SHMEMSimulator.py
+++ b/GPUSimulators/SHMEMSimulator.py
@ -45,7 +45,7 @@ class SHMEMSimulator(Simulator.BaseSimulator):
        # This would also eliminate the need for all the array bookkeeping in this class.
        autotuner = sims[0].context.autotuner
        sims[0].context.autotuner = None
-        boundary_conditions = sims[0].getBoundaryConditions()
+        boundary_conditions = sims[0].get_boundary_conditions()
        super().__init__(sims[0].context, 
            sims[0].nx, sims[0].ny, 
            sims[0].dx, sims[0].dy, 
@ -108,14 +108,14 @@ class SHMEMSimulator(Simulator.BaseSimulator):
            if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
                self.north = None
                new_boundary_conditions.north = boundary_conditions.north;
-            sim.setBoundaryConditions(new_boundary_conditions)
+            sim.set_boundary_conditions(new_boundary_conditions)
                    
            #Get number of variables
-            self.nvars[i] = len(sim.getOutput().gpu_variables)
+            self.nvars[i] = len(sim.get_output().gpu_variables)
            
            #Shorthands for computing extents and sizes
-            gc_x = int(sim.getOutput()[0].x_halo)
-            gc_y = int(sim.getOutput()[0].y_halo)
+            gc_x = int(sim.get_output()[0].x_halo)
+            gc_y = int(sim.get_output()[0].y_halo)
            nx = int(sim.nx)
            ny = int(sim.ny)
            
@ -150,10 +150,10 @@ class SHMEMSimulator(Simulator.BaseSimulator):
        for i, sim in enumerate(self.sims):
            sim.substep(dt, step_number)
    
-    def getOutput(self):
+    def get_output(self):
        # XXX: Does not return what we would expect.
        # Returns first subdomain, but we want the whole domain.
-        return self.sims[0].getOutput() 
+        return self.sims[0].get_output()
        
    def synchronize(self):
        for sim in self.sims:
@ -164,14 +164,14 @@ class SHMEMSimulator(Simulator.BaseSimulator):
        # Checks only first subdomain, but we want to check the whole domain.
        return self.sims[0].check()
    
-    def computeDt(self):
+    def compute_dt(self):
        global_dt = float("inf")

        for sim in self.sims:
            sim.context.synchronize()

        for sim in self.sims:
-            local_dt = sim.computeDt()
+            local_dt = sim.compute_dt()
            if local_dt < global_dt:
                global_dt = local_dt
            self.logger.debug("Local dt: {:f}".format(local_dt))
@ -179,7 +179,7 @@ class SHMEMSimulator(Simulator.BaseSimulator):
        self.logger.debug("Global dt: {:f}".format(global_dt))
        return global_dt
        
-    def getExtent(self, index=0):
+    def get_extent(self, index=0):
        """
        Function which returns the extent of the subdomain with index 
        index in the grid
--- a/GPUSimulators/SHMEMSimulatorGroup.py
+++ b/GPUSimulators/SHMEMSimulatorGroup.py
@ -62,8 +62,8 @@ class SHMEMGrid(object):

        for i in range(self.ngpus):
            # XXX: disabled for testing on single-GPU system
-            #self.cuda_contexts.append(CudaContext.CudaContext(device=i, autotuning=False))
-            self.cuda_contexts.append(CudaContext.CudaContext(device=0, autotuning=False))
+            #self.cuda_contexts.append(CudaContext(device=i, autotuning=False))
+            self.cuda_contexts.append(CudaContext(device=0, autotuning=False))

    def getCoordinate(self, index):
        i = (index  % self.grid[0])
@ -180,7 +180,7 @@ class SHMEMSimulatorGroup(object):
        
        autotuner = sims[0].context.autotuner
        sims[0].context.autotuner = None
-        boundary_conditions = sims[0].getBoundaryConditions()
+        boundary_conditions = sims[0].get_boundary_conditions()
        super().__init__(sims[0].context, 
            sims[0].nx, sims[0].ny, 
            sims[0].dx, sims[0].dy, 
@ -243,14 +243,14 @@ class SHMEMSimulatorGroup(object):
            if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
                self.north = None
                new_boundary_conditions.north = boundary_conditions.north;
-            sim.setBoundaryConditions(new_boundary_conditions)
+            sim.set_boundary_conditions(new_boundary_conditions)
                    
            #Get number of variables
-            self.nvars[i] = len(sim.getOutput().gpu_variables)
+            self.nvars[i] = len(sim.get_output().gpu_variables)
            
            #Shorthands for computing extents and sizes
-            gc_x = int(sim.getOutput()[0].x_halo)
-            gc_y = int(sim.getOutput()[0].y_halo)
+            gc_x = int(sim.get_output()[0].x_halo)
+            gc_y = int(sim.get_output()[0].y_halo)
            nx = int(sim.nx)
            ny = int(sim.ny)
            
@ -287,7 +287,7 @@ class SHMEMSimulatorGroup(object):
    def getOutput(self):
        # XXX: Does not return what we would expect.
        # Returns first subdomain, but we want the whole domain.
-        return self.sims[0].getOutput() 
+        return self.sims[0].get_output()
        
    def synchronize(self):
        for sim in self.sims:
@ -305,7 +305,7 @@ class SHMEMSimulatorGroup(object):
            sim.context.synchronize()

        for sim in self.sims:
-            local_dt = sim.computeDt()
+            local_dt = sim.compute_dt()
            if local_dt < global_dt:
                global_dt = local_dt
            self.logger.debug("Local dt: {:f}".format(local_dt))
--- a/GPUSimulators/Simulator.py
+++ b/GPUSimulators/Simulator.py
@ -20,18 +20,38 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
+# Import packages we need
 import numpy as np
 import logging
 from enum import IntEnum

 import pycuda.driver as cuda

-from GPUSimulators import Common
+from GPUSimulators.common import ProgressPrinter
 from GPUSimulators.gpu import CudaContext


-class BoundaryCondition(object):    
+def get_types(bc):
+    types = {'north': BoundaryCondition.Type((bc >> 24) & 0x0000000F),
+             'south': BoundaryCondition.Type((bc >> 16) & 0x0000000F),
+             'east': BoundaryCondition.Type((bc >> 8) & 0x0000000F),
+             'west': BoundaryCondition.Type((bc >> 0) & 0x0000000F)}
+    return types
+
+
+def step_order_to_coded_int(step, order):
+    """
+    Helper function which packs the step and order into a single integer
+    """
+
+    step_order = (step << 16) | (order & 0x0000ffff)
+    # print("Step:  {0:032b}".format(step))
+    # print("Order: {0:032b}".format(order))
+    # print("Mix:   {0:032b}".format(step_order))
+    return np.int32(step_order)
+
+
+class BoundaryCondition(object):
    """
    Class for holding boundary conditions for global boundaries
    """
@ -47,12 +67,7 @@ class BoundaryCondition(object):
        Periodic = 2,
        Reflective = 3

-    def __init__(self, types={ 
-                    'north': Type.Reflective, 
-                    'south': Type.Reflective, 
-                    'east': Type.Reflective, 
-                    'west': Type.Reflective 
-                 }):
+    def __init__(self, types: dict[str: Type.Reflective]):
        """
        Constructor
        """
@ -61,17 +76,18 @@ class BoundaryCondition(object):
        self.south = types['south']
        self.east = types['east']
        self.west = types['west']
-        
-        if (self.north == BoundaryCondition.Type.Neumann \
-                or self.south == BoundaryCondition.Type.Neumann \
-                or self.east == BoundaryCondition.Type.Neumann \
-                or self.west == BoundaryCondition.Type.Neumann):
-            raise(NotImplementedError("Neumann boundary condition not supported"))
-            
-    def __str__(self):
-        return  '[north={:s}, south={:s}, east={:s}, west={:s}]'.format(str(self.north), str(self.south), str(self.east), str(self.west))

-    def asCodedInt(self):
+        if (self.north == BoundaryCondition.Type.Neumann
+                or self.south == BoundaryCondition.Type.Neumann
+                or self.east == BoundaryCondition.Type.Neumann
+                or self.west == BoundaryCondition.Type.Neumann):
+            raise (NotImplementedError("Neumann boundary condition not supported"))
+
+    def __str__(self):
+        return '[north={:s}, south={:s}, east={:s}, west={:s}]'.format(str(self.north), str(self.south), str(self.east),
+                                                                       str(self.west))
+
+    def as_coded_int(self):
        """
        Helper function which packs four boundary conditions into one integer
        """
@ -79,26 +95,18 @@ class BoundaryCondition(object):
        bc = 0
        bc = bc | (self.north & 0x0000000F) << 24
        bc = bc | (self.south & 0x0000000F) << 16
-        bc = bc | (self.east  & 0x0000000F) <<  8
-        bc = bc | (self.west  & 0x0000000F) <<  0
-        
-        #for t in types:
+        bc = bc | (self.east & 0x0000000F) << 8
+        bc = bc | (self.west & 0x0000000F) << 0
+
+        # for t in types:
        #    print("{0:s}, {1:d}, {1:032b}, {1:08b}".format(t, types[t]))
-        #print("bc: {0:032b}".format(bc))
-        
+        # print("bc: {0:032b}".format(bc))
+
        return np.int32(bc)
-        
-    def getTypes(bc):
-        types = {}
-        types['north'] = BoundaryCondition.Type((bc >> 24) & 0x0000000F)
-        types['south'] = BoundaryCondition.Type((bc >> 16) & 0x0000000F)
-        types['east']  = BoundaryCondition.Type((bc >>  8) & 0x0000000F)
-        types['west']  = BoundaryCondition.Type((bc >>  0) & 0x0000000F)
-        return types


 class BaseSimulator(object):
-   
+
    def __init__(self,
                 context: CudaContext,
                 nx: int, ny: int,
@ -125,40 +133,40 @@ class BaseSimulator(object):
            num_substeps: Number of substeps to perform for a full step
        """

-        #Get logger
+        # Get logger
        self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #GPU kernel
+
+        # Save input parameters
+        # Notice that we need to specify them in the correct dataformat for the
+        # GPU kernel
        self.context = context
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
-        self.setBoundaryConditions(boundary_conditions)
+        self.set_boundary_conditions(boundary_conditions)
        self.cfl_scale = cfl_scale
        self.num_substeps = num_substeps
-        
-        #Handle autotuning block size
+
+        # Handle autotuning block size
        if self.context.autotuner:
            peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
            block_width = int(peak_configuration["block_width"])
            block_height = int(peak_configuration["block_height"])
            self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
-        
-        #Compute kernel launch parameters
-        self.block_size = (block_width, block_height, 1) 
-        self.grid_size = ( 
-                       int(np.ceil(self.nx / float(self.block_size[0]))), 
-                       int(np.ceil(self.ny / float(self.block_size[1]))) 
-                      )
-        
-        #Create a CUDA stream
+
+        # Compute kernel launch parameters
+        self.block_size = (block_width, block_height, 1)
+        self.grid_size = (
+            int(np.ceil(self.nx / float(self.block_size[0]))),
+            int(np.ceil(self.ny / float(self.block_size[1])))
+        )
+
+        # Create a CUDA stream
        self.stream = cuda.Stream()
        self.internal_stream = cuda.Stream()
-        
-        #Keep track of simulation time and number of timesteps
+
+        # Keep track of simulation time and number of timesteps
        self.t = 0.0
        self.nt = 0

@ -171,41 +179,41 @@ class BaseSimulator(object):
        Requires that the step() function is implemented in the subclasses
        """

-        printer = Common.ProgressPrinter(t)
-        
-        t_start = self.simTime()
+        printer = ProgressPrinter(t)
+
+        t_start = self.sim_time()
        t_end = t_start + t
-        
+
        update_dt = True
-        if (dt is not None):
+        if dt is not None:
            update_dt = False
            self.dt = dt
-        
-        while(self.simTime() < t_end):
+
+        while self.sim_time() < t_end:
            # Update dt every 100 timesteps and cross your fingers it works
            # for the next 100
-            if (update_dt and (self.simSteps() % 100 == 0)):
-                self.dt = self.computeDt()*self.cfl_scale
-        
+            if update_dt and (self.sim_steps() % 100 == 0):
+                self.dt = self.compute_dt() * self.cfl_scale
+
            # Compute timestep for "this" iteration (i.e., shorten last timestep)
-            current_dt = np.float32(min(self.dt, t_end-self.simTime()))
+            current_dt = np.float32(min(self.dt, t_end - self.sim_time()))

            # Stop if end reached (should not happen)
-            if (current_dt <= 0.0):
-                self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.simSteps()))
+            if current_dt <= 0.0:
+                self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.sim_steps()))
                break
-        
+
            # Step forward in time
            self.step(current_dt)

-            #Print info
-            print_string = printer.getPrintString(self.simTime() - t_start)
-            if (print_string):
+            # Print info
+            print_string = printer.get_print_string(self.sim_time() - t_start)
+            if print_string:
                self.logger.info("%s: %s", self, print_string)
                try:
                    self.check()
                except AssertionError as e:
-                    e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()),)
+                    e.args += ("Step={:d}, time={:f}".format(self.sim_steps(), self.sim_time()),)
                    raise

    def step(self, dt: int):
@ -218,57 +226,45 @@ class BaseSimulator(object):

        for i in range(self.num_substeps):
            self.substep(dt, i)
-            
+
        self.t += dt
        self.nt += 1

    def download(self, variables=None):
-        return self.getOutput().download(self.stream, variables)
-        
+        return self.get_output().download(self.stream, variables)
+
    def synchronize(self):
        self.stream.synchronize()
-        
-    def simTime(self):
+
+    def sim_time(self):
        return self.t

-    def simSteps(self):
+    def sim_steps(self):
        return self.nt
-       
-    def getExtent(self):
-        return [0, 0, self.nx*self.dx, self.ny*self.dy]
-        
-    def setBoundaryConditions(self, boundary_conditions):
+
+    def get_extent(self):
+        return [0, 0, self.nx * self.dx, self.ny * self.dy]
+
+    def set_boundary_conditions(self, boundary_conditions):
        self.logger.debug("Boundary conditions set to {:s}".format(str(boundary_conditions)))
-        self.boundary_conditions = boundary_conditions.asCodedInt()
-        
-    def getBoundaryConditions(self):
-        return BoundaryCondition(BoundaryCondition.getTypes(self.boundary_conditions))
-        
+        self.boundary_conditions = boundary_conditions.as_coded_int()
+
+    def get_boundary_conditions(self):
+        return BoundaryCondition(get_types())
+
    def substep(self, dt, step_number):
        """
        Function which performs one single substep with stepsize dt
        """

-        raise(NotImplementedError("Needs to be implemented in subclass"))
-        
-    def getOutput(self):
-        raise(NotImplementedError("Needs to be implemented in subclass"))
+        raise (NotImplementedError("Needs to be implemented in subclass"))
+
+    def get_output(self):
+        raise (NotImplementedError("Needs to be implemented in subclass"))

    def check(self):
        self.logger.warning("check() is not implemented - please implement")
-        #raise(NotImplementedError("Needs to be implemented in subclass"))
-        
-    def computeDt(self):
-        raise(NotImplementedError("Needs to be implemented in subclass"))
+        # raise(NotImplementedError("Needs to be implemented in subclass"))

-
-def stepOrderToCodedInt(step, order):
-    """
-    Helper function which packs the step and order into a single integer
-    """
-
-    step_order = (step << 16) | (order & 0x0000ffff)
-    #print("Step:  {0:032b}".format(step))
-    #print("Order: {0:032b}".format(order))
-    #print("Mix:   {0:032b}".format(step_order))
-    return np.int32(step_order)
+    def compute_dt(self):
+        raise (NotImplementedError("Needs to be implemented in subclass"))
--- a/GPUSimulators/WAF.py
+++ b/GPUSimulators/WAF.py
@ -20,30 +20,31 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

-#Import packages we need
-from GPUSimulators import Simulator, Common
-from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
+# Import packages we need
 import numpy as np
-
 from pycuda import gpuarray

+from GPUSimulators import Simulator
+from GPUSimulators.common import ArakawaA2D
+from GPUSimulators.Simulator import BoundaryCondition

-class WAF (Simulator.BaseSimulator):
+
+class WAF(Simulator.BaseSimulator):
    """
    Class that solves the SW equations using the Forward-Backward linear scheme
    """

-    def __init__(self, 
+    def __init__(self,
                 context,
-                 h0, hu0, hv0, 
-                 nx, ny, 
-                 dx, dy, 
-                 g, 
+                 h0, hu0, hv0,
+                 nx, ny,
+                 dx, dy,
+                 g,
                 cfl_scale=0.9,
-                 boundary_conditions=BoundaryCondition(), 
+                 boundary_conditions=BoundaryCondition(),
                 block_width=16, block_height=16,
-                 dt: float=None,
-                 compile_opts: list[str]=[]):
+                 dt: float = None,
+                 compile_opts: list[str] = []):
        """
        Initialization routine

@ -59,79 +60,79 @@ class WAF (Simulator.BaseSimulator):
            g: Gravitational accelleration (9.81 m/s^2)
            compile_opts: Pass a list of nvcc compiler options
        """
-                 
-        # Call super constructor
-        super().__init__(context, 
-            nx, ny, 
-            dx, dy, 
-            boundary_conditions,
-            cfl_scale,
-            2,
-            block_width, block_height);
-        self.g = np.float32(g) 

-        #Get kernels
-        module = context.get_module("cuda/SWE2D_WAF.cu", 
-                                        defines={
-                                            'BLOCK_WIDTH': self.block_size[0], 
-                                            'BLOCK_HEIGHT': self.block_size[1]
-                                        }, 
-                                        compile_args={
-                                            'no_extern_c': True,
-                                            'options': ["--use_fast_math"] + compile_opts, 
-                                        }, 
-                                        jit_compile_args={})
+        # Call super constructor
+        super().__init__(context,
+                         nx, ny,
+                         dx, dy,
+                         boundary_conditions,
+                         cfl_scale,
+                         2,
+                         block_width, block_height)
+        self.g = np.float32(g)
+
+        # Get kernels
+        module = context.get_module("cuda/SWE2D_WAF.cu",
+                                    defines={
+                                        'BLOCK_WIDTH': self.block_size[0],
+                                        'BLOCK_HEIGHT': self.block_size[1]
+                                    },
+                                    compile_args={
+                                        'no_extern_c': True,
+                                        'options': ["--use_fast_math"] + compile_opts,
+                                    },
+                                    jit_compile_args={})
        self.kernel = module.get_function("WAFKernel")
        self.kernel.prepare("iiffffiiPiPiPiPiPiPiPiiii")
-    
-        #Create data by uploading to device
-        self.u0 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [h0, hu0, hv0])
-        self.u1 = Common.ArakawaA2D(self.stream, 
-                        nx, ny, 
-                        2, 2, 
-                        [None, None, None])
+
+        # Create data by uploading to the device
+        self.u0 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [h0, hu0, hv0])
+        self.u1 = ArakawaA2D(self.stream,
+                             nx, ny,
+                             2, 2,
+                             [None, None, None])
        self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
-        
-        if dt == None:
-            dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
-            dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
+
+        if dt is None:
+            dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
+            dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
            self.dt = min(dt_x, dt_y)
        else:
            self.dt = dt
-        
+
        self.cfl_data.fill(self.dt, stream=self.stream)
-    
+
    def substep(self, dt, step_number):
-        self.substepDimsplit(dt*0.5, step_number)
-        
-    def substepDimsplit(self, dt, substep):
-        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, 
-                self.nx, self.ny, 
-                self.dx, self.dy, dt, 
-                self.g, 
-                substep,
-                self.boundary_conditions, 
-                self.u0[0].data.gpudata, self.u0[0].data.strides[0], 
-                self.u0[1].data.gpudata, self.u0[1].data.strides[0], 
-                self.u0[2].data.gpudata, self.u0[2].data.strides[0], 
-                self.u1[0].data.gpudata, self.u1[0].data.strides[0], 
-                self.u1[1].data.gpudata, self.u1[1].data.strides[0], 
-                self.u1[2].data.gpudata, self.u1[2].data.strides[0],
-                self.cfl_data.gpudata,
-                0, 0,
-                self.nx, self.ny)
+        self.substep_dimsplit(dt * 0.5, step_number)
+
+    def substep_dimsplit(self, dt, substep):
+        self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
+                                        self.nx, self.ny,
+                                        self.dx, self.dy, dt,
+                                        self.g,
+                                        substep,
+                                        self.boundary_conditions,
+                                        self.u0[0].data.gpudata, self.u0[0].data.strides[0],
+                                        self.u0[1].data.gpudata, self.u0[1].data.strides[0],
+                                        self.u0[2].data.gpudata, self.u0[2].data.strides[0],
+                                        self.u1[0].data.gpudata, self.u1[0].data.strides[0],
+                                        self.u1[1].data.gpudata, self.u1[1].data.strides[0],
+                                        self.u1[2].data.gpudata, self.u1[2].data.strides[0],
+                                        self.cfl_data.gpudata,
+                                        0, 0,
+                                        self.nx, self.ny)
        self.u0, self.u1 = self.u1, self.u0

-    def getOutput(self):
+    def get_output(self):
        return self.u0
-        
+
    def check(self):
        self.u0.check()
        self.u1.check()
-        
-    def computeDt(self):
-        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
-        return max_dt*0.5
+
+    def compute_dt(self):
+        max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
+        return max_dt * 0.5
--- a/GPUSimulators/common/init.py
+++ b/GPUSimulators/common/init.py
@ -0,0 +1,9 @@
+from .arkawa_2d import ArakawaA2D
+from .common import *
+from .cuda_array_2d import CudaArray2D
+from .cuda_array_3d import CudaArray3D
+from .data_dumper import DataDumper
+from .ip_engine import IPEngine
+from .popen_file_buffer import PopenFileBuffer
+from .progress_printer import ProgressPrinter
+from .timer import Timer
--- a/GPUSimulators/common/arkawa_2d.py
+++ b/GPUSimulators/common/arkawa_2d.py
@ -0,0 +1,57 @@
+import logging
+
+import numpy as np
+import pycuda.gpuarray
+
+from GPUSimulators.common.cuda_array_2d import CudaArray2D
+
+
+class ArakawaA2D:
+    """
+    A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
+    """
+
+    def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
+        """
+        Uploads initial data to the GPU device
+        """
+        self.logger = logging.getLogger(__name__)
+        self.gpu_variables = []
+        for cpu_variable in cpu_variables:
+            self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)]
+
+    def __getitem__(self, key):
+        if type(key) != int:
+            raise TypeError("Indexing is int based")
+
+        if key > len(self.gpu_variables) or key < 0:
+            raise IndexError("Out of bounds")
+        return self.gpu_variables[key]
+
+    def download(self, stream, variables=None):
+        """
+        Enables downloading data from the GPU device to Python
+        """
+        if variables is None:
+            variables = range(len(self.gpu_variables))
+
+        cpu_variables = []
+        for i in variables:
+            if i >= len (self.gpu_variables):
+                raise IndexError(f"Variable {i} is out of range")
+            cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
+
+        # stream.synchronize()
+        return cpu_variables
+
+    def check(self):
+        """
+        Checks that data is still sane
+        """
+        for i, gpu_variable in enumerate(self.gpu_variables):
+            var_sum = pycuda.gpuarray.sum(gpu_variable.data).get()
+            self.logger.debug(f"Data {i} with size [{gpu_variable.nx} x {gpu_variable.ny}] "
+                              + f"has average {var_sum / (gpu_variable.nx * gpu_variable.ny)}")
+
+            if np.isnan(var_sum):
+                raise ValueError("Data contains NaN values!")
--- a/GPUSimulators/common/common.py
+++ b/GPUSimulators/common/common.py
@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the different helper functions and 
+classes
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import os
+
+import numpy as np
+import time
+import subprocess
+import logging
+import json
+
+from GPUSimulators.common.data_dumper import DataDumper
+from GPUSimulators.common.progress_printer import ProgressPrinter
+from GPUSimulators.common.timer import Timer
+
+
+def safe_call(cmd):
+    logger = logging.getLogger(__name__)
+    try:
+        #git rev-parse HEAD
+        current_dir = os.path.dirname(os.path.realpath(__file__))
+        params = dict()
+        params['stderr'] = subprocess.STDOUT
+        params['cwd'] = current_dir
+        params['universal_newlines'] = True #text=True in more recent python
+        params['shell'] = False
+        if os.name == 'nt':
+            params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+        stdout = subprocess.check_output(cmd, **params)
+    except subprocess.CalledProcessError as e:
+        output = e.output
+        logger.error("Git failed, \nReturn code: " + str(e.returncode) + "\nOutput: " + output)
+        raise e
+
+    return stdout
+
+
+def get_git_hash():
+    return safe_call(["git", "rev-parse", "HEAD"])
+
+
+def get_git_status():
+    return safe_call(["git", "status", "--porcelain", "-uno"])
+
+
+def to_json(in_dict, compressed=True):
+    """
+    Creates JSON string from a dictionary
+    """
+
+    logger = logging.getLogger(__name__)
+    out_dict = in_dict.copy()
+    for key in out_dict:
+        if isinstance(out_dict[key], np.ndarray):
+            out_dict[key] = out_dict[key].tolist()
+        else:
+            try:
+                json.dumps(out_dict[key])
+            except:
+                value = str(out_dict[key])
+                logger.warning("JSON: Converting {:s} to string ({:s})".format(key, value))
+                out_dict[key] = value
+    return json.dumps(out_dict)
+
+
+def run_simulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
+    """
+    Runs a simulation, and store output in a netcdf file. Stores the times given in
+    save_times, and saves all the variables in list save_var_names. Elements in
+    save_var_names can be set to None if you do not want to save them
+    """
+
+    profiling_data_sim_runner = { 'start': {}, 'end': {} }
+    profiling_data_sim_runner["start"]["t_sim_init"] = 0
+    profiling_data_sim_runner["end"]["t_sim_init"] = 0
+    profiling_data_sim_runner["start"]["t_nc_write"] = 0
+    profiling_data_sim_runner["end"]["t_nc_write"] = 0
+    profiling_data_sim_runner["start"]["t_full_step"] = 0
+    profiling_data_sim_runner["end"]["t_full_step"] = 0
+
+    profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
+
+    logger = logging.getLogger(__name__)
+
+    if len(save_times <= 0):
+        raise ValueError("Need to specify which times to save")
+
+    with Timer("construct") as t:
+        sim = simulator(**simulator_args)
+    logger.info(f"Constructed in {str(t.secs)} seconds")
+
+    #Create a netcdf file and simulate
+    with DataDumper(outfile, mode='w', clobber=False) as outdata:
+        
+        #Create attributes (metadata)
+        outdata.ncfile.created = time.ctime(time.time())
+        outdata.ncfile.git_hash = get_git_hash()
+        outdata.ncfile.git_status = get_git_status()
+        outdata.ncfile.simulator = str(simulator)
+        
+        # do not write fields to attributes (they are to large)
+        simulator_args_for_ncfile = simulator_args.copy()
+        del simulator_args_for_ncfile["rho"]
+        del simulator_args_for_ncfile["rho_u"]
+        del simulator_args_for_ncfile["rho_v"]
+        del simulator_args_for_ncfile["E"]
+        outdata.ncfile.sim_args = to_json(simulator_args_for_ncfile)
+        
+        #Create dimensions
+        outdata.ncfile.createDimension('time', len(save_times))
+        outdata.ncfile.createDimension('x', simulator_args['nx'])
+        outdata.ncfile.createDimension('y', simulator_args['ny'])
+
+        #Create variables for dimensions
+        ncvars = {'time': outdata.ncfile.createVariable('time', np.dtype('float32').char, 'time'),
+                  'x': outdata.ncfile.createVariable('x', np.dtype('float32').char, 'x'),
+                  'y': outdata.ncfile.createVariable('y', np.dtype('float32').char, 'y')}
+
+        #Fill variables with proper values
+        ncvars['time'][:] = save_times
+        extent = sim.get_extent()
+        ncvars['x'][:] = np.linspace(extent[0], extent[1], simulator_args['nx'])
+        ncvars['y'][:] = np.linspace(extent[2], extent[3], simulator_args['ny'])
+        
+        #Choose which variables to download (prune None from the list, but keep the index)
+        download_vars = []
+        for i, var_name in enumerate(save_var_names):
+            if var_name is not None:
+                download_vars += [i]
+        save_var_names = list(save_var_names[i] for i in download_vars)
+        
+        #Create variables
+        for var_name in save_var_names:
+            ncvars[var_name] = outdata.ncfile.createVariable(
+                var_name, np.dtype('float32').char, ('time', 'y', 'x'), zlib=True, least_significant_digit=3)
+
+        #Create step sizes between each save
+        t_steps = np.empty_like(save_times)
+        t_steps[0] = save_times[0]
+        t_steps[1:] = save_times[1:] - save_times[0:-1]
+
+        profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
+
+        # Start simulation loop
+        progress_printer = ProgressPrinter(save_times[-1], print_every=10)
+        for k in range(len(save_times)):
+            # Get target time and step size there
+            t_step = t_steps[k]
+            t_end = save_times[k]
+            
+            # Sanity check simulator
+            try:
+                sim.check()
+            except AssertionError as e:
+                logger.error(f"Error after {sim.sim_steps()} steps (t={sim.sim_time()}: {str(e)}")
+                return outdata.filename
+
+            profiling_data_sim_runner["start"]["t_full_step"] += time.time()
+
+            # Simulate
+            if t_step > 0.0:
+                sim.simulate(t_step, dt)
+
+            profiling_data_sim_runner["end"]["t_full_step"] += time.time()
+
+            profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
+
+            #Download
+            save_vars = sim.download(download_vars)
+            
+            #Save to file
+            for i, var_name in enumerate(save_var_names):
+                ncvars[var_name][k, :] = save_vars[i]
+
+            profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
+
+            #Write progress to screen
+            print_string = progress_printer.get_print_string(t_end)
+            if print_string:
+                logger.debug(print_string)
+                
+        logger.debug(f"Simulated to t={t_end} in "
+                     + f"{sim.sim_steps()} timesteps (average dt={sim.sim_time() / sim.sim_steps()})")
+
+    return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
+    
--- a/GPUSimulators/common/cuda_array_2d.py
+++ b/GPUSimulators/common/cuda_array_2d.py
@ -0,0 +1,139 @@
+import logging
+
+import numpy as np
+
+import pycuda.gpuarray
+import pycuda.driver as cuda
+from pycuda.tools import PageLockedMemoryPool
+
+
+class CudaArray2D:
+    """
+    Class that holds 2D CUDA data
+    """
+
+    def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32):
+        """
+        Uploads initial data to the CUDA device
+        """
+
+        self.logger = logging.getLogger(__name__)
+        self.nx = nx
+        self.ny = ny
+        self.x_halo = x_halo
+        self.y_halo = y_halo
+
+        nx_halo = nx + 2 * x_halo
+        ny_halo = ny + 2 * y_halo
+
+        # self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+        # Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+        self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)
+
+        # For returning to download
+        self.memorypool = PageLockedMemoryPool()
+
+        # If we don't have any data, just allocate and return
+        if cpu_data is None:
+            return
+
+        # Make sure data is in proper format
+        if cpu_data.shape != (ny_halo, nx_halo) and cpu_data.shape != (self.ny, self.nx):
+            raise ValueError(
+                f"Wrong shape of data {str(cpu_data.shape)} vs {str((self.ny, self.nx))} / {str((ny_halo, nx_halo))}")
+
+        if cpu_data.itemsize != 4:
+            raise ValueError("Wrong size of data type")
+
+        if np.isfortran(cpu_data):
+            raise TypeError("Wrong datatype (Fortran, expected C)")
+
+        # Create a copy object from host to device
+        x = (nx_halo - cpu_data.shape[1]) // 2
+        y = (ny_halo - cpu_data.shape[0]) // 2
+        self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
+        # self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+
+    def __del__(self, *args):
+        # self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        self.data.gpudata.free()
+        self.data = None
+
+    def download(self, stream, cpu_data=None, asynch=False, extent=None):
+        """
+        Enables downloading data from GPU to Python
+        """
+
+        if extent is None:
+            x = self.x_halo
+            y = self.y_halo
+            nx = self.nx
+            ny = self.ny
+        else:
+            x, y, nx, ny = extent
+
+        if cpu_data is None:
+            # self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
+            # Allocate host memory
+            # The following fails, don't know why (crashes python)
+            cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32,
+                                             mem_flags=cuda.host_alloc_flags.PORTABLE)
+            # Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
+            # cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
+
+        assert nx == cpu_data.shape[1]
+        assert ny == cpu_data.shape[0]
+        assert x + nx <= self.nx + 2 * self.x_halo
+        assert y + ny <= self.ny + 2 * self.y_halo
+
+        # Create a copy object from device to host
+        copy = cuda.Memcpy2D()
+        copy.set_src_device(self.data.gpudata)
+        copy.set_dst_host(cpu_data)
+
+        # Set offsets and pitch of a source
+        copy.src_x_in_bytes = int(x) * self.data.strides[1]
+        copy.src_y = int(y)
+        copy.src_pitch = self.data.strides[0]
+
+        # Set width in bytes to copy for each row and
+        # number of rows to copy
+        copy.width_in_bytes = int(nx) * cpu_data.itemsize
+        copy.height = int(ny)
+
+        copy(stream)
+        if not asynch:
+            stream.synchronize()
+
+        return cpu_data
+
+    def upload(self, stream, cpu_data, extent=None):
+        if extent is None:
+            x = self.x_halo
+            y = self.y_halo
+            nx = self.nx
+            ny = self.ny
+        else:
+            x, y, nx, ny = extent
+
+        assert (nx == cpu_data.shape[1])
+        assert (ny == cpu_data.shape[0])
+        assert (x + nx <= self.nx + 2 * self.x_halo)
+        assert (y + ny <= self.ny + 2 * self.y_halo)
+
+        # Create a copy object from device to host
+        copy = cuda.Memcpy2D()
+        copy.set_dst_device(self.data.gpudata)
+        copy.set_src_host(cpu_data)
+
+        # Set offsets and pitch of a source
+        copy.dst_x_in_bytes = int(x) * self.data.strides[1]
+        copy.dst_y = int(y)
+        copy.dst_pitch = self.data.strides[0]
+
+        # Set width in bytes to copy for each row and
+        # number of rows to copy
+        copy.width_in_bytes = int(nx) * cpu_data.itemsize
+        copy.height = int(ny)
+
+        copy(stream)
--- a/GPUSimulators/common/cuda_array_3d.py
+++ b/GPUSimulators/common/cuda_array_3d.py
@ -0,0 +1,120 @@
+import logging
+
+import numpy as np
+import pycuda.gpuarray
+import pycuda.driver as cuda
+from pycuda.tools import PageLockedMemoryPool
+
+
+class CudaArray3D:
+    """
+    Class that holds 3D data
+    """
+
+    def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
+        """
+        Uploads initial data to the CL device
+        """
+
+        self.logger = logging.getLogger(__name__)
+        self.nx = nx
+        self.ny = ny
+        self.nz = nz
+        self.x_halo = x_halo
+        self.y_halo = y_halo
+        self.z_halo = z_halo
+
+        nx_halo = nx + 2 * x_halo
+        ny_halo = ny + 2 * y_halo
+        nz_halo = nz + 2 * z_halo
+
+        # self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
+        # Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
+        self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)
+
+        # For returning to download
+        self.memorypool = PageLockedMemoryPool()
+
+        # If we don't have any data, just allocate and return
+        if cpu_data is None:
+            return
+
+        # Make sure data is in proper format
+        if (cpu_data.shape != (nz_halo, ny_halo, nx_halo)
+                and cpu_data.shape != (self.nz, self.ny, self.nx)):
+            raise ValueError(f"Wrong shape of data {str(cpu_data.shape)} vs {str((self.nz, self.ny, self.nx))} / {str((nz_halo, ny_halo, nx_halo))}")
+
+        if cpu_data.itemsize != 4:
+            raise ValueError("Wrong size of data type")
+
+        if np.isfortran(cpu_data):
+            raise TypeError("Wrong datatype (Fortran, expected C)")
+
+        # Create a copy object from host to device
+        copy = cuda.Memcpy3D()
+        copy.set_src_host(cpu_data)
+        copy.set_dst_device(self.data.gpudata)
+
+        # Set offsets of destination
+        x_offset = (nx_halo - cpu_data.shape[2]) // 2
+        y_offset = (ny_halo - cpu_data.shape[1]) // 2
+        z_offset = (nz_halo - cpu_data.shape[0]) // 2
+        copy.dst_x_in_bytes = x_offset * self.data.strides[1]
+        copy.dst_y = y_offset
+        copy.dst_z = z_offset
+
+        # Set pitch of destination
+        copy.dst_pitch = self.data.strides[0]
+
+        # Set width in bytes to copy for each row and
+        # number of rows to copy
+        width = max(self.nx, cpu_data.shape[2])
+        height = max(self.ny, cpu_data.shape[1])
+        depth = max(self.nz, cpu - data.shape[0])
+        copy.width_in_bytes = width * cpu_data.itemsize
+        copy.height = height
+        copy.depth = depth
+
+        # Perform the copy
+        copy(stream)
+
+        # self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
+
+    def __del__(self, *args):
+        # self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        self.data.gpudata.free()
+        self.data = None
+
+    def download(self, stream, asynch=False):
+        """
+        Enables downloading data from GPU to Python
+        """
+
+        # self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
+        # Allocate host memory
+        # cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
+        # cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
+        cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32)
+
+        # Create a copy object from device to host
+        copy = cuda.Memcpy2D()
+        copy.set_src_device(self.data.gpudata)
+        copy.set_dst_host(cpu_data)
+
+        # Set offsets and pitch of a source
+        copy.src_x_in_bytes = self.x_halo * self.data.strides[1]
+        copy.src_y = self.y_halo
+        copy.src_z = self.z_halo
+        copy.src_pitch = self.data.strides[0]
+
+        # Set width in bytes to copy for each row and
+        # number of rows to copy
+        copy.width_in_bytes = self.nx * cpu_data.itemsize
+        copy.height = self.ny
+        copy.depth = self.nz
+
+        copy(stream)
+        if not asynch:
+            stream.synchronize()
+
+        return cpu_data
--- a/GPUSimulators/common/data_dumper.py
+++ b/GPUSimulators/common/data_dumper.py
@ -0,0 +1,79 @@
+import json
+import logging
+import os
+
+import netCDF4
+import numpy as np
+
+
+def to_json(in_dict):
+    out_dict = in_dict.copy()
+
+    for key in out_dict:
+        if isinstance(out_dict[key], np.ndarray):
+            out_dict[key] = out_dict[key].tolist()
+        else:
+            try:
+                json.dumps(out_dict[key])
+            except:
+                out_dict[key] = str(out_dict[key])
+
+    return json.dumps(out_dict)
+
+
+class DataDumper(object):
+    """
+    Simple class for holding a netCDF4 object
+    (handles opening and closing nicely)
+    Use as
+    with DataDumper("filename") as data:
+        ...
+    """
+
+    def __init__(self, filename, *args, **kwargs):
+        self.logger = logging.getLogger(__name__)
+
+        # Create directory if needed
+        filename = os.path.abspath(filename)
+        dirname = os.path.dirname(filename)
+        if dirname and not os.path.isdir(dirname):
+            self.logger.info("Creating directory " + dirname)
+            os.makedirs(dirname)
+
+        # Get mode of a file if we have that
+        mode = None
+        if args:
+            mode = args[0]
+        elif kwargs and 'mode' in kwargs.keys():
+            mode = kwargs['mode']
+
+        # Create a new unique file if writing
+        if mode:
+            if ("w" in mode) or ("+" in mode) or ("a" in mode):
+                i = 0
+                stem, ext = os.path.splitext(filename)
+                while os.path.isfile(filename):
+                    filename = f"{stem}_{str(i).zfill(4)}{ext}"
+                    i = i + 1
+        self.filename = os.path.abspath(filename)
+
+        # Save arguments
+        self.args = args
+        self.kwargs = kwargs
+
+        # Log output
+        self.logger.info("Initialized " + self.filename)
+
+    def __enter__(self):
+        self.logger.info("Opening " + self.filename)
+        if self.args:
+            self.logger.info("Arguments: " + str(self.args))
+        if self.kwargs:
+            self.logger.info("Keyword arguments: " + str(self.kwargs))
+        self.ncfile = netCDF4.Dataset(self.filename, *self.args, **self.kwargs)
+        return self
+
+    def __exit__(self, *args):
+        self.logger.info("Closing " + self.filename)
+        self.ncfile.close()
+
--- a/GPUSimulators/common/ip_engine.py
+++ b/GPUSimulators/common/ip_engine.py
@ -0,0 +1,101 @@
+import gc
+import logging
+import os
+import signal
+import subprocess
+import time
+
+from GPUSimulators.common.popen_file_buffer import PopenFileBuffer
+
+
+class IPEngine(object):
+    """
+    Class for starting IPEngines for MPI processing in IPython
+    """
+
+    def __init__(self, n_engines):
+        self.logger = logging.getLogger(__name__)
+
+        # Start ipcontroller
+        self.logger.info("Starting IPController")
+        self.c_buff = PopenFileBuffer()
+        c_cmd = ["ipcontroller", "--ip='*'"]
+        c_params = dict()
+        c_params['stderr'] = self.c_buff.stderr
+        c_params['stdout'] = self.c_buff.stdout
+        c_params['shell'] = False
+        if os.name == 'nt':
+            c_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+        self.c = subprocess.Popen(c_cmd, **c_params)
+
+        # Wait until the controller is running
+        time.sleep(3)
+
+        # Start engines
+        self.logger.info("Starting IPEngines")
+        self.e_buff = PopenFileBuffer()
+        e_cmd = ["mpiexec", "-n", str(n_engines), "ipengine", "--mpi"]
+        e_params = dict()
+        e_params['stderr'] = self.e_buff.stderr
+        e_params['stdout'] = self.e_buff.stdout
+        e_params['shell'] = False
+        if os.name == 'nt':
+            e_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+        self.e = subprocess.Popen(e_cmd, **e_params)
+
+        # attach to a running cluster
+        import ipyparallel
+        self.cluster = ipyparallel.Client()  # profile='mpi')
+        time.sleep(3)
+        while len(self.cluster.ids) != n_engines:
+            time.sleep(0.5)
+            self.logger.info("Waiting for cluster...")
+            self.cluster = ipyparallel.Client()  # profile='mpi')
+
+        self.logger.info("Done")
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        if self.e is not None:
+            if os.name == 'nt':
+                self.logger.warning("Sending CTRL+C to IPEngine")
+                self.e.send_signal(signal.CTRL_C_EVENT)
+
+            try:
+                self.e.communicate(timeout=3)
+                self.e.kill()
+            except subprocess.TimeoutExpired:
+                self.logger.warning("Killing IPEngine")
+                self.e.kill()
+                self.e.communicate()
+            self.e = None
+
+            cout, cerr = self.e_buff.read()
+            self.logger.info(f"IPEngine cout: {cout}")
+            self.logger.info(f"IPEngine cerr: {cerr}")
+            self.e_buff = None
+
+            gc.collect()
+
+        if self.c is not None:
+            if os.name == 'nt':
+                self.logger.warning("Sending CTRL+C to IPController")
+                self.c.send_signal(signal.CTRL_C_EVENT)
+
+            try:
+                self.c.communicate(timeout=3)
+                self.c.kill()
+            except subprocess.TimeoutExpired:
+                self.logger.warning("Killing IPController")
+                self.c.kill()
+                self.c.communicate()
+            self.c = None
+
+            cout, cerr = self.c_buff.read()
+            self.logger.info(f"IPController cout: {cout}")
+            self.logger.info(f"IPController cerr: {cerr}")
+            self.c_buff = None
+
+            gc.collect()
--- a/GPUSimulators/common/popen_file_buffer.py
+++ b/GPUSimulators/common/popen_file_buffer.py
@ -0,0 +1,27 @@
+import tempfile
+
+
+class PopenFileBuffer(object):
+    """
+    Simple class for holding a set of temp files
+    for communicating with a subprocess
+    """
+
+    def __init__(self):
+        self.stdout = tempfile.TemporaryFile(mode='w+t')
+        self.stderr = tempfile.TemporaryFile(mode='w+t')
+
+    def __del__(self):
+        self.stdout.close()
+        self.stderr.close()
+
+    def read(self):
+        self.stdout.seek(0)
+        cout = self.stdout.read()
+        self.stdout.seek(0, 2)
+
+        self.stderr.seek(0)
+        cerr = self.stderr.read()
+        self.stderr.seek(0, 2)
+
+        return cout, cerr
--- a/GPUSimulators/common/progress_printer.py
+++ b/GPUSimulators/common/progress_printer.py
@ -0,0 +1,62 @@
+import logging
+import time
+
+import numpy as np
+
+
+def time_string(seconds):
+    seconds = int(max(seconds, 1))
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    periods = [('h', hours), ('m', minutes), ('s', seconds)]
+    return_string = ' '.join('{}{}'.format(value, name)
+                             for name, value in periods
+                             if value)
+    return return_string
+
+
+def progress_bar(step, total_steps, width=30):
+    progress = np.round(width * step / total_steps).astype(np.int32)
+    progressbar = "0% [" + "#" * progress + "=" * (width - progress) + "] 100%"
+    return progressbar
+
+
+class ProgressPrinter(object):
+    """
+    Small helper class for creating a progress bar
+    """
+
+    def __init__(self, total_steps, print_every=5):
+        self.logger = logging.getLogger(__name__)
+        self.start = time.time()
+        self.total_steps = total_steps
+        self.print_every = print_every
+        self.next_print_time = self.print_every
+        self.last_step = 0
+        self.secs_per_iter = None
+
+    def get_print_string(self, step):
+        elapsed = time.time() - self.start
+        if elapsed > self.next_print_time:
+            dt = elapsed - (self.next_print_time - self.print_every)
+            dsteps = step - self.last_step
+            steps_remaining = self.total_steps - step
+
+            if dsteps == 0:
+                return None
+
+            self.last_step = step
+            self.next_print_time = elapsed + self.print_every
+
+            if not self.secs_per_iter:
+                self.secs_per_iter = dt / dsteps
+            self.secs_per_iter = 0.2 * self.secs_per_iter + 0.8 * (dt / dsteps)
+
+            remaining_time = steps_remaining * self.secs_per_iter
+
+            return (f"{progress_bar(step, self.total_steps)}. "
+                    + f"Total: {time_string(elapsed + remaining_time)}, "
+                    + f"elapsed: {time_string(elapsed)}, "
+                    + f"remaining: {time_string(remaining_time)}")
+
+        return None
--- a/GPUSimulators/common/timer.py
+++ b/GPUSimulators/common/timer.py
@ -0,0 +1,26 @@
+import logging
+import time
+
+
+class Timer(object):
+    """
+    Class which keeps track of time spent for a section of code
+    """
+
+    def __init__(self, tag, log_level=logging.DEBUG):
+        self.tag = tag
+        self.log_level = log_level
+        self.logger = logging.getLogger(__name__)
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def __exit__(self, *args):
+        self.end = time.time()
+        self.secs = self.end - self.start
+        self.msecs = self.secs * 1000  # milliseconds
+        self.logger.log(self.log_level, f"{self.tag}: {self.msecs} ms")
+
+    def elapsed(self):
+        return time.time() - self.start
--- a/GPUSimulators/gpu/init.py
+++ b/GPUSimulators/gpu/init.py
@ -0,0 +1,2 @@
+from .cuda_context import CudaContext
+from .hip_context import HIPContext
--- a/GPUSimulators/gpu/context.py
+++ b/GPUSimulators/gpu/context.py
--- a/GPUSimulators/gpu/cuda_context.py
+++ b/GPUSimulators/gpu/cuda_context.py
@ -21,8 +21,6 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import os

-import numpy as np
-import time
 import re
 import io
 import hashlib
@ -33,8 +31,8 @@ import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda

-from GPUSimulators import Autotuner, Common
-from GPUSimulators.gpu.Context import Context
+from GPUSimulators import Autotuner
+from GPUSimulators.common import common


 class CudaContext(object):
--- a/GPUSimulators/gpu/hip_context.py
+++ b/GPUSimulators/gpu/hip_context.py
@ -3,10 +3,10 @@ import io
 import os.path

 import hip as hip_main
-from hip import hip, hiprtc
+from hip import hip

-from GPUSimulators import Common
-from GPUSimulators.gpu.Context import Context
+from GPUSimulators.common import common
+from GPUSimulators.gpu.context import Context


 class HIPContext(Context):
--- a/MPITest.ipynb
+++ b/MPITest.ipynb
@ -52,9 +52,7 @@
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "from GPUSimulators import IPythonMagic"
-   ]
+   "source": ""
  },
  {
   "cell_type": "code",
@ -115,10 +113,10 @@
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "from mpi4py import MPI\n",
-    "import time\n",
    "import json\n",
    "\n",
-    "from GPUSimulators import IPythonMagic, MPISimulator, Common"
+    "from GPUSimulators import MPISimulator\n",
+    "from GPUSimulators.common import common"
   ]
  },
  {
@ -317,7 +315,6 @@
    "%%px\n",
    "\n",
    "from GPUSimulators.helpers import InitialConditions\n",
-    "from GPUSimulators.Simulator import BoundaryCondition\n",
    "\n",
    "my_context.autotuner = None\n",
    "\n",
@ -348,7 +345,7 @@
    "    return sim\n",
    "\n",
    "\n",
-    "outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)"
+    "outfile = Common.run_simulation(genSim, arguments, outfile, save_times, save_var_names)"
   ]
  },
  {
@ -657,7 +654,7 @@
    "    sim = MPISimulator.MPISimulator(local_sim, grid)\n",
    "    return sim\n",
    "\n",
-    "outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)"
+    "outfile = Common.run_simulation(genSim, arguments, outfile, save_times, save_var_names)"
   ]
  },
  {
--- a/TestSchemes.ipynb
+++ b/TestSchemes.ipynb
@ -13,19 +13,10 @@
    "%load_ext line_profiler\n",
    "\n",
    "#Import packages we need\n",
-    "import numpy as np\n",
    "from matplotlib import animation, rc\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
-    "import subprocess\n",
-    "import os\n",
-    "import gc\n",
-    "import datetime\n",
    "import importlib\n",
-    "import logging\n",
-    "\n",
-    "import pycuda.driver as cuda\n",
-    "import pycuda.compiler\n",
    "\n",
    "try:\n",
    "    from StringIO import StringIO\n",
@ -37,7 +28,7 @@
    "rc('figure', figsize=(16.0, 12.0))\n",
    "rc('animation', html='html5')\n",
    "\n",
-    "from GPUSimulators import Common, IPythonMagic\n",
+    "from GPUSimulators.common import common\n",
    "from GPUSimulators.helpers import InitialConditions"
   ]
  },
@ -129,7 +120,7 @@
    "    h = sim.u0[0].download(sim.stream)\n",
    "    \n",
    "    plt.figure()\n",
-    "    plt.title(str(sim) + \", t=\" + str(sim.simTime()) + \", nt=\" + str(sim.simSteps()))\n",
+    "    plt.title(str(sim) + \", t=\" + str(sim.sim_time()) + \", nt=\" + str(sim.sim_steps()))\n",
    "    extent = [0, sim.dx*sim.nx, 0, sim.dy*sim.ny]\n",
    "    plt.imshow(h, vmin=0.49, vmax=0.52, extent=extent)\n",
    "    plt.colorbar()"
@ -292,16 +283,16 @@
     "evalue": "",
     "output_type": "error",
     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[10], line 5\u001b[0m\n\u001b[1;32m      2\u001b[0m importlib\u001b[38;5;241m.\u001b[39mreload(KP07)\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Common\u001b[38;5;241m.\u001b[39mTimer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconstruct\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m t:\n\u001b[0;32m----> 5\u001b[0m     sim \u001b[38;5;241m=\u001b[39m \u001b[43mKP07\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mKP07\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43marguments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Common\u001b[38;5;241m.\u001b[39mTimer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstep\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m t:\n\u001b[1;32m      8\u001b[0m     t \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39msimulate(t_end)\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/KP07.py:70\u001b[0m, in \u001b[0;36mKP07.__init__\u001b[0;34m(self, context, h0, hu0, hv0, nx, ny, dx, dy, g, theta, cfl_scale, order, boundary_conditions, block_width, block_height, dt, compile_opts)\u001b[0m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     54\u001b[0m \u001b[38;5;124;03mInitialization routine\u001b[39;00m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     66\u001b[0m \u001b[38;5;124;03m    compile_opts: Pass a list of nvcc compiler options\u001b[39;00m\n\u001b[1;32m     67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     69\u001b[0m \u001b[38;5;66;03m# Call super constructor\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     71\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mny\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     72\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     73\u001b[0m \u001b[43m    \u001b[49m\u001b[43mboundary_conditions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     74\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcfl_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     75\u001b[0m \u001b[43m    \u001b[49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     76\u001b[0m \u001b[43m    \u001b[49m\u001b[43mblock_width\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_height\u001b[49m\u001b[43m)\u001b[49m;\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat32(g)             \n\u001b[1;32m     78\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtheta \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat32(theta) \n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Simulator.py:146\u001b[0m, in \u001b[0;36mBaseSimulator.__init__\u001b[0;34m(self, context, nx, ny, dx, dy, boundary_conditions, cfl_scale, num_substeps, block_width, block_height)\u001b[0m\n\u001b[1;32m    144\u001b[0m \u001b[38;5;66;03m#Handle autotuning block size\u001b[39;00m\n\u001b[1;32m    145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontext\u001b[38;5;241m.\u001b[39mautotuner:\n\u001b[0;32m--> 146\u001b[0m     peak_configuration \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_peak_performance\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    147\u001b[0m     block_width \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(peak_configuration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m    148\u001b[0m     block_height \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(peak_configuration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:121\u001b[0m, in \u001b[0;36mAutotuner.get_peak_performance\u001b[0;34m(self, simulator)\u001b[0m\n\u001b[1;32m    119\u001b[0m     logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not get autotuned peak performance for \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m: benchmarking\u001b[39m\u001b[38;5;124m\"\u001b[39m, key)\n\u001b[1;32m    120\u001b[0m     data\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m--> 121\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbenchmark\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    122\u001b[0m     data \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfilename)\n\u001b[1;32m    124\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_max_index\u001b[39m(megacells):\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:84\u001b[0m, in \u001b[0;36mAutotuner.benchmark\u001b[0;34m(self, simulator, force)\u001b[0m\n\u001b[1;32m     81\u001b[0m             benchmark_data[k] \u001b[38;5;241m=\u001b[39m v\n\u001b[1;32m     83\u001b[0m \u001b[38;5;66;03m# Run benchmark\u001b[39;00m\n\u001b[0;32m---> 84\u001b[0m benchmark_data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_megacells\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mAutotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbenchmark_single_simulator\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marguments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_widths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_heights\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     85\u001b[0m benchmark_data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block_widths\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_widths\n\u001b[1;32m     86\u001b[0m benchmark_data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block_heights\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_heights\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:162\u001b[0m, in \u001b[0;36mAutotuner.benchmark_single_simulator\u001b[0;34m(simulator, arguments, block_widths, block_heights)\u001b[0m\n\u001b[1;32m    160\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m i, block_width \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(block_widths):\n\u001b[1;32m    161\u001b[0m             sim_arguments\u001b[38;5;241m.\u001b[39mupdate({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m'\u001b[39m: block_width})\n\u001b[0;32m--> 162\u001b[0m             megacells[j, i] \u001b[38;5;241m=\u001b[39m \u001b[43mAutotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_benchmark\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msim_arguments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    165\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCompleted \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m seconds\u001b[39m\u001b[38;5;124m\"\u001b[39m, simulator\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, t\u001b[38;5;241m.\u001b[39msecs)\n\u001b[1;32m    167\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m megacells\n",
-      "File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:200\u001b[0m, in \u001b[0;36mAutotuner.run_benchmark\u001b[0;34m(simulator, arguments, timesteps, warmup_timesteps)\u001b[0m\n\u001b[1;32m    197\u001b[0m end\u001b[38;5;241m.\u001b[39mrecord(sim\u001b[38;5;241m.\u001b[39mstream)\n\u001b[1;32m    199\u001b[0m \u001b[38;5;66;03m#Synchronize end event\u001b[39;00m\n\u001b[0;32m--> 200\u001b[0m \u001b[43mend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msynchronize\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    202\u001b[0m \u001b[38;5;66;03m#Compute megacells\u001b[39;00m\n\u001b[1;32m    203\u001b[0m gpu_elapsed \u001b[38;5;241m=\u001b[39m end\u001b[38;5;241m.\u001b[39mtime_since(start)\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m1.0e-3\u001b[39m\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[10], line 5\u001B[0m\n\u001B[1;32m      2\u001B[0m importlib\u001B[38;5;241m.\u001B[39mreload(KP07)\n\u001B[1;32m      4\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m Common\u001B[38;5;241m.\u001B[39mTimer(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mconstruct\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m t:\n\u001B[0;32m----> 5\u001B[0m     sim \u001B[38;5;241m=\u001B[39m \u001B[43mKP07\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mKP07\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43marguments\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m      7\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m Common\u001B[38;5;241m.\u001B[39mTimer(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mstep\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m t:\n\u001B[1;32m      8\u001B[0m     t \u001B[38;5;241m=\u001B[39m sim\u001B[38;5;241m.\u001B[39msimulate(t_end)\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/KP07.py:70\u001B[0m, in \u001B[0;36mKP07.__init__\u001B[0;34m(self, context, h0, hu0, hv0, nx, ny, dx, dy, g, theta, cfl_scale, order, boundary_conditions, block_width, block_height, dt, compile_opts)\u001B[0m\n\u001B[1;32m     53\u001B[0m \u001B[38;5;250m\u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m     54\u001B[0m \u001B[38;5;124;03mInitialization routine\u001B[39;00m\n\u001B[1;32m     55\u001B[0m \u001B[38;5;124;03m\u001B[39;00m\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m     66\u001B[0m \u001B[38;5;124;03m    compile_opts: Pass a list of nvcc compiler options\u001B[39;00m\n\u001B[1;32m     67\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m     69\u001B[0m \u001B[38;5;66;03m# Call super constructor\u001B[39;00m\n\u001B[0;32m---> 70\u001B[0m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcontext\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[1;32m     71\u001B[0m \u001B[43m    \u001B[49m\u001B[43mnx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mny\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[1;32m     72\u001B[0m \u001B[43m    \u001B[49m\u001B[43mdx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdy\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[1;32m     73\u001B[0m \u001B[43m    \u001B[49m\u001B[43mboundary_conditions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m     74\u001B[0m \u001B[43m    \u001B[49m\u001B[43mcfl_scale\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m     75\u001B[0m \u001B[43m    \u001B[49m\u001B[43morder\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m     76\u001B[0m \u001B[43m    \u001B[49m\u001B[43mblock_width\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mblock_height\u001B[49m\u001B[43m)\u001B[49m;\n\u001B[1;32m     77\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mg \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mfloat32(g)             \n\u001B[1;32m     78\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtheta \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mfloat32(theta) \n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Simulator.py:146\u001B[0m, in \u001B[0;36mBaseSimulator.__init__\u001B[0;34m(self, context, nx, ny, dx, dy, boundary_conditions, cfl_scale, num_substeps, block_width, block_height)\u001B[0m\n\u001B[1;32m    144\u001B[0m \u001B[38;5;66;03m#Handle autotuning block size\u001B[39;00m\n\u001B[1;32m    145\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcontext\u001B[38;5;241m.\u001B[39mautotuner:\n\u001B[0;32m--> 146\u001B[0m     peak_configuration \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcontext\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mautotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_peak_performance\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;18;43m__class__\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m    147\u001B[0m     block_width \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mint\u001B[39m(peak_configuration[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m\"\u001B[39m])\n\u001B[1;32m    148\u001B[0m     block_height \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mint\u001B[39m(peak_configuration[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m\"\u001B[39m])\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:121\u001B[0m, in \u001B[0;36mAutotuner.get_peak_performance\u001B[0;34m(self, simulator)\u001B[0m\n\u001B[1;32m    119\u001B[0m     logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCould not get autotuned peak performance for \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m: benchmarking\u001B[39m\u001B[38;5;124m\"\u001B[39m, key)\n\u001B[1;32m    120\u001B[0m     data\u001B[38;5;241m.\u001B[39mclose()\n\u001B[0;32m--> 121\u001B[0m     \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbenchmark\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    122\u001B[0m     data \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mload(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfilename)\n\u001B[1;32m    124\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfind_max_index\u001B[39m(megacells):\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:84\u001B[0m, in \u001B[0;36mAutotuner.benchmark\u001B[0;34m(self, simulator, force)\u001B[0m\n\u001B[1;32m     81\u001B[0m             benchmark_data[k] \u001B[38;5;241m=\u001B[39m v\n\u001B[1;32m     83\u001B[0m \u001B[38;5;66;03m# Run benchmark\u001B[39;00m\n\u001B[0;32m---> 84\u001B[0m benchmark_data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_megacells\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[43mAutotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbenchmark_single_simulator\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43marguments\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mblock_widths\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mblock_heights\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     85\u001B[0m benchmark_data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_block_widths\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mblock_widths\n\u001B[1;32m     86\u001B[0m benchmark_data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_block_heights\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mblock_heights\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:162\u001B[0m, in \u001B[0;36mAutotuner.benchmark_single_simulator\u001B[0;34m(simulator, arguments, block_widths, block_heights)\u001B[0m\n\u001B[1;32m    160\u001B[0m         \u001B[38;5;28;01mfor\u001B[39;00m i, block_width \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(block_widths):\n\u001B[1;32m    161\u001B[0m             sim_arguments\u001B[38;5;241m.\u001B[39mupdate({\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m'\u001B[39m: block_width})\n\u001B[0;32m--> 162\u001B[0m             megacells[j, i] \u001B[38;5;241m=\u001B[39m \u001B[43mAutotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_benchmark\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msim_arguments\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    165\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCompleted \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m in \u001B[39m\u001B[38;5;132;01m%f\u001B[39;00m\u001B[38;5;124m seconds\u001B[39m\u001B[38;5;124m\"\u001B[39m, simulator\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, t\u001B[38;5;241m.\u001B[39msecs)\n\u001B[1;32m    167\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m megacells\n",
+      "File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:200\u001B[0m, in \u001B[0;36mAutotuner.run_benchmark\u001B[0;34m(simulator, arguments, timesteps, warmup_timesteps)\u001B[0m\n\u001B[1;32m    197\u001B[0m end\u001B[38;5;241m.\u001B[39mrecord(sim\u001B[38;5;241m.\u001B[39mstream)\n\u001B[1;32m    199\u001B[0m \u001B[38;5;66;03m#Synchronize end event\u001B[39;00m\n\u001B[0;32m--> 200\u001B[0m \u001B[43mend\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msynchronize\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    202\u001B[0m \u001B[38;5;66;03m#Compute megacells\u001B[39;00m\n\u001B[1;32m    203\u001B[0m gpu_elapsed \u001B[38;5;241m=\u001B[39m end\u001B[38;5;241m.\u001B[39mtime_since(start)\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m1.0e-3\u001B[39m\n",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
     ]
    }
   ],
--- a/mpiTesting.py
+++ b/mpiTesting.py
@ -34,8 +34,9 @@ from mpi4py import MPI
 import pycuda.driver as cuda

 # Simulator engine etc
-from GPUSimulators import MPISimulator, Common
-from GPUSimulators.gpu import CudaContext
+from GPUSimulators import MPISimulator
+from GPUSimulators.common import common
+from GPUSimulators.gpu import cuda_context
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC

@ -147,7 +148,7 @@ def genSim(grid, **kwargs):
    return sim


-outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
+outfile, sim_runner_profiling_data, sim_profiling_data = Common.run_simulation(
    genSim, arguments, outfile, save_times, save_var_names, dt)

 if(args.profile):
@ -183,8 +184,8 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
    profiling_data["slurm_job_id"] = job_id
    profiling_data["n_cuda_devices"] = str(num_cuda_devices)
    profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
-    profiling_data["git_hash"] = Common.getGitHash()
-    profiling_data["git_status"] = Common.getGitStatus()
+    profiling_data["git_hash"] = Common.get_git_hash()
+    profiling_data["git_status"] = Common.get_git_status()

    with open(profiling_file, "w") as write_file:
        json.dump(profiling_data, write_file)
--- a/shmemTesting.py
+++ b/shmemTesting.py
@ -25,7 +25,8 @@ import gc
 import logging

 #Simulator engine etc
-from GPUSimulators import SHMEMSimulatorGroup, Common
+from GPUSimulators import SHMEMSimulatorGroup
+from GPUSimulators.common import common
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC

@ -99,7 +100,7 @@ def genSim(sims, grid, **kwargs):
    sim = SHMEMSimulatorGroup.SHMEMSimulatorGroup(sims, grid)
    return sim

-outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
+outfile = Common.run_simulation(genSim, arguments, outfile, save_times, save_var_names)



--- a/singleGPUTesting.py
+++ b/singleGPUTesting.py
@ -28,8 +28,8 @@ import logging
 import pycuda.driver as cuda

 # Simulator engine etc
-from GPUSimulators import Common
-from GPUSimulators.gpu import CudaContext
+from GPUSimulators.common import common
+from GPUSimulators.gpu import cuda_context
 from GPUSimulators import EE2D_KP07_dimsplit
 from GPUSimulators.helpers import InitialConditions as IC

@ -104,7 +104,7 @@ def genSim(**kwargs):
    return local_sim


-outfile = Common.runSimulation(
+outfile = Common.run_simulation(
    genSim, arguments, outfile, save_times, save_var_names)

 ####