refactor(kernel): split Common.py to a separate package

This commit is contained in:
Anthony Berg 2025-06-24 17:34:29 +02:00
parent 8f24cd45ea
commit c54f08c417
39 changed files with 1969 additions and 143694 deletions

View File

@ -27,16 +27,11 @@
"from matplotlib import pyplot as plt\n",
"from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
"\n",
"import subprocess\n",
"import os\n",
"import gc\n",
"import datetime\n",
"import importlib\n",
"import logging\n",
"from socket import gethostname\n",
"\n",
"import pycuda.driver as cuda\n",
"import pycuda.compiler\n",
"\n",
"try:\n",
" from StringIO import StringIO\n",
@ -55,7 +50,8 @@
"metadata": {},
"outputs": [],
"source": [
"from GPUSimulators import Common, IPythonMagic, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, Autotuner"
"from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, Autotuner\n",
"from GPUSimulators.common import common"
]
},
{
@ -124,14 +120,14 @@
"evalue": "All-NaN slice encountered",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[9], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m simulators \u001b[38;5;241m=\u001b[39m [LxF\u001b[38;5;241m.\u001b[39mLxF, FORCE\u001b[38;5;241m.\u001b[39mFORCE, HLL\u001b[38;5;241m.\u001b[39mHLL, HLL2\u001b[38;5;241m.\u001b[39mHLL2, KP07\u001b[38;5;241m.\u001b[39mKP07, KP07_dimsplit\u001b[38;5;241m.\u001b[39mKP07_dimsplit, WAF\u001b[38;5;241m.\u001b[39mWAF]\n\u001b[0;32m----> 2\u001b[0m peak_performance \u001b[38;5;241m=\u001b[39m [autotuner\u001b[38;5;241m.\u001b[39mget_peak_performance(simulator) \u001b[38;5;28;01mfor\u001b[39;00m simulator \u001b[38;5;129;01min\u001b[39;00m simulators]\n\u001b[1;32m 3\u001b[0m megacells \u001b[38;5;241m=\u001b[39m [performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmegacells\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m performance \u001b[38;5;129;01min\u001b[39;00m peak_performance]\n\u001b[1;32m 4\u001b[0m xlabels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{:s}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m[\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124mx\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(simulators[i]\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m'\u001b[39m], performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;28;01mfor\u001b[39;00m i, performance \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(peak_performance)]\n",
"Cell \u001b[0;32mIn[9], line 2\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m simulators \u001b[38;5;241m=\u001b[39m [LxF\u001b[38;5;241m.\u001b[39mLxF, FORCE\u001b[38;5;241m.\u001b[39mFORCE, HLL\u001b[38;5;241m.\u001b[39mHLL, HLL2\u001b[38;5;241m.\u001b[39mHLL2, KP07\u001b[38;5;241m.\u001b[39mKP07, KP07_dimsplit\u001b[38;5;241m.\u001b[39mKP07_dimsplit, WAF\u001b[38;5;241m.\u001b[39mWAF]\n\u001b[0;32m----> 2\u001b[0m peak_performance \u001b[38;5;241m=\u001b[39m [\u001b[43mautotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_peak_performance\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m simulator \u001b[38;5;129;01min\u001b[39;00m simulators]\n\u001b[1;32m 3\u001b[0m megacells \u001b[38;5;241m=\u001b[39m [performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmegacells\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m performance \u001b[38;5;129;01min\u001b[39;00m peak_performance]\n\u001b[1;32m 4\u001b[0m xlabels \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{:s}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m[\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124mx\u001b[39m\u001b[38;5;132;01m{:d}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(simulators[i]\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m'\u001b[39m], performance[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m'\u001b[39m]) \u001b[38;5;28;01mfor\u001b[39;00m i, performance \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(peak_performance)]\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:132\u001b[0m, in \u001b[0;36mAutotuner.get_peak_performance\u001b[0;34m(self, simulator)\u001b[0m\n\u001b[1;32m 130\u001b[0m block_widths \u001b[38;5;241m=\u001b[39m data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_block_widths\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 131\u001b[0m block_heights \u001b[38;5;241m=\u001b[39m data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_block_heights\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m--> 132\u001b[0m j, i \u001b[38;5;241m=\u001b[39m \u001b[43mfind_max_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmegacells\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mperformance[key] \u001b[38;5;241m=\u001b[39m { \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m\"\u001b[39m: block_widths[i],\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m\"\u001b[39m: block_heights[j],\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmegacells\u001b[39m\u001b[38;5;124m\"\u001b[39m: megacells[j, i] }\n\u001b[1;32m 137\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReturning \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m as peak performance parameters\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mperformance[key])\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:126\u001b[0m, in \u001b[0;36mAutotuner.get_peak_performance.<locals>.find_max_index\u001b[0;34m(megacells)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_max_index\u001b[39m(megacells):\n\u001b[0;32m--> 126\u001b[0m max_index \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnanargmax\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmegacells\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39munravel_index(max_index, megacells\u001b[38;5;241m.\u001b[39mshape)\n",
"File \u001b[0;32m~/.conda/envs/ShallowWaterGPU/lib/python3.9/site-packages/numpy/lib/nanfunctions.py:613\u001b[0m, in \u001b[0;36mnanargmax\u001b[0;34m(a, axis, out, keepdims)\u001b[0m\n\u001b[1;32m 611\u001b[0m mask \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mall(mask, axis\u001b[38;5;241m=\u001b[39maxis)\n\u001b[1;32m 612\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(mask):\n\u001b[0;32m--> 613\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAll-NaN slice encountered\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 614\u001b[0m res \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39margmax(a, axis\u001b[38;5;241m=\u001b[39maxis, out\u001b[38;5;241m=\u001b[39mout, keepdims\u001b[38;5;241m=\u001b[39mkeepdims)\n\u001b[1;32m 615\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n",
"\u001b[0;31mValueError\u001b[0m: All-NaN slice encountered"
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[9], line 2\u001B[0m\n\u001B[1;32m 1\u001B[0m simulators \u001B[38;5;241m=\u001B[39m [LxF\u001B[38;5;241m.\u001B[39mLxF, FORCE\u001B[38;5;241m.\u001B[39mFORCE, HLL\u001B[38;5;241m.\u001B[39mHLL, HLL2\u001B[38;5;241m.\u001B[39mHLL2, KP07\u001B[38;5;241m.\u001B[39mKP07, KP07_dimsplit\u001B[38;5;241m.\u001B[39mKP07_dimsplit, WAF\u001B[38;5;241m.\u001B[39mWAF]\n\u001B[0;32m----> 2\u001B[0m peak_performance \u001B[38;5;241m=\u001B[39m [autotuner\u001B[38;5;241m.\u001B[39mget_peak_performance(simulator) \u001B[38;5;28;01mfor\u001B[39;00m simulator \u001B[38;5;129;01min\u001B[39;00m simulators]\n\u001B[1;32m 3\u001B[0m megacells \u001B[38;5;241m=\u001B[39m [performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmegacells\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m performance \u001B[38;5;129;01min\u001B[39;00m peak_performance]\n\u001B[1;32m 4\u001B[0m xlabels \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{:s}\u001B[39;00m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m[\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124mx\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124m]\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(simulators[i]\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m'\u001B[39m], performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m'\u001B[39m]) \u001B[38;5;28;01mfor\u001B[39;00m i, performance \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(peak_performance)]\n",
"Cell \u001B[0;32mIn[9], line 2\u001B[0m, in \u001B[0;36m<listcomp>\u001B[0;34m(.0)\u001B[0m\n\u001B[1;32m 1\u001B[0m simulators \u001B[38;5;241m=\u001B[39m [LxF\u001B[38;5;241m.\u001B[39mLxF, FORCE\u001B[38;5;241m.\u001B[39mFORCE, HLL\u001B[38;5;241m.\u001B[39mHLL, HLL2\u001B[38;5;241m.\u001B[39mHLL2, KP07\u001B[38;5;241m.\u001B[39mKP07, KP07_dimsplit\u001B[38;5;241m.\u001B[39mKP07_dimsplit, WAF\u001B[38;5;241m.\u001B[39mWAF]\n\u001B[0;32m----> 2\u001B[0m peak_performance \u001B[38;5;241m=\u001B[39m [\u001B[43mautotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_peak_performance\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mfor\u001B[39;00m simulator \u001B[38;5;129;01min\u001B[39;00m simulators]\n\u001B[1;32m 3\u001B[0m megacells \u001B[38;5;241m=\u001B[39m [performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmegacells\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m performance \u001B[38;5;129;01min\u001B[39;00m peak_performance]\n\u001B[1;32m 4\u001B[0m xlabels \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{:s}\u001B[39;00m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m[\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124mx\u001B[39m\u001B[38;5;132;01m{:d}\u001B[39;00m\u001B[38;5;124m]\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(simulators[i]\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m'\u001B[39m], performance[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m'\u001B[39m]) \u001B[38;5;28;01mfor\u001B[39;00m i, performance \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(peak_performance)]\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:132\u001B[0m, in \u001B[0;36mAutotuner.get_peak_performance\u001B[0;34m(self, simulator)\u001B[0m\n\u001B[1;32m 130\u001B[0m block_widths \u001B[38;5;241m=\u001B[39m data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m_block_widths\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[1;32m 131\u001B[0m block_heights \u001B[38;5;241m=\u001B[39m data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m_block_heights\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m--> 132\u001B[0m j, i \u001B[38;5;241m=\u001B[39m \u001B[43mfind_max_index\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmegacells\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 134\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mperformance[key] \u001B[38;5;241m=\u001B[39m { \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m\"\u001B[39m: block_widths[i],\n\u001B[1;32m 135\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m\"\u001B[39m: block_heights[j],\n\u001B[1;32m 136\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmegacells\u001B[39m\u001B[38;5;124m\"\u001B[39m: megacells[j, i] }\n\u001B[1;32m 137\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mReturning \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m as peak performance parameters\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mperformance[key])\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:126\u001B[0m, in \u001B[0;36mAutotuner.get_peak_performance.<locals>.find_max_index\u001B[0;34m(megacells)\u001B[0m\n\u001B[1;32m 125\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfind_max_index\u001B[39m(megacells):\n\u001B[0;32m--> 126\u001B[0m max_index \u001B[38;5;241m=\u001B[39m \u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mnanargmax\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmegacells\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 127\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m np\u001B[38;5;241m.\u001B[39munravel_index(max_index, megacells\u001B[38;5;241m.\u001B[39mshape)\n",
"File \u001B[0;32m~/.conda/envs/ShallowWaterGPU/lib/python3.9/site-packages/numpy/lib/nanfunctions.py:613\u001B[0m, in \u001B[0;36mnanargmax\u001B[0;34m(a, axis, out, keepdims)\u001B[0m\n\u001B[1;32m 611\u001B[0m mask \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mall(mask, axis\u001B[38;5;241m=\u001B[39maxis)\n\u001B[1;32m 612\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m np\u001B[38;5;241m.\u001B[39many(mask):\n\u001B[0;32m--> 613\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAll-NaN slice encountered\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 614\u001B[0m res \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39margmax(a, axis\u001B[38;5;241m=\u001B[39maxis, out\u001B[38;5;241m=\u001B[39mout, keepdims\u001B[38;5;241m=\u001B[39mkeepdims)\n\u001B[1;32m 615\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n",
"\u001B[0;31mValueError\u001B[0m: All-NaN slice encountered"
]
}
],

View File

@ -49,9 +49,6 @@
"import time\n",
"import os\n",
"import gc\n",
"import datetime\n",
"\n",
"import pycuda.driver as cuda\n",
"\n",
"try:\n",
" from StringIO import StringIO\n",
@ -59,7 +56,8 @@
" from io import StringIO\n",
"\n",
"#Finally, import our simulator\n",
"from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, IPythonMagic"
"from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF\n",
"from GPUSimulators.common import common"
]
},
{

View File

@ -42,15 +42,10 @@
"from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
"#import mpld3\n",
"\n",
"import subprocess\n",
"import socket\n",
"import time\n",
"import os\n",
"import gc\n",
"import datetime\n",
"import logging\n",
"\n",
"import pycuda.driver as cuda\n",
"\n",
"try:\n",
" from StringIO import StringIO\n",
@ -65,7 +60,8 @@
"outputs": [],
"source": [
"#Finally, import our simulator\n",
"from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, IPythonMagic\n",
"from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF\n",
"from GPUSimulators.common import common\n",
"from GPUSimulators.helpers import InitialConditions"
]
},
@ -250,8 +246,8 @@
" sim.simulate(1.0, dt=dt)\n",
" sim.check()\n",
" \n",
" nt = sim.simSteps()\n",
" dt = sim.simTime() / nt\n",
" nt = sim.sim_steps()\n",
" dt = sim.sim_time() / nt\n",
" h, hu, hv = sim.download()\n",
" \n",
" if (transpose):\n",

View File

@ -42,15 +42,10 @@
"from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
"#import mpld3\n",
"\n",
"import subprocess\n",
"import socket\n",
"import time\n",
"import os\n",
"import gc\n",
"import datetime\n",
"import logging\n",
"\n",
"import pycuda.driver as cuda\n",
"\n",
"try:\n",
" from StringIO import StringIO\n",
@ -65,7 +60,8 @@
"outputs": [],
"source": [
"#Finally, import our simulator\n",
"from GPUSimulators import Common, LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF, IPythonMagic\n",
"from GPUSimulators import LxF, FORCE, HLL, HLL2, KP07, KP07_dimsplit, WAF\n",
"from GPUSimulators.common import common\n",
"from GPUSimulators.helpers import InitialConditions"
]
},
@ -250,8 +246,8 @@
" sim.simulate(1.0, dt=dt)\n",
" sim.check()\n",
" \n",
" nt = sim.simSteps()\n",
" dt = sim.simTime() / nt\n",
" nt = sim.sim_steps()\n",
" dt = sim.sim_time() / nt\n",
" h, hu, hv = sim.download()\n",
" \n",
" if (transpose):\n",

File diff suppressed because one or more lines are too long

View File

@ -29,15 +29,159 @@ from tqdm.auto import tqdm
import pycuda.driver as cuda
from GPUSimulators import Common, Simulator
from GPUSimulators import Simulator
from GPUSimulators.common import common, Timer
from GPUSimulators.gpu import CudaContext
def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
"""
Runs a benchmark, and returns the number of megacells achieved
"""
logger = logging.getLogger(__name__)
# Initialize simulator
try:
sim = simulator(**arguments)
except:
# An exception raised - not possible to continue
logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
# raise RuntimeError("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
return np.nan
# Create timer events
start = cuda.Event()
end = cuda.Event()
# Warmup
for i in range(warmup_timesteps):
sim.substep(sim.dt, i)
# Run simulation with timer
start.record(sim.stream)
for i in range(timesteps):
sim.substep(sim.dt, i)
end.record(sim.stream)
# Synchronize end event
end.synchronize()
# Compute megacells
gpu_elapsed = end.time_since(start) * 1.0e-3
megacells = (sim.nx * sim.ny * timesteps / (1000 * 1000)) / gpu_elapsed
# Sanity check solution
h, hu, hv = sim.download()
sane = True
sane = sane and sanity_check(0.3, 0.7)
sane = sane and sanity_check(-0.2, 0.2)
sane = sane and sanity_check(-0.2, 0.2)
if sane:
logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__,
arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
return megacells
else:
logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"],
arguments["block_height"], gpu_elapsed)
# raise RuntimeError("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
return np.nan
def gen_test_data(nx, ny, g):
"""
Generates test dataset
"""
width = 100.0
height = 100.0
dx = width / float(nx)
dy = height / float(ny)
x_center = dx * nx / 2.0
y_center = dy * ny / 2.0
# Create a gaussian "dam break" that will not form shocks
size = width / 5.0
dt = 10 ** 10
h = np.zeros((ny, nx), dtype=np.float32)
hu = np.zeros((ny, nx), dtype=np.float32)
hv = np.zeros((ny, nx), dtype=np.float32)
extent = 1.0 / np.sqrt(2.0)
x = (dx * (np.arange(0, nx, dtype=np.float32) + 0.5) - x_center) / size
y = (dy * (np.arange(0, ny, dtype=np.float32) + 0.5) - y_center) / size
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
r = np.minimum(1.0, np.sqrt(xv ** 2 + yv ** 2))
xv = None
yv = None
gc.collect()
# Generate highres
cos = np.cos(np.pi * r)
h = 0.5 + 0.1 * 0.5 * (1.0 + cos)
hu = 0.1 * 0.5 * (1.0 + cos)
hv = hu.copy()
scale = 0.7
max_h_estimate = 0.6
max_u_estimate = 0.1 * np.sqrt(2.0)
dx = width / nx
dy = height / ny
dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g * max_h_estimate))
return h, hu, hv, dx, dy, dt
def sanity_check(variable, bound_min, bound_max):
"""
Checks that a variable is "sane"
"""
maxval = np.amax(variable)
minval = np.amin(variable)
if (np.isnan(maxval)
or np.isnan(minval)
or maxval > bound_max
or minval < bound_min):
return False
else:
return True
def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
"""
Runs a set of benchmarks for a single simulator
"""
logger = logging.getLogger(__name__)
megacells = np.empty((len(block_heights), len(block_widths)))
megacells.fill(np.nan)
logger.debug("Running %d benchmarks with %s", len(block_heights) * len(block_widths), simulator.__name__)
sim_arguments = arguments.copy()
with Timer(simulator.__name__) as t:
for j, block_height in enumerate(tqdm(block_heights, desc='Autotuner Progress')):
sim_arguments.update({'block_height': block_height})
for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
sim_arguments.update({'block_width': block_width})
megacells[j, i] = run_benchmark(sim_arguments)
logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
return megacells
class Autotuner:
def __init__(self,
nx=2048, ny=2048,
block_widths=range(8, 32, 1),
block_heights=range(8, 32, 1)):
def __init__(self,
nx=2048, ny=2048,
block_widths=range(8, 32, 1),
block_heights=range(8, 32, 1)):
logger = logging.getLogger(__name__)
self.filename = "autotuning_data_" + gethostname() + ".npz"
self.nx = nx
@ -48,50 +192,51 @@ class Autotuner:
def benchmark(self, simulator, force=False):
logger = logging.getLogger(__name__)
#Run through simulators and benchmark
# Run through simulators and benchmark
key = str(simulator.__name__)
logger.info("Benchmarking %s to %s", key, self.filename)
#If this simulator has been benchmarked already, skip it
if (force==False and os.path.isfile(self.filename)):
# If this simulator has been benchmarked already, skip it
if force == False and os.path.isfile(self.filename):
with np.load(self.filename) as data:
if key in data["simulators"]:
logger.info("%s already benchmarked - skipping", key)
return
# Set arguments to send to the simulators during construction
context = CudaContext.CudaContext(autotuning=False)
context = CudaContext(autotuning=False)
g = 9.81
h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
h0, hu0, hv0, dx, dy, dt = gen_test_data(ny=self.ny, g=g)
arguments = {
'context': context,
'h0': h0, 'hu0': hu0, 'hv0': hv0,
'nx': self.nx, 'ny': self.ny,
'dx': dx, 'dy': dy, 'dt': 0.9*dt,
'dx': dx, 'dy': dy, 'dt': 0.9 * dt,
'g': g,
'compile_opts': ['-Wno-deprecated-gpu-targets']
}
}
# Load existing data into memory
benchmark_data = {
"simulators": [],
"simulators": [],
}
if (os.path.isfile(self.filename)):
if os.path.isfile(self.filename):
with np.load(self.filename) as data:
for k, v in data.items():
benchmark_data[k] = v
# Run benchmark
benchmark_data[key + "_megacells"] = Autotuner.benchmark_single_simulator(simulator, arguments, self.block_widths, self.block_heights)
benchmark_data[key + "_megacells"] = benchmark_single_simulator(arguments, self.block_widths,
self.block_heights)
benchmark_data[key + "_block_widths"] = self.block_widths
benchmark_data[key + "_block_heights"] = self.block_heights
benchmark_data[key + "_arguments"] = str(arguments)
existing_sims = benchmark_data["simulators"]
if (isinstance(existing_sims, np.ndarray)):
if isinstance(existing_sims, np.ndarray):
existing_sims = existing_sims.tolist()
if (key not in existing_sims):
if key not in existing_sims:
benchmark_data["simulators"] = existing_sims + [key]
# Save to file
@ -104,178 +249,40 @@ class Autotuner:
"""
logger = logging.getLogger(__name__)
assert issubclass(simulator, Simulator.BaseSimulator)
key = simulator.__name__
if (key in self.performance):
if key in self.performance:
return self.performance[key]
else:
#Run simulation if required
if (not os.path.isfile(self.filename)):
# Run simulation if required
if not os.path.isfile(self.filename):
logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
self.benchmark(simulator)
with np.load(self.filename) as data:
if key not in data['simulators']:
logger.debug("Could not get autotuned peak performance for %s: benchmarking", key)
data.close()
self.benchmark(simulator)
data = np.load(self.filename)
def find_max_index(megacells):
max_index = np.nanargmax(megacells)
return np.unravel_index(max_index, megacells.shape)
megacells = data[key + '_megacells']
block_widths = data[key + '_block_widths']
block_heights = data[key + '_block_heights']
j, i = find_max_index(megacells)
self.performance[key] = { "block_width": block_widths[i],
self.performance[key] = {"block_width": block_widths[i],
"block_height": block_heights[j],
"megacells": megacells[j, i] }
"megacells": megacells[j, i]}
logger.debug("Returning %s as peak performance parameters", self.performance[key])
return self.performance[key]
#This should never happen
# This should never happen
raise "Something wrong: Could not get autotuning data!"
return None
def benchmark_single_simulator(simulator, arguments, block_widths, block_heights):
"""
Runs a set of benchmarks for a single simulator
"""
logger = logging.getLogger(__name__)
megacells = np.empty((len(block_heights), len(block_widths)))
megacells.fill(np.nan)
logger.debug("Running %d benchmarks with %s", len(block_heights)*len(block_widths), simulator.__name__)
sim_arguments = arguments.copy()
with Common.Timer(simulator.__name__) as t:
for j, block_height in enumerate(tqdm(block_heights, desc='Autotuner Progress')):
sim_arguments.update({'block_height': block_height})
for i, block_width in enumerate(tqdm(block_widths, desc=f'Iteration {j} Progress', leave=False)):
sim_arguments.update({'block_width': block_width})
megacells[j, i] = Autotuner.run_benchmark(simulator, sim_arguments)
logger.debug("Completed %s in %f seconds", simulator.__name__, t.secs)
return megacells
def run_benchmark(simulator, arguments, timesteps=10, warmup_timesteps=2):
"""
Runs a benchmark, and returns the number of megacells achieved
"""
logger = logging.getLogger(__name__)
#Initialize simulator
try:
sim = simulator(**arguments)
except:
#An exception raised - not possible to continue
logger.debug("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
# raise RuntimeError("Failed creating %s with arguments %s", simulator.__name__, str(arguments))
return np.nan
#Create timer events
start = cuda.Event()
end = cuda.Event()
#Warmup
for i in range(warmup_timesteps):
sim.substep(sim.dt, i)
#Run simulation with timer
start.record(sim.stream)
for i in range(timesteps):
sim.substep(sim.dt, i)
end.record(sim.stream)
#Synchronize end event
end.synchronize()
#Compute megacells
gpu_elapsed = end.time_since(start)*1.0e-3
megacells = (sim.nx*sim.ny*timesteps / (1000*1000)) / gpu_elapsed
#Sanity check solution
h, hu, hv = sim.download()
sane = True
sane = sane and Autotuner.sanity_check(h, 0.3, 0.7)
sane = sane and Autotuner.sanity_check(hu, -0.2, 0.2)
sane = sane and Autotuner.sanity_check(hv, -0.2, 0.2)
if (sane):
logger.debug("%s [%d x %d] succeeded: %f megacells, gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], megacells, gpu_elapsed)
return megacells
else:
logger.debug("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
# raise RuntimeError("%s [%d x %d] failed: gpu elapsed %f", simulator.__name__, arguments["block_width"], arguments["block_height"], gpu_elapsed)
return np.nan
def gen_test_data(nx, ny, g):
"""
Generates test dataset
"""
width = 100.0
height = 100.0
dx = width / float(nx)
dy = height / float(ny)
x_center = dx*nx/2.0
y_center = dy*ny/2.0
#Create a gaussian "dam break" that will not form shocks
size = width / 5.0
dt = 10**10
h = np.zeros((ny, nx), dtype=np.float32);
hu = np.zeros((ny, nx), dtype=np.float32);
hv = np.zeros((ny, nx), dtype=np.float32);
extent = 1.0/np.sqrt(2.0)
x = (dx*(np.arange(0, nx, dtype=np.float32)+0.5) - x_center) / size
y = (dy*(np.arange(0, ny, dtype=np.float32)+0.5) - y_center) / size
xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
r = np.minimum(1.0, np.sqrt(xv**2 + yv**2))
xv = None
yv = None
gc.collect()
#Generate highres
cos = np.cos(np.pi*r)
h = 0.5 + 0.1*0.5*(1.0 + cos)
hu = 0.1*0.5*(1.0 + cos)
hv = hu.copy()
scale = 0.7
max_h_estimate = 0.6
max_u_estimate = 0.1*np.sqrt(2.0)
dx = width/nx
dy = height/ny
dt = scale * min(dx, dy) / (max_u_estimate + np.sqrt(g*max_h_estimate))
return h, hu, hv, dx, dy, dt
def sanity_check(variable, bound_min, bound_max):
"""
Checks that a variable is "sane"
"""
maxval = np.amax(variable)
minval = np.amin(variable)
if (np.isnan(maxval)
or np.isnan(minval)
or maxval > bound_max
or minval < bound_min):
return False
else:
return True

View File

@ -1,758 +0,0 @@
# -*- coding: utf-8 -*-
"""
This python module implements the different helper functions and
classes
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import os
import numpy as np
import time
import signal
import subprocess
import tempfile
import re
import io
import hashlib
import logging
import gc
import netCDF4
import json
import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from pycuda.tools import PageLockedMemoryPool
def safeCall(cmd):
logger = logging.getLogger(__name__)
try:
#git rev-parse HEAD
current_dir = os.path.dirname(os.path.realpath(__file__))
params = dict()
params['stderr'] = subprocess.STDOUT
params['cwd'] = current_dir
params['universal_newlines'] = True #text=True in more recent python
params['shell'] = False
if os.name == 'nt':
params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
stdout = subprocess.check_output(cmd, **params)
except subprocess.CalledProcessError as e:
output = e.output
logger.error("Git failed, \nReturn code: " + str(e.returncode) + "\nOutput: " + output)
raise e
return stdout
def getGitHash():
return safeCall(["git", "rev-parse", "HEAD"])
def getGitStatus():
return safeCall(["git", "status", "--porcelain", "-uno"])
def toJson(in_dict, compressed=True):
"""
Creates JSON string from a dictionary
"""
logger = logging.getLogger(__name__)
out_dict = in_dict.copy()
for key in out_dict:
if isinstance(out_dict[key], np.ndarray):
out_dict[key] = out_dict[key].tolist()
else:
try:
json.dumps(out_dict[key])
except:
value = str(out_dict[key])
logger.warning("JSON: Converting {:s} to string ({:s})".format(key, value))
out_dict[key] = value
return json.dumps(out_dict)
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
"""
Runs a simulation, and stores output in netcdf file. Stores the times given in
save_times, and saves all of the variables in list save_var_names. Elements in
save_var_names can be set to None if you do not want to save them
"""
profiling_data_sim_runner = { 'start': {}, 'end': {} }
profiling_data_sim_runner["start"]["t_sim_init"] = 0
profiling_data_sim_runner["end"]["t_sim_init"] = 0
profiling_data_sim_runner["start"]["t_nc_write"] = 0
profiling_data_sim_runner["end"]["t_nc_write"] = 0
profiling_data_sim_runner["start"]["t_full_step"] = 0
profiling_data_sim_runner["end"]["t_full_step"] = 0
profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
logger = logging.getLogger(__name__)
assert len(save_times) > 0, "Need to specify which times to save"
with Timer("construct") as t:
sim = simulator(**simulator_args)
logger.info("Constructed in " + str(t.secs) + " seconds")
#Create netcdf file and simulate
with DataDumper(outfile, mode='w', clobber=False) as outdata:
#Create attributes (metadata)
outdata.ncfile.created = time.ctime(time.time())
outdata.ncfile.git_hash = getGitHash()
outdata.ncfile.git_status = getGitStatus()
outdata.ncfile.simulator = str(simulator)
# do not write fields to attributes (they are to large)
simulator_args_for_ncfile = simulator_args.copy()
del simulator_args_for_ncfile["rho"]
del simulator_args_for_ncfile["rho_u"]
del simulator_args_for_ncfile["rho_v"]
del simulator_args_for_ncfile["E"]
outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile)
#Create dimensions
outdata.ncfile.createDimension('time', len(save_times))
outdata.ncfile.createDimension('x', simulator_args['nx'])
outdata.ncfile.createDimension('y', simulator_args['ny'])
#Create variables for dimensions
ncvars = {}
ncvars['time'] = outdata.ncfile.createVariable('time', np.dtype('float32').char, 'time')
ncvars['x'] = outdata.ncfile.createVariable( 'x', np.dtype('float32').char, 'x')
ncvars['y'] = outdata.ncfile.createVariable( 'y', np.dtype('float32').char, 'y')
#Fill variables with proper values
ncvars['time'][:] = save_times
extent = sim.getExtent()
ncvars['x'][:] = np.linspace(extent[0], extent[1], simulator_args['nx'])
ncvars['y'][:] = np.linspace(extent[2], extent[3], simulator_args['ny'])
#Choose which variables to download (prune None from list, but keep the index)
download_vars = []
for i, var_name in enumerate(save_var_names):
if var_name is not None:
download_vars += [i]
save_var_names = list(save_var_names[i] for i in download_vars)
#Create variables
for var_name in save_var_names:
ncvars[var_name] = outdata.ncfile.createVariable(var_name, np.dtype('float32').char, ('time', 'y', 'x'), zlib=True, least_significant_digit=3)
#Create step sizes between each save
t_steps = np.empty_like(save_times)
t_steps[0] = save_times[0]
t_steps[1:] = save_times[1:] - save_times[0:-1]
profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
#Start simulation loop
progress_printer = ProgressPrinter(save_times[-1], print_every=10)
for k in range(len(save_times)):
#Get target time and step size there
t_step = t_steps[k]
t_end = save_times[k]
#Sanity check simulator
try:
sim.check()
except AssertionError as e:
logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e)))
return outdata.filename
profiling_data_sim_runner["start"]["t_full_step"] += time.time()
#Simulate
if (t_step > 0.0):
sim.simulate(t_step, dt)
profiling_data_sim_runner["end"]["t_full_step"] += time.time()
profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
#Download
save_vars = sim.download(download_vars)
#Save to file
for i, var_name in enumerate(save_var_names):
ncvars[var_name][k, :] = save_vars[i]
profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
#Write progress to screen
print_string = progress_printer.getPrintString(t_end)
if (print_string):
logger.debug(print_string)
logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps()))
return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi
class Timer(object):
"""
Class which keeps track of time spent for a section of code
"""
def __init__(self, tag, log_level=logging.DEBUG):
self.tag = tag
self.log_level = log_level
self.logger = logging.getLogger(__name__)
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs
self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
def elapsed(self):
return time.time() - self.start
class PopenFileBuffer(object):
"""
Simple class for holding a set of tempfiles
for communicating with a subprocess
"""
def __init__(self):
self.stdout = tempfile.TemporaryFile(mode='w+t')
self.stderr = tempfile.TemporaryFile(mode='w+t')
def __del__(self):
self.stdout.close()
self.stderr.close()
def read(self):
self.stdout.seek(0)
cout = self.stdout.read()
self.stdout.seek(0, 2)
self.stderr.seek(0)
cerr = self.stderr.read()
self.stderr.seek(0, 2)
return cout, cerr
class IPEngine(object):
"""
Class for starting IPEngines for MPI processing in IPython
"""
def __init__(self, n_engines):
self.logger = logging.getLogger(__name__)
#Start ipcontroller
self.logger.info("Starting IPController")
self.c_buff = PopenFileBuffer()
c_cmd = ["ipcontroller", "--ip='*'"]
c_params = dict()
c_params['stderr'] = self.c_buff.stderr
c_params['stdout'] = self.c_buff.stdout
c_params['shell'] = False
if os.name == 'nt':
c_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
self.c = subprocess.Popen(c_cmd, **c_params)
#Wait until controller is running
time.sleep(3)
#Start engines
self.logger.info("Starting IPEngines")
self.e_buff = PopenFileBuffer()
e_cmd = ["mpiexec", "-n", str(n_engines), "ipengine", "--mpi"]
e_params = dict()
e_params['stderr'] = self.e_buff.stderr
e_params['stdout'] = self.e_buff.stdout
e_params['shell'] = False
if os.name == 'nt':
e_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
self.e = subprocess.Popen(e_cmd, **e_params)
# attach to a running cluster
import ipyparallel
self.cluster = ipyparallel.Client()#profile='mpi')
time.sleep(3)
while(len(self.cluster.ids) != n_engines):
time.sleep(0.5)
self.logger.info("Waiting for cluster...")
self.cluster = ipyparallel.Client()#profile='mpi')
self.logger.info("Done")
def __del__(self):
self.shutdown()
def shutdown(self):
if (self.e is not None):
if (os.name == 'nt'):
self.logger.warn("Sending CTRL+C to IPEngine")
self.e.send_signal(signal.CTRL_C_EVENT)
try:
self.e.communicate(timeout=3)
self.e.kill()
except subprocess.TimeoutExpired:
self.logger.warn("Killing IPEngine")
self.e.kill()
self.e.communicate()
self.e = None
cout, cerr = self.e_buff.read()
self.logger.info("IPEngine cout: {:s}".format(cout))
self.logger.info("IPEngine cerr: {:s}".format(cerr))
self.e_buff = None
gc.collect()
if (self.c is not None):
if (os.name == 'nt'):
self.logger.warn("Sending CTRL+C to IPController")
self.c.send_signal(signal.CTRL_C_EVENT)
try:
self.c.communicate(timeout=3)
self.c.kill()
except subprocess.TimeoutExpired:
self.logger.warn("Killing IPController")
self.c.kill()
self.c.communicate()
self.c = None
cout, cerr = self.c_buff.read()
self.logger.info("IPController cout: {:s}".format(cout))
self.logger.info("IPController cerr: {:s}".format(cerr))
self.c_buff = None
gc.collect()
class DataDumper(object):
"""
Simple class for holding a netCDF4 object
(handles opening and closing in a nice way)
Use as
with DataDumper("filename") as data:
...
"""
def __init__(self, filename, *args, **kwargs):
self.logger = logging.getLogger(__name__)
#Create directory if needed
filename = os.path.abspath(filename)
dirname = os.path.dirname(filename)
if dirname and not os.path.isdir(dirname):
self.logger.info("Creating directory " + dirname)
os.makedirs(dirname)
#Get mode of file if we have that
mode = None
if (args):
mode = args[0]
elif (kwargs and 'mode' in kwargs.keys()):
mode = kwargs['mode']
#Create new unique file if writing
if (mode):
if (("w" in mode) or ("+" in mode) or ("a" in mode)):
i = 0
stem, ext = os.path.splitext(filename)
while (os.path.isfile(filename)):
filename = "{:s}_{:04d}{:s}".format(stem, i, ext)
i = i+1
self.filename = os.path.abspath(filename)
#Save arguments
self.args = args
self.kwargs = kwargs
#Log output
self.logger.info("Initialized " + self.filename)
def __enter__(self):
self.logger.info("Opening " + self.filename)
if (self.args):
self.logger.info("Arguments: " + str(self.args))
if (self.kwargs):
self.logger.info("Keyword arguments: " + str(self.kwargs))
self.ncfile = netCDF4.Dataset(self.filename, *self.args, **self.kwargs)
return self
def __exit__(self, *args):
self.logger.info("Closing " + self.filename)
self.ncfile.close()
def toJson(in_dict):
out_dict = in_dict.copy()
for key in out_dict:
if isinstance(out_dict[key], np.ndarray):
out_dict[key] = out_dict[key].tolist()
else:
try:
json.dumps(out_dict[key])
except:
out_dict[key] = str(out_dict[key])
return json.dumps(out_dict)
class ProgressPrinter(object):
"""
Small helper class for
"""
def __init__(self, total_steps, print_every=5):
self.logger = logging.getLogger(__name__)
self.start = time.time()
self.total_steps = total_steps
self.print_every = print_every
self.next_print_time = self.print_every
self.last_step = 0
self.secs_per_iter = None
def getPrintString(self, step):
elapsed = time.time() - self.start
if (elapsed > self.next_print_time):
dt = elapsed - (self.next_print_time - self.print_every)
dsteps = step - self.last_step
steps_remaining = self.total_steps - step
if (dsteps == 0):
return
self.last_step = step
self.next_print_time = elapsed + self.print_every
if not self.secs_per_iter:
self.secs_per_iter = dt / dsteps
self.secs_per_iter = 0.2*self.secs_per_iter + 0.8*(dt / dsteps)
remaining_time = steps_remaining * self.secs_per_iter
return "{:s}. Total: {:s}, elapsed: {:s}, remaining: {:s}".format(
ProgressPrinter.progressBar(step, self.total_steps),
ProgressPrinter.timeString(elapsed + remaining_time),
ProgressPrinter.timeString(elapsed),
ProgressPrinter.timeString(remaining_time))
def timeString(seconds):
seconds = int(max(seconds, 1))
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
periods = [('h', hours), ('m', minutes), ('s', seconds)]
time_string = ' '.join('{}{}'.format(value, name)
for name, value in periods
if value)
return time_string
def progressBar(step, total_steps, width=30):
progress = np.round(width * step / total_steps).astype(np.int32)
progressbar = "0% [" + "#"*(progress) + "="*(width-progress) + "] 100%"
return progressbar
class CudaArray2D:
"""
Class that holds 2D data
"""
def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32):
"""
Uploads initial data to the CUDA device
"""
self.logger = logging.getLogger(__name__)
self.nx = nx
self.ny = ny
self.x_halo = x_halo
self.y_halo = y_halo
nx_halo = nx + 2*x_halo
ny_halo = ny + 2*y_halo
#self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)
#For returning to download
self.memorypool = PageLockedMemoryPool()
#If we don't have any data, just allocate and return
if cpu_data is None:
return
#Make sure data is in proper format
assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.ny, self.nx)), str((ny_halo, nx_halo)))
assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Create copy object from host to device
x = (nx_halo - cpu_data.shape[1]) // 2
y = (ny_halo - cpu_data.shape[0]) // 2
self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args):
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free()
self.data = None
def download(self, stream, cpu_data=None, asynch=False, extent=None):
"""
Enables downloading data from GPU to Python
"""
if (extent is None):
x = self.x_halo
y = self.y_halo
nx = self.nx
ny = self.ny
else:
x, y, nx, ny = extent
if (cpu_data is None):
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
#Allocate host memory
#The following fails, don't know why (crashes python)
cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
#Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
#cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
assert nx == cpu_data.shape[1]
assert ny == cpu_data.shape[0]
assert x+nx <= self.nx + 2*self.x_halo
assert y+ny <= self.ny + 2*self.y_halo
#Create copy object from device to host
copy = cuda.Memcpy2D()
copy.set_src_device(self.data.gpudata)
copy.set_dst_host(cpu_data)
#Set offsets and pitch of source
copy.src_x_in_bytes = int(x)*self.data.strides[1]
copy.src_y = int(y)
copy.src_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = int(nx)*cpu_data.itemsize
copy.height = int(ny)
copy(stream)
if asynch==False:
stream.synchronize()
return cpu_data
def upload(self, stream, cpu_data, extent=None):
if (extent is None):
x = self.x_halo
y = self.y_halo
nx = self.nx
ny = self.ny
else:
x, y, nx, ny = extent
assert(nx == cpu_data.shape[1])
assert(ny == cpu_data.shape[0])
assert(x+nx <= self.nx + 2*self.x_halo)
assert(y+ny <= self.ny + 2*self.y_halo)
#Create copy object from device to host
copy = cuda.Memcpy2D()
copy.set_dst_device(self.data.gpudata)
copy.set_src_host(cpu_data)
#Set offsets and pitch of source
copy.dst_x_in_bytes = int(x)*self.data.strides[1]
copy.dst_y = int(y)
copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = int(nx)*cpu_data.itemsize
copy.height = int(ny)
copy(stream)
class CudaArray3D:
"""
Class that holds 3D data
"""
def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
"""
Uploads initial data to the CL device
"""
self.logger = logging.getLogger(__name__)
self.nx = nx
self.ny = ny
self.nz = nz
self.x_halo = x_halo
self.y_halo = y_halo
self.z_halo = z_halo
nx_halo = nx + 2*x_halo
ny_halo = ny + 2*y_halo
nz_halo = nz + 2*z_halo
#self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
#Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)
#For returning to download
self.memorypool = PageLockedMemoryPool()
#If we don't have any data, just allocate and return
if cpu_data is None:
return
#Make sure data is in proper format
assert cpu_data.shape == (nz_halo, ny_halo, nx_halo) or cpu_data.shape == (self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(cpu_data.shape), str((self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo)))
assert cpu_data.itemsize == 4, "Wrong size of data type"
assert not np.isfortran(cpu_data), "Wrong datatype (Fortran, expected C)"
#Create copy object from host to device
copy = cuda.Memcpy3D()
copy.set_src_host(cpu_data)
copy.set_dst_device(self.data.gpudata)
#Set offsets of destination
x_offset = (nx_halo - cpu_data.shape[2]) // 2
y_offset = (ny_halo - cpu_data.shape[1]) // 2
z_offset = (nz_halo - cpu_data.shape[0]) // 2
copy.dst_x_in_bytes = x_offset*self.data.strides[1]
copy.dst_y = y_offset
copy.dst_z = z_offset
#Set pitch of destination
copy.dst_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
width = max(self.nx, cpu_data.shape[2])
height = max(self.ny, cpu_data.shape[1])
depth = max(self.nz, cpu-data.shape[0])
copy.width_in_bytes = width*cpu_data.itemsize
copy.height = height
copy.depth = depth
#Perform the copy
copy(stream)
#self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args):
#self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free()
self.data = None
def download(self, stream, asynch=False):
"""
Enables downloading data from GPU to Python
"""
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
#Allocate host memory
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
#cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32)
#Create copy object from device to host
copy = cuda.Memcpy2D()
copy.set_src_device(self.data.gpudata)
copy.set_dst_host(cpu_data)
#Set offsets and pitch of source
copy.src_x_in_bytes = self.x_halo*self.data.strides[1]
copy.src_y = self.y_halo
copy.src_z = self.z_halo
copy.src_pitch = self.data.strides[0]
#Set width in bytes to copy for each row and
#number of rows to copy
copy.width_in_bytes = self.nx*cpu_data.itemsize
copy.height = self.ny
copy.depth = self.nz
copy(stream)
if asynch==False:
stream.synchronize()
return cpu_data
class ArakawaA2D:
"""
A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
"""
def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
"""
Uploads initial data to the GPU device
"""
self.logger = logging.getLogger(__name__)
self.gpu_variables = []
for cpu_variable in cpu_variables:
self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)]
def __getitem__(self, key):
assert type(key) == int, "Indexing is int based"
if (key > len(self.gpu_variables) or key < 0):
raise IndexError("Out of bounds")
return self.gpu_variables[key]
def download(self, stream, variables=None):
"""
Enables downloading data from the GPU device to Python
"""
if variables is None:
variables=range(len(self.gpu_variables))
cpu_variables = []
for i in variables:
assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
#stream.synchronize()
return cpu_variables
def check(self):
"""
Checks that data is still sane
"""
for i, gpu_variable in enumerate(self.gpu_variables):
var_sum = pycuda.gpuarray.sum(gpu_variable.data).get()
self.logger.debug("Data %d with size [%d x %d] has average %f", i, gpu_variable.nx, gpu_variable.ny, var_sum / (gpu_variable.nx * gpu_variable.ny))
assert np.isnan(var_sum) == False, "Data contains NaN values!"

View File

@ -19,29 +19,29 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
class EE2D_KP07_dimsplit (BaseSimulator):
class EE2D_KP07_dimsplit(BaseSimulator):
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
def __init__(self,
context,
rho, rho_u, rho_v, E,
nx, ny,
dx, dy,
g,
gamma,
theta=1.3,
def __init__(self,
context,
rho, rho_u, rho_v, E,
nx, ny,
dx, dy,
g,
gamma,
theta=1.3,
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=8):
"""
Initialization routine
@ -60,77 +60,76 @@ class EE2D_KP07_dimsplit (BaseSimulator):
gamma: Gas constant
p: pressure
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height)
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height)
self.g = np.float32(g)
self.gamma = np.float32(gamma)
self.theta = np.float32(theta)
self.theta = np.float32(theta)
#Get kernels
module = context.get_module("cuda/EE2D_KP07_dimsplit.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
},
jit_compile_args={})
# Get kernels
module = context.get_module("cuda/EE2D_KP07_dimsplit.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"],
},
jit_compile_args={})
self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[rho, rho_u, rho_v, E])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[rho, rho_u, rho_v, E])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
dt_x = np.min(self.dx / (np.abs(rho_u/rho) + np.sqrt(gamma*rho)))
dt_y = np.min(self.dy / (np.abs(rho_v/rho) + np.sqrt(gamma*rho)))
dt_x = np.min(self.dx / (np.abs(rho_u / rho) + np.sqrt(gamma * rho)))
dt_y = np.min(self.dy / (np.abs(rho_v / rho) + np.sqrt(gamma * rho)))
self.dt = min(dt_x, dt_y)
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number, external=True, internal=True):
self.substepDimsplit(0.5*dt, step_number, external, internal)
def substepDimsplit(self, dt, substep, external, internal):
if external and internal:
#print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
def substep(self, dt, step_number, external=True, internal=True):
self.substep_dimsplit(0.5 * dt, step_number, external, internal)
def substep_dimsplit(self, dt, substep, external, internal):
if external and internal:
# print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
return
if external and not internal:
###################################
# XXX: Corners are treated twice! #
@ -141,136 +140,135 @@ class EE2D_KP07_dimsplit (BaseSimulator):
# NORTH
# (x0, y0) x (x1, y1)
# (0, ny-y_halo) x (nx, ny)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, self.ny - int(self.u0[0].y_halo),
self.nx, self.ny)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, self.ny - int(self.u0[0].y_halo),
self.nx, self.ny)
# SOUTH
# (x0, y0) x (x1, y1)
# (0, 0) x (nx, y_halo)
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, int(self.u0[0].y_halo))
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, int(self.u0[0].y_halo))
we_grid_size = (1, self.grid_size[1])
# WEST
# (x0, y0) x (x1, y1)
# (0, 0) x (x_halo, ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
int(self.u0[0].x_halo), self.ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
0, 0,
int(self.u0[0].x_halo), self.ny)
# EAST
# (x0, y0) x (x1, y1)
# (nx-x_halo, 0) x (nx, ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.nx - int(self.u0[0].x_halo), 0,
self.nx, self.ny)
self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
self.nx - int(self.u0[0].x_halo), 0,
self.nx, self.ny)
return
if internal and not external:
# INTERNAL DOMAIN
# (x0, y0) x (x1, y1)
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
int(self.u0[0].x_halo), int(self.u0[0].y_halo),
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.gamma,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
self.cfl_data.gpudata,
int(self.u0[0].x_halo), int(self.u0[0].y_halo),
self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo))
return
def swapBuffers(self):
def swap_buffers(self):
self.u0, self.u1 = self.u1, self.u0
return
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
return
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5

View File

@ -20,30 +20,31 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators.common import ArakawaA2D
from GPUSimulators import Simulator
from GPUSimulators.Simulator import BoundaryCondition
class FORCE (Simulator.BaseSimulator):
class FORCE(Simulator.BaseSimulator):
"""
Class that solves the SW equations
"""
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=16,
dt: float=None,
compile_opts: list[str]=[]):
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -59,76 +60,76 @@ class FORCE (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
1,
block_width, block_height)
self.g = np.float32(g)
#Get kernels
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
1,
block_width, block_height)
self.g = np.float32(g)
# Get kernels
module = context.get_module("cuda/SWE2D_FORCE.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("FORCEKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
1, 1,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
1, 1,
[None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
1, 1,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
1, 1,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt

View File

@ -19,30 +19,31 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators import Simulator
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.Simulator import BoundaryCondition
class HLL (Simulator.BaseSimulator):
class HLL(Simulator.BaseSimulator):
"""
Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
"""
def __init__(self,
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=16,
dt: float=None,
compile_opts: list[str]=[]):
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -58,74 +59,74 @@ class HLL (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
1,
block_width, block_height);
self.g = np.float32(g)
#Get kernels
module = context.get_module("cuda/SWE2D_HLL.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
1,
block_width, block_height)
self.g = np.float32(g)
# Get kernels
module = context.get_module("cuda/SWE2D_HLL.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("HLLKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
1, 1,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
1, 1,
[None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
1, 1,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
1, 1,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5

View File

@ -19,31 +19,32 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators import Simulator
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.Simulator import BoundaryCondition
class HLL2 (Simulator.BaseSimulator):
class HLL2(Simulator.BaseSimulator):
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
theta=1.8,
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
theta=1.8,
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=16,
dt: float=None,
compile_opts: list[str]=[]):
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -59,81 +60,81 @@ class HLL2 (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height);
self.g = np.float32(g)
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height)
self.g = np.float32(g)
self.theta = np.float32(theta)
#Get kernels
module = context.get_module("cuda/SWE2D_HLL2.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
# Get kernels
module = context.get_module("cuda/SWE2D_HLL2.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("HLL2Kernel")
self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.substep_dimsplit(dt * 0.5, step_number)
def substep_dimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5

View File

@ -26,12 +26,12 @@ from IPython.core import magic_arguments
from IPython.core.magic import line_magic, Magics, magics_class
import pycuda.driver as cuda
from GPUSimulators import Common
from GPUSimulators.common import IPEngine
from GPUSimulators.gpu import CudaContext
@magics_class
class MagicCudaContext(Magics):
class MagicCudaContext(Magics):
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument(
@ -44,14 +44,14 @@ class MagicCudaContext(Magics):
'--no_autotuning', '-na', action="store_true", help='Disable autotuning of kernels')
def cuda_context_handler(self, line):
args = magic_arguments.parse_argstring(self.cuda_context_handler, line)
self.logger = logging.getLogger(__name__)
self.logger = logging.getLogger(__name__)
self.logger.info("Registering %s in user workspace", args.name)
context_flags = None
if (args.blocking):
if args.blocking:
context_flags = cuda.ctx_flags.SCHED_BLOCKING_SYNC
if args.name in self.shell.user_ns.keys():
self.logger.debug("Context already registered! Ignoring")
return
@ -59,12 +59,13 @@ class MagicCudaContext(Magics):
self.logger.debug("Creating context")
use_cache = False if args.no_cache else True
use_autotuning = False if args.no_autotuning else True
self.shell.user_ns[args.name] = CudaContext.CudaContext(context_flags=context_flags, use_cache=use_cache, autotuning=use_autotuning)
self.shell.user_ns[args.name] = CudaContext(context_flags=context_flags, use_cache=use_cache,
autotuning=use_autotuning)
# this function will be called on exceptions in any cell
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
self.logger.exception("Exception caught: Resetting to CUDA context %s", args.name)
while (cuda.Context.get_current() != None):
while cuda.Context.get_current() is not None:
context = cuda.Context.get_current()
self.logger.info("Popping <%s>", str(context.handle))
cuda.Context.pop()
@ -77,36 +78,30 @@ class MagicCudaContext(Magics):
self.logger.error("CUDA will not work now")
self.logger.debug("==================================================================")
# still show the error within the notebook, don't just swallow it
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
# this registers a custom exception handler for the whole current notebook
get_ipython().set_custom_exc((Exception,), custom_exc)
# Handle CUDA context when exiting python
import atexit
def exitfunc():
self.logger.info("Exitfunc: Resetting CUDA context stack")
while (cuda.Context.get_current() != None):
while cuda.Context.get_current() != None:
context = cuda.Context.get_current()
self.logger.info("`-> Popping <%s>", str(context.handle))
cuda.Context.pop()
self.logger.debug("==================================================================")
atexit.register(exitfunc)
@magics_class
class MagicLogger(Magics):
class MagicLogger(Magics):
logger_initialized = False
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument(
@ -118,51 +113,47 @@ class MagicLogger(Magics):
@magic_arguments.argument(
'--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
def setup_logging(self, line):
if (self.logger_initialized):
if self.logger_initialized:
logging.getLogger('GPUSimulators').info("Global logger already initialized!")
return;
return
else:
self.logger_initialized = True
args = magic_arguments.parse_argstring(self.setup_logging, line)
import sys
#Get root logger
# Get root logger
logger = logging.getLogger('GPUSimulators')
logger.setLevel(min(args.level, args.file_level))
#Add log to screen
# Add log to screen
ch = logging.StreamHandler()
ch.setLevel(args.level)
logger.addHandler(ch)
logger.log(args.level, "Console logger using level %s", logging.getLevelName(args.level))
#Get the outfilename (try to evaluate if Python expression...)
# Get the outfilename (try to evaluate if Python expression...)
try:
outfile = eval(args.out, self.shell.user_global_ns, self.shell.user_ns)
except:
outfile = args.out
#Add log to file
# Add log to file
logger.log(args.level, "File logger using level %s to %s", logging.getLevelName(args.file_level), outfile)
fh = logging.FileHandler(outfile)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
fh.setFormatter(formatter)
fh.setLevel(args.file_level)
logger.addHandler(fh)
logger.info("Python version %s", sys.version)
self.shell.user_ns[args.name] = logger
@magics_class
class MagicMPI(Magics):
class MagicMPI(Magics):
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument(
@ -177,13 +168,7 @@ class MagicMPI(Magics):
self.shell.user_ns[args.name].shutdown()
self.shell.user_ns[args.name] = None
gc.collect()
self.shell.user_ns[args.name] = Common.IPEngine(args.num_engines)
self.shell.user_ns[args.name] = IPEngine(args.num_engines)
# Register
@ -191,4 +176,3 @@ ip = get_ipython()
ip.register_magics(MagicCudaContext)
ip.register_magics(MagicLogger)
ip.register_magics(MagicMPI)

View File

@ -24,32 +24,33 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators import Simulator
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.Simulator import BoundaryCondition
class KP07 (Simulator.BaseSimulator):
class KP07(Simulator.BaseSimulator):
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
theta=1.3,
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
theta=1.3,
cfl_scale=0.9,
order=2,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=16,
dt: float=None,
compile_opts: list[str]=[]):
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -65,84 +66,82 @@ class KP07 (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
order,
block_width, block_height);
self.g = np.float32(g)
self.theta = np.float32(theta)
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
order,
block_width, block_height)
self.g = np.float32(g)
self.theta = np.float32(theta)
self.order = np.int32(order)
#Get kernels
module = context.get_module("cuda/SWE2D_KP07.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
# Get kernels
module = context.get_module("cuda/SWE2D_KP07.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("KP07Kernel")
self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.substepRK(dt, step_number)
def substepRK(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
Simulator.stepOrderToCodedInt(step=substep, order=self.order),
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.substep_rk(dt, step_number)
def substep_rk(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
Simulator.step_order_to_coded_int(step=substep, order=self.order),
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5**(self.order-1)
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5 ** (self.order - 1)

View File

@ -24,31 +24,32 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators import Simulator
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.Simulator import BoundaryCondition
class KP07_dimsplit(Simulator.BaseSimulator):
"""
Class that solves the SW equations using the dimentionally split KP07 scheme
"""
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
theta=1.3,
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
theta=1.3,
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=16,
dt: float=None,
compile_opts: list[str]=[]):
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -64,83 +65,83 @@ class KP07_dimsplit(Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height)
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height)
self.gc_x = 2
self.gc_y = 2
self.g = np.float32(g)
self.theta = np.float32(theta)
#Get kernels
module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
# Get kernels
module = context.get_module("cuda/SWE2D_KP07_dimsplit.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("KP07DimsplitKernel")
self.kernel.prepare("iifffffiiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
self.gc_x, self.gc_y,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
self.gc_x, self.gc_y,
[None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
self.gc_x, self.gc_y,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
self.gc_x, self.gc_y,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.substep_dimsplit(dt * 0.5, step_number)
def substep_dimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.theta,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5

View File

@ -20,16 +20,17 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.gpu import CudaContext
from GPUSimulators.Simulator import BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators import Simulator
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.gpu import CudaContext
from GPUSimulators.Simulator import BoundaryCondition
class LxF (Simulator.BaseSimulator):
class LxF(Simulator.BaseSimulator):
"""
Class that solves the SW equations using the Lax Friedrichs scheme
"""
@ -40,11 +41,11 @@ class LxF (Simulator.BaseSimulator):
nx: int, ny: int,
dx: int, dy: int,
g: float,
cfl_scale: float=0.9,
cfl_scale: float = 0.9,
boundary_conditions=BoundaryCondition(),
block_width: int=16, block_height: int=16,
dt: float=None,
compile_opts: list[str]=[]):
block_width: int = 16, block_height: int = 16,
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -60,80 +61,80 @@ class LxF (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
1,
block_width, block_height)
self.g = np.float32(g)
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
1,
block_width, block_height)
self.g = np.float32(g)
# Get kernels
module = context.get_module("cuda/SWE2D_LxF.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
module = context.get_module("cuda/SWE2D_LxF.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("LxFKernel")
self.kernel.prepare("iiffffiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
1, 1,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
1, 1,
[None, None, None])
# Create data by uploading to thedevice
self.u0 = ArakawaA2D(self.stream,
nx, ny,
1, 1,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
1, 1,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
"""
Args:
dt: Size of each timestep (seconds)
"""
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5

View File

@ -222,7 +222,7 @@ class MPISimulator(Simulator.BaseSimulator):
autotuner = sim.context.autotuner
sim.context.autotuner = None;
boundary_conditions = sim.getBoundaryConditions()
boundary_conditions = sim.get_boundary_conditions()
super().__init__(sim.context,
sim.nx, sim.ny,
sim.dx, sim.dy,
@ -263,14 +263,14 @@ class MPISimulator(Simulator.BaseSimulator):
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
self.north = None
new_boundary_conditions.north = boundary_conditions.north;
sim.setBoundaryConditions(new_boundary_conditions)
sim.set_boundary_conditions(new_boundary_conditions)
#Get number of variables
self.nvars = len(self.getOutput().gpu_variables)
self.nvars = len(self.get_output().gpu_variables)
#Shorthands for computing extents and sizes
gc_x = int(self.sim.getOutput()[0].x_halo)
gc_y = int(self.sim.getOutput()[0].y_halo)
gc_x = int(self.sim.get_output()[0].x_halo)
gc_y = int(self.sim.get_output()[0].y_halo)
nx = int(self.sim.nx)
ny = int(self.sim.ny)
@ -322,7 +322,7 @@ class MPISimulator(Simulator.BaseSimulator):
#nvtx.mark("substep full", color="blue")
#self.sim.substep(dt, step_number, external=True, internal=True)
self.sim.swapBuffers()
self.sim.swap_buffers()
self.profiling_data_mpi["end"]["t_mpi_step"] += time.time()
@ -336,8 +336,8 @@ class MPISimulator(Simulator.BaseSimulator):
self.profiling_data_mpi["n_time_steps"] += 1
def getOutput(self):
return self.sim.getOutput()
def get_output(self):
return self.sim.get_output()
def synchronize(self):
self.sim.synchronize()
@ -345,14 +345,14 @@ class MPISimulator(Simulator.BaseSimulator):
def check(self):
return self.sim.check()
def computeDt(self):
local_dt = np.array([np.float32(self.sim.computeDt())]);
def compute_dt(self):
local_dt = np.array([np.float32(self.sim.compute_dt())]);
global_dt = np.empty(1, dtype=np.float32)
self.grid.comm.Allreduce(local_dt, global_dt, op=MPI.MIN)
self.logger.debug("Local dt: {:f}, global dt: {:f}".format(local_dt[0], global_dt[0]))
return global_dt[0]
def getExtent(self):
def get_extent(self):
"""
Function which returns the extent of node with rank
rank in the grid

View File

@ -45,7 +45,7 @@ class SHMEMSimulator(Simulator.BaseSimulator):
# This would also eliminate the need for all the array bookkeeping in this class.
autotuner = sims[0].context.autotuner
sims[0].context.autotuner = None
boundary_conditions = sims[0].getBoundaryConditions()
boundary_conditions = sims[0].get_boundary_conditions()
super().__init__(sims[0].context,
sims[0].nx, sims[0].ny,
sims[0].dx, sims[0].dy,
@ -108,14 +108,14 @@ class SHMEMSimulator(Simulator.BaseSimulator):
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
self.north = None
new_boundary_conditions.north = boundary_conditions.north;
sim.setBoundaryConditions(new_boundary_conditions)
sim.set_boundary_conditions(new_boundary_conditions)
#Get number of variables
self.nvars[i] = len(sim.getOutput().gpu_variables)
self.nvars[i] = len(sim.get_output().gpu_variables)
#Shorthands for computing extents and sizes
gc_x = int(sim.getOutput()[0].x_halo)
gc_y = int(sim.getOutput()[0].y_halo)
gc_x = int(sim.get_output()[0].x_halo)
gc_y = int(sim.get_output()[0].y_halo)
nx = int(sim.nx)
ny = int(sim.ny)
@ -150,10 +150,10 @@ class SHMEMSimulator(Simulator.BaseSimulator):
for i, sim in enumerate(self.sims):
sim.substep(dt, step_number)
def getOutput(self):
def get_output(self):
# XXX: Does not return what we would expect.
# Returns first subdomain, but we want the whole domain.
return self.sims[0].getOutput()
return self.sims[0].get_output()
def synchronize(self):
for sim in self.sims:
@ -164,14 +164,14 @@ class SHMEMSimulator(Simulator.BaseSimulator):
# Checks only first subdomain, but we want to check the whole domain.
return self.sims[0].check()
def computeDt(self):
def compute_dt(self):
global_dt = float("inf")
for sim in self.sims:
sim.context.synchronize()
for sim in self.sims:
local_dt = sim.computeDt()
local_dt = sim.compute_dt()
if local_dt < global_dt:
global_dt = local_dt
self.logger.debug("Local dt: {:f}".format(local_dt))
@ -179,7 +179,7 @@ class SHMEMSimulator(Simulator.BaseSimulator):
self.logger.debug("Global dt: {:f}".format(global_dt))
return global_dt
def getExtent(self, index=0):
def get_extent(self, index=0):
"""
Function which returns the extent of the subdomain with index
index in the grid

View File

@ -62,8 +62,8 @@ class SHMEMGrid(object):
for i in range(self.ngpus):
# XXX: disabled for testing on single-GPU system
#self.cuda_contexts.append(CudaContext.CudaContext(device=i, autotuning=False))
self.cuda_contexts.append(CudaContext.CudaContext(device=0, autotuning=False))
#self.cuda_contexts.append(CudaContext(device=i, autotuning=False))
self.cuda_contexts.append(CudaContext(device=0, autotuning=False))
def getCoordinate(self, index):
i = (index % self.grid[0])
@ -180,7 +180,7 @@ class SHMEMSimulatorGroup(object):
autotuner = sims[0].context.autotuner
sims[0].context.autotuner = None
boundary_conditions = sims[0].getBoundaryConditions()
boundary_conditions = sims[0].get_boundary_conditions()
super().__init__(sims[0].context,
sims[0].nx, sims[0].ny,
sims[0].dx, sims[0].dy,
@ -243,14 +243,14 @@ class SHMEMSimulatorGroup(object):
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
self.north = None
new_boundary_conditions.north = boundary_conditions.north;
sim.setBoundaryConditions(new_boundary_conditions)
sim.set_boundary_conditions(new_boundary_conditions)
#Get number of variables
self.nvars[i] = len(sim.getOutput().gpu_variables)
self.nvars[i] = len(sim.get_output().gpu_variables)
#Shorthands for computing extents and sizes
gc_x = int(sim.getOutput()[0].x_halo)
gc_y = int(sim.getOutput()[0].y_halo)
gc_x = int(sim.get_output()[0].x_halo)
gc_y = int(sim.get_output()[0].y_halo)
nx = int(sim.nx)
ny = int(sim.ny)
@ -287,7 +287,7 @@ class SHMEMSimulatorGroup(object):
def getOutput(self):
# XXX: Does not return what we would expect.
# Returns first subdomain, but we want the whole domain.
return self.sims[0].getOutput()
return self.sims[0].get_output()
def synchronize(self):
for sim in self.sims:
@ -305,7 +305,7 @@ class SHMEMSimulatorGroup(object):
sim.context.synchronize()
for sim in self.sims:
local_dt = sim.computeDt()
local_dt = sim.compute_dt()
if local_dt < global_dt:
global_dt = local_dt
self.logger.debug("Local dt: {:f}".format(local_dt))

View File

@ -20,18 +20,38 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
# Import packages we need
import numpy as np
import logging
from enum import IntEnum
import pycuda.driver as cuda
from GPUSimulators import Common
from GPUSimulators.common import ProgressPrinter
from GPUSimulators.gpu import CudaContext
class BoundaryCondition(object):
def get_types(bc):
types = {'north': BoundaryCondition.Type((bc >> 24) & 0x0000000F),
'south': BoundaryCondition.Type((bc >> 16) & 0x0000000F),
'east': BoundaryCondition.Type((bc >> 8) & 0x0000000F),
'west': BoundaryCondition.Type((bc >> 0) & 0x0000000F)}
return types
def step_order_to_coded_int(step, order):
"""
Helper function which packs the step and order into a single integer
"""
step_order = (step << 16) | (order & 0x0000ffff)
# print("Step: {0:032b}".format(step))
# print("Order: {0:032b}".format(order))
# print("Mix: {0:032b}".format(step_order))
return np.int32(step_order)
class BoundaryCondition(object):
"""
Class for holding boundary conditions for global boundaries
"""
@ -47,12 +67,7 @@ class BoundaryCondition(object):
Periodic = 2,
Reflective = 3
def __init__(self, types={
'north': Type.Reflective,
'south': Type.Reflective,
'east': Type.Reflective,
'west': Type.Reflective
}):
def __init__(self, types: dict[str: Type.Reflective]):
"""
Constructor
"""
@ -61,17 +76,18 @@ class BoundaryCondition(object):
self.south = types['south']
self.east = types['east']
self.west = types['west']
if (self.north == BoundaryCondition.Type.Neumann \
or self.south == BoundaryCondition.Type.Neumann \
or self.east == BoundaryCondition.Type.Neumann \
or self.west == BoundaryCondition.Type.Neumann):
raise(NotImplementedError("Neumann boundary condition not supported"))
def __str__(self):
return '[north={:s}, south={:s}, east={:s}, west={:s}]'.format(str(self.north), str(self.south), str(self.east), str(self.west))
def asCodedInt(self):
if (self.north == BoundaryCondition.Type.Neumann
or self.south == BoundaryCondition.Type.Neumann
or self.east == BoundaryCondition.Type.Neumann
or self.west == BoundaryCondition.Type.Neumann):
raise (NotImplementedError("Neumann boundary condition not supported"))
def __str__(self):
return '[north={:s}, south={:s}, east={:s}, west={:s}]'.format(str(self.north), str(self.south), str(self.east),
str(self.west))
def as_coded_int(self):
"""
Helper function which packs four boundary conditions into one integer
"""
@ -79,26 +95,18 @@ class BoundaryCondition(object):
bc = 0
bc = bc | (self.north & 0x0000000F) << 24
bc = bc | (self.south & 0x0000000F) << 16
bc = bc | (self.east & 0x0000000F) << 8
bc = bc | (self.west & 0x0000000F) << 0
#for t in types:
bc = bc | (self.east & 0x0000000F) << 8
bc = bc | (self.west & 0x0000000F) << 0
# for t in types:
# print("{0:s}, {1:d}, {1:032b}, {1:08b}".format(t, types[t]))
#print("bc: {0:032b}".format(bc))
# print("bc: {0:032b}".format(bc))
return np.int32(bc)
def getTypes(bc):
types = {}
types['north'] = BoundaryCondition.Type((bc >> 24) & 0x0000000F)
types['south'] = BoundaryCondition.Type((bc >> 16) & 0x0000000F)
types['east'] = BoundaryCondition.Type((bc >> 8) & 0x0000000F)
types['west'] = BoundaryCondition.Type((bc >> 0) & 0x0000000F)
return types
class BaseSimulator(object):
def __init__(self,
context: CudaContext,
nx: int, ny: int,
@ -125,40 +133,40 @@ class BaseSimulator(object):
num_substeps: Number of substeps to perform for a full step
"""
#Get logger
# Get logger
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
#Save input parameters
#Notice that we need to specify them in the correct dataformat for the
#GPU kernel
# Save input parameters
# Notice that we need to specify them in the correct dataformat for the
# GPU kernel
self.context = context
self.nx = np.int32(nx)
self.ny = np.int32(ny)
self.dx = np.float32(dx)
self.dy = np.float32(dy)
self.setBoundaryConditions(boundary_conditions)
self.set_boundary_conditions(boundary_conditions)
self.cfl_scale = cfl_scale
self.num_substeps = num_substeps
#Handle autotuning block size
# Handle autotuning block size
if self.context.autotuner:
peak_configuration = self.context.autotuner.get_peak_performance(self.__class__)
block_width = int(peak_configuration["block_width"])
block_height = int(peak_configuration["block_height"])
self.logger.debug("Used autotuning to get block size [%d x %d]", block_width, block_height)
#Compute kernel launch parameters
self.block_size = (block_width, block_height, 1)
self.grid_size = (
int(np.ceil(self.nx / float(self.block_size[0]))),
int(np.ceil(self.ny / float(self.block_size[1])))
)
#Create a CUDA stream
# Compute kernel launch parameters
self.block_size = (block_width, block_height, 1)
self.grid_size = (
int(np.ceil(self.nx / float(self.block_size[0]))),
int(np.ceil(self.ny / float(self.block_size[1])))
)
# Create a CUDA stream
self.stream = cuda.Stream()
self.internal_stream = cuda.Stream()
#Keep track of simulation time and number of timesteps
# Keep track of simulation time and number of timesteps
self.t = 0.0
self.nt = 0
@ -171,41 +179,41 @@ class BaseSimulator(object):
Requires that the step() function is implemented in the subclasses
"""
printer = Common.ProgressPrinter(t)
t_start = self.simTime()
printer = ProgressPrinter(t)
t_start = self.sim_time()
t_end = t_start + t
update_dt = True
if (dt is not None):
if dt is not None:
update_dt = False
self.dt = dt
while(self.simTime() < t_end):
while self.sim_time() < t_end:
# Update dt every 100 timesteps and cross your fingers it works
# for the next 100
if (update_dt and (self.simSteps() % 100 == 0)):
self.dt = self.computeDt()*self.cfl_scale
if update_dt and (self.sim_steps() % 100 == 0):
self.dt = self.compute_dt() * self.cfl_scale
# Compute timestep for "this" iteration (i.e., shorten last timestep)
current_dt = np.float32(min(self.dt, t_end-self.simTime()))
current_dt = np.float32(min(self.dt, t_end - self.sim_time()))
# Stop if end reached (should not happen)
if (current_dt <= 0.0):
self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.simSteps()))
if current_dt <= 0.0:
self.logger.warning("Timestep size {:d} is less than or equal to zero!".format(self.sim_steps()))
break
# Step forward in time
self.step(current_dt)
#Print info
print_string = printer.getPrintString(self.simTime() - t_start)
if (print_string):
# Print info
print_string = printer.get_print_string(self.sim_time() - t_start)
if print_string:
self.logger.info("%s: %s", self, print_string)
try:
self.check()
except AssertionError as e:
e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()),)
e.args += ("Step={:d}, time={:f}".format(self.sim_steps(), self.sim_time()),)
raise
def step(self, dt: int):
@ -218,57 +226,45 @@ class BaseSimulator(object):
for i in range(self.num_substeps):
self.substep(dt, i)
self.t += dt
self.nt += 1
def download(self, variables=None):
return self.getOutput().download(self.stream, variables)
return self.get_output().download(self.stream, variables)
def synchronize(self):
self.stream.synchronize()
def simTime(self):
def sim_time(self):
return self.t
def simSteps(self):
def sim_steps(self):
return self.nt
def getExtent(self):
return [0, 0, self.nx*self.dx, self.ny*self.dy]
def setBoundaryConditions(self, boundary_conditions):
def get_extent(self):
return [0, 0, self.nx * self.dx, self.ny * self.dy]
def set_boundary_conditions(self, boundary_conditions):
self.logger.debug("Boundary conditions set to {:s}".format(str(boundary_conditions)))
self.boundary_conditions = boundary_conditions.asCodedInt()
def getBoundaryConditions(self):
return BoundaryCondition(BoundaryCondition.getTypes(self.boundary_conditions))
self.boundary_conditions = boundary_conditions.as_coded_int()
def get_boundary_conditions(self):
return BoundaryCondition(get_types())
def substep(self, dt, step_number):
"""
Function which performs one single substep with stepsize dt
"""
raise(NotImplementedError("Needs to be implemented in subclass"))
def getOutput(self):
raise(NotImplementedError("Needs to be implemented in subclass"))
raise (NotImplementedError("Needs to be implemented in subclass"))
def get_output(self):
raise (NotImplementedError("Needs to be implemented in subclass"))
def check(self):
self.logger.warning("check() is not implemented - please implement")
#raise(NotImplementedError("Needs to be implemented in subclass"))
def computeDt(self):
raise(NotImplementedError("Needs to be implemented in subclass"))
# raise(NotImplementedError("Needs to be implemented in subclass"))
def stepOrderToCodedInt(step, order):
"""
Helper function which packs the step and order into a single integer
"""
step_order = (step << 16) | (order & 0x0000ffff)
#print("Step: {0:032b}".format(step))
#print("Order: {0:032b}".format(order))
#print("Mix: {0:032b}".format(step_order))
return np.int32(step_order)
def compute_dt(self):
raise (NotImplementedError("Needs to be implemented in subclass"))

View File

@ -20,30 +20,31 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
#Import packages we need
from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
# Import packages we need
import numpy as np
from pycuda import gpuarray
from GPUSimulators import Simulator
from GPUSimulators.common import ArakawaA2D
from GPUSimulators.Simulator import BoundaryCondition
class WAF (Simulator.BaseSimulator):
class WAF(Simulator.BaseSimulator):
"""
Class that solves the SW equations using the Forward-Backward linear scheme
"""
def __init__(self,
def __init__(self,
context,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
h0, hu0, hv0,
nx, ny,
dx, dy,
g,
cfl_scale=0.9,
boundary_conditions=BoundaryCondition(),
boundary_conditions=BoundaryCondition(),
block_width=16, block_height=16,
dt: float=None,
compile_opts: list[str]=[]):
dt: float = None,
compile_opts: list[str] = []):
"""
Initialization routine
@ -59,79 +60,79 @@ class WAF (Simulator.BaseSimulator):
g: Gravitational accelleration (9.81 m/s^2)
compile_opts: Pass a list of nvcc compiler options
"""
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height);
self.g = np.float32(g)
#Get kernels
module = context.get_module("cuda/SWE2D_WAF.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
# Call super constructor
super().__init__(context,
nx, ny,
dx, dy,
boundary_conditions,
cfl_scale,
2,
block_width, block_height)
self.g = np.float32(g)
# Get kernels
module = context.get_module("cuda/SWE2D_WAF.cu",
defines={
'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1]
},
compile_args={
'no_extern_c': True,
'options': ["--use_fast_math"] + compile_opts,
},
jit_compile_args={})
self.kernel = module.get_function("WAFKernel")
self.kernel.prepare("iiffffiiPiPiPiPiPiPiPiiii")
#Create data by uploading to device
self.u0 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[h0, hu0, hv0])
self.u1 = Common.ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None])
# Create data by uploading to the device
self.u0 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[h0, hu0, hv0])
self.u1 = ArakawaA2D(self.stream,
nx, ny,
2, 2,
[None, None, None])
self.cfl_data = gpuarray.GPUArray(self.grid_size, dtype=np.float32)
if dt == None:
dt_x = np.min(self.dx / (np.abs(hu0/h0) + np.sqrt(g*h0)))
dt_y = np.min(self.dy / (np.abs(hv0/h0) + np.sqrt(g*h0)))
if dt is None:
dt_x = np.min(self.dx / (np.abs(hu0 / h0) + np.sqrt(g * h0)))
dt_y = np.min(self.dy / (np.abs(hv0 / h0) + np.sqrt(g * h0)))
self.dt = min(dt_x, dt_y)
else:
self.dt = dt
self.cfl_data.fill(self.dt, stream=self.stream)
def substep(self, dt, step_number):
self.substepDimsplit(dt*0.5, step_number)
def substepDimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.substep_dimsplit(dt * 0.5, step_number)
def substep_dimsplit(self, dt, substep):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
self.nx, self.ny,
self.dx, self.dy, dt,
self.g,
substep,
self.boundary_conditions,
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
self.cfl_data.gpudata,
0, 0,
self.nx, self.ny)
self.u0, self.u1 = self.u1, self.u0
def getOutput(self):
def get_output(self):
return self.u0
def check(self):
self.u0.check()
self.u1.check()
def computeDt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
return max_dt*0.5
def compute_dt(self):
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get()
return max_dt * 0.5

View File

@ -0,0 +1,9 @@
from .arkawa_2d import ArakawaA2D
from .common import *
from .cuda_array_2d import CudaArray2D
from .cuda_array_3d import CudaArray3D
from .data_dumper import DataDumper
from .ip_engine import IPEngine
from .popen_file_buffer import PopenFileBuffer
from .progress_printer import ProgressPrinter
from .timer import Timer

View File

@ -0,0 +1,57 @@
import logging
import numpy as np
import pycuda.gpuarray
from GPUSimulators.common.cuda_array_2d import CudaArray2D
class ArakawaA2D:
"""
A class representing an Arakawa A type (unstaggered, logically Cartesian) grid
"""
def __init__(self, stream, nx, ny, halo_x, halo_y, cpu_variables):
"""
Uploads initial data to the GPU device
"""
self.logger = logging.getLogger(__name__)
self.gpu_variables = []
for cpu_variable in cpu_variables:
self.gpu_variables += [CudaArray2D(stream, nx, ny, halo_x, halo_y, cpu_variable)]
def __getitem__(self, key):
if type(key) != int:
raise TypeError("Indexing is int based")
if key > len(self.gpu_variables) or key < 0:
raise IndexError("Out of bounds")
return self.gpu_variables[key]
def download(self, stream, variables=None):
"""
Enables downloading data from the GPU device to Python
"""
if variables is None:
variables = range(len(self.gpu_variables))
cpu_variables = []
for i in variables:
if i >= len (self.gpu_variables):
raise IndexError(f"Variable {i} is out of range")
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
# stream.synchronize()
return cpu_variables
def check(self):
"""
Checks that data is still sane
"""
for i, gpu_variable in enumerate(self.gpu_variables):
var_sum = pycuda.gpuarray.sum(gpu_variable.data).get()
self.logger.debug(f"Data {i} with size [{gpu_variable.nx} x {gpu_variable.ny}] "
+ f"has average {var_sum / (gpu_variable.nx * gpu_variable.ny)}")
if np.isnan(var_sum):
raise ValueError("Data contains NaN values!")

View File

@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
"""
This python module implements the different helper functions and
classes
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import os
import numpy as np
import time
import subprocess
import logging
import json
from GPUSimulators.common.data_dumper import DataDumper
from GPUSimulators.common.progress_printer import ProgressPrinter
from GPUSimulators.common.timer import Timer
def safe_call(cmd):
logger = logging.getLogger(__name__)
try:
#git rev-parse HEAD
current_dir = os.path.dirname(os.path.realpath(__file__))
params = dict()
params['stderr'] = subprocess.STDOUT
params['cwd'] = current_dir
params['universal_newlines'] = True #text=True in more recent python
params['shell'] = False
if os.name == 'nt':
params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
stdout = subprocess.check_output(cmd, **params)
except subprocess.CalledProcessError as e:
output = e.output
logger.error("Git failed, \nReturn code: " + str(e.returncode) + "\nOutput: " + output)
raise e
return stdout
def get_git_hash():
return safe_call(["git", "rev-parse", "HEAD"])
def get_git_status():
return safe_call(["git", "status", "--porcelain", "-uno"])
def to_json(in_dict, compressed=True):
"""
Creates JSON string from a dictionary
"""
logger = logging.getLogger(__name__)
out_dict = in_dict.copy()
for key in out_dict:
if isinstance(out_dict[key], np.ndarray):
out_dict[key] = out_dict[key].tolist()
else:
try:
json.dumps(out_dict[key])
except:
value = str(out_dict[key])
logger.warning("JSON: Converting {:s} to string ({:s})".format(key, value))
out_dict[key] = value
return json.dumps(out_dict)
def run_simulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
"""
Runs a simulation, and store output in a netcdf file. Stores the times given in
save_times, and saves all the variables in list save_var_names. Elements in
save_var_names can be set to None if you do not want to save them
"""
profiling_data_sim_runner = { 'start': {}, 'end': {} }
profiling_data_sim_runner["start"]["t_sim_init"] = 0
profiling_data_sim_runner["end"]["t_sim_init"] = 0
profiling_data_sim_runner["start"]["t_nc_write"] = 0
profiling_data_sim_runner["end"]["t_nc_write"] = 0
profiling_data_sim_runner["start"]["t_full_step"] = 0
profiling_data_sim_runner["end"]["t_full_step"] = 0
profiling_data_sim_runner["start"]["t_sim_init"] = time.time()
logger = logging.getLogger(__name__)
if len(save_times <= 0):
raise ValueError("Need to specify which times to save")
with Timer("construct") as t:
sim = simulator(**simulator_args)
logger.info(f"Constructed in {str(t.secs)} seconds")
#Create a netcdf file and simulate
with DataDumper(outfile, mode='w', clobber=False) as outdata:
#Create attributes (metadata)
outdata.ncfile.created = time.ctime(time.time())
outdata.ncfile.git_hash = get_git_hash()
outdata.ncfile.git_status = get_git_status()
outdata.ncfile.simulator = str(simulator)
# do not write fields to attributes (they are to large)
simulator_args_for_ncfile = simulator_args.copy()
del simulator_args_for_ncfile["rho"]
del simulator_args_for_ncfile["rho_u"]
del simulator_args_for_ncfile["rho_v"]
del simulator_args_for_ncfile["E"]
outdata.ncfile.sim_args = to_json(simulator_args_for_ncfile)
#Create dimensions
outdata.ncfile.createDimension('time', len(save_times))
outdata.ncfile.createDimension('x', simulator_args['nx'])
outdata.ncfile.createDimension('y', simulator_args['ny'])
#Create variables for dimensions
ncvars = {'time': outdata.ncfile.createVariable('time', np.dtype('float32').char, 'time'),
'x': outdata.ncfile.createVariable('x', np.dtype('float32').char, 'x'),
'y': outdata.ncfile.createVariable('y', np.dtype('float32').char, 'y')}
#Fill variables with proper values
ncvars['time'][:] = save_times
extent = sim.get_extent()
ncvars['x'][:] = np.linspace(extent[0], extent[1], simulator_args['nx'])
ncvars['y'][:] = np.linspace(extent[2], extent[3], simulator_args['ny'])
#Choose which variables to download (prune None from the list, but keep the index)
download_vars = []
for i, var_name in enumerate(save_var_names):
if var_name is not None:
download_vars += [i]
save_var_names = list(save_var_names[i] for i in download_vars)
#Create variables
for var_name in save_var_names:
ncvars[var_name] = outdata.ncfile.createVariable(
var_name, np.dtype('float32').char, ('time', 'y', 'x'), zlib=True, least_significant_digit=3)
#Create step sizes between each save
t_steps = np.empty_like(save_times)
t_steps[0] = save_times[0]
t_steps[1:] = save_times[1:] - save_times[0:-1]
profiling_data_sim_runner["end"]["t_sim_init"] = time.time()
# Start simulation loop
progress_printer = ProgressPrinter(save_times[-1], print_every=10)
for k in range(len(save_times)):
# Get target time and step size there
t_step = t_steps[k]
t_end = save_times[k]
# Sanity check simulator
try:
sim.check()
except AssertionError as e:
logger.error(f"Error after {sim.sim_steps()} steps (t={sim.sim_time()}: {str(e)}")
return outdata.filename
profiling_data_sim_runner["start"]["t_full_step"] += time.time()
# Simulate
if t_step > 0.0:
sim.simulate(t_step, dt)
profiling_data_sim_runner["end"]["t_full_step"] += time.time()
profiling_data_sim_runner["start"]["t_nc_write"] += time.time()
#Download
save_vars = sim.download(download_vars)
#Save to file
for i, var_name in enumerate(save_var_names):
ncvars[var_name][k, :] = save_vars[i]
profiling_data_sim_runner["end"]["t_nc_write"] += time.time()
#Write progress to screen
print_string = progress_printer.get_print_string(t_end)
if print_string:
logger.debug(print_string)
logger.debug(f"Simulated to t={t_end} in "
+ f"{sim.sim_steps()} timesteps (average dt={sim.sim_time() / sim.sim_steps()})")
return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi

View File

@ -0,0 +1,139 @@
import logging
import numpy as np
import pycuda.gpuarray
import pycuda.driver as cuda
from pycuda.tools import PageLockedMemoryPool
class CudaArray2D:
"""
Class that holds 2D CUDA data
"""
def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32):
"""
Uploads initial data to the CUDA device
"""
self.logger = logging.getLogger(__name__)
self.nx = nx
self.ny = ny
self.x_halo = x_halo
self.y_halo = y_halo
nx_halo = nx + 2 * x_halo
ny_halo = ny + 2 * y_halo
# self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
# Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)
# For returning to download
self.memorypool = PageLockedMemoryPool()
# If we don't have any data, just allocate and return
if cpu_data is None:
return
# Make sure data is in proper format
if cpu_data.shape != (ny_halo, nx_halo) and cpu_data.shape != (self.ny, self.nx):
raise ValueError(
f"Wrong shape of data {str(cpu_data.shape)} vs {str((self.ny, self.nx))} / {str((ny_halo, nx_halo))}")
if cpu_data.itemsize != 4:
raise ValueError("Wrong size of data type")
if np.isfortran(cpu_data):
raise TypeError("Wrong datatype (Fortran, expected C)")
# Create a copy object from host to device
x = (nx_halo - cpu_data.shape[1]) // 2
y = (ny_halo - cpu_data.shape[0]) // 2
self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
# self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args):
# self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free()
self.data = None
def download(self, stream, cpu_data=None, asynch=False, extent=None):
"""
Enables downloading data from GPU to Python
"""
if extent is None:
x = self.x_halo
y = self.y_halo
nx = self.nx
ny = self.ny
else:
x, y, nx, ny = extent
if cpu_data is None:
# self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
# Allocate host memory
# The following fails, don't know why (crashes python)
cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32,
mem_flags=cuda.host_alloc_flags.PORTABLE)
# Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
# cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
assert nx == cpu_data.shape[1]
assert ny == cpu_data.shape[0]
assert x + nx <= self.nx + 2 * self.x_halo
assert y + ny <= self.ny + 2 * self.y_halo
# Create a copy object from device to host
copy = cuda.Memcpy2D()
copy.set_src_device(self.data.gpudata)
copy.set_dst_host(cpu_data)
# Set offsets and pitch of a source
copy.src_x_in_bytes = int(x) * self.data.strides[1]
copy.src_y = int(y)
copy.src_pitch = self.data.strides[0]
# Set width in bytes to copy for each row and
# number of rows to copy
copy.width_in_bytes = int(nx) * cpu_data.itemsize
copy.height = int(ny)
copy(stream)
if not asynch:
stream.synchronize()
return cpu_data
def upload(self, stream, cpu_data, extent=None):
if extent is None:
x = self.x_halo
y = self.y_halo
nx = self.nx
ny = self.ny
else:
x, y, nx, ny = extent
assert (nx == cpu_data.shape[1])
assert (ny == cpu_data.shape[0])
assert (x + nx <= self.nx + 2 * self.x_halo)
assert (y + ny <= self.ny + 2 * self.y_halo)
# Create a copy object from device to host
copy = cuda.Memcpy2D()
copy.set_dst_device(self.data.gpudata)
copy.set_src_host(cpu_data)
# Set offsets and pitch of a source
copy.dst_x_in_bytes = int(x) * self.data.strides[1]
copy.dst_y = int(y)
copy.dst_pitch = self.data.strides[0]
# Set width in bytes to copy for each row and
# number of rows to copy
copy.width_in_bytes = int(nx) * cpu_data.itemsize
copy.height = int(ny)
copy(stream)

View File

@ -0,0 +1,120 @@
import logging
import numpy as np
import pycuda.gpuarray
import pycuda.driver as cuda
from pycuda.tools import PageLockedMemoryPool
class CudaArray3D:
"""
Class that holds 3D data
"""
def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32):
"""
Uploads initial data to the CL device
"""
self.logger = logging.getLogger(__name__)
self.nx = nx
self.ny = ny
self.nz = nz
self.x_halo = x_halo
self.y_halo = y_halo
self.z_halo = z_halo
nx_halo = nx + 2 * x_halo
ny_halo = ny + 2 * y_halo
nz_halo = nz + 2 * z_halo
# self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
# Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)
# For returning to download
self.memorypool = PageLockedMemoryPool()
# If we don't have any data, just allocate and return
if cpu_data is None:
return
# Make sure data is in proper format
if (cpu_data.shape != (nz_halo, ny_halo, nx_halo)
and cpu_data.shape != (self.nz, self.ny, self.nx)):
raise ValueError(f"Wrong shape of data {str(cpu_data.shape)} vs {str((self.nz, self.ny, self.nx))} / {str((nz_halo, ny_halo, nx_halo))}")
if cpu_data.itemsize != 4:
raise ValueError("Wrong size of data type")
if np.isfortran(cpu_data):
raise TypeError("Wrong datatype (Fortran, expected C)")
# Create a copy object from host to device
copy = cuda.Memcpy3D()
copy.set_src_host(cpu_data)
copy.set_dst_device(self.data.gpudata)
# Set offsets of destination
x_offset = (nx_halo - cpu_data.shape[2]) // 2
y_offset = (ny_halo - cpu_data.shape[1]) // 2
z_offset = (nz_halo - cpu_data.shape[0]) // 2
copy.dst_x_in_bytes = x_offset * self.data.strides[1]
copy.dst_y = y_offset
copy.dst_z = z_offset
# Set pitch of destination
copy.dst_pitch = self.data.strides[0]
# Set width in bytes to copy for each row and
# number of rows to copy
width = max(self.nx, cpu_data.shape[2])
height = max(self.ny, cpu_data.shape[1])
depth = max(self.nz, cpu - data.shape[0])
copy.width_in_bytes = width * cpu_data.itemsize
copy.height = height
copy.depth = depth
# Perform the copy
copy(stream)
# self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
def __del__(self, *args):
# self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
self.data.gpudata.free()
self.data = None
def download(self, stream, asynch=False):
"""
Enables downloading data from GPU to Python
"""
# self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
# Allocate host memory
# cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
# cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32)
# Create a copy object from device to host
copy = cuda.Memcpy2D()
copy.set_src_device(self.data.gpudata)
copy.set_dst_host(cpu_data)
# Set offsets and pitch of a source
copy.src_x_in_bytes = self.x_halo * self.data.strides[1]
copy.src_y = self.y_halo
copy.src_z = self.z_halo
copy.src_pitch = self.data.strides[0]
# Set width in bytes to copy for each row and
# number of rows to copy
copy.width_in_bytes = self.nx * cpu_data.itemsize
copy.height = self.ny
copy.depth = self.nz
copy(stream)
if not asynch:
stream.synchronize()
return cpu_data

View File

@ -0,0 +1,79 @@
import json
import logging
import os
import netCDF4
import numpy as np
def to_json(in_dict):
out_dict = in_dict.copy()
for key in out_dict:
if isinstance(out_dict[key], np.ndarray):
out_dict[key] = out_dict[key].tolist()
else:
try:
json.dumps(out_dict[key])
except:
out_dict[key] = str(out_dict[key])
return json.dumps(out_dict)
class DataDumper(object):
"""
Simple class for holding a netCDF4 object
(handles opening and closing nicely)
Use as
with DataDumper("filename") as data:
...
"""
def __init__(self, filename, *args, **kwargs):
self.logger = logging.getLogger(__name__)
# Create directory if needed
filename = os.path.abspath(filename)
dirname = os.path.dirname(filename)
if dirname and not os.path.isdir(dirname):
self.logger.info("Creating directory " + dirname)
os.makedirs(dirname)
# Get mode of a file if we have that
mode = None
if args:
mode = args[0]
elif kwargs and 'mode' in kwargs.keys():
mode = kwargs['mode']
# Create a new unique file if writing
if mode:
if ("w" in mode) or ("+" in mode) or ("a" in mode):
i = 0
stem, ext = os.path.splitext(filename)
while os.path.isfile(filename):
filename = f"{stem}_{str(i).zfill(4)}{ext}"
i = i + 1
self.filename = os.path.abspath(filename)
# Save arguments
self.args = args
self.kwargs = kwargs
# Log output
self.logger.info("Initialized " + self.filename)
def __enter__(self):
self.logger.info("Opening " + self.filename)
if self.args:
self.logger.info("Arguments: " + str(self.args))
if self.kwargs:
self.logger.info("Keyword arguments: " + str(self.kwargs))
self.ncfile = netCDF4.Dataset(self.filename, *self.args, **self.kwargs)
return self
def __exit__(self, *args):
self.logger.info("Closing " + self.filename)
self.ncfile.close()

View File

@ -0,0 +1,101 @@
import gc
import logging
import os
import signal
import subprocess
import time
from GPUSimulators.common.popen_file_buffer import PopenFileBuffer
class IPEngine(object):
"""
Class for starting IPEngines for MPI processing in IPython
"""
def __init__(self, n_engines):
self.logger = logging.getLogger(__name__)
# Start ipcontroller
self.logger.info("Starting IPController")
self.c_buff = PopenFileBuffer()
c_cmd = ["ipcontroller", "--ip='*'"]
c_params = dict()
c_params['stderr'] = self.c_buff.stderr
c_params['stdout'] = self.c_buff.stdout
c_params['shell'] = False
if os.name == 'nt':
c_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
self.c = subprocess.Popen(c_cmd, **c_params)
# Wait until the controller is running
time.sleep(3)
# Start engines
self.logger.info("Starting IPEngines")
self.e_buff = PopenFileBuffer()
e_cmd = ["mpiexec", "-n", str(n_engines), "ipengine", "--mpi"]
e_params = dict()
e_params['stderr'] = self.e_buff.stderr
e_params['stdout'] = self.e_buff.stdout
e_params['shell'] = False
if os.name == 'nt':
e_params['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
self.e = subprocess.Popen(e_cmd, **e_params)
# attach to a running cluster
import ipyparallel
self.cluster = ipyparallel.Client() # profile='mpi')
time.sleep(3)
while len(self.cluster.ids) != n_engines:
time.sleep(0.5)
self.logger.info("Waiting for cluster...")
self.cluster = ipyparallel.Client() # profile='mpi')
self.logger.info("Done")
def __del__(self):
self.shutdown()
def shutdown(self):
if self.e is not None:
if os.name == 'nt':
self.logger.warning("Sending CTRL+C to IPEngine")
self.e.send_signal(signal.CTRL_C_EVENT)
try:
self.e.communicate(timeout=3)
self.e.kill()
except subprocess.TimeoutExpired:
self.logger.warning("Killing IPEngine")
self.e.kill()
self.e.communicate()
self.e = None
cout, cerr = self.e_buff.read()
self.logger.info(f"IPEngine cout: {cout}")
self.logger.info(f"IPEngine cerr: {cerr}")
self.e_buff = None
gc.collect()
if self.c is not None:
if os.name == 'nt':
self.logger.warning("Sending CTRL+C to IPController")
self.c.send_signal(signal.CTRL_C_EVENT)
try:
self.c.communicate(timeout=3)
self.c.kill()
except subprocess.TimeoutExpired:
self.logger.warning("Killing IPController")
self.c.kill()
self.c.communicate()
self.c = None
cout, cerr = self.c_buff.read()
self.logger.info(f"IPController cout: {cout}")
self.logger.info(f"IPController cerr: {cerr}")
self.c_buff = None
gc.collect()

View File

@ -0,0 +1,27 @@
import tempfile
class PopenFileBuffer(object):
"""
Simple class for holding a set of temp files
for communicating with a subprocess
"""
def __init__(self):
self.stdout = tempfile.TemporaryFile(mode='w+t')
self.stderr = tempfile.TemporaryFile(mode='w+t')
def __del__(self):
self.stdout.close()
self.stderr.close()
def read(self):
self.stdout.seek(0)
cout = self.stdout.read()
self.stdout.seek(0, 2)
self.stderr.seek(0)
cerr = self.stderr.read()
self.stderr.seek(0, 2)
return cout, cerr

View File

@ -0,0 +1,62 @@
import logging
import time
import numpy as np
def time_string(seconds):
seconds = int(max(seconds, 1))
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
periods = [('h', hours), ('m', minutes), ('s', seconds)]
return_string = ' '.join('{}{}'.format(value, name)
for name, value in periods
if value)
return return_string
def progress_bar(step, total_steps, width=30):
progress = np.round(width * step / total_steps).astype(np.int32)
progressbar = "0% [" + "#" * progress + "=" * (width - progress) + "] 100%"
return progressbar
class ProgressPrinter(object):
"""
Small helper class for creating a progress bar
"""
def __init__(self, total_steps, print_every=5):
self.logger = logging.getLogger(__name__)
self.start = time.time()
self.total_steps = total_steps
self.print_every = print_every
self.next_print_time = self.print_every
self.last_step = 0
self.secs_per_iter = None
def get_print_string(self, step):
elapsed = time.time() - self.start
if elapsed > self.next_print_time:
dt = elapsed - (self.next_print_time - self.print_every)
dsteps = step - self.last_step
steps_remaining = self.total_steps - step
if dsteps == 0:
return None
self.last_step = step
self.next_print_time = elapsed + self.print_every
if not self.secs_per_iter:
self.secs_per_iter = dt / dsteps
self.secs_per_iter = 0.2 * self.secs_per_iter + 0.8 * (dt / dsteps)
remaining_time = steps_remaining * self.secs_per_iter
return (f"{progress_bar(step, self.total_steps)}. "
+ f"Total: {time_string(elapsed + remaining_time)}, "
+ f"elapsed: {time_string(elapsed)}, "
+ f"remaining: {time_string(remaining_time)}")
return None

View File

@ -0,0 +1,26 @@
import logging
import time
class Timer(object):
"""
Class which keeps track of time spent for a section of code
"""
def __init__(self, tag, log_level=logging.DEBUG):
self.tag = tag
self.log_level = log_level
self.logger = logging.getLogger(__name__)
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # milliseconds
self.logger.log(self.log_level, f"{self.tag}: {self.msecs} ms")
def elapsed(self):
return time.time() - self.start

View File

@ -0,0 +1,2 @@
from .cuda_context import CudaContext
from .hip_context import HIPContext

View File

@ -21,8 +21,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import numpy as np
import time
import re
import io
import hashlib
@ -33,8 +31,8 @@ import pycuda.compiler as cuda_compiler
import pycuda.gpuarray
import pycuda.driver as cuda
from GPUSimulators import Autotuner, Common
from GPUSimulators.gpu.Context import Context
from GPUSimulators import Autotuner
from GPUSimulators.common import common
class CudaContext(object):

View File

@ -3,10 +3,10 @@ import io
import os.path
import hip as hip_main
from hip import hip, hiprtc
from hip import hip
from GPUSimulators import Common
from GPUSimulators.gpu.Context import Context
from GPUSimulators.common import common
from GPUSimulators.gpu.context import Context
class HIPContext(Context):

View File

@ -52,9 +52,7 @@
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from GPUSimulators import IPythonMagic"
]
"source": ""
},
{
"cell_type": "code",
@ -115,10 +113,10 @@
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"from mpi4py import MPI\n",
"import time\n",
"import json\n",
"\n",
"from GPUSimulators import IPythonMagic, MPISimulator, Common"
"from GPUSimulators import MPISimulator\n",
"from GPUSimulators.common import common"
]
},
{
@ -317,7 +315,6 @@
"%%px\n",
"\n",
"from GPUSimulators.helpers import InitialConditions\n",
"from GPUSimulators.Simulator import BoundaryCondition\n",
"\n",
"my_context.autotuner = None\n",
"\n",
@ -348,7 +345,7 @@
" return sim\n",
"\n",
"\n",
"outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)"
"outfile = Common.run_simulation(genSim, arguments, outfile, save_times, save_var_names)"
]
},
{
@ -657,7 +654,7 @@
" sim = MPISimulator.MPISimulator(local_sim, grid)\n",
" return sim\n",
"\n",
"outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)"
"outfile = Common.run_simulation(genSim, arguments, outfile, save_times, save_var_names)"
]
},
{

View File

@ -13,19 +13,10 @@
"%load_ext line_profiler\n",
"\n",
"#Import packages we need\n",
"import numpy as np\n",
"from matplotlib import animation, rc\n",
"from matplotlib import pyplot as plt\n",
"\n",
"import subprocess\n",
"import os\n",
"import gc\n",
"import datetime\n",
"import importlib\n",
"import logging\n",
"\n",
"import pycuda.driver as cuda\n",
"import pycuda.compiler\n",
"\n",
"try:\n",
" from StringIO import StringIO\n",
@ -37,7 +28,7 @@
"rc('figure', figsize=(16.0, 12.0))\n",
"rc('animation', html='html5')\n",
"\n",
"from GPUSimulators import Common, IPythonMagic\n",
"from GPUSimulators.common import common\n",
"from GPUSimulators.helpers import InitialConditions"
]
},
@ -129,7 +120,7 @@
" h = sim.u0[0].download(sim.stream)\n",
" \n",
" plt.figure()\n",
" plt.title(str(sim) + \", t=\" + str(sim.simTime()) + \", nt=\" + str(sim.simSteps()))\n",
" plt.title(str(sim) + \", t=\" + str(sim.sim_time()) + \", nt=\" + str(sim.sim_steps()))\n",
" extent = [0, sim.dx*sim.nx, 0, sim.dy*sim.ny]\n",
" plt.imshow(h, vmin=0.49, vmax=0.52, extent=extent)\n",
" plt.colorbar()"
@ -292,16 +283,16 @@
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[10], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m importlib\u001b[38;5;241m.\u001b[39mreload(KP07)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Common\u001b[38;5;241m.\u001b[39mTimer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mconstruct\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m t:\n\u001b[0;32m----> 5\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mKP07\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mKP07\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43marguments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Common\u001b[38;5;241m.\u001b[39mTimer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstep\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m t:\n\u001b[1;32m 8\u001b[0m t \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39msimulate(t_end)\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/KP07.py:70\u001b[0m, in \u001b[0;36mKP07.__init__\u001b[0;34m(self, context, h0, hu0, hv0, nx, ny, dx, dy, g, theta, cfl_scale, order, boundary_conditions, block_width, block_height, dt, compile_opts)\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;124;03mInitialization routine\u001b[39;00m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;124;03m\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m compile_opts: Pass a list of nvcc compiler options\u001b[39;00m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 69\u001b[0m \u001b[38;5;66;03m# Call super constructor\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 71\u001b[0m \u001b[43m \u001b[49m\u001b[43mnx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mny\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[43m \u001b[49m\u001b[43mdx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 73\u001b[0m \u001b[43m \u001b[49m\u001b[43mboundary_conditions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 74\u001b[0m \u001b[43m \u001b[49m\u001b[43mcfl_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[43m \u001b[49m\u001b[43mblock_width\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_height\u001b[49m\u001b[43m)\u001b[49m;\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mg \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat32(g) \n\u001b[1;32m 78\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtheta \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat32(theta) \n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Simulator.py:146\u001b[0m, in \u001b[0;36mBaseSimulator.__init__\u001b[0;34m(self, context, nx, ny, dx, dy, boundary_conditions, cfl_scale, num_substeps, block_width, block_height)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;66;03m#Handle autotuning block size\u001b[39;00m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontext\u001b[38;5;241m.\u001b[39mautotuner:\n\u001b[0;32m--> 146\u001b[0m peak_configuration \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_peak_performance\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 147\u001b[0m block_width \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(peak_configuration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 148\u001b[0m block_height \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(peak_configuration[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblock_height\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:121\u001b[0m, in \u001b[0;36mAutotuner.get_peak_performance\u001b[0;34m(self, simulator)\u001b[0m\n\u001b[1;32m 119\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not get autotuned peak performance for \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m: benchmarking\u001b[39m\u001b[38;5;124m\"\u001b[39m, key)\n\u001b[1;32m 120\u001b[0m data\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbenchmark\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 122\u001b[0m data \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfilename)\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_max_index\u001b[39m(megacells):\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:84\u001b[0m, in \u001b[0;36mAutotuner.benchmark\u001b[0;34m(self, simulator, force)\u001b[0m\n\u001b[1;32m 81\u001b[0m benchmark_data[k] \u001b[38;5;241m=\u001b[39m v\n\u001b[1;32m 83\u001b[0m \u001b[38;5;66;03m# Run benchmark\u001b[39;00m\n\u001b[0;32m---> 84\u001b[0m benchmark_data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_megacells\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mAutotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbenchmark_single_simulator\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marguments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_widths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_heights\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 85\u001b[0m benchmark_data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block_widths\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_widths\n\u001b[1;32m 86\u001b[0m benchmark_data[key \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block_heights\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblock_heights\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:162\u001b[0m, in \u001b[0;36mAutotuner.benchmark_single_simulator\u001b[0;34m(simulator, arguments, block_widths, block_heights)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, block_width \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(block_widths):\n\u001b[1;32m 161\u001b[0m sim_arguments\u001b[38;5;241m.\u001b[39mupdate({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mblock_width\u001b[39m\u001b[38;5;124m'\u001b[39m: block_width})\n\u001b[0;32m--> 162\u001b[0m megacells[j, i] \u001b[38;5;241m=\u001b[39m \u001b[43mAutotuner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_benchmark\u001b[49m\u001b[43m(\u001b[49m\u001b[43msimulator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msim_arguments\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 165\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCompleted \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m%f\u001b[39;00m\u001b[38;5;124m seconds\u001b[39m\u001b[38;5;124m\"\u001b[39m, simulator\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, t\u001b[38;5;241m.\u001b[39msecs)\n\u001b[1;32m 167\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m megacells\n",
"File \u001b[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:200\u001b[0m, in \u001b[0;36mAutotuner.run_benchmark\u001b[0;34m(simulator, arguments, timesteps, warmup_timesteps)\u001b[0m\n\u001b[1;32m 197\u001b[0m end\u001b[38;5;241m.\u001b[39mrecord(sim\u001b[38;5;241m.\u001b[39mstream)\n\u001b[1;32m 199\u001b[0m \u001b[38;5;66;03m#Synchronize end event\u001b[39;00m\n\u001b[0;32m--> 200\u001b[0m \u001b[43mend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msynchronize\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;66;03m#Compute megacells\u001b[39;00m\n\u001b[1;32m 203\u001b[0m gpu_elapsed \u001b[38;5;241m=\u001b[39m end\u001b[38;5;241m.\u001b[39mtime_since(start)\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m1.0e-3\u001b[39m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[10], line 5\u001B[0m\n\u001B[1;32m 2\u001B[0m importlib\u001B[38;5;241m.\u001B[39mreload(KP07)\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m Common\u001B[38;5;241m.\u001B[39mTimer(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mconstruct\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m t:\n\u001B[0;32m----> 5\u001B[0m sim \u001B[38;5;241m=\u001B[39m \u001B[43mKP07\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mKP07\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43marguments\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m Common\u001B[38;5;241m.\u001B[39mTimer(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mstep\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m t:\n\u001B[1;32m 8\u001B[0m t \u001B[38;5;241m=\u001B[39m sim\u001B[38;5;241m.\u001B[39msimulate(t_end)\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/KP07.py:70\u001B[0m, in \u001B[0;36mKP07.__init__\u001B[0;34m(self, context, h0, hu0, hv0, nx, ny, dx, dy, g, theta, cfl_scale, order, boundary_conditions, block_width, block_height, dt, compile_opts)\u001B[0m\n\u001B[1;32m 53\u001B[0m \u001B[38;5;250m\u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 54\u001B[0m \u001B[38;5;124;03mInitialization routine\u001B[39;00m\n\u001B[1;32m 55\u001B[0m \u001B[38;5;124;03m\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 66\u001B[0m \u001B[38;5;124;03m compile_opts: Pass a list of nvcc compiler options\u001B[39;00m\n\u001B[1;32m 67\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 69\u001B[0m \u001B[38;5;66;03m# Call super constructor\u001B[39;00m\n\u001B[0;32m---> 70\u001B[0m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mcontext\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[1;32m 71\u001B[0m \u001B[43m \u001B[49m\u001B[43mnx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mny\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[1;32m 72\u001B[0m \u001B[43m \u001B[49m\u001B[43mdx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdy\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\n\u001B[1;32m 73\u001B[0m \u001B[43m \u001B[49m\u001B[43mboundary_conditions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 74\u001B[0m \u001B[43m \u001B[49m\u001B[43mcfl_scale\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 75\u001B[0m \u001B[43m \u001B[49m\u001B[43morder\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 76\u001B[0m \u001B[43m \u001B[49m\u001B[43mblock_width\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mblock_height\u001B[49m\u001B[43m)\u001B[49m;\n\u001B[1;32m 77\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mg \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mfloat32(g) \n\u001B[1;32m 78\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtheta \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mfloat32(theta) \n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Simulator.py:146\u001B[0m, in \u001B[0;36mBaseSimulator.__init__\u001B[0;34m(self, context, nx, ny, dx, dy, boundary_conditions, cfl_scale, num_substeps, block_width, block_height)\u001B[0m\n\u001B[1;32m 144\u001B[0m \u001B[38;5;66;03m#Handle autotuning block size\u001B[39;00m\n\u001B[1;32m 145\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcontext\u001B[38;5;241m.\u001B[39mautotuner:\n\u001B[0;32m--> 146\u001B[0m peak_configuration \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcontext\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mautotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_peak_performance\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;18;43m__class__\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 147\u001B[0m block_width \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mint\u001B[39m(peak_configuration[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m\"\u001B[39m])\n\u001B[1;32m 148\u001B[0m block_height \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mint\u001B[39m(peak_configuration[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mblock_height\u001B[39m\u001B[38;5;124m\"\u001B[39m])\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:121\u001B[0m, in \u001B[0;36mAutotuner.get_peak_performance\u001B[0;34m(self, simulator)\u001B[0m\n\u001B[1;32m 119\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCould not get autotuned peak performance for \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m: benchmarking\u001B[39m\u001B[38;5;124m\"\u001B[39m, key)\n\u001B[1;32m 120\u001B[0m data\u001B[38;5;241m.\u001B[39mclose()\n\u001B[0;32m--> 121\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbenchmark\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 122\u001B[0m data \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mload(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfilename)\n\u001B[1;32m 124\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfind_max_index\u001B[39m(megacells):\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:84\u001B[0m, in \u001B[0;36mAutotuner.benchmark\u001B[0;34m(self, simulator, force)\u001B[0m\n\u001B[1;32m 81\u001B[0m benchmark_data[k] \u001B[38;5;241m=\u001B[39m v\n\u001B[1;32m 83\u001B[0m \u001B[38;5;66;03m# Run benchmark\u001B[39;00m\n\u001B[0;32m---> 84\u001B[0m benchmark_data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_megacells\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[43mAutotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbenchmark_single_simulator\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43marguments\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mblock_widths\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mblock_heights\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 85\u001B[0m benchmark_data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_block_widths\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mblock_widths\n\u001B[1;32m 86\u001B[0m benchmark_data[key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_block_heights\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mblock_heights\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:162\u001B[0m, in \u001B[0;36mAutotuner.benchmark_single_simulator\u001B[0;34m(simulator, arguments, block_widths, block_heights)\u001B[0m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i, block_width \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(block_widths):\n\u001B[1;32m 161\u001B[0m sim_arguments\u001B[38;5;241m.\u001B[39mupdate({\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mblock_width\u001B[39m\u001B[38;5;124m'\u001B[39m: block_width})\n\u001B[0;32m--> 162\u001B[0m megacells[j, i] \u001B[38;5;241m=\u001B[39m \u001B[43mAutotuner\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_benchmark\u001B[49m\u001B[43m(\u001B[49m\u001B[43msimulator\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msim_arguments\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 165\u001B[0m logger\u001B[38;5;241m.\u001B[39mdebug(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCompleted \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m in \u001B[39m\u001B[38;5;132;01m%f\u001B[39;00m\u001B[38;5;124m seconds\u001B[39m\u001B[38;5;124m\"\u001B[39m, simulator\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, t\u001B[38;5;241m.\u001B[39msecs)\n\u001B[1;32m 167\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m megacells\n",
"File \u001B[0;32m~/PycharmProjects/FiniteVolumeGPU/GPUSimulators/Autotuner.py:200\u001B[0m, in \u001B[0;36mAutotuner.run_benchmark\u001B[0;34m(simulator, arguments, timesteps, warmup_timesteps)\u001B[0m\n\u001B[1;32m 197\u001B[0m end\u001B[38;5;241m.\u001B[39mrecord(sim\u001B[38;5;241m.\u001B[39mstream)\n\u001B[1;32m 199\u001B[0m \u001B[38;5;66;03m#Synchronize end event\u001B[39;00m\n\u001B[0;32m--> 200\u001B[0m \u001B[43mend\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msynchronize\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 202\u001B[0m \u001B[38;5;66;03m#Compute megacells\u001B[39;00m\n\u001B[1;32m 203\u001B[0m gpu_elapsed \u001B[38;5;241m=\u001B[39m end\u001B[38;5;241m.\u001B[39mtime_since(start)\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m1.0e-3\u001B[39m\n",
"\u001B[0;31mKeyboardInterrupt\u001B[0m: "
]
}
],

View File

@ -34,8 +34,9 @@ from mpi4py import MPI
import pycuda.driver as cuda
# Simulator engine etc
from GPUSimulators import MPISimulator, Common
from GPUSimulators.gpu import CudaContext
from GPUSimulators import MPISimulator
from GPUSimulators.common import common
from GPUSimulators.gpu import cuda_context
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
@ -147,7 +148,7 @@ def genSim(grid, **kwargs):
return sim
outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
outfile, sim_runner_profiling_data, sim_profiling_data = Common.run_simulation(
genSim, arguments, outfile, save_times, save_var_names, dt)
if(args.profile):
@ -183,8 +184,8 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
profiling_data["slurm_job_id"] = job_id
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
profiling_data["git_hash"] = Common.getGitHash()
profiling_data["git_status"] = Common.getGitStatus()
profiling_data["git_hash"] = Common.get_git_hash()
profiling_data["git_status"] = Common.get_git_status()
with open(profiling_file, "w") as write_file:
json.dump(profiling_data, write_file)

View File

@ -25,7 +25,8 @@ import gc
import logging
#Simulator engine etc
from GPUSimulators import SHMEMSimulatorGroup, Common
from GPUSimulators import SHMEMSimulatorGroup
from GPUSimulators.common import common
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
@ -99,7 +100,7 @@ def genSim(sims, grid, **kwargs):
sim = SHMEMSimulatorGroup.SHMEMSimulatorGroup(sims, grid)
return sim
outfile = Common.runSimulation(genSim, arguments, outfile, save_times, save_var_names)
outfile = Common.run_simulation(genSim, arguments, outfile, save_times, save_var_names)

View File

@ -28,8 +28,8 @@ import logging
import pycuda.driver as cuda
# Simulator engine etc
from GPUSimulators import Common
from GPUSimulators.gpu import CudaContext
from GPUSimulators.common import common
from GPUSimulators.gpu import cuda_context
from GPUSimulators import EE2D_KP07_dimsplit
from GPUSimulators.helpers import InitialConditions as IC
@ -104,7 +104,7 @@ def genSim(**kwargs):
return local_sim
outfile = Common.runSimulation(
outfile = Common.run_simulation(
genSim, arguments, outfile, save_times, save_var_names)
####