diff --git a/Figures.ipynb b/Figures.ipynb index d7144f3..904b155 100644 --- a/Figures.ipynb +++ b/Figures.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -36,123 +36,9 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dt n_cuda_devices n_processes n_time_steps nx ny \\\n", - "11 0.00001 1 1 202.0 4096.0 4096.0 \n", - "7 0.00001 2 2 202.0 4096.0 4096.0 \n", - "14 0.00001 3 3 202.0 4096.0 4096.0 \n", - "0 0.00001 4 4 202.0 4096.0 4096.0 \n", - "5 0.00001 5 5 202.0 4096.0 4096.0 \n", - "8 0.00001 6 6 202.0 4096.0 4096.0 \n", - "2 0.00001 7 7 202.0 4096.0 4096.0 \n", - "1 0.00001 8 8 202.0 4096.0 4096.0 \n", - "15 0.00001 9 9 202.0 4096.0 4096.0 \n", - "9 0.00001 10 10 202.0 4096.0 4096.0 \n", - "6 0.00001 11 11 202.0 4096.0 4096.0 \n", - "12 0.00001 12 12 202.0 4096.0 4096.0 \n", - "3 0.00001 13 13 202.0 4096.0 4096.0 \n", - "4 0.00001 14 14 202.0 4096.0 4096.0 \n", - "10 0.00001 15 15 202.0 4096.0 4096.0 \n", - "13 0.00001 16 16 202.0 4096.0 4096.0 \n", - "\n", - " outfile slurm_job_id t_init \\\n", - "11 /work/martinls/220879/ShallowWaterGPU/mpi_out_... 220879.0 5.507971 \n", - "7 /work/martinls/220880/ShallowWaterGPU/mpi_out_... 220880.0 5.721808 \n", - "14 /work/martinls/220881/ShallowWaterGPU/mpi_out_... 220881.0 5.717571 \n", - "0 /work/martinls/220882/ShallowWaterGPU/mpi_out_... 220882.0 6.074582 \n", - "5 /work/martinls/220883/ShallowWaterGPU/mpi_out_... 220883.0 6.108083 \n", - "8 /work/martinls/220884/ShallowWaterGPU/mpi_out_... 220884.0 5.634954 \n", - "2 /work/martinls/220885/ShallowWaterGPU/mpi_out_... 220885.0 5.070476 \n", - "1 /work/martinls/220886/ShallowWaterGPU/mpi_out_... 220886.0 4.180240 \n", - "15 /work/martinls/220887/ShallowWaterGPU/mpi_out_... 220887.0 4.341784 \n", - "9 /work/martinls/220888/ShallowWaterGPU/mpi_out_... 220888.0 5.299585 \n", - "6 /work/martinls/220889/ShallowWaterGPU/mpi_out_... 220889.0 6.026560 \n", - "12 /work/martinls/220890/ShallowWaterGPU/mpi_out_... 220890.0 6.275554 \n", - "3 /work/martinls/220891/ShallowWaterGPU/mpi_out_... 220891.0 7.103848 \n", - "4 /work/martinls/220892/ShallowWaterGPU/mpi_out_... 220892.0 8.413395 \n", - "10 /work/martinls/220893/ShallowWaterGPU/mpi_out_... 220893.0 9.477692 \n", - "13 /work/martinls/220894/ShallowWaterGPU/mpi_out_... 220894.0 9.881657 \n", - "\n", - " t_nc_write t_sim_init t_sim_mpi_init t_step t_step_mpi \\\n", - "11 3.687433 41.936742 0.002744 1.654766 1.635376 \n", - "7 3.692994 22.107036 0.002273 23.791452 1.607910 \n", - "14 3.710237 37.115577 0.002727 10.057041 1.693726 \n", - "0 3.825206 45.152544 0.002787 1.858922 1.732544 \n", - "5 3.833911 45.100539 0.002747 1.893753 1.798096 \n", - "8 3.779569 21.815326 0.003427 20.156193 1.755737 \n", - "2 3.902169 41.552375 0.003021 1.923522 1.733032 \n", - "1 3.880948 25.252051 0.003020 1.858470 1.778198 \n", - "15 3.870645 25.595168 0.002387 1.857388 1.771851 \n", - "9 3.858673 25.574244 0.003054 1.863774 1.784302 \n", - "6 3.944965 26.100656 0.002582 1.899902 1.745728 \n", - "12 3.932847 29.110549 0.003453 4.618246 1.801758 \n", - "3 3.973085 33.206754 0.003094 1.944592 1.778198 \n", - "4 3.987867 28.638798 0.003963 2.088098 1.781616 \n", - "10 3.998516 28.688051 0.002921 1.876371 1.801880 \n", - "13 3.955760 30.579091 0.002603 1.924463 1.780518 \n", - "\n", - " t_step_mpi_halo_exchange t_step_mpi_halo_exchange_download \\\n", - "11 0.0 0.038208 \n", - "7 0.0 0.040833 \n", - "14 0.0 0.039368 \n", - "0 0.0 0.038696 \n", - "5 0.0 0.037964 \n", - "8 0.0 0.038269 \n", - "2 0.0 0.038757 \n", - "1 0.0 0.038025 \n", - "15 0.0 0.037292 \n", - "9 0.0 0.037842 \n", - "6 0.0 0.038269 \n", - "12 0.0 0.037842 \n", - "3 0.0 0.037415 \n", - "4 0.0 0.038818 \n", - "10 0.0 0.038269 \n", - "13 0.0 0.038330 \n", - "\n", - " t_step_mpi_halo_exchange_sendreceive t_step_mpi_halo_exchange_upload \\\n", - "11 0.016968 0.039429 \n", - "7 0.159668 0.046143 \n", - "14 0.139648 0.041931 \n", - "0 0.083252 0.044800 \n", - "5 0.036621 0.040771 \n", - "8 0.090393 0.043579 \n", - "2 0.100586 0.042480 \n", - "1 0.054260 0.041016 \n", - "15 0.061646 0.039795 \n", - "9 0.066345 0.039734 \n", - "6 0.152771 0.040283 \n", - "12 0.051697 0.040100 \n", - "3 0.163818 0.039673 \n", - "4 0.111450 0.041382 \n", - "10 0.072327 0.040405 \n", - "13 0.062927 0.041321 \n", - "\n", - " t_total \n", - "11 53.170382 \n", - "7 55.784859 \n", - "14 57.047498 \n", - "0 57.329107 \n", - "5 57.349928 \n", - "8 51.902805 \n", - "2 52.942312 \n", - "1 35.602447 \n", - "15 36.094733 \n", - "9 36.997667 \n", - "6 38.400811 \n", - "12 44.374620 \n", - "3 46.639225 \n", - "4 43.558686 \n", - "10 44.496923 \n", - "13 46.773383 \n" - ] - } - ], + "outputs": [], "source": [ "def read_profiling_files(profile_dir_path=\".\"):\n", " profiling_data = pd.DataFrame()\n", @@ -162,14 +48,12 @@ " for json_filename in json_filenames:\n", " with open(os.path.join(profile_dir_path, json_filename)) as json_file:\n", " profiling_data = profiling_data.append(json.load(json_file), ignore_index=True)\n", + " profiling_data = profiling_data.sort_values(by=[\"n_processes\"], ignore_index=True)\n", "\n", " profiling_data.n_processes = profiling_data.n_processes.astype(int)\n", - " profiling_data = profiling_data.sort_values(by=[\"n_processes\"])\n", + " profiling_data = profiling_data.sort_values(by=[\"n_processes\"], ignore_index=True)\n", "\n", - " return profiling_data\n", - "\n", - "profiling_data = read_profiling_files(\"output_dgx-2/2022-04-26T155207/\")\n", - "print(profiling_data)" + " return profiling_data" ] }, { @@ -181,96 +65,94 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.083251953125\n", - "11 0.016968\n", - "7 0.159668\n", - "14 0.139648\n", - "0 0.083252\n", - "5 0.036621\n", - "8 0.090393\n", - "2 0.100586\n", - "1 0.054260\n", - "15 0.061646\n", - "9 0.066345\n", - "6 0.152771\n", - "12 0.051697\n", - "3 0.163818\n", - "4 0.111450\n", - "10 0.072327\n", - "13 0.062927\n", - "Name: t_step_mpi_halo_exchange_sendreceive, dtype: float64\n", - "11 4.906475\n", - "7 0.521407\n", - "14 0.596154\n", - "0 1.000000\n", - "5 2.273333\n", - "8 0.920999\n", - "2 0.827670\n", - "1 1.534308\n", - "15 1.350495\n", - "9 1.254830\n", - "6 0.544946\n", - "12 1.610390\n", - "3 0.508197\n", - "4 0.746988\n", - "10 1.151055\n", - "13 1.322987\n", - "Name: t_step_mpi_halo_exchange_sendreceive, dtype: float64\n" + " t_init t_total outfile \\\n", + "0 1.248385 20.045867 /work/martinls/230527/ShallowWaterGPU/mpi_out_... \n", + "1 1.687006 21.810200 /work/martinls/230528/ShallowWaterGPU/mpi_out_... \n", + "2 2.178354 24.593490 /work/martinls/230530/ShallowWaterGPU/mpi_out_... \n", + "3 2.690906 25.624513 /work/martinls/230531/ShallowWaterGPU/mpi_out_... \n", + "4 3.629718 26.697773 /work/martinls/230532/ShallowWaterGPU/mpi_out_... \n", + "5 4.364927 27.958164 /work/martinls/230533/ShallowWaterGPU/mpi_out_... \n", + "6 5.628270 29.105025 /work/martinls/230534/ShallowWaterGPU/mpi_out_... \n", + "7 6.777608 30.504384 /work/martinls/230535/ShallowWaterGPU/mpi_out_... \n", + "\n", + " t_sim_init t_nc_write t_full_step t_mpi_halo_exchange \\\n", + "0 1.880793 12.403532 4.007889 0.0 \n", + "1 2.117109 12.576457 4.909249 0.0 \n", + "2 2.050483 12.774502 7.045701 0.0 \n", + "3 2.216515 13.096246 7.060501 0.0 \n", + "4 2.259021 13.178762 7.057118 0.0 \n", + "5 2.455840 13.353797 7.218295 0.0 \n", + "6 2.354878 13.576300 6.985424 0.0 \n", + "7 2.688699 13.492631 6.996821 0.0 \n", + "\n", + " t_mpi_halo_exchange_download t_mpi_halo_exchange_upload \\\n", + "0 3.883057 0.025879 \n", + "1 4.622559 0.027954 \n", + "2 3.596680 0.027832 \n", + "3 6.201660 0.028931 \n", + "4 3.875732 0.027222 \n", + "5 4.124268 0.028076 \n", + "6 4.145630 0.028564 \n", + "7 5.710327 0.030151 \n", + "\n", + " t_mpi_halo_exchange_sendreceive t_mpi_step nx ny dt \\\n", + "0 0.015381 0.017944 8192.0 8192.0 0.000001 \n", + "1 0.122803 0.018860 8192.0 8192.0 0.000001 \n", + "2 3.337158 0.019775 8192.0 8192.0 0.000001 \n", + "3 0.542480 0.019165 8192.0 8192.0 0.000001 \n", + "4 0.423584 0.020264 8192.0 8192.0 0.000001 \n", + "5 2.685059 0.019531 8192.0 8192.0 0.000001 \n", + "6 0.510254 0.019775 8192.0 8192.0 0.000001 \n", + "7 0.753418 0.018982 8192.0 8192.0 0.000001 \n", + "\n", + " n_time_steps slurm_job_id n_cuda_devices n_processes \\\n", + "0 200.0 230527.0 1 1 \n", + "1 200.0 230528.0 2 2 \n", + "2 200.0 230530.0 3 3 \n", + "3 200.0 230531.0 4 4 \n", + "4 200.0 230532.0 5 5 \n", + "5 200.0 230533.0 6 6 \n", + "6 200.0 230534.0 7 7 \n", + "7 200.0 230535.0 8 8 \n", + "\n", + " git_hash \\\n", + "0 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "1 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "2 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "3 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "4 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "5 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "6 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "7 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "\n", + " git_status \n", + "0 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "1 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "2 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "3 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "4 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "5 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "6 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "7 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":23: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", - " fig.show()\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" } ], "source": [ - "fig, ax = plt.subplots(figsize=(8,6))\n", + "# DGX-2\n", + "#weak_scaling_profiling_data = read_profiling_files(\"output_dgx-2/weak_scaling/2022-06-09T134809/\")\n", "\n", - "# FIXME! Sort AND give new indices\n", - "print(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"][0])\n", - "print(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"])\n", - "print(speedup(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"][0], profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"]))\n", + "# HGX\n", + "weak_scaling_profiling_data = read_profiling_files(\"output_hgx/weak_scaling/2022-06-16T162931/\")\n", + "##weak_scaling_profiling_data = read_profiling_files(\"output_hgx/weak_scaling/2022-06-16T170630/\")\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_total\"][0], profiling_data[\"t_total\"]), label=\"Total\")\n", - "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"][0], profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"]), label=\"MPI send/recv\")\n", - "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step\"][0], profiling_data[\"t_step\"]), label=\"Simulate\")\n", - "\n", - "#ax.plot(nproc, speedup, label=\"Actual\")\n", - "\n", - "ax.plot(nproc, np.ones(len(nproc)), label=\"Ideal (constant)\", linestyle=\"dotted\")\n", - "\n", - "ax.set_xlabel(\"Number of cores/GPUs\")\n", - "ax.set_ylabel(\"Efficiency\")\n", - "ax.legend(loc=\"upper left\")\n", - "fig.show()" + "print(weak_scaling_profiling_data)" ] }, { @@ -282,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -293,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -304,22 +186,91 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - ":25: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", - " fig.show()\n" + " t_init t_total outfile \\\n", + "0 8.105802 127.329448 /work/martinls/230507/ShallowWaterGPU/mpi_out_... \n", + "1 8.391940 106.173041 /work/martinls/230508/ShallowWaterGPU/mpi_out_... \n", + "2 8.316061 89.259504 /work/martinls/230509/ShallowWaterGPU/mpi_out_... \n", + "3 9.480870 82.180610 /work/martinls/230510/ShallowWaterGPU/mpi_out_... \n", + "4 9.948056 74.482449 /work/martinls/230511/ShallowWaterGPU/mpi_out_... \n", + "\n", + " t_sim_init t_nc_write t_full_step t_mpi_halo_exchange \\\n", + "0 5.656313 88.769145 23.461966 0.0 \n", + "1 5.297291 72.174575 19.195057 0.0 \n", + "2 5.045456 58.199751 16.024106 0.0 \n", + "3 5.172412 52.463597 13.905023 0.0 \n", + "4 4.827947 46.293962 12.370357 0.0 \n", + "\n", + " t_mpi_halo_exchange_download t_mpi_halo_exchange_upload \\\n", + "0 21.429688 0.028931 \n", + "1 15.628418 0.031372 \n", + "2 13.573486 0.030273 \n", + "3 11.412964 0.030151 \n", + "4 10.445801 0.030762 \n", + "\n", + " t_mpi_halo_exchange_sendreceive t_mpi_step nx ny dt \\\n", + "0 1.946533 0.019531 41984.0 10496.0 0.000001 \n", + "1 2.726074 0.021606 41984.0 8396.0 0.000001 \n", + "2 1.489014 0.020386 41984.0 6997.0 0.000001 \n", + "3 1.407959 0.019775 41984.0 5997.0 0.000001 \n", + "4 1.264648 0.021240 41984.0 5248.0 0.000001 \n", + "\n", + " n_time_steps slurm_job_id n_cuda_devices n_processes \\\n", + "0 200.0 230507.0 4 4 \n", + "1 200.0 230508.0 5 5 \n", + "2 200.0 230509.0 6 6 \n", + "3 200.0 230510.0 7 7 \n", + "4 200.0 230511.0 8 8 \n", + "\n", + " git_hash \\\n", + "0 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "1 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "2 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "3 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "4 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "\n", + " git_status \n", + "0 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "1 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "2 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "3 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n", + "4 M Figures.ipynb\\n M dgx-2_strong_scaling_benc... \n" ] - }, + } + ], + "source": [ + "# DGX-2\n", + "#strong_scaling_profiling_data = read_profiling_files(\"output_dgx-2/strong_scaling/2022-06-09T160712/\")\n", + "\n", + "# HGX\n", + "strong_scaling_profiling_data = read_profiling_files(\"output_hgx/strong_scaling/2022-06-16T152945/\")\n", + "\n", + "print(strong_scaling_profiling_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": { @@ -329,30 +280,108 @@ } ], "source": [ - "fig, ax = plt.subplots(figsize=(8,6))\n", + "fig, (ax_weak, ax_strong) = plt.subplots(1, 2, figsize=(16,6))\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_total\"][0], profiling_data[\"t_total\"]), label=\"Total\")\n", + "t_total_no_init_or_file_io = weak_scaling_profiling_data[\"t_total\"] \\\n", + " -weak_scaling_profiling_data[\"t_init\"] \\\n", + " -weak_scaling_profiling_data[\"t_nc_write\"] \\\n", + " -weak_scaling_profiling_data[\"t_sim_init\"]\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step_mpi_halo_exchange\"][0], profiling_data[\"t_step_mpi_halo_exchange\"]), label=\"MPI\")\n", + "t_total_halo_exchange = weak_scaling_profiling_data[\"t_mpi_halo_exchange_download\"] \\\n", + " +weak_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"] \\\n", + " +weak_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step_mpi\"][0], profiling_data[\"t_step_mpi\"]), label=\"Simulate\")\n", + "#ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + "# speedup(t_total_no_init_or_file_io[0], t_total_no_init_or_file_io), label=\"Total (no init or file I/O)\")\n", "\n", - "#ax.plot(nproc, amdahls_speedup(0.9, nproc), label=\"Amdahls 90%\", linestyle=\"dashed\")\n", - "#ax.plot(nproc, amdahls_speedup(0.5, nproc), label=\"Amdahls 50%\", linestyle=\"dashed\")\n", - "#ax.plot(nproc, amdahls_speedup(0.1, nproc), label=\"Amdahls 10%\", linestyle=\"dashed\")\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"][0:].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_full_step\"][0], weak_scaling_profiling_data[\"t_full_step\"][0:]), label=\"Total (no init or file I/O)\")\n", "\n", - "#ax.plot(nproc, gustafsons_speedup(0.9, nproc), label=\"Gustafsons 90%\")\n", - "#ax.plot(nproc, gustafsons_speedup(0.5, nproc), label=\"Gustafsons 50%\")\n", - "#ax.plot(nproc, gustafsons_speedup(0.1, nproc), label=\"Gustafsons 10%\")\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"][0:].to_numpy(dtype=\"int\"), \n", + " speedup(t_total_halo_exchange[0], t_total_halo_exchange[0:]), label=\"Halo exchange (D/E/U)\", linestyle=\"dashed\")\n", "\n", - "ax.plot(nproc, nproc, label=\"Ideal (linear)\", linestyle=\"dotted\")\n", + "\"\"\"\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_total\"][0], weak_scaling_profiling_data[\"t_total\"]), label=\"Total\")\n", "\n", - "ax.set_xlabel(\"Number of cores/GPUs\")\n", - "ax.set_ylabel(\"Speedup\")\n", - "ax.legend(loc=\"upper left\")\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"][0], weak_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"]), label=\"MPI send/recv\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_mpi_halo_exchange_download\"][0], weak_scaling_profiling_data[\"t_mpi_halo_exchange_download\"]), label=\"Download (GPU->CPU)\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"][0], weak_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]), label=\"Upload (CPU->GPU)\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_nc_write\"][0], weak_scaling_profiling_data[\"t_nc_write\"]), label=\"Write to file\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_init\"][0], weak_scaling_profiling_data[\"t_init\"]), label=\"Init\")\n", + "\"\"\"\n", + "\n", + "ax_weak.plot(nproc, np.ones(len(nproc)), label=\"Ideal (constant)\", linestyle=\"dotted\")\n", + "\n", + "ax_weak.set_xlabel(\"Number of ranks/GPUs\")\n", + "ax_weak.set_ylabel(\"Efficiency\")\n", + "ax_weak.legend(loc=\"lower right\")\n", + "#fig.show()\n", + "\n", + "##############################################\n", + "\n", + "#fig, ax = plt.subplots(figsize=(8,6))\n", + "\n", + "t_total_no_init_or_file_io = strong_scaling_profiling_data[\"t_total\"] \\\n", + " -strong_scaling_profiling_data[\"t_init\"] \\\n", + " -strong_scaling_profiling_data[\"t_nc_write\"] \\\n", + " -strong_scaling_profiling_data[\"t_sim_init\"]\n", + "\n", + "t_total_halo_exchange = strong_scaling_profiling_data[\"t_mpi_halo_exchange_download\"] \\\n", + " +strong_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"] \\\n", + " +strong_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]\n", + "\n", + "#ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + "# speedup(t_total_no_init_or_file_io[0], t_total_no_init_or_file_io)*4, label=\"Total (no init or file I/O)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_full_step\"][0], strong_scaling_profiling_data[\"t_full_step\"])*4, label=\"Total (no init or file I/O)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(t_total_halo_exchange[0], t_total_halo_exchange)*4, label=\"Halo exchange (D/E/U)\", linestyle=\"dashed\")\n", + "\n", + "\"\"\"\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_total\"][0], strong_scaling_profiling_data[\"t_total\"])*4, label=\"Total\")\n", + " \n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"][0], strong_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"]), label=\"MPI send/recv\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_mpi_halo_exchange_download\"][0], strong_scaling_profiling_data[\"t_mpi_halo_exchange_download\"]), label=\"Download (GPU->CPU)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"][0], strong_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]), label=\"Upload (CPU->GPU)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_nc_write\"][0], strong_scaling_profiling_data[\"t_nc_write\"]), label=\"Write to file\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_init\"][0], strong_scaling_profiling_data[\"t_init\"]), label=\"Init\")\n", + "\"\"\"\n", + "\n", + "#ax_strong.plot(nproc, amdahls_speedup(0.9, nproc), label=\"Amdahls 90%\", linestyle=\"dashed\")\n", + "#ax_strong.plot(nproc, amdahls_speedup(0.5, nproc), label=\"Amdahls 50%\", linestyle=\"dashed\")\n", + "#ax_strong.plot(nproc, amdahls_speedup(0.1, nproc), label=\"Amdahls 10%\", linestyle=\"dashed\")\n", + "\n", + "#ax_strong.plot(nproc, gustafsons_speedup(0.9, nproc), label=\"Gustafsons 90%\")\n", + "#ax_strong.plot(nproc, gustafsons_speedup(0.5, nproc), label=\"Gustafsons 50%\")\n", + "#ax_strong.plot(nproc, gustafsons_speedup(0.1, nproc), label=\"Gustafsons 10%\")\n", + "\n", + "ax_strong.plot(nproc[3:], nproc[3:], label=\"Ideal (linear)\", linestyle=\"dotted\")\n", + "\n", + "ax_strong.set_xlabel(\"Number of ranks/GPUs\")\n", + "ax_strong.set_ylabel(\"Speedup\")\n", + "ax_strong.legend(loc=\"upper left\")\n", "fig.show()" ] }, @@ -495,10 +524,10 @@ ], "metadata": { "interpreter": { - "hash": "d80e56d67bdb125526bdf12740b058f9c8b2e6eb30981cd0c9aaae49693d1172" + "hash": "5ec8a684eb355694b427c525a814c01edbb663f485e9b356374be21a7726d858" }, "kernelspec": { - "display_name": "ShallowWaterGPU_HPC", + "display_name": "Python 3.7.12 ('ShallowWaterGPU')", "language": "python", "name": "python3" }, @@ -512,7 +541,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.7.12" } }, "nbformat": 4, diff --git a/dgx-2_strong_scaling_benchmark.sh b/dgx-2_strong_scaling_benchmark.sh index e90f015..7414a7b 100644 --- a/dgx-2_strong_scaling_benchmark.sh +++ b/dgx-2_strong_scaling_benchmark.sh @@ -3,20 +3,36 @@ TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") # one node: 1-16 GPUs -sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# +#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# one node: 4-16 GPUs +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job diff --git a/dgx-2_weak_scaling_benchmark.sh b/dgx-2_weak_scaling_benchmark.sh index 4146f07..a24ee65 100644 --- a/dgx-2_weak_scaling_benchmark.sh +++ b/dgx-2_weak_scaling_benchmark.sh @@ -3,20 +3,39 @@ TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") # one node: 1-16 GPUs -sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# +#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# one node: 1-16 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=41984,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job \ No newline at end of file diff --git a/hgx_scaling_benchmark.job b/hgx_scaling_benchmark.job new file mode 100644 index 0000000..f072fcf --- /dev/null +++ b/hgx_scaling_benchmark.job @@ -0,0 +1,58 @@ +#!/bin/bash +# See http://wiki.ex3.simula.no before changing the values below +#SBATCH -p hgx2q # partition (GPU queue) +#SBATCH -w g002 # HGX node +#SBATCH -t 0-00:10 # time (D-HH:MM) +#SBATCH -o slurm.%N.%j.out # STDOUT +#SBATCH -e slurm.%N.%j.err # STDERR +#SBATCH --reservation=martinls_11 + + +# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default. +# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before +# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line: +# mpiexec --mca opal_cuda_support 1 ... +# +# In addition, the UCX support is also built but disabled by default. +# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment +# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes. +# Equivalently, you can set the MCA parameters in the command line: +# mpiexec --mca pml ucx --mca osc ucx ... +# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX. +# Please consult UCX's documentation for detail. + +ulimit -s 10240 +module load slurm/20.02.7 +module load cuda11.2/toolkit/11.2.2 +module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0 + +# Check how many gpu's your job got +#nvidia-smi + +mkdir -p output_hgx/$NOW + +## Copy input files to the work directory: +mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU +cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU + +# Run job +# (Assumes Miniconda is installed in user root dir.) +cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU +#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile + +export OMPI_MCA_opal_cuda_support=true +mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile + +cd $HOME/src/ShallowWaterGPU + +## Copy files from work directory: +# (NOTE: Copying is not performed if job fails!) +mkdir -p output_hgx/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW + +rm -rf /work/$USER/$SLURM_JOB_ID diff --git a/hgx_strong_scaling_benchmark.sh b/hgx_strong_scaling_benchmark.sh new file mode 100644 index 0000000..f92c611 --- /dev/null +++ b/hgx_strong_scaling_benchmark.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") + +# one node: 1-8 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job + +# one node: 4-8 GPUs +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job diff --git a/hgx_weak_scaling_benchmark.sh b/hgx_weak_scaling_benchmark.sh new file mode 100644 index 0000000..11b91d5 --- /dev/null +++ b/hgx_weak_scaling_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") + +# one node: 1-16 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job + +# one node: 1-8 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job