diff --git a/Figures.ipynb b/Figures.ipynb index d7144f3..dc48b2b 100644 --- a/Figures.ipynb +++ b/Figures.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -36,140 +36,30 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dt n_cuda_devices n_processes n_time_steps nx ny \\\n", - "11 0.00001 1 1 202.0 4096.0 4096.0 \n", - "7 0.00001 2 2 202.0 4096.0 4096.0 \n", - "14 0.00001 3 3 202.0 4096.0 4096.0 \n", - "0 0.00001 4 4 202.0 4096.0 4096.0 \n", - "5 0.00001 5 5 202.0 4096.0 4096.0 \n", - "8 0.00001 6 6 202.0 4096.0 4096.0 \n", - "2 0.00001 7 7 202.0 4096.0 4096.0 \n", - "1 0.00001 8 8 202.0 4096.0 4096.0 \n", - "15 0.00001 9 9 202.0 4096.0 4096.0 \n", - "9 0.00001 10 10 202.0 4096.0 4096.0 \n", - "6 0.00001 11 11 202.0 4096.0 4096.0 \n", - "12 0.00001 12 12 202.0 4096.0 4096.0 \n", - "3 0.00001 13 13 202.0 4096.0 4096.0 \n", - "4 0.00001 14 14 202.0 4096.0 4096.0 \n", - "10 0.00001 15 15 202.0 4096.0 4096.0 \n", - "13 0.00001 16 16 202.0 4096.0 4096.0 \n", - "\n", - " outfile slurm_job_id t_init \\\n", - "11 /work/martinls/220879/ShallowWaterGPU/mpi_out_... 220879.0 5.507971 \n", - "7 /work/martinls/220880/ShallowWaterGPU/mpi_out_... 220880.0 5.721808 \n", - "14 /work/martinls/220881/ShallowWaterGPU/mpi_out_... 220881.0 5.717571 \n", - "0 /work/martinls/220882/ShallowWaterGPU/mpi_out_... 220882.0 6.074582 \n", - "5 /work/martinls/220883/ShallowWaterGPU/mpi_out_... 220883.0 6.108083 \n", - "8 /work/martinls/220884/ShallowWaterGPU/mpi_out_... 220884.0 5.634954 \n", - "2 /work/martinls/220885/ShallowWaterGPU/mpi_out_... 220885.0 5.070476 \n", - "1 /work/martinls/220886/ShallowWaterGPU/mpi_out_... 220886.0 4.180240 \n", - "15 /work/martinls/220887/ShallowWaterGPU/mpi_out_... 220887.0 4.341784 \n", - "9 /work/martinls/220888/ShallowWaterGPU/mpi_out_... 220888.0 5.299585 \n", - "6 /work/martinls/220889/ShallowWaterGPU/mpi_out_... 220889.0 6.026560 \n", - "12 /work/martinls/220890/ShallowWaterGPU/mpi_out_... 220890.0 6.275554 \n", - "3 /work/martinls/220891/ShallowWaterGPU/mpi_out_... 220891.0 7.103848 \n", - "4 /work/martinls/220892/ShallowWaterGPU/mpi_out_... 220892.0 8.413395 \n", - "10 /work/martinls/220893/ShallowWaterGPU/mpi_out_... 220893.0 9.477692 \n", - "13 /work/martinls/220894/ShallowWaterGPU/mpi_out_... 220894.0 9.881657 \n", - "\n", - " t_nc_write t_sim_init t_sim_mpi_init t_step t_step_mpi \\\n", - "11 3.687433 41.936742 0.002744 1.654766 1.635376 \n", - "7 3.692994 22.107036 0.002273 23.791452 1.607910 \n", - "14 3.710237 37.115577 0.002727 10.057041 1.693726 \n", - "0 3.825206 45.152544 0.002787 1.858922 1.732544 \n", - "5 3.833911 45.100539 0.002747 1.893753 1.798096 \n", - "8 3.779569 21.815326 0.003427 20.156193 1.755737 \n", - "2 3.902169 41.552375 0.003021 1.923522 1.733032 \n", - "1 3.880948 25.252051 0.003020 1.858470 1.778198 \n", - "15 3.870645 25.595168 0.002387 1.857388 1.771851 \n", - "9 3.858673 25.574244 0.003054 1.863774 1.784302 \n", - "6 3.944965 26.100656 0.002582 1.899902 1.745728 \n", - "12 3.932847 29.110549 0.003453 4.618246 1.801758 \n", - "3 3.973085 33.206754 0.003094 1.944592 1.778198 \n", - "4 3.987867 28.638798 0.003963 2.088098 1.781616 \n", - "10 3.998516 28.688051 0.002921 1.876371 1.801880 \n", - "13 3.955760 30.579091 0.002603 1.924463 1.780518 \n", - "\n", - " t_step_mpi_halo_exchange t_step_mpi_halo_exchange_download \\\n", - "11 0.0 0.038208 \n", - "7 0.0 0.040833 \n", - "14 0.0 0.039368 \n", - "0 0.0 0.038696 \n", - "5 0.0 0.037964 \n", - "8 0.0 0.038269 \n", - "2 0.0 0.038757 \n", - "1 0.0 0.038025 \n", - "15 0.0 0.037292 \n", - "9 0.0 0.037842 \n", - "6 0.0 0.038269 \n", - "12 0.0 0.037842 \n", - "3 0.0 0.037415 \n", - "4 0.0 0.038818 \n", - "10 0.0 0.038269 \n", - "13 0.0 0.038330 \n", - "\n", - " t_step_mpi_halo_exchange_sendreceive t_step_mpi_halo_exchange_upload \\\n", - "11 0.016968 0.039429 \n", - "7 0.159668 0.046143 \n", - "14 0.139648 0.041931 \n", - "0 0.083252 0.044800 \n", - "5 0.036621 0.040771 \n", - "8 0.090393 0.043579 \n", - "2 0.100586 0.042480 \n", - "1 0.054260 0.041016 \n", - "15 0.061646 0.039795 \n", - "9 0.066345 0.039734 \n", - "6 0.152771 0.040283 \n", - "12 0.051697 0.040100 \n", - "3 0.163818 0.039673 \n", - "4 0.111450 0.041382 \n", - "10 0.072327 0.040405 \n", - "13 0.062927 0.041321 \n", - "\n", - " t_total \n", - "11 53.170382 \n", - "7 55.784859 \n", - "14 57.047498 \n", - "0 57.329107 \n", - "5 57.349928 \n", - "8 51.902805 \n", - "2 52.942312 \n", - "1 35.602447 \n", - "15 36.094733 \n", - "9 36.997667 \n", - "6 38.400811 \n", - "12 44.374620 \n", - "3 46.639225 \n", - "4 43.558686 \n", - "10 44.496923 \n", - "13 46.773383 \n" - ] - } - ], + "outputs": [], "source": [ - "def read_profiling_files(profile_dir_path=\".\"):\n", + "def read_profiling_files(profile_dir_path=\".\", drop_multinode=False, drop_singlenode=False):\n", " profiling_data = pd.DataFrame()\n", "\n", " json_filenames = [file for file in os.listdir(profile_dir_path) if file.endswith(\"_profiling.json\")]\n", "\n", + " if(drop_singlenode):\n", + " json_filenames = [file for file in json_filenames if \"1_nodes\" not in file]\n", + "\n", + " if(drop_multinode):\n", + " json_filenames = [file for file in json_filenames if \"1_nodes\" in file]\n", + "\n", " for json_filename in json_filenames:\n", " with open(os.path.join(profile_dir_path, json_filename)) as json_file:\n", " profiling_data = profiling_data.append(json.load(json_file), ignore_index=True)\n", + " profiling_data = profiling_data.sort_values(by=[\"n_processes\"], ignore_index=True)\n", "\n", " profiling_data.n_processes = profiling_data.n_processes.astype(int)\n", - " profiling_data = profiling_data.sort_values(by=[\"n_processes\"])\n", + " profiling_data = profiling_data.sort_values(by=[\"n_processes\"], ignore_index=True)\n", "\n", - " return profiling_data\n", - "\n", - "profiling_data = read_profiling_files(\"output_dgx-2/2022-04-26T155207/\")\n", - "print(profiling_data)" + " return profiling_data" ] }, { @@ -181,96 +71,106 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.083251953125\n", - "11 0.016968\n", - "7 0.159668\n", - "14 0.139648\n", - "0 0.083252\n", - "5 0.036621\n", - "8 0.090393\n", - "2 0.100586\n", - "1 0.054260\n", - "15 0.061646\n", - "9 0.066345\n", - "6 0.152771\n", - "12 0.051697\n", - "3 0.163818\n", - "4 0.111450\n", - "10 0.072327\n", - "13 0.062927\n", - "Name: t_step_mpi_halo_exchange_sendreceive, dtype: float64\n", - "11 4.906475\n", - "7 0.521407\n", - "14 0.596154\n", - "0 1.000000\n", - "5 2.273333\n", - "8 0.920999\n", - "2 0.827670\n", - "1 1.534308\n", - "15 1.350495\n", - "9 1.254830\n", - "6 0.544946\n", - "12 1.610390\n", - "3 0.508197\n", - "4 0.746988\n", - "10 1.151055\n", - "13 1.322987\n", - "Name: t_step_mpi_halo_exchange_sendreceive, dtype: float64\n" + " t_init t_total outfile \\\n", + "0 5.327343 72.445652 /cluster/work/jobs/5977262/ShallowWaterGPU/mpi... \n", + "1 5.766222 70.390925 /cluster/work/jobs/5977264/ShallowWaterGPU/mpi... \n", + "2 6.594021 72.797283 /cluster/work/jobs/5977265/ShallowWaterGPU/mpi... \n", + "3 13.090770 98.327636 /cluster/work/jobs/5977266/ShallowWaterGPU/mpi... \n", + "\n", + " t_sim_init t_nc_write t_full_step t_mpi_halo_exchange \\\n", + "0 6.785504 34.131567 24.233159 0.0 \n", + "1 6.297029 31.896560 24.437577 0.0 \n", + "2 6.115570 33.620830 24.389490 0.0 \n", + "3 23.062950 35.326106 24.479295 0.0 \n", + "\n", + " t_mpi_halo_exchange_download t_mpi_halo_exchange_upload \\\n", + "0 23.969971 0.039307 \n", + "1 23.959106 0.035278 \n", + "2 23.961182 0.036865 \n", + "3 23.963623 0.038574 \n", + "\n", + " t_mpi_halo_exchange_sendreceive t_mpi_step nx ny dt \\\n", + "0 0.049072 0.027039 12288.0 12288.0 0.000001 \n", + "1 0.200195 0.022766 12288.0 12288.0 0.000001 \n", + "2 0.150146 0.025269 12288.0 12288.0 0.000001 \n", + "3 0.281494 0.028137 12288.0 12288.0 0.000001 \n", + "\n", + " n_time_steps slurm_job_id n_cuda_devices n_processes \\\n", + "0 200.0 5977262.0 1 1 \n", + "1 200.0 5977264.0 2 2 \n", + "2 200.0 5977265.0 3 3 \n", + "3 200.0 5977266.0 4 4 \n", + "\n", + " git_hash \\\n", + "0 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "1 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "2 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "3 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "\n", + " git_status \n", + "0 M conda_environment.yml\\n M conda_environment... \n", + "1 M conda_environment.yml\\n M conda_environment... \n", + "2 M conda_environment.yml\\n M conda_environment... \n", + "3 M saga_scaling_benchmark.job\\n M saga_strong_... \n", + " t_init t_total outfile \\\n", + "0 5.409974 74.459357 /cluster/work/jobs/5977267/ShallowWaterGPU/mpi... \n", + "1 10.191378 87.734289 /cluster/work/jobs/5977268/ShallowWaterGPU/mpi... \n", + "2 10.992114 92.724516 /cluster/work/jobs/5977269/ShallowWaterGPU/mpi... \n", + "\n", + " t_sim_init t_nc_write t_full_step t_mpi_halo_exchange \\\n", + "0 8.215068 31.199623 27.619763 0.0 \n", + "1 18.097157 30.747718 26.639607 0.0 \n", + "2 18.207139 32.633317 28.926713 0.0 \n", + "\n", + " t_mpi_halo_exchange_download t_mpi_halo_exchange_upload \\\n", + "0 23.962158 0.035278 \n", + "1 23.961914 0.035278 \n", + "2 23.966187 0.037476 \n", + "\n", + " t_mpi_halo_exchange_sendreceive t_mpi_step nx ny dt \\\n", + "0 3.625488 0.022888 12288.0 12288.0 0.000001 \n", + "1 2.509521 0.022278 12288.0 12288.0 0.000001 \n", + "2 4.665771 0.023193 12288.0 12288.0 0.000001 \n", + "\n", + " n_time_steps slurm_job_id n_cuda_devices n_processes \\\n", + "0 200.0 5977267.0 1 2 \n", + "1 200.0 5977268.0 1 3 \n", + "2 200.0 5977269.0 1 4 \n", + "\n", + " git_hash \\\n", + "0 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "1 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "2 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "\n", + " git_status \n", + "0 M conda_environment.yml\\n M conda_environment... \n", + "1 M saga_scaling_benchmark.job\\n M saga_strong_... \n", + "2 M saga_scaling_benchmark.job\\n M saga_strong_... \n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":23: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", - " fig.show()\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" } ], "source": [ - "fig, ax = plt.subplots(figsize=(8,6))\n", + "# DGX-2\n", + "#weak_scaling_profiling_data = read_profiling_files(\"output_dgx-2/weak_scaling/2022-06-09T134809/\")\n", + "#weak_scaling_profiling_data = read_profiling_files(\"output_dgx-2/weak_scaling/2022-06-23T154025/\")\n", "\n", - "# FIXME! Sort AND give new indices\n", - "print(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"][0])\n", - "print(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"])\n", - "print(speedup(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"][0], profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"]))\n", + "# HGX\n", + "#weak_scaling_profiling_data = read_profiling_files(\"output_hgx/weak_scaling/2022-06-16T162931/\")\n", + "##weak_scaling_profiling_data = read_profiling_files(\"output_hgx/weak_scaling/2022-06-16T170630/\")\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_total\"][0], profiling_data[\"t_total\"]), label=\"Total\")\n", + "# Saga\n", + "singlenode_weak_scaling_profiling_data = read_profiling_files(\"output_saga/weak_scaling/2022-06-16T151516/\", drop_multinode=True)\n", + "multinode_weak_scaling_profiling_data = read_profiling_files(\"output_saga/weak_scaling/2022-06-16T151516/\", drop_singlenode=True)\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"][0], profiling_data[\"t_step_mpi_halo_exchange_sendreceive\"]), label=\"MPI send/recv\")\n", - "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step\"][0], profiling_data[\"t_step\"]), label=\"Simulate\")\n", - "\n", - "#ax.plot(nproc, speedup, label=\"Actual\")\n", - "\n", - "ax.plot(nproc, np.ones(len(nproc)), label=\"Ideal (constant)\", linestyle=\"dotted\")\n", - "\n", - "ax.set_xlabel(\"Number of cores/GPUs\")\n", - "ax.set_ylabel(\"Efficiency\")\n", - "ax.legend(loc=\"upper left\")\n", - "fig.show()" + "print(singlenode_weak_scaling_profiling_data)\n", + "print(multinode_weak_scaling_profiling_data)" ] }, { @@ -282,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -293,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -304,22 +204,110 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - ":25: UserWarning: Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", - " fig.show()\n" + " t_init t_total outfile \\\n", + "0 9.692163 80.12349 /cluster/work/jobs/5977971/ShallowWaterGPU/mpi... \n", + "\n", + " t_sim_init t_nc_write t_full_step t_mpi_halo_exchange \\\n", + "0 8.455713 35.275914 24.448944 0.0 \n", + "\n", + " t_mpi_halo_exchange_download t_mpi_halo_exchange_upload \\\n", + "0 23.929565 0.03894 \n", + "\n", + " t_mpi_halo_exchange_sendreceive t_mpi_step nx ny dt \\\n", + "0 0.340088 0.028564 24576.0 6144.0 0.000001 \n", + "\n", + " n_time_steps slurm_job_id n_cuda_devices n_processes \\\n", + "0 200.0 5977971.0 4 4 \n", + "\n", + " git_hash \\\n", + "0 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "\n", + " git_status \n", + "0 M saga_scaling_benchmark.job\\n M saga_strong_... \n", + " t_init t_total outfile \\\n", + "0 10.973809 93.593265 /cluster/work/jobs/5977972/ShallowWaterGPU/mpi... \n", + "1 4.248161 39.835643 /cluster/work/jobs/5977974/ShallowWaterGPU/mpi... \n", + "2 11.035480 60.120367 /cluster/work/jobs/5983711/ShallowWaterGPU/mpi... \n", + "3 9.521014 44.935236 /cluster/work/jobs/5983714/ShallowWaterGPU/mpi... \n", + "\n", + " t_sim_init t_nc_write t_full_step t_mpi_halo_exchange \\\n", + "0 18.225805 32.501692 29.925707 0.0 \n", + "1 4.393575 15.181573 13.800955 0.0 \n", + "2 26.829786 10.607348 9.628182 0.0 \n", + "3 17.313007 8.706373 7.796057 0.0 \n", + "\n", + " t_mpi_halo_exchange_download t_mpi_halo_exchange_upload \\\n", + "0 23.949829 0.037476 \n", + "1 12.015137 0.035522 \n", + "2 8.051514 0.038574 \n", + "3 6.057861 0.042480 \n", + "\n", + " t_mpi_halo_exchange_sendreceive t_mpi_step nx ny dt \\\n", + "0 5.814209 0.025513 24576.0 6144.0 0.000001 \n", + "1 1.679688 0.023071 24576.0 3072.0 0.000001 \n", + "2 1.506348 0.025513 24576.0 2048.0 0.000001 \n", + "3 1.665527 0.029907 24576.0 1536.0 0.000001 \n", + "\n", + " n_time_steps slurm_job_id n_cuda_devices n_processes \\\n", + "0 200.0 5977972.0 1 4 \n", + "1 200.0 5977974.0 2 8 \n", + "2 200.0 5983711.0 3 12 \n", + "3 200.0 5983714.0 4 16 \n", + "\n", + " git_hash \\\n", + "0 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "1 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "2 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "3 0f0cbad2dd661c59f9a2c43740eda12d90cca413\\n \n", + "\n", + " git_status \n", + "0 M saga_scaling_benchmark.job\\n M saga_strong_... \n", + "1 M saga_scaling_benchmark.job\\n M saga_strong_... \n", + "2 M saga_scaling_benchmark.job\\n M saga_strong_... \n", + "3 M saga_scaling_benchmark.job\\n M saga_strong_... \n" ] - }, + } + ], + "source": [ + "# DGX-2\n", + "#strong_scaling_profiling_data = read_profiling_files(\"output_dgx-2/strong_scaling/2022-06-09T160712/\")\n", + "#strong_scaling_profiling_data = read_profiling_files(\"output_dgx-2/strong_scaling/2022-06-23T172838/\")\n", + "\n", + "# HGX\n", + "#strong_scaling_profiling_data = read_profiling_files(\"output_hgx/strong_scaling/2022-06-16T152945/\")\n", + "\n", + "# Saga\n", + "singlenode_strong_scaling_profiling_data = read_profiling_files(\"output_saga/strong_scaling/2022-06-16T190721/\", drop_multinode=True)\n", + "multinode_strong_scaling_profiling_data = read_profiling_files(\"output_saga/strong_scaling/2022-06-16T190721/\", drop_singlenode=True)\n", + "\n", + "print(singlenode_strong_scaling_profiling_data)\n", + "print(multinode_strong_scaling_profiling_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": { @@ -329,31 +317,192 @@ } ], "source": [ - "fig, ax = plt.subplots(figsize=(8,6))\n", + "plt.rcParams['font.size'] = 16\n", + "plt.rcParams['legend.fontsize'] = 14\n", + "plt.rcParams['axes.linewidth'] = 2\n", + "plt.rcParams['lines.linewidth'] = 2\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_total\"][0], profiling_data[\"t_total\"]), label=\"Total\")\n", + "fig, (ax_weak, ax_strong) = plt.subplots(1, 2, figsize=(16,6))\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step_mpi_halo_exchange\"][0], profiling_data[\"t_step_mpi_halo_exchange\"]), label=\"MPI\")\n", + "t_total_no_init_or_file_io = weak_scaling_profiling_data[\"t_total\"] \\\n", + " -weak_scaling_profiling_data[\"t_init\"] \\\n", + " -weak_scaling_profiling_data[\"t_nc_write\"] \\\n", + " -weak_scaling_profiling_data[\"t_sim_init\"]\n", "\n", - "ax.plot(profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", - " speedup(profiling_data[\"t_step_mpi\"][0], profiling_data[\"t_step_mpi\"]), label=\"Simulate\")\n", + "t_total_halo_exchange = weak_scaling_profiling_data[\"t_mpi_halo_exchange_download\"] \\\n", + " +weak_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"] \\\n", + " +weak_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]\n", "\n", - "#ax.plot(nproc, amdahls_speedup(0.9, nproc), label=\"Amdahls 90%\", linestyle=\"dashed\")\n", - "#ax.plot(nproc, amdahls_speedup(0.5, nproc), label=\"Amdahls 50%\", linestyle=\"dashed\")\n", - "#ax.plot(nproc, amdahls_speedup(0.1, nproc), label=\"Amdahls 10%\", linestyle=\"dashed\")\n", + "#ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + "# speedup(t_total_no_init_or_file_io[0], t_total_no_init_or_file_io), label=\"Total (no init or file I/O)\")\n", "\n", - "#ax.plot(nproc, gustafsons_speedup(0.9, nproc), label=\"Gustafsons 90%\")\n", - "#ax.plot(nproc, gustafsons_speedup(0.5, nproc), label=\"Gustafsons 50%\")\n", - "#ax.plot(nproc, gustafsons_speedup(0.1, nproc), label=\"Gustafsons 10%\")\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_full_step\"][0], weak_scaling_profiling_data[\"t_full_step\"]), label=\"Runtime (except init and file I/O)\")\n", + "ax_weak.locator_params(axis=\"x\", nbins=16)\n", "\n", - "ax.plot(nproc, nproc, label=\"Ideal (linear)\", linestyle=\"dotted\")\n", + "\"\"\"\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"][0:].to_numpy(dtype=\"int\"), \n", + " speedup(t_total_halo_exchange[0], t_total_halo_exchange[0:]), label=\"Halo exchange (D/E/U)\", linestyle=\"dashed\")\n", "\n", - "ax.set_xlabel(\"Number of cores/GPUs\")\n", - "ax.set_ylabel(\"Speedup\")\n", - "ax.legend(loc=\"upper left\")\n", - "fig.show()" + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_total\"][0], weak_scaling_profiling_data[\"t_total\"]), label=\"Total\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"][0], weak_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"]), label=\"MPI send/recv\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_mpi_halo_exchange_download\"][0], weak_scaling_profiling_data[\"t_mpi_halo_exchange_download\"]), label=\"Download (GPU->CPU)\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"][0], weak_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]), label=\"Upload (CPU->GPU)\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_nc_write\"][0], weak_scaling_profiling_data[\"t_nc_write\"]), label=\"Write to file\")\n", + "\n", + "ax_weak.plot(weak_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(weak_scaling_profiling_data[\"t_init\"][0], weak_scaling_profiling_data[\"t_init\"]), label=\"Init\")\n", + "\"\"\"\n", + "\n", + "ax_weak.plot(nproc, np.ones(len(nproc)), label=\"Ideal runtime (constant)\", linestyle=\"dotted\")\n", + "\n", + "ax_weak.set_xlabel(\"Number of ranks/GPUs\")\n", + "ax_weak.set_ylabel(\"Efficiency\")\n", + "ax_weak.legend(loc=\"upper right\", bbox_to_anchor=[1.0, 0.95])\n", + "#fig.show()\n", + "\n", + "##############################################\n", + "\n", + "#fig, ax = plt.subplots(figsize=(8,6))\n", + "\n", + "t_total_no_init_or_file_io = strong_scaling_profiling_data[\"t_total\"] \\\n", + " -strong_scaling_profiling_data[\"t_init\"] \\\n", + " -strong_scaling_profiling_data[\"t_nc_write\"] \\\n", + " -strong_scaling_profiling_data[\"t_sim_init\"]\n", + "\n", + "t_total_halo_exchange = strong_scaling_profiling_data[\"t_mpi_halo_exchange_download\"] \\\n", + " +strong_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"] \\\n", + " +strong_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]\n", + "\n", + "#ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + "# speedup(t_total_no_init_or_file_io[0], t_total_no_init_or_file_io)*4, label=\"Total (no init or file I/O)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_full_step\"][0], strong_scaling_profiling_data[\"t_full_step\"])*4, label=\"Runtime (except init and file I/O)\")\n", + "\n", + "\"\"\"\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(t_total_halo_exchange[0], t_total_halo_exchange)*4, label=\"Halo exchange (D/E/U)\", linestyle=\"dashed\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_total\"][0], strong_scaling_profiling_data[\"t_total\"])*4, label=\"Total\")\n", + " \n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"][0], strong_scaling_profiling_data[\"t_mpi_halo_exchange_sendreceive\"]), label=\"MPI send/recv\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_mpi_halo_exchange_download\"][0], strong_scaling_profiling_data[\"t_mpi_halo_exchange_download\"]), label=\"Download (GPU->CPU)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"][0], strong_scaling_profiling_data[\"t_mpi_halo_exchange_upload\"]), label=\"Upload (CPU->GPU)\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_nc_write\"][0], strong_scaling_profiling_data[\"t_nc_write\"]), label=\"Write to file\")\n", + "\n", + "ax_strong.plot(strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(strong_scaling_profiling_data[\"t_init\"][0], strong_scaling_profiling_data[\"t_init\"]), label=\"Init\")\n", + "\"\"\"\n", + "\n", + "#ax_strong.plot(nproc, amdahls_speedup(0.9, nproc), label=\"Amdahls 90%\", linestyle=\"dashed\")\n", + "#ax_strong.plot(nproc, amdahls_speedup(0.5, nproc), label=\"Amdahls 50%\", linestyle=\"dashed\")\n", + "#ax_strong.plot(nproc, amdahls_speedup(0.1, nproc), label=\"Amdahls 10%\", linestyle=\"dashed\")\n", + "\n", + "#ax_strong.plot(nproc, gustafsons_speedup(0.9, nproc), label=\"Gustafsons 90%\")\n", + "#ax_strong.plot(nproc, gustafsons_speedup(0.5, nproc), label=\"Gustafsons 50%\")\n", + "#ax_strong.plot(nproc, gustafsons_speedup(0.1, nproc), label=\"Gustafsons 10%\")\n", + "\n", + "ax_strong.plot(nproc[3:], nproc[3:], label=\"Ideal runtime (linear)\", linestyle=\"dotted\")\n", + "\n", + "ax_strong.set_xlabel(\"Number of ranks/GPUs\")\n", + "ax_strong.set_ylabel(\"Speedup\")\n", + "ax_strong.legend(loc=\"upper left\")\n", + "fig.show()\n", + "\n", + "fig.savefig(\"dgx-2-scaling.pdf\", bbox_inches='tight')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/lib/python3.7/site-packages/ipykernel_launcher.py:45: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "###\n", + "### Saga\n", + "###\n", + "\n", + "plt.rcParams['font.size'] = 16\n", + "plt.rcParams['legend.fontsize'] = 14\n", + "plt.rcParams['axes.linewidth'] = 2\n", + "plt.rcParams['lines.linewidth'] = 2\n", + "\n", + "fig, (ax_weak, ax_strong) = plt.subplots(1, 2, figsize=(16,6))\n", + "\n", + "ax_weak.plot(singlenode_weak_scaling_profiling_data[\"n_processes\"][0:4].to_numpy(dtype=\"int\"), \n", + " speedup(singlenode_weak_scaling_profiling_data[\"t_full_step\"][0], singlenode_weak_scaling_profiling_data[\"t_full_step\"][0:4]), \n", + " label=\"Single-node runtime (no init or file I/O)\", marker=\"x\")\n", + "\n", + "ax_weak.plot(multinode_weak_scaling_profiling_data[\"n_processes\"][0:3].to_numpy(dtype=\"int\"), \n", + " speedup(singlenode_weak_scaling_profiling_data[\"t_full_step\"][0], multinode_weak_scaling_profiling_data[\"t_full_step\"][0:3]), \n", + " label=\"2–4 nodes runtime (no init or file I/O)\", marker=\"o\", color=\"green\")\n", + "\n", + "ax_weak.locator_params(axis=\"x\", nbins=4)\n", + "\n", + "ax_weak.plot(nproc[0:4], np.ones(len(nproc[0:4])), label=\"Ideal runtime (constant)\", linestyle=\"dotted\", color=\"orange\")\n", + "\n", + "ax_weak.set_xlabel(\"Number of ranks/GPUs\")\n", + "ax_weak.set_ylabel(\"Efficiency\")\n", + "ax_weak.legend(loc=\"upper left\", bbox_to_anchor=[0.0, 0.8])\n", + "\n", + "##############################################\n", + "\n", + "#ax_strong.plot(singlenode_strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + "# speedup(singlenode_strong_scaling_profiling_data[\"t_full_step\"][0], singlenode_strong_scaling_profiling_data[\"t_full_step\"])*4, \n", + "# label=\"Single-node (no init or file I/O)\", marker=\"x\")\n", + "\n", + "ax_strong.plot(multinode_strong_scaling_profiling_data[\"n_processes\"].to_numpy(dtype=\"int\"), \n", + " speedup(multinode_strong_scaling_profiling_data[\"t_full_step\"][0], multinode_strong_scaling_profiling_data[\"t_full_step\"])*4, \n", + " label=\"Four nodes runtime (no init or file I/O)\", marker=\"o\")\n", + "\n", + "ax_strong.locator_params(axis=\"x\", nbins=16)\n", + "\n", + "ax_strong.plot(nproc[0:], nproc[0:], label=\"Ideal runtime (linear)\", linestyle=\"dotted\")\n", + "\n", + "ax_strong.set_xlabel(\"Number of ranks/GPUs\")\n", + "ax_strong.set_ylabel(\"Speedup\")\n", + "ax_strong.legend(loc=\"upper left\")\n", + "fig.show()\n", + "\n", + "fig.savefig(\"saga-scaling.pdf\", bbox_inches='tight')" ] }, { @@ -494,11 +643,8 @@ } ], "metadata": { - "interpreter": { - "hash": "d80e56d67bdb125526bdf12740b058f9c8b2e6eb30981cd0c9aaae49693d1172" - }, "kernelspec": { - "display_name": "ShallowWaterGPU_HPC", + "display_name": "Python 3.7.12 ('ShallowWaterGPU_HPC')", "language": "python", "name": "python3" }, @@ -512,7 +658,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.7.12" + }, + "vscode": { + "interpreter": { + "hash": "cb8fa661d82d1ec49918052345889e962ab1d5f5f5cbd9596ba31c436e222a26" + } } }, "nbformat": 4, diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index a965231..76902c5 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -100,8 +100,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names profiling_data_sim_runner["end"]["t_sim_init"] = 0 profiling_data_sim_runner["start"]["t_nc_write"] = 0 profiling_data_sim_runner["end"]["t_nc_write"] = 0 - profiling_data_sim_runner["start"]["t_step"] = 0 - profiling_data_sim_runner["end"]["t_step"] = 0 + profiling_data_sim_runner["start"]["t_full_step"] = 0 + profiling_data_sim_runner["end"]["t_full_step"] = 0 profiling_data_sim_runner["start"]["t_sim_init"] = time.time() @@ -121,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names outdata.ncfile.git_hash = getGitHash() outdata.ncfile.git_status = getGitStatus() outdata.ncfile.simulator = str(simulator) - outdata.ncfile.sim_args = toJson(simulator_args) + + # do not write fields to attributes (they are to large) + simulator_args_for_ncfile = simulator_args.copy() + del simulator_args_for_ncfile["rho"] + del simulator_args_for_ncfile["rho_u"] + del simulator_args_for_ncfile["rho_v"] + del simulator_args_for_ncfile["E"] + outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile) #Create dimensions outdata.ncfile.createDimension('time', len(save_times)) @@ -172,13 +179,13 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e))) return outdata.filename - profiling_data_sim_runner["start"]["t_step"] += time.time() + profiling_data_sim_runner["start"]["t_full_step"] += time.time() #Simulate if (t_step > 0.0): sim.simulate(t_step, dt) - profiling_data_sim_runner["end"]["t_step"] += time.time() + profiling_data_sim_runner["end"]["t_full_step"] += time.time() profiling_data_sim_runner["start"]["t_nc_write"] += time.time() diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py index cc15c9c..2c3f810 100644 --- a/GPUSimulators/EE2D_KP07_dimsplit.py +++ b/GPUSimulators/EE2D_KP07_dimsplit.py @@ -138,9 +138,9 @@ class EE2D_KP07_dimsplit (BaseSimulator): return if external and not internal: - ############################################################# - # XXX: Only treating north and south external cells for now # - ############################################################# + ################################### + # XXX: Corners are treated twice! # + ################################### ns_grid_size = (self.grid_size[0], 1) @@ -189,14 +189,58 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.cfl_data.gpudata, 0, 0, self.nx, int(self.u0[0].y_halo)) + + we_grid_size = (1, self.grid_size[1]) + + # WEST + # (x0, y0) x (x1, y1) + # (0, 0) x (x_halo, ny) + self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, 0, + int(self.u0[0].x_halo), self.ny) + + # EAST + # (x0, y0) x (x1, y1) + # (nx-x_halo, 0) x (nx, ny) + self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + self.nx - int(self.u0[0].x_halo), 0, + self.nx, self.ny) return if internal and not external: - ############################################################# - # XXX: Only treating north and south external cells for now # - # So we need to include west and east boundary here! # - ############################################################# - + # INTERNAL DOMAIN # (x0, y0) x (x1, y1) # (x_halo, y_halo) x (nx - x_halo, ny - y_halo) @@ -217,8 +261,8 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.cfl_data.gpudata, - 0, int(self.u0[0].y_halo), - self.nx, self.ny - int(self.u0[0].y_halo)) + int(self.u0[0].x_halo), int(self.u0[0].y_halo), + self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo)) return def swapBuffers(self): diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index 3e2a7e0..c092c01 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -27,7 +27,7 @@ from mpi4py import MPI import time import pycuda.driver as cuda -import nvtx +#import nvtx @@ -208,18 +208,17 @@ class MPISimulator(Simulator.BaseSimulator): """ def __init__(self, sim, grid): self.profiling_data_mpi = { 'start': {}, 'end': {} } - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0 + self.profiling_data_mpi["start"]["t_mpi_step"] = 0 + self.profiling_data_mpi["end"]["t_mpi_step"] = 0 self.profiling_data_mpi["n_time_steps"] = 0 - self.profiling_data_mpi["start"]["t_sim_mpi_init"] = time.time() self.logger = logging.getLogger(__name__) autotuner = sim.context.autotuner @@ -297,43 +296,43 @@ class MPISimulator(Simulator.BaseSimulator): #Note that east and west also transfer ghost cells #whilst north/south only transfer internal cells #Reuses the width/height defined in the read-extets above - self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) - self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) - self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) - self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) + self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) + self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) + self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) + self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) #Allocate data for sending - self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e) - self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w) - self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n) - self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s) + self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e) + self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w) + self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n) + self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) - self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() - self.old_exchange() + self.full_exchange() + sim.context.synchronize() def substep(self, dt, step_number): - nvtx.mark("substep start", color="yellow") + #nvtx.mark("substep start", color="yellow") - self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_step"] += time.time() - nvtx.mark("substep internal", color="red") - self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded - - nvtx.mark("substep external", color="blue") + #nvtx.mark("substep external", color="blue") self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" + + #nvtx.mark("substep internal", color="red") + self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded #nvtx.mark("substep full", color="blue") #self.sim.substep(dt, step_number, external=True, internal=True) self.sim.swapBuffers() - self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() + self.profiling_data_mpi["end"]["t_mpi_step"] += time.time() - nvtx.mark("exchange", color="blue") - self.old_exchange() + #nvtx.mark("exchange", color="blue") + self.full_exchange() #nvtx.mark("download", color="blue") #self.download_for_exchange(self.sim.u0) @@ -344,10 +343,10 @@ class MPISimulator(Simulator.BaseSimulator): #nvtx.mark("upload", color="blue") #self.upload_for_exchange(self.sim.u0) - nvtx.mark("sync start", color="blue") + #nvtx.mark("sync start", color="blue") self.sim.stream.synchronize() self.sim.internal_stream.synchronize() - nvtx.mark("sync end", color="blue") + #nvtx.mark("sync end", color="blue") self.profiling_data_mpi["n_time_steps"] += 1 @@ -383,8 +382,7 @@ class MPISimulator(Simulator.BaseSimulator): return [x0, x1, y0, y1] def download_for_exchange(self, u): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time() # North-south if self.north is not None: @@ -406,12 +404,10 @@ class MPISimulator(Simulator.BaseSimulator): u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) #self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w) - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() def exchange(self): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() #Send/receive to north/south neighbours comm_send = [] @@ -441,12 +437,10 @@ class MPISimulator(Simulator.BaseSimulator): for comm in comm_send: comm.wait() - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() def upload_for_exchange(self, u): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time() # North-south if self.north is not None: @@ -464,15 +458,11 @@ class MPISimulator(Simulator.BaseSimulator): for k in range(self.nvars): u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() - + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time() - - - def old_exchange(self): + def full_exchange(self): #### # FIXME: This function can be optimized using persitent communications. # Also by overlapping some of the communications north/south and east/west of GPU and intra-node @@ -484,8 +474,7 @@ class MPISimulator(Simulator.BaseSimulator): #### #Download from the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time() if self.north is not None: for k in range(self.nvars): @@ -495,10 +484,10 @@ class MPISimulator(Simulator.BaseSimulator): self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) self.sim.stream.synchronize() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() + #Send/receive to north/south neighbours - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() comm_send = [] comm_recv = [] @@ -513,10 +502,10 @@ class MPISimulator(Simulator.BaseSimulator): for comm in comm_recv: comm.wait() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() + #Upload to the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time() if self.north is not None: for k in range(self.nvars): @@ -524,25 +513,23 @@ class MPISimulator(Simulator.BaseSimulator): if self.south is not None: for k in range(self.nvars): self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s) + + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time() #Wait for sending to complete - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() for comm in comm_send: comm.wait() - + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() #### # Then transfer east-west including ghost cells that have been filled in by north-south transfer above #### #Download from the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time() if self.east is not None: for k in range(self.nvars): @@ -552,10 +539,10 @@ class MPISimulator(Simulator.BaseSimulator): self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) self.sim.stream.synchronize() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() + #Send/receive to east/west neighbours - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() comm_send = [] comm_recv = [] @@ -566,15 +553,14 @@ class MPISimulator(Simulator.BaseSimulator): comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)] comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)] - #Wait for incoming transfers to complete for comm in comm_recv: comm.wait() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() + #Upload to the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time() if self.east is not None: for k in range(self.nvars): @@ -583,13 +569,12 @@ class MPISimulator(Simulator.BaseSimulator): for k in range(self.nvars): self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time() + #Wait for sending to complete - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() for comm in comm_send: comm.wait() - - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() diff --git a/dgx-2_strong_scaling_benchmark.job b/dgx-2_scaling_benchmark.job similarity index 62% rename from dgx-2_strong_scaling_benchmark.job rename to dgx-2_scaling_benchmark.job index d607feb..d4c7cb5 100644 --- a/dgx-2_strong_scaling_benchmark.job +++ b/dgx-2_scaling_benchmark.job @@ -6,6 +6,21 @@ #SBATCH -t 0-00:10 # time (D-HH:MM) #SBATCH -o slurm.%N.%j.out # STDOUT #SBATCH -e slurm.%N.%j.err # STDERR +#SBATCH --reservation=martinls_17 + + +# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default. +# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before +# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line: +# mpiexec --mca opal_cuda_support 1 ... +# +# In addition, the UCX support is also built but disabled by default. +# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment +# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes. +# Equivalently, you can set the MCA parameters in the command line: +# mpiexec --mca pml ucx --mca osc ucx ... +# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX. +# Please consult UCX's documentation for detail. ulimit -s 10240 module load slurm/20.02.7 @@ -26,7 +41,11 @@ cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU #mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile #nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile -mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile + +export OMPI_MCA_opal_cuda_support=true +mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile + cd $HOME/src/ShallowWaterGPU ## Copy files from work directory: diff --git a/dgx-2_strong_scaling_benchmark.sh b/dgx-2_strong_scaling_benchmark.sh index 9c94602..cf6121d 100644 --- a/dgx-2_strong_scaling_benchmark.sh +++ b/dgx-2_strong_scaling_benchmark.sh @@ -2,21 +2,72 @@ TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") -# one node: 1-8 GPUs -sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +# one node: 1-16 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# +#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +# one node: 4-16 GPUs +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# +#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +# one node: 1-16 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +# one node: 4-16 GPUs +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job diff --git a/dgx-2_weak_scaling_benchmark.sh b/dgx-2_weak_scaling_benchmark.sh index 4bb0772..fddabf9 100644 --- a/dgx-2_weak_scaling_benchmark.sh +++ b/dgx-2_weak_scaling_benchmark.sh @@ -2,21 +2,40 @@ TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") -# one node: 1-8 GPUs -sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +# one node: 1-16 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +# +#sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +# one node: 1-16 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job + +sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job \ No newline at end of file diff --git a/hgx_scaling_benchmark.job b/hgx_scaling_benchmark.job new file mode 100644 index 0000000..f072fcf --- /dev/null +++ b/hgx_scaling_benchmark.job @@ -0,0 +1,58 @@ +#!/bin/bash +# See http://wiki.ex3.simula.no before changing the values below +#SBATCH -p hgx2q # partition (GPU queue) +#SBATCH -w g002 # HGX node +#SBATCH -t 0-00:10 # time (D-HH:MM) +#SBATCH -o slurm.%N.%j.out # STDOUT +#SBATCH -e slurm.%N.%j.err # STDERR +#SBATCH --reservation=martinls_11 + + +# For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default. +# To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before +# launching your MPI processes. Equivalently, you can set the MCA parameter in the command line: +# mpiexec --mca opal_cuda_support 1 ... +# +# In addition, the UCX support is also built but disabled by default. +# To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment +# variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes. +# Equivalently, you can set the MCA parameters in the command line: +# mpiexec --mca pml ucx --mca osc ucx ... +# Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX. +# Please consult UCX's documentation for detail. + +ulimit -s 10240 +module load slurm/20.02.7 +module load cuda11.2/toolkit/11.2.2 +module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0 + +# Check how many gpu's your job got +#nvidia-smi + +mkdir -p output_hgx/$NOW + +## Copy input files to the work directory: +mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU +cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU + +# Run job +# (Assumes Miniconda is installed in user root dir.) +cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU +#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +#mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile + +export OMPI_MCA_opal_cuda_support=true +mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile + +cd $HOME/src/ShallowWaterGPU + +## Copy files from work directory: +# (NOTE: Copying is not performed if job fails!) +mkdir -p output_hgx/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW + +rm -rf /work/$USER/$SLURM_JOB_ID diff --git a/hgx_strong_scaling_benchmark.sh b/hgx_strong_scaling_benchmark.sh new file mode 100644 index 0000000..f92c611 --- /dev/null +++ b/hgx_strong_scaling_benchmark.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") + +# one node: 1-8 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job + +# one node: 4-8 GPUs +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job diff --git a/hgx_weak_scaling_benchmark.sh b/hgx_weak_scaling_benchmark.sh new file mode 100644 index 0000000..11b91d5 --- /dev/null +++ b/hgx_weak_scaling_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") + +# one node: 1-16 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job +#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job + +# one node: 1-8 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job diff --git a/mpiTesting.py b/mpiTesting.py index 42e3bad..d33116b 100644 --- a/mpiTesting.py +++ b/mpiTesting.py @@ -114,13 +114,13 @@ logger.info("Generating initial conditions") nx = args.nx ny = args.ny -dt = 0.00001 +dt = 0.000001 gamma = 1.4 #save_times = np.linspace(0, 0.000009, 2) #save_times = np.linspace(0, 0.000099, 11) #save_times = np.linspace(0, 0.000099, 2) -save_times = np.linspace(0, 0.000999, 2) +save_times = np.linspace(0, 0.0000999, 2) outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] @@ -183,6 +183,8 @@ if(args.profile and MPI.COMM_WORLD.rank == 0): profiling_data["slurm_job_id"] = job_id profiling_data["n_cuda_devices"] = str(num_cuda_devices) profiling_data["n_processes"] = str(MPI.COMM_WORLD.size) + profiling_data["git_hash"] = Common.getGitHash() + profiling_data["git_status"] = Common.getGitStatus() with open(profiling_file, "w") as write_file: json.dump(profiling_data, write_file) diff --git a/saga_strong_scaling_benchmark.job b/saga_scaling_benchmark.job similarity index 59% rename from saga_strong_scaling_benchmark.job rename to saga_scaling_benchmark.job index fc61ffb..4ba1297 100644 --- a/saga_strong_scaling_benchmark.job +++ b/saga_scaling_benchmark.job @@ -1,6 +1,6 @@ #!/bin/bash # Job name: -#SBATCH --job-name=ShallowWaterGPUStrongScaling +#SBATCH --job-name=ShallowWaterGPUScaling # # Project: #SBATCH --account=nn9882k @@ -16,7 +16,8 @@ #SBATCH --partition=accel # # Max memory usage per task (core) - increasing this will cost more core hours: -#SBATCH --mem-per-cpu=3800M +##SBATCH --mem-per-cpu=3800M +#SBATCH --mem-per-cpu=24G # #SBATCH --qos=devel @@ -26,6 +27,8 @@ module restore system # instead of 'module purge' rather set module environment to the system default module load CUDA/11.4.1 +#module load CUDA/11.1.1-GCC-10.2.0 +#module load OpenMPI/4.0.5-gcccuda-2020b # It is also recommended to to list loaded modules, for easier debugging: module list @@ -40,12 +43,23 @@ cp -r . $SCRATCH/ShallowWaterGPU ## Make sure the results are copied back to the submit directory (see Work Directory below): # chkfile MyResultFile # chkfile is replaced by 'savefile' on Saga -savefile "$SCRATCH/ShallowWaterGPU/*.log" -savefile "$SCRATCH/ShallowWaterGPU/*.nc" -savefile "$SCRATCH/ShallowWaterGPU/*.json" +#savefile "$SCRATCH/ShallowWaterGPU/*.log" +#savefile "$SCRATCH/ShallowWaterGPU/*.nc" +#savefile "$SCRATCH/ShallowWaterGPU/*.json" +#savefile "$SCRATCH/ShallowWaterGPU/*.qdrep" + +cleanup "rm -rf $SCRATCH/ShallowWaterGPU" + +export OMPI_MCA_opal_cuda_support=true ## Do some work: cd $SCRATCH/ShallowWaterGPU -srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version -srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version +srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +cd $HOME/src/ShallowWaterGPU +mkdir -p output_saga/$NOW/$SLURM_JOB_ID +mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID +mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID +mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW +mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW diff --git a/saga_strong_scaling_benchmark.sh b/saga_strong_scaling_benchmark.sh index 96ba541..e550aca 100644 --- a/saga_strong_scaling_benchmark.sh +++ b/saga_strong_scaling_benchmark.sh @@ -1,13 +1,30 @@ #!/bin/bash -# one node: 1-4 GPUs -sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=1024 saga_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=1024,NY=512 saga_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=1024,NY=341 saga_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=512,NY=512 saga_strong_scaling_benchmark.job +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") -# 2-4 nodes: 1 GPUs per node -sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=512 saga_strong_scaling_benchmark.job -sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=341 saga_strong_scaling_benchmark.job -sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=512,NY=512 saga_strong_scaling_benchmark.job +# one node: 1–4 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks +# 4 nodes: 1–4 GPUs per node +sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks +sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks +sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks +sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks + +# 4 nodes: 1–4 GPUs per node +sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks +sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks +sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks +sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks + +## one node: 1–4 GPUs +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks +# +## 4 nodes: 1–4 GPUs per node +#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks +#sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks +#sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks +#sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks diff --git a/saga_weak_scaling_benchmark.sh b/saga_weak_scaling_benchmark.sh new file mode 100644 index 0000000..70da66b --- /dev/null +++ b/saga_weak_scaling_benchmark.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") + +# one node: 1-4 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks + +# 2-4 nodes: 1 GPUs per node +sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks +sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks +sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks + +## one node: 1-4 GPUs +#sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks +#sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks +#sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks +#sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks + +## 2-4 nodes: 1 GPUs per node +#sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks +#sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks +#sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks \ No newline at end of file diff --git a/seymour_strong_scaling_benchmark.sh b/seymour_strong_scaling_benchmark.sh index 33fe534..a894640 100644 --- a/seymour_strong_scaling_benchmark.sh +++ b/seymour_strong_scaling_benchmark.sh @@ -4,35 +4,35 @@ NOW=$(date "+%Y-%m-%dT%H%M%S") mkdir -p output_seymour/$NOW # one node: 1-8 GPUs -mpiexec -n 1 python mpiTesting.py -nx 4096 -ny 4096 --profile && +mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile && mkdir -p output_seymour/$NOW/1_proc && mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ && -mpiexec -n 2 python mpiTesting.py -nx 4096 -ny 2048 --profile && +mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile && mkdir -p output_seymour/$NOW/2_proc && mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ && -mpiexec -n 3 python mpiTesting.py -nx 4096 -ny 1365 --profile && +mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile && mkdir -p output_seymour/$NOW/3_proc && mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ && -mpiexec -n 4 python mpiTesting.py -nx 4096 -ny 1024 --profile && +mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile && mkdir -p output_seymour/$NOW/4_proc && mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ && -mpiexec -n 5 python mpiTesting.py -nx 4096 -ny 819 --profile && +mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile && mkdir -p output_seymour/$NOW/5_proc && mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ && -mpiexec -n 6 python mpiTesting.py -nx 4096 -ny 683 --profile && +mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile && mkdir -p output_seymour/$NOW/6_proc && mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ && -mpiexec -n 7 python mpiTesting.py -nx 4096 -ny 585 --profile && +mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile && mkdir -p output_seymour/$NOW/7_proc && mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ && -mpiexec -n 8 python mpiTesting.py -nx 4096 -ny 512 --profile && +mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile && mkdir -p output_seymour/$NOW/8_proc && mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ &&