mirror of
				https://github.com/smyalygames/FiniteVolumeGPU.git
				synced 2025-10-31 20:27:40 +01:00 
			
		
		
		
	Start of HIPIFYcation
This commit is contained in:
		
							parent
							
								
									e7cd6ae34a
								
							
						
					
					
						commit
						19ea8319e8
					
				| @ -1,29 +1,28 @@ | ||||
| # Assumes that conda, pip, build-essentials and cuda are installed | ||||
| --- | ||||
| name: ShallowWaterGPU | ||||
| name: FiniteVolumeGPU_HPC | ||||
| channels: | ||||
| - conda-forge | ||||
| 
 | ||||
| dependencies: | ||||
| - python=3.9 | ||||
| - python=3.13.5 | ||||
| - pip | ||||
| - numpy | ||||
| - matplotlib | ||||
| - jupyter | ||||
| - mpi4py | ||||
| - six | ||||
| - pytools | ||||
| - netcdf4 | ||||
| - scipy | ||||
| - nb_conda_kernels | ||||
| - nbdime | ||||
| - mpi4py | ||||
| - ffmpeg | ||||
| - pycuda | ||||
| - ipyparallel | ||||
| - line_profiler | ||||
| - tqdm | ||||
| - pip: | ||||
|   - hip-python==6.4.1.552.39 | ||||
|   - -i https://test.pypi.org/simple/ | ||||
| 
 | ||||
| 
 | ||||
| # Install conda environment (one-time operation): | ||||
| # $ conda env create -f conda_environment.yml | ||||
| # Activate environment | ||||
| # $ conda activate ShallowWaterGPU | ||||
| # $ conda activate FiniteVolumeGPU_HPC | ||||
| 
 | ||||
| # OPTIONAL: If you want to compile pycuda yourself, uncomment pycuda under  | ||||
| # "dependencies" above and do the following (one-time operation): | ||||
|  | ||||
| @ -1,23 +0,0 @@ | ||||
| # Assumes that conda, pip, build-essentials and cuda are installed | ||||
| --- | ||||
| name: ShallowWaterGPU_HPC | ||||
| channels: | ||||
| - conda-forge | ||||
| 
 | ||||
| dependencies: | ||||
| - python=3.7 | ||||
| - numpy | ||||
| - mpi4py | ||||
| - six | ||||
| - pytools | ||||
| - netcdf4 | ||||
| - scipy | ||||
| - tqdm | ||||
| 
 | ||||
| # Install conda environment (one-time operation): | ||||
| # $ conda env create -f conda_environment_hpc.yml | ||||
| # Activate environment and install the following packages using pip: | ||||
| # $ conda activate ShallowWaterGPU_HPC | ||||
| #  - pycuda: $ pip3 install --no-deps -U pycuda | ||||
| # on Windows: make sure your visual studio c++ compiler is available in PATH | ||||
| # PATH should have something like C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\ | ||||
| @ -1,33 +0,0 @@ | ||||
| #!/bin/bash | ||||
| #SBATCH -p dgx2q                   # partition (GPU queue) | ||||
| #SBATCH -N 1                       # number of nodes | ||||
| #SBATCH -n 1                       # number of cores | ||||
| #SBATCH -w g001                    # DGX-2 node | ||||
| #SBATCH --gres=gpu:1               # number of V100's | ||||
| #SBATCH -t 0-00:10                 # time (D-HH:MM) | ||||
| #SBATCH -o slurm.%N.%j.out  # STDOUT | ||||
| #SBATCH -e slurm.%N.%j.err  # STDERR | ||||
| 
 | ||||
| ulimit -s 10240 | ||||
| module load slurm  | ||||
| module load cuda10.1/toolkit/10.1.243 | ||||
| 
 | ||||
| # Check how many gpu's your job got | ||||
| #nvidia-smi | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| rm -rf /work/$USER/ShallowWaterGPU | ||||
| mkdir -p /work/$USER/ShallowWaterGPU | ||||
| cp -r . /work/$USER/ShallowWaterGPU | ||||
| 
 | ||||
| # Run job | ||||
| # (Assumes Miniconda is installed in user root dir.) | ||||
| cd /work/$USER/ShallowWaterGPU | ||||
| nvprof -o profiler_output $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 shmemTesting.py | ||||
| cd $HOME/src/ShallowWaterGPU | ||||
| 
 | ||||
| ## Copy files from work directory: | ||||
| # (NOTE: Copying is not performed if job fails!) | ||||
| cp /work/$USER/ShallowWaterGPU/*.log . | ||||
| cp /work/$USER/ShallowWaterGPU/*.nc . | ||||
| cp /work/$USER/ShallowWaterGPU/profiler_output . | ||||
| @ -1,35 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # See http://wiki.ex3.simula.no before changing the values below | ||||
| #SBATCH -p dgx2q                   # partition (GPU queue) | ||||
| #SBATCH -N 1                       # number of nodes | ||||
| #SBATCH -n 4                       # number of cores | ||||
| #SBATCH -w g001                    # DGX-2 node | ||||
| #SBATCH --gres=gpu:4               # number of V100's | ||||
| #SBATCH -t 0-00:10                 # time (D-HH:MM) | ||||
| #SBATCH -o slurm.%N.%j.out  # STDOUT | ||||
| #SBATCH -e slurm.%N.%j.err  # STDERR | ||||
| 
 | ||||
| ulimit -s 10240 | ||||
| module load slurm  | ||||
| module load openmpi/4.0.1 | ||||
| module load cuda10.1/toolkit/10.1.243 | ||||
| 
 | ||||
| # Check how many gpu's your job got | ||||
| #nvidia-smi | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| rm -rf /work/$USER/ShallowWaterGPU | ||||
| mkdir -p /work/$USER/ShallowWaterGPU | ||||
| cp -r . /work/$USER/ShallowWaterGPU | ||||
| 
 | ||||
| # Run job | ||||
| # (Assumes Miniconda is installed in user root dir.) | ||||
| cd /work/$USER/ShallowWaterGPU | ||||
| mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py | ||||
| cd $HOME/src/ShallowWaterGPU | ||||
| 
 | ||||
| ## Copy files from work directory: | ||||
| # (NOTE: Copying is not performed if job fails!) | ||||
| cp /work/$USER/ShallowWaterGPU/*.log . | ||||
| cp /work/$USER/ShallowWaterGPU/*.nc . | ||||
| 
 | ||||
| @ -1,59 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # See http://wiki.ex3.simula.no before changing the values below | ||||
| #SBATCH -p dgx2q                   # partition (GPU queue) | ||||
| #SBATCH -w g001                    # DGX-2 node | ||||
| ##SBATCH --gres=gpu:4               # number of V100's | ||||
| #SBATCH -t 0-00:10                 # time (D-HH:MM) | ||||
| #SBATCH -o slurm.%N.%j.out  # STDOUT | ||||
| #SBATCH -e slurm.%N.%j.err  # STDERR | ||||
| #SBATCH --reservation=martinls_17 | ||||
| 
 | ||||
| 
 | ||||
| # For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default. | ||||
| # To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before | ||||
| # launching your MPI processes. Equivalently, you can set the MCA parameter in the command line: | ||||
| # mpiexec --mca opal_cuda_support 1 ... | ||||
| #  | ||||
| # In addition, the UCX support is also built but disabled by default. | ||||
| # To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment | ||||
| # variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes. | ||||
| # Equivalently, you can set the MCA parameters in the command line: | ||||
| # mpiexec --mca pml ucx --mca osc ucx ... | ||||
| # Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX. | ||||
| # Please consult UCX's documentation for detail. | ||||
| 
 | ||||
| ulimit -s 10240 | ||||
| module load slurm/20.02.7 | ||||
| module load cuda11.2/toolkit/11.2.2 | ||||
| module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0 | ||||
| 
 | ||||
| # Check how many gpu's your job got | ||||
| #nvidia-smi | ||||
| 
 | ||||
| mkdir -p output_dgx-2/$NOW | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU | ||||
| cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU | ||||
| 
 | ||||
| # Run job | ||||
| # (Assumes Miniconda is installed in user root dir.) | ||||
| cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU | ||||
| #mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| #nsys profile -t nvtx,cuda mpirun -np  $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| #mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| 
 | ||||
| export OMPI_MCA_opal_cuda_support=true | ||||
| mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| 
 | ||||
| cd $HOME/src/ShallowWaterGPU | ||||
| 
 | ||||
| ## Copy files from work directory: | ||||
| # (NOTE: Copying is not performed if job fails!) | ||||
| mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW | ||||
| 
 | ||||
| rm -rf /work/$USER/$SLURM_JOB_ID | ||||
| @ -1,73 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") | ||||
| 
 | ||||
| # one node: 1-16 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| # | ||||
| #sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=910,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=819,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=745,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=683,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=630,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=585,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=546,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=512,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| # one node: 4-16 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| # | ||||
| #sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=41984,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=41984,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=41984,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=41984,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=41984,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=41984,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=41984,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=41984,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| # one node: 1-16 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=7509,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=5632,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=4505,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=3754,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=3218,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=2816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=2503,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=2252,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=2048,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=1877,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=1732,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=1609,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=1501,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=1408,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| # one node: 4-16 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=45056,NY=11264,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=45056,NY=8396,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=45056,NY=6997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=45056,NY=5997,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=45056,NY=5248,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=45056,NY=4664,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=45056,NY=4198,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=45056,NY=3816,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=45056,NY=3498,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=45056,NY=3229,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=45056,NY=2998,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=45056,NY=2798,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=45056,NY=2624,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| @ -1,41 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") | ||||
| 
 | ||||
| # one node: 1-16 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| # | ||||
| #sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| # one node: 1-16 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| 
 | ||||
| sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=22528,NY=22528,NOW=$TIMESTAMP dgx-2_scaling_benchmark.job | ||||
| @ -1,58 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # See http://wiki.ex3.simula.no before changing the values below | ||||
| #SBATCH -p hgx2q                   # partition (GPU queue) | ||||
| #SBATCH -w g002                    # HGX node | ||||
| #SBATCH -t 0-00:10                 # time (D-HH:MM) | ||||
| #SBATCH -o slurm.%N.%j.out  # STDOUT | ||||
| #SBATCH -e slurm.%N.%j.err  # STDERR | ||||
| #SBATCH --reservation=martinls_11 | ||||
| 
 | ||||
| 
 | ||||
| # For Linux 64, Open MPI is built with CUDA awareness but this support is disabled by default. | ||||
| # To enable it, please set the environment variable OMPI_MCA_opal_cuda_support=true before | ||||
| # launching your MPI processes. Equivalently, you can set the MCA parameter in the command line: | ||||
| # mpiexec --mca opal_cuda_support 1 ... | ||||
| #  | ||||
| # In addition, the UCX support is also built but disabled by default. | ||||
| # To enable it, first install UCX (conda install -c conda-forge ucx). Then, set the environment | ||||
| # variables OMPI_MCA_pml="ucx" OMPI_MCA_osc="ucx" before launching your MPI processes. | ||||
| # Equivalently, you can set the MCA parameters in the command line: | ||||
| # mpiexec --mca pml ucx --mca osc ucx ... | ||||
| # Note that you might also need to set UCX_MEMTYPE_CACHE=n for CUDA awareness via UCX. | ||||
| # Please consult UCX's documentation for detail. | ||||
| 
 | ||||
| ulimit -s 10240 | ||||
| module load slurm/20.02.7 | ||||
| module load cuda11.2/toolkit/11.2.2 | ||||
| module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0 | ||||
| 
 | ||||
| # Check how many gpu's your job got | ||||
| #nvidia-smi | ||||
| 
 | ||||
| mkdir -p output_hgx/$NOW | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU | ||||
| cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU | ||||
| 
 | ||||
| # Run job | ||||
| # (Assumes Miniconda is installed in user root dir.) | ||||
| cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU | ||||
| #mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| #nsys profile -t nvtx,cuda mpirun -np  $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| #mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| 
 | ||||
| export OMPI_MCA_opal_cuda_support=true | ||||
| mpirun -np $SLURM_NTASKS $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| 
 | ||||
| cd $HOME/src/ShallowWaterGPU | ||||
| 
 | ||||
| ## Copy files from work directory: | ||||
| # (NOTE: Copying is not performed if job fails!) | ||||
| mkdir -p output_hgx/$NOW/$SLURM_JOB_ID | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_hgx/$NOW/$SLURM_JOB_ID | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_hgx/$NOW/$SLURM_JOB_ID | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_hgx/$NOW | ||||
| mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_hgx/$NOW | ||||
| 
 | ||||
| rm -rf /work/$USER/$SLURM_JOB_ID | ||||
| @ -1,20 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") | ||||
| 
 | ||||
| # one node: 1-8 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=4096,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=2731,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=2048,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=1638,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=1365,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=1170,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=1024,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| 
 | ||||
| # one node: 4-8 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=41984,NY=10496,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=41984,NY=8396,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=41984,NY=6997,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=41984,NY=5997,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=41984,NY=5248,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| @ -1,23 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") | ||||
| 
 | ||||
| # one node: 1-16 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| #sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=8192,NY=8192,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| 
 | ||||
| # one node: 1-8 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP hgx_scaling_benchmark.job | ||||
| @ -1,8 +0,0 @@ | ||||
| #!/bin/bash | ||||
| module purge | ||||
| module load git/2.21.0 hdf5/1.10.5-gcc cuda/10.1 | ||||
| 
 | ||||
| conda activate ShallowWaterGPU_HPC | ||||
| 
 | ||||
| python mpiTesting.py | ||||
| 
 | ||||
							
								
								
									
										54
									
								
								saga-dev.job
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								saga-dev.job
									
									
									
									
									
								
							| @ -1,54 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # Job name: | ||||
| #SBATCH --job-name=ShallowWaterGPUScalingDev | ||||
| # | ||||
| # Project: | ||||
| #SBATCH --account=nn9882k | ||||
| # | ||||
| # Wall clock limit: | ||||
| #SBATCH --time=00:02:00 | ||||
| # | ||||
| # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below | ||||
| # | ||||
| # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU  | ||||
| # device(s) to use. It will have values '0', '1' or '0,1' corresponding to  | ||||
| # /dev/nvidia0, /dev/nvidia1 or both, respectively. | ||||
| #SBATCH --partition=accel | ||||
| # | ||||
| # Max memory usage per task (core) - increasing this will cost more core hours: | ||||
| #SBATCH --mem-per-cpu=3800M | ||||
| # | ||||
| # Number of tasks: | ||||
| #SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 | ||||
| # | ||||
| #SBATCH --qos=devel | ||||
| 
 | ||||
| ## Set up job environment: (this is done automatically behind the scenes) | ||||
| ## (make sure to comment '#' or remove the following line 'source ...') | ||||
| # source /cluster/bin/jobsetup | ||||
| 
 | ||||
| module restore system   # instead of 'module purge' rather set module environment to the system default | ||||
| module load CUDA/11.4.1 | ||||
| 
 | ||||
| # It is also recommended to to list loaded modules, for easier debugging: | ||||
| module list | ||||
| 
 | ||||
| set -o errexit # exit on errors | ||||
| set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script) | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| mkdir $SCRATCH/ShallowWaterGPU | ||||
| cp -r . $SCRATCH/ShallowWaterGPU | ||||
| 
 | ||||
| ## Make sure the results are copied back to the submit directory (see Work Directory below): | ||||
| # chkfile MyResultFileq | ||||
| # chkfile is replaced by 'savefile' on Saga | ||||
| savefile "$SCRATCH/ShallowWaterGPU/*.log" | ||||
| savefile "$SCRATCH/ShallowWaterGPU/*.nc" | ||||
| savefile "$SCRATCH/ShallowWaterGPU/*.json" | ||||
| 
 | ||||
| ## Do some work: | ||||
| cd $SCRATCH/ShallowWaterGPU | ||||
| srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version | ||||
| srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile | ||||
| 
 | ||||
| @ -1,52 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # Job name: | ||||
| #SBATCH --job-name=ShallowWaterGPUStrongScaling | ||||
| # | ||||
| # Project: | ||||
| #SBATCH --account=nn9882k | ||||
| # | ||||
| # Wall clock limit: | ||||
| #SBATCH --time=24:00:00 | ||||
| # | ||||
| # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below | ||||
| # | ||||
| # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU  | ||||
| # device(s) to use. It will have values '0', '1' or '0,1' corresponding to  | ||||
| # /dev/nvidia0, /dev/nvidia1 or both, respectively. | ||||
| #SBATCH --partition=accel | ||||
| # | ||||
| # Max memory usage per task (core) - increasing this will cost more core hours: | ||||
| #SBATCH --mem-per-cpu=3800M | ||||
| # | ||||
| # Number of tasks: | ||||
| #SBATCH --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 | ||||
| 
 | ||||
| ## Set up job environment: (this is done automatically behind the scenes) | ||||
| ## (make sure to comment '#' or remove the following line 'source ...') | ||||
| # source /cluster/bin/jobsetup | ||||
| 
 | ||||
| module restore system   # instead of 'module purge' rather set module environment to the system default | ||||
| module load CUDA/10.2.89 | ||||
| 
 | ||||
| # It is also recommended to to list loaded modules, for easier debugging: | ||||
| module list | ||||
| 
 | ||||
| set -o errexit # exit on errors | ||||
| set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script) | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| mkdir $SCRATCH/ShallowWaterGPU | ||||
| cp -r . $SCRATCH/ShallowWaterGPU | ||||
| 
 | ||||
| ## Make sure the results are copied back to the submit directory (see Work Directory below): | ||||
| # chkfile MyResultFile | ||||
| # chkfile is replaced by 'savefile' on Saga | ||||
| savefile "$SCRATCH/ShallowWaterGPU/*.log" | ||||
| savefile "$SCRATCH/ShallowWaterGPU/*.nc" | ||||
| savefile "$SCRATCH/ShallowWaterGPU/*.json" | ||||
| 
 | ||||
| ## Do some work: | ||||
| cd $SCRATCH/ShallowWaterGPU | ||||
| srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version | ||||
| srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 8192 -ny 8192 --profile | ||||
| 
 | ||||
| @ -1,65 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # Job name: | ||||
| #SBATCH --job-name=ShallowWaterGPUScaling | ||||
| # | ||||
| # Project: | ||||
| #SBATCH --account=nn9882k | ||||
| # | ||||
| # Wall clock limit: | ||||
| #SBATCH --time=00:10:00 | ||||
| # | ||||
| # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below | ||||
| # | ||||
| # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU  | ||||
| # device(s) to use. It will have values '0', '1' or '0,1' corresponding to  | ||||
| # /dev/nvidia0, /dev/nvidia1 or both, respectively. | ||||
| #SBATCH --partition=accel | ||||
| # | ||||
| # Max memory usage per task (core) - increasing this will cost more core hours: | ||||
| ##SBATCH --mem-per-cpu=3800M | ||||
| #SBATCH --mem-per-cpu=24G | ||||
| # | ||||
| #SBATCH --qos=devel | ||||
| 
 | ||||
| ## Set up job environment: (this is done automatically behind the scenes) | ||||
| ## (make sure to comment '#' or remove the following line 'source ...') | ||||
| # source /cluster/bin/jobsetup | ||||
| 
 | ||||
| module restore system   # instead of 'module purge' rather set module environment to the system default | ||||
| module load CUDA/11.4.1 | ||||
| #module load CUDA/11.1.1-GCC-10.2.0 | ||||
| #module load OpenMPI/4.0.5-gcccuda-2020b | ||||
| 
 | ||||
| # It is also recommended to to list loaded modules, for easier debugging: | ||||
| module list | ||||
| 
 | ||||
| set -o errexit # exit on errors | ||||
| set -o nounset # Treat unset variables as errors (added for more easily discovering issues in your batch script) | ||||
| 
 | ||||
| ## Copy input files to the work directory: | ||||
| mkdir $SCRATCH/ShallowWaterGPU | ||||
| cp -r . $SCRATCH/ShallowWaterGPU | ||||
| 
 | ||||
| ## Make sure the results are copied back to the submit directory (see Work Directory below): | ||||
| # chkfile MyResultFile | ||||
| # chkfile is replaced by 'savefile' on Saga | ||||
| #savefile "$SCRATCH/ShallowWaterGPU/*.log" | ||||
| #savefile "$SCRATCH/ShallowWaterGPU/*.nc" | ||||
| #savefile "$SCRATCH/ShallowWaterGPU/*.json" | ||||
| #savefile "$SCRATCH/ShallowWaterGPU/*.qdrep" | ||||
| 
 | ||||
| cleanup "rm -rf $SCRATCH/ShallowWaterGPU" | ||||
| 
 | ||||
| export OMPI_MCA_opal_cuda_support=true | ||||
| 
 | ||||
| ## Do some work: | ||||
| cd $SCRATCH/ShallowWaterGPU | ||||
| srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version | ||||
| srun /cluster/projects/nn9882k/martinls/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile | ||||
| 
 | ||||
| cd $HOME/src/ShallowWaterGPU | ||||
| mkdir -p output_saga/$NOW/$SLURM_JOB_ID | ||||
| mv $SCRATCH/ShallowWaterGPU/*.log ./output_saga/$NOW/$SLURM_JOB_ID | ||||
| mv $SCRATCH/ShallowWaterGPU/*.nc ./output_saga/$NOW/$SLURM_JOB_ID | ||||
| mv $SCRATCH/ShallowWaterGPU/*.json ./output_saga/$NOW | ||||
| mv $SCRATCH/ShallowWaterGPU/*.qdrep ./output_saga/$NOW | ||||
| @ -1,30 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") | ||||
| 
 | ||||
| # one node: 1–4 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks | ||||
| sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks | ||||
| sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=6826,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| 
 | ||||
| # 4 nodes: 1–4 GPUs per node | ||||
| sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=1706,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=1280,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks | ||||
| 
 | ||||
| # 4 nodes: 1–4 GPUs per node | ||||
| sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=40960,NY=10240,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=40960,NY=5120,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=40960,NY=3413,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=40960,NY=2560,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks | ||||
| 
 | ||||
| ## one node: 1–4 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| # | ||||
| ## 4 nodes: 1–4 GPUs per node | ||||
| #sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=24576,NY=6144,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| #sbatch --nodes=4 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=24576,NY=3072,NOW=$TIMESTAMP saga_scaling_benchmark.job # 8 ranks | ||||
| #sbatch --nodes=4 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=24576,NY=2048,NOW=$TIMESTAMP saga_scaling_benchmark.job # 12 ranks | ||||
| #sbatch --nodes=4 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=24576,NY=1536,NOW=$TIMESTAMP saga_scaling_benchmark.job # 16 ranks | ||||
| @ -1,25 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") | ||||
| 
 | ||||
| # one node: 1-4 GPUs | ||||
| sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks | ||||
| sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks | ||||
| sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks | ||||
| sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| 
 | ||||
| # 2-4 nodes: 1 GPUs per node | ||||
| sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks | ||||
| sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks | ||||
| sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=20480,NY=20480,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| 
 | ||||
| ## one node: 1-4 GPUs | ||||
| #sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 1 ranks | ||||
| #sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks | ||||
| #sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks | ||||
| #sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| 
 | ||||
| ## 2-4 nodes: 1 GPUs per node | ||||
| #sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 2 ranks | ||||
| #sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 3 ranks | ||||
| #sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=12288,NY=12288,NOW=$TIMESTAMP saga_scaling_benchmark.job # 4 ranks | ||||
| @ -1,39 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| NOW=$(date "+%Y-%m-%dT%H%M%S") | ||||
| mkdir -p output_seymour/$NOW | ||||
| 
 | ||||
| # one node: 1-8 GPUs | ||||
| mpiexec -n 1 python mpiTesting.py -nx 8192 -ny 8192 --profile &&  | ||||
| mkdir -p output_seymour/$NOW/1_proc &&  | ||||
| mv *.log output_seymour/$NOW/1_proc/ && mv *.nc output_seymour/$NOW/1_proc/ && | ||||
| 
 | ||||
| mpiexec -n 2 python mpiTesting.py -nx 8192 -ny 4096 --profile &&  | ||||
| mkdir -p output_seymour/$NOW/2_proc &&  | ||||
| mv *.log output_seymour/$NOW/2_proc/ && mv *.nc output_seymour/$NOW/2_proc/ && | ||||
| 
 | ||||
| mpiexec -n 3 python mpiTesting.py -nx 8192 -ny 2731 --profile &&  | ||||
| mkdir -p output_seymour/$NOW/3_proc &&  | ||||
| mv *.log output_seymour/$NOW/3_proc/ && mv *.nc output_seymour/$NOW/3_proc/ && | ||||
| 
 | ||||
| mpiexec -n 4 python mpiTesting.py -nx 8192 -ny 2048 --profile &&  | ||||
| mkdir -p output_seymour/$NOW/4_proc &&  | ||||
| mv *.log output_seymour/$NOW/4_proc/ && mv *.nc output_seymour/$NOW/4_proc/ && | ||||
| 
 | ||||
| mpiexec -n 5 python mpiTesting.py -nx 8192 -ny 1638 --profile && | ||||
| mkdir -p output_seymour/$NOW/5_proc &&  | ||||
| mv *.log output_seymour/$NOW/5_proc/ && mv *.nc output_seymour/$NOW/5_proc/ && | ||||
| 
 | ||||
| mpiexec -n 6 python mpiTesting.py -nx 8192 -ny 1365 --profile && | ||||
| mkdir -p output_seymour/$NOW/6_proc &&  | ||||
| mv *.log output_seymour/$NOW/6_proc/ && mv *.nc output_seymour/$NOW/6_proc/ && | ||||
| 
 | ||||
| mpiexec -n 7 python mpiTesting.py -nx 8192 -ny 1170 --profile && | ||||
| mkdir -p output_seymour/$NOW/7_proc &&  | ||||
| mv *.log output_seymour/$NOW/7_proc/ && mv *.nc output_seymour/$NOW/7_proc/ && | ||||
| 
 | ||||
| mpiexec -n 8 python mpiTesting.py -nx 8192 -ny 1024 --profile && | ||||
| mkdir -p output_seymour/$NOW/8_proc &&  | ||||
| mv *.log output_seymour/$NOW/8_proc/ && mv *.nc output_seymour/$NOW/8_proc/ && | ||||
| 
 | ||||
| for filename in *.json; do mv "$filename" "output_seymour/$NOW/MPI_${NOW}_${filename#????}"; done; | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Anthony Berg
						Anthony Berg