#!/bin/bash

# Launch multi-GPU benchmarks on TCB surtur
# (HGX-A100: 8x A100-SXM4-40GB, NVSwitch, 2x AMD EPYC 7742 64-Core Processor)
#
# My impression: The AMD Rome CPUs in surtur don't keep up as well as the 
# newer AMD Milan CPUs in puck and portia.
#
# Preserved the initial launching attempts, using 8 cores per GPU
# (same as GPU-resident) without specifying pemap. Lackluster performance.

# set to NAMD binary
NAMD=/Projects/dhardy/namd_builds/NAMD_3.0b6_Linux-x86_64-multicore-CUDA/namd3

echo STMV NVE GPU-offload on 1 GPU
#
#$NAMD +p8 +setcpuaffinity +devices 0 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_1gpu.log
#
$NAMD +p16 +pemap 0-15 +devices 0 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_1gpu.log

echo STMV NVE GPU-offload on 2 GPUs
#
#$NAMD +p16 +setcpuaffinity +devices 0,1 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_2gpu.log
#
$NAMD +p32 +pemap 0-15,64-79 +devices 0,4 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_2gpu.log

echo STMV NVE GPU-offload on 4 GPUs
#
#$NAMD +p32 +setcpuaffinity +devices 0,1,2,3 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_4gpu.log
#
$NAMD +p64 +pemap 0-15,64-79,32-47,96-111 +devices 0,4,2,6 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_4gpu.log

echo STMV NVE GPU-offload on 8 GPUs
#
#$NAMD +p64 +setcpuaffinity +devices 0,1,2,3,4,5,6,7 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_8gpu.log
#
# XXX note that the following 128-core run doesn't work
#$NAMD +p128 +pemap 0-15,64-79,32-47,96-111,16-31,80-95,48-63,112-127 +devices 0,4,2,6,1,5,3,7 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_16core_8gpu.log
#
# XXX also tried 96-core run with similarly poor performance
#$NAMD +p96 +pemap 0-12,64-75,32-43,96-107,16-27,80-91,48-59,112-123 +devices 0,4,2,6,1,5,3,7 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_12core_8gpu.log
#
# instead redistribute 64 cores over the 8 GPUs:
$NAMD +p64 +pemap 0-7,64-71,32-39,96-103,16-23,80-87,48-55,112-119 +devices 0,4,2,6,1,5,3,7 stmv_gpuoff_nve.namd |& tee out_stmv_gpuoff_nve_8gpu.log

