#!/usr/bin/env python3 import os import grp import argparse import subprocess from subprocess import Popen, PIPE #from math import floor # # Define job submission defaults # # Use the group name as the default account # For Purdue Anvil, peel off "x-" prefix acctdef = grp.getgrgid(os.getgid()).gr_name[2:] jobdef = 'NAMD' walldef = '24:00:00' maxtime_debug = 2*60*60 maxtime_whole = 96*60*60 maxtime_wide = 12*60*60 # end job submission defaults parser = argparse.ArgumentParser(description="Launch NAMD on Anvil CPU nodes") parser.add_argument('namd_conf', metavar='NAMD_CONFIG', type=str, help="NAMD input configuration file") parser.add_argument('logfile', metavar='NAMD_LOGFILE', type=str, help="NAMD output log file") parser.add_argument('nodes', metavar='NODES', type=int, help="number of nodes requested") parser.add_argument('-j', '--jobname', type=str, default=jobdef, help="job name (defaults to \"%s\")" % jobdef) parser.add_argument('-w', '--walltime', type=str, default=walldef, help="job running time (defaults to \"%s\")" % walldef) parser.add_argument('-q', '--queue', type=str, help="Slurm partitions: wholenode, wide, debug") parser.add_argument('-a', '--account', type=str, default=acctdef, help="account name (defaults to group name \"%s\")" % acctdef) parser.add_argument('-r', '--replicas', type=int, help='number of replicas') parser.add_argument('-c', '--cmdline', type=str, default='', help='additional command line options for NAMD (e.g. \"+stdout %%j.log\")') parser.add_argument('-e', '--email', type=str, help="user email for job notifications") parser.add_argument('-d', '--depend', type=str, help="hold job for running") parser.add_argument('-v', '--view', action='store_true', help="view batch script without submitting it") args = parser.parse_args() print("") print("Launch NAMD on Anvil CPU nodes") print("") if len(args.walltime) != 8 or not (args.walltime[0:1].isdigit() and args.walltime[3:4].isdigit() and args.walltime[6:7].isdigit() and args.walltime[2] == ':' and args.walltime[5] == ':'): raise Exception("Format for walltime must be HH:MM:SS") wall_hr = int(args.walltime[0:1]) wall_min = int(args.walltime[3:4]) wall_sec = int(args.walltime[6:7]) wall_total = (wall_hr*60 + wall_min)*60 + wall_sec if not args.queue: if args.nodes <= 16: args.queue = "wholenode" else: args.queue = "wide" print("Queue is not specified, choosing \"%s\" for node count %d" % (args.queue, args.nodes)) if args.queue == "debug": if wall_total > maxtime_debug: raise Exception("Max walltime for \"debug\" job is 2 hours") if args.nodes > 2: raise Exception("Max nodes for \"debug\" job are 2") elif args.queue == "wholenode": if wall_total > maxtime_whole: raise Exception("Max walltime for \"wholenode\" job is 96 hours") if args.nodes > 16: raise Exception("Max nodes for \"wholenode\" job are 16") elif args.queue == "wide": if wall_total > maxtime_wide: raise Exception("Max walltime for \"wide\" job is 12 hours") if args.nodes > 56: raise Exception("Max nodes for \"wide\" job are 56") else: raise Exception("Request unsupported queue \"{}\"".format(args.queue)) print("Number of nodes is set to {}".format(args.nodes)) print("Queue is set to \"{}\"".format(args.queue)) print("Walltime is set to \"{}\"".format(args.walltime)) for filename in [args.namd_conf]: if not os.path.isfile(filename): raise Exception( "NAMD input configuration file \"{}\" not found!".format(filename)) else: print("Found NAMD input config file \"{}\"".format(filename)) for filename in [args.logfile]: if os.path.isfile(filename): raise Exception( "NAMD output log file \"{}\" already exists!".format(filename)) else: print("Storing console output to log file \"{}\"".format(filename)) # Convert float hours to string HH:MM:SS #def format_walltime(total): # print("total = %s" % total) # t = total # h = floor(t) # m = floor((t - h) * 60) # s = floor(((t - h) * 60 - m) * 60) # return ":".join( ["%02d" % i for i in (h,m,s)] ) data = vars(args) #data["walltime"] = format_walltime(args.walltime) data["tasks_per_node"] = 16 if not args.depend: data["depend"] = "" else: data["depend"] = "#SBATCH --dependency=afterany:" + args.depend if args.email and len(args.email) > 0: data["mail-type"] = "#SBATCH --mail-type=all" data["mail-user"] = "#SBATCH --mail-user={}".format(args.email) else: data["mail-type"] = "" data["mail-user"] = "" if not args.replicas: data['replicas'] = '' else: if args.replicas < 1: raise ValueError("The number of replicas must be greater than zero") if 0 != args.nodes % args.replicas: raise ValueError("""The number of nodes ({}) must be divisible by the number of replicas ({})""".format(args.nodes,args.replicas)) data['replicas'] = ' +replicas {}'.format(args.replicas) data["cwd"] = os.getcwd() data["fname"] = os.path.splitext(args.logfile)[0] # near-optimal mapping for AMD EPYC 7003 series CPUs, 2 per node, SMP disabled # follow NUMA settings: every 8 cores share L3 memory cache data["namd_args"] = "+ppn 7 +pemap L1-127:8.7 +commap L0-127:8" run_script = """#!/bin/bash #SBATCH -A {account} #SBATCH -J {jobname} #SBATCH -N {nodes} # Total number of nodes #SBATCH --output="{fname}.%j.out" #SBATCH --ntasks-per-node={tasks_per_node} #SBATCH -p {queue} #SBATCH -t {walltime} {mail-type} {mail-user} {depend} module load gcc module load openmpi module load namd/2.14 echo "" echo "Listing of loaded modules:" module list echo "" echo "NAMD path:" which namd2 echo "" echo "NAMD library linkage:" ldd `which namd2` cd {cwd} pwd echo "" echo "Running the following command:" echo 'srun namd2 {namd_args} {cmdline} "{namd_conf}" >& "{logfile}"' srun namd2 {namd_args} {cmdline} "{namd_conf}" >& "{logfile}" """.format(**data) print("") print("Here is the batch submission script:") print("") print(run_script) if args.view: print("View mode enabled: batch script was not submitted!") else: proc = Popen('sbatch', stdin=PIPE, stdout=PIPE, stderr=PIPE) out,err = proc.communicate(run_script.encode("utf-8")) print(out.decode("utf-8")) print(err.decode("utf-8")) if p.returncode != 0: raise Exception("Problem submitting script with sbatch") else: print(""" NAMD is submitted to the queue. To check on your job, use this command: squeue -u %s Your job is running if there is an R (running) in the S (state) column. You may monitor progress with this command: tail -F %s You will need to press Control-C to exit the tail command. """ % (os.getlogin(), args.logfile))