AW: AW: Automatic GPU selection in NAMD ?

From: Norman Geist (norman.geist_at_uni-greifswald.de)
Date: Thu May 19 2011 - 01:41:37 CDT

Yes sure ;)
But I don't know if it will work for you like that, because it reads a
machinefile and some environment variables from SGE. You also have to change
the directory names etc.
The script also kills jobs when the cpu selection is not valid to allocate
gpus as I explained. For that it uses a password less ssh login to a submit
host to execute a qdel, you have to see if that’s a problem for you or. Call
the script gpu_allocate and gpu_unallocate and functions. Change the
hostnames in the sgeactive and sgekill function.
Good luck.

Call this scripts within your jobscript like:

----------------------------------------------------------------------------
-----
devices=$(your/path/gpu_allocate) //at the beginning of the script
or
devices=$(your/path/gpu_allocate 0 2) //it's also possible to prefer some
gpus
                                                  //they will be used if
free

charmrun +p$NSLOTS ... namd2 +devices $devices //here the gpus to use are
applied
                                                                 //A
multinode mode is not still implemented
                                                                 //because we
didn't need it yet
[YOUR JOB STUFF]

/your/path/gpu_unallocate //free the gpus

THE ALLOCATE SCRIPT
----------------------------------------------------------------------------
-----
#!/bin/bash
source /d2/bin/functions
#OPTIONS
datapath="/d2/bin/gpu_allocate.data"
datafileext="data"
allocfileext="alloc"
waittime=$(($RANDOM%20))
waitfile="active.info"
waitfileexpired=10
#DEFINES
cpus=""
gpus=""
nodes=""
# Check which GPUs are available and generate the devicestring (example
0,1,2)

#################### HILFE AUSGEBEN
############################################################################
#####################################
if [ "$1" == "-h" ]; then
cat <<HILFE
##################### NORMAN GEIST'S GPU Allocator
######################################################
#
#
# Param1="-h" Zeigt diese Hilfe an
#
# Param1=STRING("1 4 5") Bevorzugt bestimmte GPU Nummern wenn verfügbar
#
# Dieses Script prüft ob auf gewählten Maschinen ausreichend GPUs frei sind,
belegt sie #
# und gibt den passenden devicestring zurück
#
# INPUT ist ENVVAR JOB_ID, NSLOTS und TMPDIR
#
#
#
############################################################################
#############################
HILFE
exit
fi
################ ENDE HILFE AUSGEBEN
############################################################################
#####################################

################ WAIT & SIGN ACTIVE ###########################
wait="true"
while [ "$wait" == "true" ]
do
  if [ -f "$datapath/$waitfile" ]; then
    echo "INFO: Waitfile found! Means another allocator is accessing the
*.alloc files!" >> gpu_alloc.info.$JOB_ID
    echo "I'll wait $waittime seconds!" >> gpu_alloc.info.$JOB_ID
    sleep $waittime
  else
    wait="false"
  fi
  check=$(find "$datapath/$waitfile" -cmin +$waitfileexpired)
  if [ ${#check} -gt 0 ]; then
    echo "INFO: Waitfile \"$datapath/$waitfile\" is older then
$waitfileexpired minutes!" >> gpu_alloc.info.$JOB_ID
    echo "Ignoring waitfile and start allocation!" >>
gpu_alloc.info.$JOB_ID
    wait="false"
  fi
done
touch $datapath/$waitfile
###############################################################

# ############### CHECK FOR EXPIRED JOBS ########################
# # check for jobs that aren't still running and unalloc
for allocfile in $(dir $datapath/*.alloc)
do
  temp=$(cat $allocfile | grep STAMP)
  temp=${temp//" "/_}

  for jobstamp in $temp
  do
    jobid=$( echo ${jobstamp//_/ } | awk '{ print $2 }')
    sgeisactive $jobid
    if [ $? -lt 1 ]; then
      #unalloc
      tempnodename=$(basename $allocfile)
      tempnodename=${tempnodename/.$allocfileext/}
      ################### UNSIGN ACTIVE ######################
      rm -f "$datapath/$waitfile"
      ########################################################
      /d2/bin/gpu_unallocate "$tempnodename" "$jobid"
      ################ WAIT & SIGN ACTIVE ###########################
      wait="true"
      while [ "$wait" == "true" ]
      do
        if [ -f "$datapath/$waitfile" ]; then
          echo "INFO: Waitfile found! Means another allocator is accessing
the *.alloc files!" >> gpu_alloc.info.$JOB_ID
          echo "I'll wait $waittime seconds!" >> gpu_alloc.info.$JOB_ID
          sleep $waittime
        else
          wait="false"
        fi
        check=$(find "$datapath/$waitfile" -cmin +$waitfileexpired)
        if [ ${#check} -gt 0 ]; then
          echo "INFO: Waitfile \"$datapath/$waitfile\" is older then
$waitfileexpired minutes!" >> gpu_alloc.info.$JOB_ID
          echo "Ignoring waitfile and start allocation!" >>
gpu_alloc.info.$JOB_ID
          wait="false"
        fi
      done
      touch $datapath/$waitfile
      ###############################################################
    fi
  done
done
# ###############################################################

################ READ THE MASCHINEFILE #################
is_file "$TMPDIR/machines"
if [ $? -eq 1 ]; then
  for line in $(cat $TMPDIR/machines)
  do
    if [ $(countsubstring "$machines" "$line") -eq 0 ]; then
      machines="$machines $line"
    fi
  done
else
  echo "ERROR: Couldn't read the machinefile \"$TMPDIR/machines\"! Job with
ID $JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
  ################### UNSIGN ACTIVE ######################
  rm -f "$datapath/$waitfile"
  ########################################################
  sgekilljob $JOB_ID
  exit 1
fi
########################################################

########################################################
################ READ THE NODEDATA #####################
# Get how many gpus can be allocated
# num_gpus=NSLOTS/(cpus/gpus) result should be integer
# otherwise the script will exit and the job be killed
# job will also be killed if there's no datafile for
# an allocated node
########################################################
#for every machine in the file...
i=0
for node in $machines
do
  #...get the Nodedata
  is_file "$datapath/$node.$datafileext"
  if [ $? -eq 1 ]; then
    #For every information
    for info in $(cat "$datapath/$node.$datafileext")
    do
      #is it the cpu value
      if [ $(countsubstring "$info" "CPU") -ge 1 ]; then
        temp=$(explode "$info" "=")
        temp=($temp)
        cpus[$i]=${temp[1]}
      #ist it the gpus value
      elif [ $(countsubstring "$info" "GPU") -ge 1 ]; then
        temp=$(explode "$info" "=")
        temp=($temp)
        gpus[$i]=${temp[1]}
      #datafile is wrong format
      else
        echo "ERROR: Datafile for \"$node\" has wrong structure! Job with ID
$JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
        ################### UNSIGN ACTIVE ######################
        rm -f "$datapath/$waitfile"
        ########################################################
        sgekilljob $JOB_ID
        exit 1
      fi
    done
  else
    # Theres no datafile for an allocated node, Job will be killed
    echo "ERROR: No Datafile found for Node \"$node\"! Job with ID $JOB_ID
will be killed!" >> gpu_alloc.info.$JOB_ID
    ################### UNSIGN ACTIVE ######################
    rm -f "$datapath/$waitfile"
    ########################################################
    sgekilljob $JOB_ID
    exit 1
  fi
  #######################################
nodes[$i]=$node
i=$[$i+1]
done
#SINGLE OR MULTINODE
if [ $i -eq 1 ]; then
  gpustoalloc=$(calc "$NSLOTS/(${cpus[0]}/${gpus[0]})")
  isfloat "$gpustoalloc"
  if [ $? -eq 1 ]; then
     temp=$(calc "${cpus[0]}/${gpus[0]}")
     temp=$(intval "$temp")
    echo "ERROR: GPU ALLOC: No valid choice for gpu number! Choose
(CPUs/GPUs)CPUs for every GPU to be allocated!" >> gpu_alloc.info.$JOB_ID
    echo "For this machine it would be $temp CPUs to choose for every GPU to
be allocated!" >> gpu_alloc.info.$JOB_ID
    echo "Job with ID $JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
    ################### UNSIGN ACTIVE ######################
    rm -f "$datapath/$waitfile"
    ########################################################
    sgekilljob $JOB_ID
    exit 1
  else
    gpustoalloc=$(intval "$gpustoalloc")
    #Check available GPUs
    temp=$(cat "$datapath/${nodes[0]}.$allocfileext" | grep "ALLOC")
    temp=${temp/ALLOC}
    temp=($temp)
    #for every available gpu
    i=0
    freegpus=""
    while [ $i -lt ${gpus[0]} ]
    do
      #Is the gpu allocated?
      temp2=$(arraysearch "${temp[*]}" "$i" exact)
      if [ ${#temp2} -lt 1 ]; then
        #This gpu is free
        freegpus="$freegpus $i"
      fi
      i=$[$i+1]
    done
    #Choose the GPUS
    choosengpus=""
    #Prefered Mode???
    preferedgpus=("$1")
    i=0
    for prefered in $preferedgpus
    do
      #Is the prefered gpu free
      temp2=$(arraysearch "${freegpus[*]}" "$prefered" exact)
      if [ ${#temp2} -gt 0 ] && [ $i -lt $gpustoalloc ]; then
        #Yes it is free
        choosengpus="$choosengpus $prefered"
        i=$[$i+1]
      fi
    done
    #Fill up
    #Have I choosen enough GPUS
    temp2=($choosengpus)
    if [ ${#temp2[*]} -lt $gpustoalloc ]; then
      #No I need more
      #For every free gpu
      for freegpu in $freegpus
      do
        #Is it alredy choosen?
        temp3=$(arraysearch "${temp2[*]}" "$freegpu" exact)
        if [ ${#temp3} -lt 1 ] && [ ${#temp2[*]} -lt $gpustoalloc ]; then
          #No it's still free so lets get it
          choosengpus="$choosengpus $freegpu"
          temp2=($choosengpus)
        fi
      done
    fi
    #Allocate GPUS
     date=$(date +"%H:%M %d.%m.%y")
    echo "ALLOC "${temp[*]}$choosengpus>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
    cat "$datapath/${nodes[0]}.$allocfileext" | grep JOB >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
    cat "$datapath/${nodes[0]}.$allocfileext" | grep STAMP >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
    echo "JOB $JOB_ID$choosengpus" >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
    echo STAMP $JOB_ID $date >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
    rm -f "$datapath/${nodes[0]}.$allocfileext"
    mv "$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
"$datapath/${nodes[0]}.$allocfileext"
    #Generate the String
    devicestring=""
    temp=($choosengpus)
    temp=${#temp[*]}
    i=1
    for choosen in $choosengpus
    do
      devicestring=$devicestring${choosen/" "/""}
      if [ $i -lt $temp ]; then
        devicestring=$devicestring","
      fi
      i=$[$i+1]
    done
    echo $devicestring
    echo "INFO: Allocated the following GPUs: $devicestring on ${nodes[0]}!"
>> gpu_alloc.info.$JOB_ID
  fi
else
  echo "ERROR: GPU ALLOC: Multi Node Mode is not currently supported! Job
with ID $JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
  sgekilljob $JOB_ID
  exit 1
fi
########################################################

################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
----------------------------------------------------------------------------
-----
END THE ALLOCATE SCRIPT

THE UNALLOC SCRIPT
----------------------------------------------------------------------------
-----
#!/bin/bash
source /d2/bin/functions
#OPTIONS
datapath="/d2/bin/gpu_allocate.data"
allocfileext="alloc"
waittime=$(($RANDOM%20))
waitfile="active.info"
waitfileexpired=10
# Unalloc the GPUS for a JOBID

#################### HILFE AUSGEBEN
############################################################################
#####################################
if [ "$1" == "-h" ]; then
cat <<HILFE
##################### NORMAN GEIST'S GPU Unallcator
#####################################################
#
#
# Param1="-h" Zeigt diese Hilfe an
#
# INPUT ist ENVVAR JOB_ID und TMPDIR
#
# Dieses Script gibt durch gpu_allocate belegte GPUs wieder frei!
#
# Optional
#
# Param1=STRING machinename
#
# Param2=INT JOB_ID
#
#
#
############################################################################
#############################
HILFE
exit
fi
################ ENDE HILFE AUSGEBEN
############################################################################
#####################################

################ WAIT & SIGN ACTIVE ###########################
wait="true"
while [ "$wait" == "true" ]
do
  if [ -f "$datapath/$waitfile" ]; then
    echo "INFO: Waitfile found! Means another allocator is accessing the
*.alloc files!" >> gpu_unalloc.info.$JOB_ID
    echo "I'll wait $waittime seconds!" >> gpu_unalloc.info.$JOB_ID
    sleep $waittime
  else
    wait="false"
  fi
  check=$(find "$datapath/$waitfile" -cmin +$waitfileexpired)
  if [ ${#check} -gt 0 ]; then
    echo "INFO: Waitfile \"$datapath/$waitfile\" is older then
$waitfileexpired minutes!" >> gpu_unalloc.info.$JOB_ID
    echo "Ignoring waitfile and start unallocation!" >>
gpu_unalloc.info.$JOB_ID
    wait="false"
  fi
done
touch $datapath/$waitfile
###############################################################

################ EXECUTE
############################################################################
#################################################
################ READ THE MASCHINEFILE #################
is_file "$TMPDIR/machines"
test=$?
if [ ${#2} -gt 0 ] && [ ${#1} -gt 0 ]; then
  machines="$1"
  export JOB_ID=$2
  echo "INFO: Unallocation called for inactive Job!" >>
gpu_unalloc.info.$JOB_ID
  echo "GPU Unallocation executed for $machines with expired Job-ID
$JOB_ID!" >> gpu_unalloc.info.$JOB_ID
elif [ $test -eq 1 ]; then
  for line in $(cat $TMPDIR/machines)
  do
    if [ $(countsubstring "$machines" "$line") -eq 0 ]; then
      machines="$machines $line"
    fi
  done
 echo "INFO: GPU Unallocation called well-planned!" >>
gpu_unalloc.info.$JOB_ID
 echo "GPU Unallocation executed for $machines with Job-ID $JOB_ID!" >>
gpu_unalloc.info.$JOB_ID
else
  echo "WARNING: Couldn't read the machinefile \"$TMPDIR/machines\" or
parameters! Unable to unallocate GPUs!" >> gpu_unalloc.info.$JOB_ID
  ################### UNSIGN ACTIVE ######################
  rm -f "$datapath/$waitfile"
  ########################################################
  exit 1
fi
########################################################
# FOR EVERY MACHINE UNALLOC
for machine in $machines
do
  #read the allocfile and get what is allocated from this JOB and what from
others
  thisalloc=$(cat "$datapath/$machine.$allocfileext" | grep JOB | grep
$JOB_ID)
  thisalloc=${thisalloc/JOB/}
  thisalloc=${thisalloc/$JOB_ID/}
  machinealloc=$(cat "$datapath/$machine.$allocfileext" | grep ALLOC)
  machinealloc=${machinealloc/ALLOC/}
  #generate new ALLOC
  newalloc=""
  for allocated in $machinealloc
  do
    # Is this allocated GPU part of THIS JOB_ID
    temp=$(arraysearch "${thisalloc[*]}" "$allocated" exact)
    if [ ${#temp} -lt 1 ]; then
      newalloc="$newalloc $allocated"
    fi
  done
  # Generate new ALLOCFILE
  echo "ALLOC"$newalloc > "$datapath/$machine.$allocfileext.tmp$JOB_ID"
  cat "$datapath/$machine.$allocfileext" | grep -v $JOB_ID | grep -v "ALLOC"
>> "$datapath/$machine.$allocfileext.tmp$JOB_ID"
  rm -f "$datapath/$machine.$allocfileext"
  mv "$datapath/$machine.$allocfileext.tmp$JOB_ID"
"$datapath/$machine.$allocfileext"
done
############################################################################
##########################################################################

################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
----------------------------------------------------------------------------
-----
END THE UNALLOC SCRIPT

EXTRA FUNCTIONS
----------------------------------------------------------------------------
-----
#!/bin/bash

# COUNT SUBSTRINGS IN A STRING
# $1 String
# $2 Substring
# echo return count of $2 in $1
countsubstring()
{
  found=true
  string=$1
  count=0
  while [ $found == true ]
  do
    length=${#string}
    temp=${string/$2/}
    length2=${#temp}
    if [ $length2 -lt $length ]; then
      count=$[$count+1]
      string=$temp
    else
      found=false
    fi
  done
  echo $count
}

# Check if a VAR is set
isset()
{
  # $1=variable
  temp=$( echo $1 )
  if [ ${#temp} -gt 0 ]; then
    return 1
  else
    return 0
  fi
}

# Check if a file exists
is_file()
{
  #$1 filename
  if [ -f $1 ]; then
    return 1
  else
    return 0
  fi
}

# Calculate via bc
calc()
{
  #$1 calcstring like "10*5/40"
  #$2 int precision
  scale=2
  isset $2
  if [ "$?" == "1" ]; then
    if [ $2 -gt 0 ]; then
      scale=$2
    fi
  fi
  res=$(echo "scale=$scale; $1" | bc)
  echo $res
}

# Transform Float to Int while cutting
intval()
{
  #$1 float like 55.713 makes 55
  echo ${1/.*/}
}

# Extract Celldimensions from ACEMD xsc file
acemd_get_celldimensions_fromxsc()
{
  #$1=xscfile
  #return echo
  line=$(cat $1 | grep 0)
  x=$(echo $line | awk '{print $2'})
  y=$(echo $line | awk '{print $6'})
  z=$(echo $line | awk '{print $10'})
  echo $x $y $z
}

# Let String explode at CHAR and return Array
explode()
{
  #$1 String
  #$2 Seperator
  #echo return array
  temp=${1//$2/" "}
  echo "$temp"
}

# Kill an SGE Job with given ID
sgekilljob()
{
  # $1 = JOB_ID
  ssh -o StrictHostKeyChecking=no gast_at_clustermaster qdel $1
}

# Check if Job with given ID is active
sgeisactive()
{
  #$1 JOB_ID
  # return 1 or 0
  temp=$(ssh -o StrictHostKeyChecking=no gast_at_clustermaster qstat -j $1)
  temp=$(countsubstring "$temp" "job_number")
  if [ $temp -gt 0 ]; then
    return 1
  else
    return 0
  fi
}

# Check if Var is float
isfloat()
{
  # $1 number
  # return 1 or 0
  temp=${1/*./}
  if [ $temp -gt 0 ]; then
    return 1
  else
    return 0
  fi
}

# Search Array for something and return the indices
arraysearch()
{
  # $1 Array
  # $2 Search
  # $3 Mode=within || Mode=exact
  # $4 num_results default 1 | give int num or "*" for all
  # echo return index or if not found nothing
  array=("$1")
  i=0
  results=0
  numresults=1
  isset "$4"
  if [ $? -eq 1 ]; then
    if [ "$4" != "*" ]; then
      if [ $4 -gt 0 ]; then
        numresults=$4
      fi
    fi
  fi
  for element in $array
  do
    if [ "$4" == "*" ]; then
      numresults=$[$results+1]
    fi
    if [ "$3" == "exact" ]; then
      if [ "$element" == "$2" ]; then
        if [ $results -lt $numresults ]; then
          echo $i
          results=$[$results+1]
        fi
      fi
    elif [ "$3" == "within" ]; then
      if [ $(countsubstring "$element" "$2") -gt 0 ]; then
        if [ $results -lt $numresults ]; then
          echo $i
          results=$[$results+1]
        fi
      fi
    fi
    i=$[$i+1]
  done
}

function elapsed() #give date as 11 27 2010 15 30 (day month year hour
minute)
{
  month=$(date +"%m")
  day=$(date +"%d")
  year=$(date +"%Y")
  hour2=$(date +"%k")
  minute2=$(date +"%M")
  
  #Year Overtooked
  if [ $year -gt $3 ]; then
    return 1
 # Year Reached
  elif [ $year == $3 ]; then
    #Month overtooked
    if [ $month -gt $2 ]; then
      return 1
    #Month reached
    elif [ $month == $2 ]; then
      #Day Ovetooked
      if [ $day -gt $1 ]; then
        return 1
      #Day reached
      elif [ $day == $1 ]; then
        #Hour overtooked
        if [ $hour2 -gt $4 ]; then
          return 1
        #Hour reached
        elif [ $hour2 == $4 ]; then
          #Minute overtooked or reached
          if [ $minute2 -ge $5 ]; then
            return 1
          fi
        fi
      fi
    fi
  fi
  return 0
}

# Check Date if exist
# Give date as "day month year" f.i. "21 12 2012"
function checkdate()
{
  date=($1)
  size=${#date[*]}
  index=$[$size-1]
  year=${date[$index]}
  index=$[$size-2]
  month=${date[$index]}
  index=$[$size-3]
  day=${date[$index]}
  cal $month $year 2> /dev/null | grep -w $day > /dev/null
  if [ $? -eq 0 ]; then
          return 1
  else
          return 0
  fi
}

function exp2float()
{
  echo "$1" | awk -F"E" 'BEGIN{OFMT="%10.10f"} {print $1 * (10 ^ $2)}'
}

function float_cond()
{
    local cond=0
    if [[ $# -gt 0 ]]; then
        cond=$(echo "$*" | bc -q 2>/dev/null)
        if [[ -z "$cond" ]]; then cond=0; fi
        if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
    fi
    local stat=$((cond == 1))
    return $stat
}
----------------------------------------------------------------------------
-----
END EXTRA FUNCTIONS

Mit freundlichen Grüßen

Norman Geist.

This archive was generated by hypermail 2.1.6 : Wed Feb 29 2012 - 15:57:09 CST