From: Norman Geist (norman.geist_at_uni-greifswald.de)
Date: Thu May 19 2011 - 01:41:37 CDT
Yes sure ;)
But I don't know if it will work for you like that, because it reads a
machinefile and some environment variables from SGE. You also have to change
the directory names etc.
The script also kills jobs when the cpu selection is not valid to allocate
gpus as I explained. For that it uses a password less ssh login to a submit
host to execute a qdel, you have to see if that’s a problem for you or. Call
the script gpu_allocate and gpu_unallocate and functions. Change the
hostnames in the sgeactive and sgekill function.
Good luck.
Call this scripts within your jobscript like:
----------------------------------------------------------------------------
-----
devices=$(your/path/gpu_allocate) //at the beginning of the script
or
devices=$(your/path/gpu_allocate 0 2) //it's also possible to prefer some
gpus
//they will be used if
free
charmrun +p$NSLOTS ... namd2 +devices $devices //here the gpus to use are
applied
//A
multinode mode is not still implemented
//because we
didn't need it yet
[YOUR JOB STUFF]
/your/path/gpu_unallocate //free the gpus
THE ALLOCATE SCRIPT
----------------------------------------------------------------------------
-----
#!/bin/bash
source /d2/bin/functions
#OPTIONS
datapath="/d2/bin/gpu_allocate.data"
datafileext="data"
allocfileext="alloc"
waittime=$(($RANDOM%20))
waitfile="active.info"
waitfileexpired=10
#DEFINES
cpus=""
gpus=""
nodes=""
# Check which GPUs are available and generate the devicestring (example
0,1,2)
#################### HILFE AUSGEBEN
############################################################################
#####################################
if [ "$1" == "-h" ]; then
cat <<HILFE
##################### NORMAN GEIST'S GPU Allocator
######################################################
#
#
# Param1="-h" Zeigt diese Hilfe an
#
# Param1=STRING("1 4 5") Bevorzugt bestimmte GPU Nummern wenn verfügbar
#
# Dieses Script prüft ob auf gewählten Maschinen ausreichend GPUs frei sind,
belegt sie #
# und gibt den passenden devicestring zurück
#
# INPUT ist ENVVAR JOB_ID, NSLOTS und TMPDIR
#
#
#
############################################################################
#############################
HILFE
exit
fi
################ ENDE HILFE AUSGEBEN
############################################################################
#####################################
################ WAIT & SIGN ACTIVE ###########################
wait="true"
while [ "$wait" == "true" ]
do
if [ -f "$datapath/$waitfile" ]; then
echo "INFO: Waitfile found! Means another allocator is accessing the
*.alloc files!" >> gpu_alloc.info.$JOB_ID
echo "I'll wait $waittime seconds!" >> gpu_alloc.info.$JOB_ID
sleep $waittime
else
wait="false"
fi
check=$(find "$datapath/$waitfile" -cmin +$waitfileexpired)
if [ ${#check} -gt 0 ]; then
echo "INFO: Waitfile \"$datapath/$waitfile\" is older then
$waitfileexpired minutes!" >> gpu_alloc.info.$JOB_ID
echo "Ignoring waitfile and start allocation!" >>
gpu_alloc.info.$JOB_ID
wait="false"
fi
done
touch $datapath/$waitfile
###############################################################
# ############### CHECK FOR EXPIRED JOBS ########################
# # check for jobs that aren't still running and unalloc
for allocfile in $(dir $datapath/*.alloc)
do
temp=$(cat $allocfile | grep STAMP)
temp=${temp//" "/_}
for jobstamp in $temp
do
jobid=$( echo ${jobstamp//_/ } | awk '{ print $2 }')
sgeisactive $jobid
if [ $? -lt 1 ]; then
#unalloc
tempnodename=$(basename $allocfile)
tempnodename=${tempnodename/.$allocfileext/}
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
/d2/bin/gpu_unallocate "$tempnodename" "$jobid"
################ WAIT & SIGN ACTIVE ###########################
wait="true"
while [ "$wait" == "true" ]
do
if [ -f "$datapath/$waitfile" ]; then
echo "INFO: Waitfile found! Means another allocator is accessing
the *.alloc files!" >> gpu_alloc.info.$JOB_ID
echo "I'll wait $waittime seconds!" >> gpu_alloc.info.$JOB_ID
sleep $waittime
else
wait="false"
fi
check=$(find "$datapath/$waitfile" -cmin +$waitfileexpired)
if [ ${#check} -gt 0 ]; then
echo "INFO: Waitfile \"$datapath/$waitfile\" is older then
$waitfileexpired minutes!" >> gpu_alloc.info.$JOB_ID
echo "Ignoring waitfile and start allocation!" >>
gpu_alloc.info.$JOB_ID
wait="false"
fi
done
touch $datapath/$waitfile
###############################################################
fi
done
done
# ###############################################################
################ READ THE MASCHINEFILE #################
is_file "$TMPDIR/machines"
if [ $? -eq 1 ]; then
for line in $(cat $TMPDIR/machines)
do
if [ $(countsubstring "$machines" "$line") -eq 0 ]; then
machines="$machines $line"
fi
done
else
echo "ERROR: Couldn't read the machinefile \"$TMPDIR/machines\"! Job with
ID $JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
sgekilljob $JOB_ID
exit 1
fi
########################################################
########################################################
################ READ THE NODEDATA #####################
# Get how many gpus can be allocated
# num_gpus=NSLOTS/(cpus/gpus) result should be integer
# otherwise the script will exit and the job be killed
# job will also be killed if there's no datafile for
# an allocated node
########################################################
#for every machine in the file...
i=0
for node in $machines
do
#...get the Nodedata
is_file "$datapath/$node.$datafileext"
if [ $? -eq 1 ]; then
#For every information
for info in $(cat "$datapath/$node.$datafileext")
do
#is it the cpu value
if [ $(countsubstring "$info" "CPU") -ge 1 ]; then
temp=$(explode "$info" "=")
temp=($temp)
cpus[$i]=${temp[1]}
#ist it the gpus value
elif [ $(countsubstring "$info" "GPU") -ge 1 ]; then
temp=$(explode "$info" "=")
temp=($temp)
gpus[$i]=${temp[1]}
#datafile is wrong format
else
echo "ERROR: Datafile for \"$node\" has wrong structure! Job with ID
$JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
sgekilljob $JOB_ID
exit 1
fi
done
else
# Theres no datafile for an allocated node, Job will be killed
echo "ERROR: No Datafile found for Node \"$node\"! Job with ID $JOB_ID
will be killed!" >> gpu_alloc.info.$JOB_ID
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
sgekilljob $JOB_ID
exit 1
fi
#######################################
nodes[$i]=$node
i=$[$i+1]
done
#SINGLE OR MULTINODE
if [ $i -eq 1 ]; then
gpustoalloc=$(calc "$NSLOTS/(${cpus[0]}/${gpus[0]})")
isfloat "$gpustoalloc"
if [ $? -eq 1 ]; then
temp=$(calc "${cpus[0]}/${gpus[0]}")
temp=$(intval "$temp")
echo "ERROR: GPU ALLOC: No valid choice for gpu number! Choose
(CPUs/GPUs)CPUs for every GPU to be allocated!" >> gpu_alloc.info.$JOB_ID
echo "For this machine it would be $temp CPUs to choose for every GPU to
be allocated!" >> gpu_alloc.info.$JOB_ID
echo "Job with ID $JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
sgekilljob $JOB_ID
exit 1
else
gpustoalloc=$(intval "$gpustoalloc")
#Check available GPUs
temp=$(cat "$datapath/${nodes[0]}.$allocfileext" | grep "ALLOC")
temp=${temp/ALLOC}
temp=($temp)
#for every available gpu
i=0
freegpus=""
while [ $i -lt ${gpus[0]} ]
do
#Is the gpu allocated?
temp2=$(arraysearch "${temp[*]}" "$i" exact)
if [ ${#temp2} -lt 1 ]; then
#This gpu is free
freegpus="$freegpus $i"
fi
i=$[$i+1]
done
#Choose the GPUS
choosengpus=""
#Prefered Mode???
preferedgpus=("$1")
i=0
for prefered in $preferedgpus
do
#Is the prefered gpu free
temp2=$(arraysearch "${freegpus[*]}" "$prefered" exact)
if [ ${#temp2} -gt 0 ] && [ $i -lt $gpustoalloc ]; then
#Yes it is free
choosengpus="$choosengpus $prefered"
i=$[$i+1]
fi
done
#Fill up
#Have I choosen enough GPUS
temp2=($choosengpus)
if [ ${#temp2[*]} -lt $gpustoalloc ]; then
#No I need more
#For every free gpu
for freegpu in $freegpus
do
#Is it alredy choosen?
temp3=$(arraysearch "${temp2[*]}" "$freegpu" exact)
if [ ${#temp3} -lt 1 ] && [ ${#temp2[*]} -lt $gpustoalloc ]; then
#No it's still free so lets get it
choosengpus="$choosengpus $freegpu"
temp2=($choosengpus)
fi
done
fi
#Allocate GPUS
date=$(date +"%H:%M %d.%m.%y")
echo "ALLOC "${temp[*]}$choosengpus>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
cat "$datapath/${nodes[0]}.$allocfileext" | grep JOB >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
cat "$datapath/${nodes[0]}.$allocfileext" | grep STAMP >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
echo "JOB $JOB_ID$choosengpus" >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
echo STAMP $JOB_ID $date >>
"$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
rm -f "$datapath/${nodes[0]}.$allocfileext"
mv "$datapath/${nodes[0]}.$allocfileext.tmp$JOB_ID"
"$datapath/${nodes[0]}.$allocfileext"
#Generate the String
devicestring=""
temp=($choosengpus)
temp=${#temp[*]}
i=1
for choosen in $choosengpus
do
devicestring=$devicestring${choosen/" "/""}
if [ $i -lt $temp ]; then
devicestring=$devicestring","
fi
i=$[$i+1]
done
echo $devicestring
echo "INFO: Allocated the following GPUs: $devicestring on ${nodes[0]}!"
>> gpu_alloc.info.$JOB_ID
fi
else
echo "ERROR: GPU ALLOC: Multi Node Mode is not currently supported! Job
with ID $JOB_ID will be killed!" >> gpu_alloc.info.$JOB_ID
sgekilljob $JOB_ID
exit 1
fi
########################################################
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
----------------------------------------------------------------------------
-----
END THE ALLOCATE SCRIPT
THE UNALLOC SCRIPT
----------------------------------------------------------------------------
-----
#!/bin/bash
source /d2/bin/functions
#OPTIONS
datapath="/d2/bin/gpu_allocate.data"
allocfileext="alloc"
waittime=$(($RANDOM%20))
waitfile="active.info"
waitfileexpired=10
# Unalloc the GPUS for a JOBID
#################### HILFE AUSGEBEN
############################################################################
#####################################
if [ "$1" == "-h" ]; then
cat <<HILFE
##################### NORMAN GEIST'S GPU Unallcator
#####################################################
#
#
# Param1="-h" Zeigt diese Hilfe an
#
# INPUT ist ENVVAR JOB_ID und TMPDIR
#
# Dieses Script gibt durch gpu_allocate belegte GPUs wieder frei!
#
# Optional
#
# Param1=STRING machinename
#
# Param2=INT JOB_ID
#
#
#
############################################################################
#############################
HILFE
exit
fi
################ ENDE HILFE AUSGEBEN
############################################################################
#####################################
################ WAIT & SIGN ACTIVE ###########################
wait="true"
while [ "$wait" == "true" ]
do
if [ -f "$datapath/$waitfile" ]; then
echo "INFO: Waitfile found! Means another allocator is accessing the
*.alloc files!" >> gpu_unalloc.info.$JOB_ID
echo "I'll wait $waittime seconds!" >> gpu_unalloc.info.$JOB_ID
sleep $waittime
else
wait="false"
fi
check=$(find "$datapath/$waitfile" -cmin +$waitfileexpired)
if [ ${#check} -gt 0 ]; then
echo "INFO: Waitfile \"$datapath/$waitfile\" is older then
$waitfileexpired minutes!" >> gpu_unalloc.info.$JOB_ID
echo "Ignoring waitfile and start unallocation!" >>
gpu_unalloc.info.$JOB_ID
wait="false"
fi
done
touch $datapath/$waitfile
###############################################################
################ EXECUTE
############################################################################
#################################################
################ READ THE MASCHINEFILE #################
is_file "$TMPDIR/machines"
test=$?
if [ ${#2} -gt 0 ] && [ ${#1} -gt 0 ]; then
machines="$1"
export JOB_ID=$2
echo "INFO: Unallocation called for inactive Job!" >>
gpu_unalloc.info.$JOB_ID
echo "GPU Unallocation executed for $machines with expired Job-ID
$JOB_ID!" >> gpu_unalloc.info.$JOB_ID
elif [ $test -eq 1 ]; then
for line in $(cat $TMPDIR/machines)
do
if [ $(countsubstring "$machines" "$line") -eq 0 ]; then
machines="$machines $line"
fi
done
echo "INFO: GPU Unallocation called well-planned!" >>
gpu_unalloc.info.$JOB_ID
echo "GPU Unallocation executed for $machines with Job-ID $JOB_ID!" >>
gpu_unalloc.info.$JOB_ID
else
echo "WARNING: Couldn't read the machinefile \"$TMPDIR/machines\" or
parameters! Unable to unallocate GPUs!" >> gpu_unalloc.info.$JOB_ID
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
exit 1
fi
########################################################
# FOR EVERY MACHINE UNALLOC
for machine in $machines
do
#read the allocfile and get what is allocated from this JOB and what from
others
thisalloc=$(cat "$datapath/$machine.$allocfileext" | grep JOB | grep
$JOB_ID)
thisalloc=${thisalloc/JOB/}
thisalloc=${thisalloc/$JOB_ID/}
machinealloc=$(cat "$datapath/$machine.$allocfileext" | grep ALLOC)
machinealloc=${machinealloc/ALLOC/}
#generate new ALLOC
newalloc=""
for allocated in $machinealloc
do
# Is this allocated GPU part of THIS JOB_ID
temp=$(arraysearch "${thisalloc[*]}" "$allocated" exact)
if [ ${#temp} -lt 1 ]; then
newalloc="$newalloc $allocated"
fi
done
# Generate new ALLOCFILE
echo "ALLOC"$newalloc > "$datapath/$machine.$allocfileext.tmp$JOB_ID"
cat "$datapath/$machine.$allocfileext" | grep -v $JOB_ID | grep -v "ALLOC"
>> "$datapath/$machine.$allocfileext.tmp$JOB_ID"
rm -f "$datapath/$machine.$allocfileext"
mv "$datapath/$machine.$allocfileext.tmp$JOB_ID"
"$datapath/$machine.$allocfileext"
done
############################################################################
##########################################################################
################### UNSIGN ACTIVE ######################
rm -f "$datapath/$waitfile"
########################################################
----------------------------------------------------------------------------
-----
END THE UNALLOC SCRIPT
EXTRA FUNCTIONS
----------------------------------------------------------------------------
-----
#!/bin/bash
# COUNT SUBSTRINGS IN A STRING
# $1 String
# $2 Substring
# echo return count of $2 in $1
countsubstring()
{
found=true
string=$1
count=0
while [ $found == true ]
do
length=${#string}
temp=${string/$2/}
length2=${#temp}
if [ $length2 -lt $length ]; then
count=$[$count+1]
string=$temp
else
found=false
fi
done
echo $count
}
# Check if a VAR is set
isset()
{
# $1=variable
temp=$( echo $1 )
if [ ${#temp} -gt 0 ]; then
return 1
else
return 0
fi
}
# Check if a file exists
is_file()
{
#$1 filename
if [ -f $1 ]; then
return 1
else
return 0
fi
}
# Calculate via bc
calc()
{
#$1 calcstring like "10*5/40"
#$2 int precision
scale=2
isset $2
if [ "$?" == "1" ]; then
if [ $2 -gt 0 ]; then
scale=$2
fi
fi
res=$(echo "scale=$scale; $1" | bc)
echo $res
}
# Transform Float to Int while cutting
intval()
{
#$1 float like 55.713 makes 55
echo ${1/.*/}
}
# Extract Celldimensions from ACEMD xsc file
acemd_get_celldimensions_fromxsc()
{
#$1=xscfile
#return echo
line=$(cat $1 | grep 0)
x=$(echo $line | awk '{print $2'})
y=$(echo $line | awk '{print $6'})
z=$(echo $line | awk '{print $10'})
echo $x $y $z
}
# Let String explode at CHAR and return Array
explode()
{
#$1 String
#$2 Seperator
#echo return array
temp=${1//$2/" "}
echo "$temp"
}
# Kill an SGE Job with given ID
sgekilljob()
{
# $1 = JOB_ID
ssh -o StrictHostKeyChecking=no gast_at_clustermaster qdel $1
}
# Check if Job with given ID is active
sgeisactive()
{
#$1 JOB_ID
# return 1 or 0
temp=$(ssh -o StrictHostKeyChecking=no gast_at_clustermaster qstat -j $1)
temp=$(countsubstring "$temp" "job_number")
if [ $temp -gt 0 ]; then
return 1
else
return 0
fi
}
# Check if Var is float
isfloat()
{
# $1 number
# return 1 or 0
temp=${1/*./}
if [ $temp -gt 0 ]; then
return 1
else
return 0
fi
}
# Search Array for something and return the indices
arraysearch()
{
# $1 Array
# $2 Search
# $3 Mode=within || Mode=exact
# $4 num_results default 1 | give int num or "*" for all
# echo return index or if not found nothing
array=("$1")
i=0
results=0
numresults=1
isset "$4"
if [ $? -eq 1 ]; then
if [ "$4" != "*" ]; then
if [ $4 -gt 0 ]; then
numresults=$4
fi
fi
fi
for element in $array
do
if [ "$4" == "*" ]; then
numresults=$[$results+1]
fi
if [ "$3" == "exact" ]; then
if [ "$element" == "$2" ]; then
if [ $results -lt $numresults ]; then
echo $i
results=$[$results+1]
fi
fi
elif [ "$3" == "within" ]; then
if [ $(countsubstring "$element" "$2") -gt 0 ]; then
if [ $results -lt $numresults ]; then
echo $i
results=$[$results+1]
fi
fi
fi
i=$[$i+1]
done
}
function elapsed() #give date as 11 27 2010 15 30 (day month year hour
minute)
{
month=$(date +"%m")
day=$(date +"%d")
year=$(date +"%Y")
hour2=$(date +"%k")
minute2=$(date +"%M")
#Year Overtooked
if [ $year -gt $3 ]; then
return 1
# Year Reached
elif [ $year == $3 ]; then
#Month overtooked
if [ $month -gt $2 ]; then
return 1
#Month reached
elif [ $month == $2 ]; then
#Day Ovetooked
if [ $day -gt $1 ]; then
return 1
#Day reached
elif [ $day == $1 ]; then
#Hour overtooked
if [ $hour2 -gt $4 ]; then
return 1
#Hour reached
elif [ $hour2 == $4 ]; then
#Minute overtooked or reached
if [ $minute2 -ge $5 ]; then
return 1
fi
fi
fi
fi
fi
return 0
}
# Check Date if exist
# Give date as "day month year" f.i. "21 12 2012"
function checkdate()
{
date=($1)
size=${#date[*]}
index=$[$size-1]
year=${date[$index]}
index=$[$size-2]
month=${date[$index]}
index=$[$size-3]
day=${date[$index]}
cal $month $year 2> /dev/null | grep -w $day > /dev/null
if [ $? -eq 0 ]; then
return 1
else
return 0
fi
}
function exp2float()
{
echo "$1" | awk -F"E" 'BEGIN{OFMT="%10.10f"} {print $1 * (10 ^ $2)}'
}
function float_cond()
{
local cond=0
if [[ $# -gt 0 ]]; then
cond=$(echo "$*" | bc -q 2>/dev/null)
if [[ -z "$cond" ]]; then cond=0; fi
if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
fi
local stat=$((cond == 1))
return $stat
}
----------------------------------------------------------------------------
-----
END EXTRA FUNCTIONS
Mit freundlichen Grüßen
Norman Geist.
This archive was generated by hypermail 2.1.6 : Wed Feb 29 2012 - 15:57:09 CST