# $Id: ams.tcl,v 1.9 2015/07/13 14:09:00 mayne Exp $

# AMS helper functions for concurrent NAMD instances
# Developed as part of CEMRACS 2013
#
# Written by: Christopher G. Mayne
# last updated: 07-12-2015
#
global AMS_verno
set AMS_verno 20150712a

#==============================================================================
# Configuration Information (HOW TO USE)
#==============================================================================
# The NAMD configuration file requires the following items:
#
# NAMD Standard options:
#   The following NAMD options are required and should be preset:
#   -- outputname : should be different from AMS_sharedDataPrefix defined below
#
#   The following NAMD options are optional
#   -- DCDFreq : this will also be the stepsize used to monitor the rc
#              : if not set, AMS_stepSize should be left undefined or set as equal
#              : must be left undefined if AMS_suppressDCD is turned on
#
# AMS-specific: 
#   -- user must provide "ams_measure" proc, called with no arguments & returns
#      the current reaction coordinate value (scalar)
#   e.g., source my_ams_measure.tcl
#
#   -- source this file first
#   source ams.tcl
#   
#   -- set the following global variables (required)
#   set AMS_sharedDataPrefix <filename prefix>       | e.g., "ams.data"
#   set AMS_thresholdMinZ <double>                   | e.g., 3.040
#   set AMS_thresholdMaxZ <double>                   | e.g., 17.0
#   set AMS_wallTimeLimit <integer, in seconds>      | e.g., [expr {60 * 60 * 20.0}]; # 20 hrs
#   set AMS_verbosity <debug|standard>
#   
#   -- the following variables are optional
#   set AMS_suppressDCD <off|on|0|1>                 | if "on", make sure DCDFreq is undefined
#   set AMS_stepSize <integer>                       | e.g., 500; can also be set by defining dcdfreq
#   set AMS_pollInterval <integer, milliseconds>     | e.g., 10000
#   set AMS_lockInterval <integer, milliseconds>     | e.g., 1000
#   set AMS_ATS <off|on|0|1>
#   set AMS_ATS_proc <proc name>                     | e.g., "computeSS"
#   NOTE: If AMS_ATS is turned on, not only must the proc be specified, but also
#         provided by the user in the configuration file
#	set AMS_repairGlobPattern <pattern for .ams.log> | if unset, defaults to "*.ams.log"
#	set AMS_repairDiagnosticOnly <0|1>               | if unset, defaults to standard repair run
#   
#   -- run the simulation with AMS
#   initAMS
#	runAMS 
#	termAMS
#
#	-- OR to repair an existing abnormally terminated AMS run (e.g., system id 4)
#	initAMS
#	repairAMS 4
#	termAMS


#==============================================================================
# Change Log
#==============================================================================
# updated 03/04/2015 a
# -- added a global variable to track and log version number (AMS_verno)
# -- added a function to repair broken AMS runs (repairAMS)
# updated 05/15/2014 a
# -- updated the method used to measure the reaction coordinate.  now uses a
#    user-provided proc "ams_measure" to report current RC value.
# updated 02/13/2014 a
# -- updated the method used to measure the colvar.  now uses the native, but
#    undocumented, "colvarvalue <colvar name>" function instead of the old
#    harmonic colvarbias trick.  accordingly, the colvar itself should be named
#    "ams_measure" (because there is no longer a harmonic).
# updated 01/09/2014 a
# -- only update .zdat data when a new maxZ is observed
# -- simulation restarts now use position, velocity, and cell data recorded via
#    the new "output" and "reinitatoms" NAMD commands; this requires the new 
#    .frames directory to store the data.  no new user-defined settings are required
# -- DCD suppression added
# -- step size can be set by dcdfreq OR AMS_stepSize; if dcd suppression is turned on
#    AMS_stepSize MUST be defined
# -- added runAMS proc to simplify code launch in NAMD config file
#
# updated 01/02/2014 b
# -- Adaptive Timestepping (ATS) implemented via a user provided alrogithm for
#    computing the stepsize when passed the current value of the reaction coordinate
#
# updated 01/02/2014 a
# -- AMS_outputPrefix is replaced by the more descriptive AMS_sharedDataPrefix
#    to help differentiate it from instance-specific output, such as logging
# -- AMS_logPrefix is depreciated as the AMS_logOutput file is now named/opened
#    as a modification of the standard NAMD option "outputname" and probed using
#    the new NAMD introspection Tcl commands ([isset outputname], [outputname])
# -- AMS global are set as "global" in the amsNAMD_*.tcl file rather than directly
#    in the configuration file.  This means they don't need to be namespaced in
#    the configuation file prior (e.g., global <varname>) to user-defined
#    setup/customization (e.g., set <varname> <value>)
# -- initAMS proc now has input checking and error handling with logging to the
#    instance-specific AMS log file, or in the event that log output fails,
#    redirects to NAMD stdout
# -- AMS_stepSize is set to equal the standard NAMD option "DCDFreq", and is no
#    longer a required user-defined variable (should not be manually set!)
#
# updated 09/17-18/2013
# -- added an AMS initialization proc to reduce the amount of setup required
#    in the NAMD configuration file
# -- added an AMS termination proc to shutdown the AMS environment
# -- implement custom Tcl logging; divorces AMS output from NAMD log files,
#    allows for verbosity setting (native log package is unavail in NAMD)
#
# updated 08/25/2013
# -- removed references to 1D TEST
# -- implemented wall clock time checking via "exec date +%s"
#
# updated 13/07/2016 - Laura
# -- changed AMS_probaFile to AMS_killFile (the new file AMS_sharedDataPrefix.killed
#    must exist and contain the number of replicas before running AMS). This file
#    will contain the number of replicas killed at each iteration for the calculation
#    of the probability.
# -- added AMS_Brep global variable with the number of replicas that reached B to be
#    writen on the AMS_killFile by the termAMS proc
#
# updated 15/07/2016 - Laura
# -- status of a replica can be also b, when it enters B. This is here because of the
#    AMS_Brep new variable. This way not only the final status file will give the
#    user the number of replicas that entered B but the initial number can be
#    calculated without reading the positions of every replica
# -- AMS_Brep is calculated reading the status file because we can have a replica
#    that was inside B already (enter B in the initialization fase is rare but it
#    happens and the code couldn't handle it)
# -- inserted a new file (.traj) to have the colvars' variables at each step for
#    each instance to recriate the trajectories whithout using vmd (this is going to
#    be usefull for huge system when we don't want to store every positions)
#
# updated 28/09/2016 - Laura
# -- change in function inB inA. Now the user should give a unic function called zone
#    that returns -1 if the system is in A, 0 if not in A or B, and 1 if in B.
# -- added new global variable AMS_toKill that is the number of replicas to kill at 
#    each iteration. The user must give its value in the conf file
# -- changed getKillIds to kill the minimum of AMS_toKill replicas
#
# updated 13/02/2017 - Laura
# -- changed getKillIds to return both the kill ids and the kill level $killLevel
#
#==============================================================================
# Initialize/Shutdown NAMD AMS
#==============================================================================
# Declare AMS Global Variables
global AMS_verbosity AMS_logOutput
global AMS_sharedDataPrefix
global AMS_iterFile AMS_sysFile AMS_statusFile AMS_refCountFile AMS_killFile
global AMS_thresholdMinZ AMS_thresholdMaxZ
global AMS_stepSize AMS_stepsPerCycle AMS_suppressDCD
global AMS_pollInterval AMS_lockInterval
global AMS_wallTimeLimit AMS_startTime
global AMS_ATS AMS_ATS_proc
global AMS_repairGlobPattern AMS_repairDiagnosticOnly
global AMS_nbReplicas
global AMS_Brep
global AMS_toKill

proc initAMS {} {
	# sets up the AMS environment and checks the input

	# PASSED: nothing
	# RETURNS: nothing

	# localize global variables
	global AMS_verno
	global AMS_verbosity AMS_logOutput
	global AMS_sharedDataPrefix
	global AMS_iterFile AMS_sysFile AMS_statusFile AMS_refCountFile AMS_killFile 
	global AMS_thresholdMinZ AMS_thresholdMaxZ
	global AMS_stepSize AMS_stepsPerCycle AMS_suppressDCD
	global AMS_pollInterval AMS_lockInterval
	global AMS_wallTimeLimit AMS_startTime
	global AMS_ATS AMS_ATS_proc
	global AMS_nbReplicas
	global AMS_Brep
	global AMS_toKill

	# local error variables
	set errorFlag 0
	set startupText {}
	# note: logging errors will result in redirection of error information to NAMD stdout
	#       otherwise, errors will be sent to the AMS log file
	set amsLogError 0
	
	# setup logging (note: logging errors are sent to NAMD stdout)
	if { ![isset outputname] } {
		set amsLogError 1
		set errorFlag 1
		lappend startupText [list error "NAMD outputname is not set prior to executing initAMS proc"]
	} else {
		set namd_output_name [outputname]
		set AMS_logOutput [open ${namd_output_name}.ams.log w]
		lappend startupText [list info "AMS log opened: ${namd_output_name}.ams.log"]
		unset namd_output_name
	}

	# programmatic configuration of some AMS variables
	set AMS_iterFile	${AMS_sharedDataPrefix}.iter
	set AMS_sysFile		${AMS_sharedDataPrefix}.sys
	set AMS_statusFile	${AMS_sharedDataPrefix}.status
	set AMS_refCountFile	${AMS_sharedDataPrefix}.rcf
	set AMS_killFile	${AMS_sharedDataPrefix}.killed
	# check that shared data files exist

	if { ![file exists $AMS_iterFile] } {
		set errorFlag 1
		lappend startupText [list error "Cannot find ITER file: $AMS_iterFile"]
	} else {
		lappend startupText [list info "ITER file found: $AMS_iterFile"]
	}

	if { ![file exists $AMS_sysFile] } {
		set errorFlag 1
		lappend startupText [list error "Cannot find SYS file: $AMS_sysFile"]
	} else {
		lappend startupText [list info "SYS file found: $AMS_sysFile"]
	}

	if { ![file exists $AMS_statusFile] } {
		set errorFlag 1
		lappend startupText [list error "Cannot find STATUS file: $AMS_statusFile"]
	} else {
		lappend startupText [list info "STATUS file found: $AMS_statusFile"]
	}

	if { ![file exists $AMS_refCountFile] } {
		set errorFlag 1
		lappend startupText [list error "Cannot find REFCOUNT file: $AMS_refCountFile"]
	} else {
		lappend startupText [list info "REFCOUNT file found: $AMS_refCountFile"]
	}
	
	if { ![file exists $AMS_killFile] } {
		set errorFlag 1
		lappend startupText [list error "Cannot find KILL file: $AMS_killFile"]
	} else {
		lappend startupText [list info "KILL file found: $AMS_killFile"]
	}

    # check that ams_measure proc exists
    if { [llength [info commands ams_measure]] == 0 || ![info complete ams_measure] } {
        set errorFlag 1
        lappend startupText [list error "ams_measure proc not properly set"]
    } else {
        lappend startupText [list info "Valid ams_measure proc FOUND"]
    }

#	# check that min and max Z (Reaction Coordinate) values are set and real numbers
#	if { ![info exists AMS_thresholdMinZ] || ![string is double $AMS_thresholdMinZ] } {
#		set errorFlag 1
#		lappend startupText [list error "AMS_thresholdMinZ not properly set"]
#	} else {
#		lappend startupText [list info "AMS_thresholdMinZ set to: $AMS_thresholdMinZ"]
#	}
#
	if { ![info exists AMS_thresholdMaxZ] || ![string is double $AMS_thresholdMaxZ] } {
		set errorFlag 1
		lappend startupText [list error "AMS_thresholdMaxZ not properly set"]
	} else {
		lappend startupText [list info "AMS_thresholdMaxZ set to: $AMS_thresholdMaxZ"]
	}


	# three parameters to test here: 1) dcdfreq, 2) AMS_stepSize, and 3) AMS_suppressDCD

	# handle DCD suppression
	if { [info exists AMS_suppressDCD] && ([string toupper $AMS_suppressDCD] eq "YES" || $AMS_suppressDCD) } {
		set AMS_suppressDCD 1
		lappend startupText [list info "DCD suppression turned ON"]
	} else {
		set AMS_suppressDCD 0
		lappend startupText [list info "DCD suppression turned OFF (default)"]
	}

	# combinations of dcdfreq and/or AMS_stepSize
	# inside the code dcdfreq will always be defined by AMS_stepSize	
	if { ![isset dcdfreq] && ![info exists AMS_stepSize] } {
		# neither are set
		set errorFlag 1
		lappend startupText [list error "DCDFreq and AMS_stepSize are undefined; one or the other must be set"]
	} elseif { ([isset dcdfreq] && [info exists AMS_stepSize]) && [dcdfreq] != $AMS_stepSize } {
		# both are set and in conflict
		lappend startupText [list warning "DCDFreq and AMS_stepSize are explicitly set to different values; using AMS_stepSize"]
	} elseif { [isset dcdfreq] && $AMS_suppressDCD } {
		# AMS_suppressDCD and dcdfreq are defined
		set errorFlag 1
		lappend startupText [list error "DCDFreq cannot not be defined when DCD suppression is turned ON"]
	} else {
		if { [isset dcdfreq] && ![info exists AMS_stepSize] } {
			# dcdfreq was set, AMS_stepSize keys off dcdfreq
			set AMS_stepSize [dcdfreq]
		} else {
			# AMS_stepSize was set, dcdfreq keys off AMS_stepsize when dcd suppresion is off
			if { !$AMS_suppressDCD } { dcdfreq $AMS_stepSize }
		}
		lappend startupText [list info "Simulation stepsize set to: $AMS_stepSize"]
	}


	# check that steps percycle are defined
	if { ![isset stepspercycle] } {
		set errorFlag 1
		lappend startupText [list error "stepspercycle not properly set"]
	} else {
		set AMS_stepsPerCycle [stepspercycle]
		lappend startupText [list info "Simulation stepsize restricted to multiple of: $AMS_stepsPerCycle"]
	}

	# check for interval settings; set defaults (milliseconds) if they haven't been manually set already
	if { ![info exists AMS_pollInterval] || ![string is integer $AMS_pollInterval] } { set AMS_pollInterval 10000 } ; # 10 seconds
	lappend startupText [list info "pollInterval (time between checking the status file) set to: $AMS_pollInterval (ms)"]
	if { ![info exists AMS_lockInterval] || ![string is integer $AMS_lockInterval] } { set AMS_lockInterval 1000  } ; # 1  second
	lappend startupText [list info "lockInterval (time between attempting a file lock) set to: $AMS_lockInterval (ms)"]

	# check wall time limit is set and is a real number
	if { ![info exists AMS_wallTimeLimit] || ![string is double $AMS_wallTimeLimit] } {
		set errorFlag 1
		lappend startupText [list error "AMS_wallTimeLimit not properly set"]
	} else {
		lappend startupText [list info "Walltime limit set to: $AMS_wallTimeLimit (s) = [expr {$AMS_wallTimeLimit / (60 * 60)}] (hrs)"]
	}

	# poll the system clock to set the start time
	set AMS_startTime [exec date +%s]
	if { ![info exists AMS_startTime] || ![string is integer $AMS_startTime] } {
		set errorFlag 1
		lappend startupText [list error "Start time failed to initialize properly"]
	} else {
		lappend startupText [list info "Process start time set to: $AMS_startTime"]
	}

	# Adaptive Time Stepping (ATS)
	if { [info exists AMS_ATS] && ([string toupper $AMS_ATS] eq "ON" || $AMS_ATS == 1) } {
		set AMS_ATS 1
		lappend startupText [list info "Adaptive Timestepping (ATS) turned ON"]
		if { [llength [info commands $AMS_ATS_proc]] == 0 || ![info complete $AMS_ATS_proc] } {
			puts "325"
			set errorFlag 1
			lappend startupText [list error "ATS proc error (not defined or incomplete)"]
		} else {
			lappend startupText [list info "ATS proc set to: $AMS_ATS_proc <current rc value>"]	
		}
	} else {
		set AMS_ATS 0
		lappend startupText [list info "Adaptive Timesepping (ATS) turned OFF (default)"]
	}

	
	# get the number of replicas this AMS is running and the number of replicas already in B
	set infile [open $AMS_statusFile r]
	set aux [lrange [split [read $infile] "\n"] 0 end-1]
	set AMS_nbReplicas [llength $aux]
	set col2 {}
	foreach {one} $aux {
		lappend col2 [lindex $one 1]
	}
	set AMS_Brep [llength [lsearch -all $col2 b]]
	close $infile
	if { $AMS_nbReplicas == 0 } {
		set errorFlag 1
		lappend startupText [list error "AMS_statusFile empty"]
	}
	lappend startupText [list info  "Number of replicas  : $AMS_nbReplicas, replicas already in B: $AMS_Brep"]
	
	# check if the number of replicas to kill (AMS_toKill) is an integer and is smaller than the number of replicas
	if { ![info exists AMS_toKill] || ![string is integer $AMS_toKill] || $AMS_toKill >= $AMS_nbReplicas } {
		set errorFlag 1
		lappend startupText [list error "AMS_toKill not properly set"]
	} else {
		lappend startupText [list info  "Minimum number of replicas to kill : $AMS_toKill"]
	}

	# write startup text to log file
	if { !$amsLogError } {
		# ams logging works
		amslog info "##############################################################"
		amslog info "Adaptive Multilevel Splitting (AMS, ${AMS_verno}) Log File"
		amslog info "##############################################################\n"
		foreach ele $startupText {
			lassign $ele lvl msg
			amslog $lvl $msg
		}
	} else {
		# ams logging failed, redirect information to NAMD stdout
		print "\n##############################################################"
		print "Adaptive Multilevel Splitting (AMS, ${AMS_verno}) Startup"
		print "##############################################################\n"
		foreach ele $startupText {
			lassign $ele lvl msg
			print "ams ${lvl}: $msg"
		}
	}
	
	# on error, exit appropriately, otherwise return normally
	if { $errorFlag && $amsLogError } {
		# brute force quit
		exit
	} elseif { $errorFlag } {
		# shutdown AMS using the termAMS proc to quit gracefully
		termAMS
	} else {
		# no errors, lock settings in with a run 0 and return normally
		run 0
		return
	}
}

proc runAMS {} {
	# provides simple proc name for users to call in config files that actually runs the code
	global AMS_killFile
	global AMS_nbReplicas
	while {1} {
	    set lixo [getKillIds]
	    set killIds [lindex $lixo 0]
	    set killLevel [lindex $lixo 1]
	    # get iteration number from shared data
	    set ams_iter [getNextIter]
	    set count 0
	    amslog debug  "Iteration $ams_iter | killing [llength $killIds] replicas | kill level $killLevel"
	    foreach id $killIds {
		amslog debug "killing $id"
		waitForStart $id ;# wait until the status is changed from "s" to "p"	
		lassign [initInstance $id $killIds $ams_iter $count $killLevel] ams_sysId ams_initZ
		lassign [runInstance $ams_sysId $ams_initZ $ams_iter $count] ams_maxZ ams_maxZFrame
		termInstance $ams_sysId $ams_maxZ $ams_maxZFrame
		incr count
	    }
	    #---------------------------------
	    # UPDATE THE FILE FOR PROBABILITY
	    #---------------------------------
  	    #lockFile $AMS_killFile
	    set infile [open $AMS_killFile a]
	    puts $infile [llength $killIds]
	    #set proba [lindex [lrange [split [read $infile] "\n"] 0 end-1] end]
	    #set new_proba [expr {${proba}*(1-1.0*[llength $killIds]/${AMS_nbReplicas})}]
	    #puts $infile ${new_proba}
 	    close $infile
	    #unlockFile $AMS_killFime
	    amslog debug "Iteration $ams_iter, finished all [llength $killIds] replicas"
        }
}

proc termAMS {} {
	# shuts down the AMS environment

	# PASSED: nothing
	# RETURNS: nothing

	# localize relevant variables
	global AMS_logOutput AMS_startTime
	global AMS_Brep AMS_killFile

	# poll system for current time in seconds
	set currTime [exec date +%s]
	#---------------------------------
	# UPDATE THE FILE FOR PROBABILITY
	#---------------------------------
	# print number of replicas that reached B in the end of the AMS run
	set infile [open $AMS_killFile a]
	puts $infile [expr -1*$AMS_Brep]
	close $infile
	amslog info "shutting down: $AMS_Brep replicas reached B"

	# if start/current times are valid, compute a runtime and log
	# note: if startup can't poll the system clock, the termAMS proc will be
	#       executed without a valid start time--hence this test is necessary
	if { [string is integer $AMS_startTime] && [string is integer $currTime] } {
		set runtime [expr $currTime - $AMS_startTime]
		set hour [expr int($runtime/3600)]
		set min [expr int(($runtime-3600*$hour)/60)]
		set sec [expr int($runtime-3600*$hour-60*$min)]
		amslog info "shutting down AMS at: [exec date +%s], runtime: $runtime s ($hour h $min min $sec s)\n"
	} else {
		amslog info "shutting down AMS (runtime calculation failed)\n"
	}

	# shutdown
	close $AMS_logOutput
	exit
}

proc repairAMS { sysId } {
	# initializes the repair process
	
	# localize variables
	global AMS_sharedDataPrefix AMS_thresholdMinZ AMS_thresholdMaxZ AMS_stepSize
	global AMS_suppressDCD AMS_ATS

	global AMS_repairGlobPattern AMS_repairDiagnosticOnly

	# identify that this is a repair job
	amslog info "Launching repairAMS to fix abnormally terminated AMS run for system id: $sysId"

	# AMS_repair variables
	if { [info exist AMS_repairDiagnosticOnly] && $AMS_repairDiagnosticOnly == 1 } {
		amslog info "Performing repair in DIAGNOSTIC MODE"
	} else {
		set AMS_repairDiagnosticOnly 0
		amslog info "Performing repair in STANDARD MODE"
	}
	if { [info exists AMS_repairGlobPattern] && $AMS_repairGlobPattern ne "" } {
		amslog info "File glob pattern set to: $AMS_repairGlobPattern"
	} else {
		set AMS_repairGlobPattern "*.ams.log"
		amslog info "File glob pattern defaulting to: *.ams.log"
	}

	# things i don't currently want to mess with, test to bail early
	# DCD output, Adaptive Time Stepping (ATS)
	if { !$AMS_suppressDCD } {
		amslog error "repairAMS doesn't currently handle DCD output"
		return
	} elseif { $AMS_ATS } {
		amslog error "repairAMS doesn't currently support Adaptive Time Stepping"
		return
	}

	# find the broken ams.log file and identify the iteration that crashed
	amslog info "Searching for abnormally terminated NAMD data..."
	set termLogFile ""; set termIter -1
	set amsLogList [lsort -dictionary -decreasing [glob $AMS_repairGlobPattern]]
	foreach f $amsLogList {
		set startCount 0; set finishCount 0; set iter -1
		set infile [open $f r]
		while { ![eof $infile] } {
			set inline [string trim [gets $infile]]
			if { [string match "*starting system. SysId: $sysId*" $inline] } {
				incr startCount
				set iter [lindex $inline 8]
			} elseif { [string match "*Terminating iteration with final data -- sysId: $sysId*" $inline] } {
				incr finishCount
			} else {
				continue
			}
		}; # end of read while loop
		close $infile

		# if the counts don't match, we should have the broken file
		if { $startCount != $finishCount } {
			amslog info "abnormally terminated .ams.log file FOUND!"
			amslog info "error in: [file tail $f] at iteration: $iter"
			set termLogFile $f
			set termIter $iter
			break
		} else {
			# otherwise keep looking
			continue
		}
	}; # end of the ams log loop

	# check to make sure that we've found the error
	if { $termLogFile eq "" } {
		amslog error "No abnormalities found in .ams.log files.  Exiting on error."
		return
	}

	# step 2: determine where to restart from
	amslog info "Determining the closest valid restart point (restart vs frame)..."

	# track down the last restart XSC file
	set lastRestartStep -1
	set basename [file rootname [file rootname [file tail $termLogFile]]]

	if { [file exists ${basename}.restart.xsc] } {
		set infile [open ${basename}.restart.xsc r]
		gets $infile; gets $infile
		set lastRestartStep [lindex [gets $infile] 0]
		close $infile
		amslog info "Restart file FOUND!  Written at step $lastRestartStep"
	} else {
		amslog error "No restart file found.  Will default to restarting from last frame output"
	}

	# track down the last dumped frame data
	set lastFrameStep -1
	# find the frames directory for the correct iteration
	if { [file exists ${AMS_sharedDataPrefix}.${termIter}.0.frames] } {
		# parse out the step for the last xsc file written
		set lastFrameXsc [lindex [lsort -dictionary -increasing [glob -directory ${AMS_sharedDataPrefix}.${termIter}.0.frames *.xsc]] end]
		set infile [open $lastFrameXsc r]
		gets $infile; gets $infile
		set lastFrameStep [lindex [gets $infile] 0]
		close $infile
		amslog info "Frame data FOUND!  Last output at step: $lastFrameStep"
	} else {
		amslog error "No frame data found.  Exiting on error!"
		return
	}

	# determine restart mode
	if { $lastRestartStep == -1 || $lastFrameStep >= $lastRestartStep } {
		amslog info "Last viable restart output from recorded FRAME data"
		# reload the system into NAMD
		reinitatoms [file rootname $lastFrameXsc]
		amslog info "System reinitialized from basename: [file rootname $lastFrameXsc]"
		firsttimestep $lastFrameStep
		amslog info "NAMD internal step count set to: $lastFrameStep"
		# redefine the current frame number (important for picking back up on zdat)
		set frameNum [expr {($lastFrameStep / $AMS_stepSize)}]
		amslog info "AMS frame number set to: $frameNum"

	} else {
		amslog info "Last viable restart output from RESTART files"
		# reload the system into NAMD
		reinitatoms ${basename}.restart
		amslog info "System reinitialized from basename: ${basename}.restart"
		firsttimestep $lastRestartStep
		amslog info "NAMD internal step count set to: $lastRestartStep"
		# redefine the current frame number (important for picking back up on zdat)
		set frameNum [expr {$lastRestartStep / $AMS_stepSize}]
		amslog info "AMS frame number set to: $frameNum"
	}

	# initialize the repair-only run

	if { ![file exists ${AMS_sharedDataPrefix}.${termIter}.0.zdat] } {
		amslog error "zdat file (${AMS_sharedDataPrefix}.${termIter}.0.zdat) not found!  Exiting on error."
		return
	}
	# maxZ, maxZ frame
	set infile [open ${AMS_sharedDataPrefix}.${termIter}.0.zdat r]
	set maxZ -1; set maxZFrame -1
	while { ![eof $infile] } {
		set inline [string trim [gets $infile]]
		if { $inline eq "" } { continue } else { lassign $inline maxZFrame maxZ }
	}
	close $infile
	amslog info "zdat read.  MaxZ: $maxZ, MaxZ Frame: $maxZFrame"

	# currZ
	run 0
	set currZ [ams_measure]
	amslog info "current z measured: $currZ"

	# define the frame directory
	set frameDir ${AMS_sharedDataPrefix}.${termIter}.0.frames
	amslog info "frame directory set to: $frameDir"

	# if we're in diagnostic mode, throw on the brakes before we've actually modified anything
	if { $AMS_repairDiagnosticOnly } {
		amslog info "DIAGNOSTIC MODE ENDPOINT"
		amslog info "Exiting prior to any modification."
		return
	}
	
	# reopen the zdat -- in append mode!
	set zDatFile [open ${AMS_sharedDataPrefix}.${termIter}.0.zdat a]
	amslog info "zdat reopened in append mode"

	# looks like we're really going to do this!
	updateStatusFile [list $sysId "R" $maxZ]

	# run the simulation
	while { $currZ > $AMS_thresholdMinZ && $currZ < $AMS_thresholdMaxZ } {

		# advance the simulation
		run $AMS_stepSize

		# probe Z value
        set currZ [ams_measure]
		#set currZ [colvarvalue ams_measure]
		amslog info "AMS sys: $sysId , currZ: $currZ"

		if { $currZ > $maxZ } {
			# we really only need to write restart frame data for new maxZ values
			puts $zDatFile "$frameNum\t$currZ"; flush $zDatFile
			# write frame restart data to .frames (requires NAMD 2.9 dev or higher)
			output [file join $frameDir f.${frameNum}]

			# update the max z value
			set maxZ $currZ
			set maxZFrame $frameNum
		}

		# update status file (big R for Repair!)
		updateStatusFile [list $sysId "R" $maxZ]

		# assuming the dcdfreq = stepSize
		# frameNum is 0-based
		incr frameNum
	}

	# gracefully exit and terminate
	amslog info "Termination criterial met at Z: $currZ ($frameNum steps)"
	close $zDatFile

	# pulled from termInstance
	amslog info "Final data -- sysId: $sysId | MaxZ: $maxZ | maxZFrame: $maxZFrame"

	#-------------------
	# UPDATE SYSTEM DATA
	#-------------------
	# read system data
	set sysDataAll [readSysDataFile]

	# parse out and update data for the current system
	set sysDataCurr [lsearch -inline -index 0 $sysDataAll $sysId]
	lset sysDataCurr 1 $maxZ
	lset sysDataCurr 2 end 1 $maxZFrame

	# write the updated data to file
	updateSysDataFile $sysDataCurr

	#------------------
	# UPDATE THE STATUS
	#------------------
	if { $maxZ > $AMS_thresholdMaxZ } {
		updateStatusFile [list $sysId "c" $maxZ]
	} else {
		updateStatusFile [list $sysId "s" $maxZ]
	}

	amslog info "Repair complete at: [exec date +%s]"
}

proc amslog {level msg} {
	# custom logging implementation

	# PASSED: log level (e.g., info, debug, or error)
	# RETURNS: nothing

	# localize relevant variables
	global AMS_logOutput AMS_verbosity

	# write data to log output file
	switch -- $AMS_verbosity {
		"debug" { puts $AMS_logOutput "$level:\t$msg" }
		"standard" -
		default { if {$level eq "info" || $level eq "warning" || $level eq "error"} { puts $AMS_logOutput "$level:\t$msg" } }
	}

	flush $AMS_logOutput
	return
}

#==============================================================================
# use a file-based data locking mechanism
#==============================================================================
# "locking" a file for one-on-one access generates a lock file (<filename>.lock)
# processes cannot lock a file that is currently locked

proc lockFile { fileName } {
	# waits until any existing locks are removed; once accessible, places a lock
	# on the requsted file

	global AMS_lockInterval

	# tries to open a file but only if it doesn't exist
	while { [catch [list open ${fileName}.lock {CREAT EXCL WRONLY}] fid] } {
		after [expr {int(rand()*$AMS_lockInterval)}]
	}
	close $fid
	return
}

proc unlockFile { fileName } {
	# deletes the lock on the requested file
	file delete ${fileName}.lock
	return
}

#==============================================================================


#==============================================================================
# data file reader / writers
#==============================================================================
# there are four files that store data and will require writing/reading/updating
# iterFile - tracks the iteration count (zero-padded integer)
# sysDataFile - contains detailed system data ({ {sysId, maxZ, {trajHist}} ... })
# statusFile - tracks the current status of each system ({ {sysId, status, maxZ} ...})
# refCountFile - tracks the reference count for each trajectory ({ {fileId, reference count} ...})

proc getNextIter {} {
	# reads and increments the iteration count
	# PASS: nothing
	# RETURN: next usable iteration number

	# localize relevant global variables
	global AMS_iterFile

	# lock the file
	lockFile $AMS_iterFile

	# read in the iteration number, process, increment, and update in the file
	set infile [open $AMS_iterFile r+]
	set iter [string trim [gets $infile]]
	set iLength [string length $iter]
	seek $infile 0
	puts $infile [format "%0${iLength}i" [expr {[scan $iter %d] + 1}]]
	close $infile

	# unlock the iterFile and return the current iteration value
	unlockFile $AMS_iterFile
	return $iter
}

proc readSysDataFile {} {
	# reads the data file used to track system data

	# PASS: nothing
	# RETURNS: full system data

	# localize relevant global
	global AMS_sysFile

	# lock
	lockFile $AMS_sysFile

	# open the data file
	set infile [open $AMS_sysFile r]

	# read the data, format, and return
	set data [lrange [split [read $infile] "\n"] 0 end-1]
	close $infile

	# unLock
	unlockFile $AMS_sysFile
	
	# return system data (full set)
	return $data
}

proc updateSysDataFile { sysData } {
	# updates the system information
	# matches against sysId (the first element of sysData)

	# PASS: system data to update {id maxZ {trajHist}}
	# RETURNS: nothing

	# localize relevant globals
	global AMS_sysFile

	# process input arguments
	set searchId [lindex $sysData 0]

	# lock
	lockFile $AMS_sysFile

	# open files for IO
	set infile [open $AMS_sysFile r]
	set outfile [open ${AMS_sysFile}.tmp w]

	# read through the file
	while { ![eof $infile] } {
		set inline [string trim [gets $infile]]
		if { $inline eq "" } { continue }
		if { [lindex $inline 0] eq $searchId } {
			puts $outfile $sysData
		} else {
			puts $outfile $inline
		}
	}

	# clean up
	close $infile; close $outfile
	file rename -force ${AMS_sysFile}.tmp $AMS_sysFile

	# unlock
	unlockFile $AMS_sysFile
}

proc updateStatusFile { statusData } {
	# update the status of the system id in the status file

	# PASSED: status data to update, { sysId s|p|r|c maxZ }
	# RETURNS: nothing

	# localize relevant globals
	global AMS_statusFile

	# parse input
	set searchId [lindex $statusData 0]

	# lock
	lockFile $AMS_statusFile

	# setup file IO
	set infile [open $AMS_statusFile r]
	set outfile [open ${AMS_statusFile}.tmp w]

	# read through file
	while { ![eof $infile] } {
		set inline [string trim [gets $infile]]
		if { $inline eq "" } { continue }
		if { [lindex $inline 0] eq $searchId } {
			puts $outfile $statusData
		} else {
			puts $outfile $inline
		}
	}

	# clean up
	close $infile; close $outfile
	file rename -force ${AMS_statusFile}.tmp $AMS_statusFile

	# unLock
	unlockFile $AMS_statusFile
}

proc modifyReferenceCount { trajId modVal } {
	# will modify (incr or decr) the rererence count associated with a trajectory
	# when the reference count reaches 0, the file is deleted

	# PASS: file identifier to modify, count change
	# RETURNS: nothing

	# localize relevant globals
	global AMS_refCountFile

	# lock
	lockFile $AMS_refCountFile

	# setup file IO
	set infile [open $AMS_refCountFile r]
	set outfile [open $AMS_refCountFile.tmp w]

	# read through the file
	while { ![eof $infile] } {
		lassign [gets $infile] id count
		if { [llength $id] == 0 } { continue }

		# decrement the match
		if { $id eq $trajId } { incr count $modVal }

		# any files/directories that have zero reference count are deleted
		if { $count == 0 } {
			foreach ele [glob $id.*] { file delete -force $ele }
		} else {
			puts $outfile [list $id $count]
		}
	}

	# clean up
	close $infile; close $outfile
	file rename -force ${AMS_refCountFile}.tmp $AMS_refCountFile

	# unLock
	unlockFile $AMS_refCountFile
}

proc newReferenceCount { trajId } {
	# adds a new trajectory identifier into the refCountFile for reference counting

	# PASS: file identifier to track
	# RETURNS: nothing

	# localize relevant globals
	global AMS_refCountFile

	# lock
	lockFile $AMS_refCountFile

	# append the entry
	set outfile [open $AMS_refCountFile a]
	puts $outfile [list $trajId 1]

	# close and unlock
	close $outfile
	unlockFile $AMS_refCountFile

	# done
	return
}

#==============================================================================
# NAMD instance control
#==============================================================================

proc getKillIds {} {
	# polls the status file to find all replicas ids which are at the kill z-level
	# returns the systems ids to be restarted

	# PASS: nothing
	# RETURN: systems ids to restart

	# localize relevant global variables
	global AMS_statusFile
	global AMS_sharedDataPrefix
	global AMS_toKill
	global AMS_thresholdMaxZ

	# lock the status file
	lockFile $AMS_statusFile

	# read in the system status data
	set infile [open $AMS_statusFile r]
	set data [lrange [split [read $infile] "\n"] 0 end-1]
	close $infile

	# CURRENTLY WE CANT HANDLE TERMINATED SYSTEMS
	# if there was a system that was terminated, restart it
	#if {  [lsearch -index 1 $data "t"] != -1 } {
	#	set restartSysId [lindex [lsearch -index 1 $data "t"] 0]
	#	set restartDatFile [glob *.${restartSysId}.*.AMSRESTART.dat]
	#	lockFile $restartDatFile
	#	set infile $restartDatFile
	#	lassign [gets $infile] AMS_outputPrefix iter sysId currZ maxZ maxZFrame
	#	close $infile
	#	unlockFile $restartDatFile
	#	# setup NAMD to restart
	#	# need some sort of flag or way of communicating to initInstance
	#	# that we need a simple restart, not a kill/restart
	#
	#}

	# if the number of static systems is smaller then the number of replicas terminate AMS!
	# exit NAMD gracefully
	if { [llength [lsearch -index 1 -all $data "s"]] < $AMS_toKill } {	
		unlockFile $AMS_statusFile
		termAMS
	}

	# process the data: sort by max_z in decreasing order
	set dataSorted [lsort -real -index 2 -decreasing $data]
	# identinfy all replicas to kill
	set killSysIds {}
	set zKillList {}
	for {set ind 0} {$ind < $AMS_toKill} {incr ind} {
		set zKill [lindex $dataSorted end-$ind 2]
		lappend zKillList $zKill
		# if zKill > zMax we are done!
		if { $zKill > $AMS_thresholdMaxZ } {
			unlockFile $AMS_statusFile
			termAMS
		}
	}
	set killLevel $zKill
	for {set ind 0} {$ind < $AMS_toKill} {incr ind} {
		set newdataSorted {}
		set zKill [lindex $zKillList $ind]
		foreach ele $dataSorted {
			if { [lindex $ele 2] == $zKill } { 
				lappend killSysIds [lindex $ele 0] 
			} else {
				lappend newdataSorted $ele
			}
		}
		set dataSorted $newdataSorted
	}

	# identify the replicas Ids to replicate (all that is inside dataSorted)
	set restartSysIds {}
	foreach ele $dataSorted {
		lappend restartSysIds [lindex $ele 0]
	}
	# if there is not enough restart replicas, terminate AMS
	if { [llength $restartSysIds] == 0 } {
		unlockFile $AMS_statusFile
		amslog info "shutting down AMS on insuffisant restart replicas ..." 
		termAMS 
	}
	# unlock the status file and return
	unlockFile $AMS_statusFile
	return [list $killSysIds $killLevel]
}


proc waitForStart { sysId } {
	# localize relevant global variables
	global AMS_statusFile AMS_pollInterval AMS_startTime AMS_wallTimeLimit
	
	while { 1 } {
		
		# check the wall time to make sure we're still eligible to start a new simulation
		# after 20 hrs of elapsed wall clock runtime, exit gracefully
		# this is to prevent a simulation from being prematurely terminated by exceeding max wall clock
		set currTime [exec date +%s]
		if { [expr { ($currTime - $AMS_startTime) }] > $AMS_wallTimeLimit } { termAMS }
	
		# lock the status file
		lockFile $AMS_statusFile

		# read in the system status data
		set infile [open $AMS_statusFile r]
		set data [lrange [split [read $infile] "\n"] 0 end-1]
		close $infile
		
		# get status of sysId
		set sysData [lsearch -inline -index 0 $data $sysId]
		set status [lindex $sysData 1]
		
		# check the status of the system
		if { $status ne "s" } {
			# if the status isn't static, clean up and keep waiting
			unlockFile $AMS_statusFile
			after $AMS_pollInterval
			continue
		} else {
			# if the status is static, reset the status to pending (p) and update the file
			amslog info "AMS: restarting sysID: $sysId at $currTime"
			lset sysData 1 "p"
			set outfile [open ${AMS_statusFile}.tmp w]
			foreach ele $data { 
				if { [lindex $ele 0] != $sysId } {
					puts $outfile $ele 
				} else {
					puts $outfile $sysData 
				}	
			}
			close $outfile
			file rename -force ${AMS_statusFile}.tmp $AMS_statusFile
			unlockFile $AMS_statusFile
			return
	      }
	}
}


proc initInstance { sysId killIds iter count killLevel} {
	# initialize the NAMD instance

	# PASS: id of the system to be resampled
	# RETURN: system id (killed), max z of the killed system (initial z of the restart)

	# debug
	amslog debug "AMS: Initializing iteration at [exec date +%s]"

	# localize global variables
	global AMS_sharedDataPrefix AMS_stepSize AMS_suppressDCD

	# read all system data
	set sysDataAll [readSysDataFile]

	# parse out data for the system we're to kill (keyed by sysId)
	set sysDataKill [lsearch -inline -index 0 $sysDataAll $sysId]

	# deincrement all files in the trajectory history of the system we're killing
	foreach ele [lindex $sysDataKill 2] { modifyReferenceCount [lindex $ele 0] -1 }

	# randomly choose another system id to restart and parse out that system data
	set possibleRestartList {}
	foreach ele $sysDataAll {
		set tmpid [lindex $ele 0]
		if { [lsearch $killIds $tmpid]<0 } { lappend possibleRestartList $tmpid }
	}
	set sysIdRestart [lindex $possibleRestartList [expr {int(rand()*[llength $possibleRestartList])}]]
	unset possibleRestartList
	set sysDataRestart [lsearch -inline -index 0 $sysDataAll $sysIdRestart]

	# copy the trajectory history from the restart system, and increment the ref count for each entry
	set trajHistNew [lindex $sysDataRestart 2]
	foreach ele $trajHistNew { modifyReferenceCount [lindex $ele 0] 1 }
	
        # read in the zdat file; frames are 0-based
        amslog debug "AMS: Searching for restart coordinates (frame) from [lindex $trajHistNew end 0].zdat"
        set zDatFile [open [lindex $trajHistNew end 0].zdat r]
        while { ![eof $zDatFile] } {
		set inline [string trim [gets $zDatFile]]
		# NOTE : when we're restarting from N_restart minus N_kill, the following condition is the same wether we're using '<=' or '<' 
		if { [lindex $inline 1] <= $killLevel } {
			continue
		} else {
			set frameNum [lindex $inline 0]
			# ADDED BY IMANE
			set maxZ [lindex $inline 1]
			amslog debug "AMS: Frame FOUND! (frame = [lindex $inline 0], z = [lindex $inline 1])"
			break
		}
       }
       close $zDatFile

	# change the terminal frame of the last traj history file to the found branch point
	lset trajHistNew end 1 $frameNum
	# note: we are intentionally takeing the frame AFTER z is surpassed

	# append the traj id for the new simulation to the traj history and reference count file
	lappend trajHistNew [list ${AMS_sharedDataPrefix}.${iter}.${count} 0]
	newReferenceCount ${AMS_sharedDataPrefix}.${iter}.${count}

	# update the sysData file
	# MODIFIED BY IMANE
	updateSysDataFile [list $sysId ${maxZ} $trajHistNew]

	# NAMD-specific commands
	# load coordinates, velocities, cell information from appropriate frame of the .frames data
	reinitatoms [file join [lindex $trajHistNew end-1 0].frames f.${frameNum}]
	firsttimestep 0
	run 0

	# if dcd suppression isn't turned on, open the dcd file for writing
	if { !$AMS_suppressDCD } { DCDfile ${AMS_sharedDataPrefix}.${iter}.${count}.dcd }

	# activate sysId in the statusFile
	# MODIFIED BY IMANE
	updateStatusFile [list $sysId "r" ${maxZ}]

	# debug
	amslog debug "AMS: Initialization complete"

	# return relevant information
	# MODIFIED BY IMANE
	return [list $sysId ${maxZ}]
}

proc runInstance { sysId initZ iter count} {
	# runs the NAMD instance

	# PASS: system id, and initial z value
	# RETURNS: maximum observed Z value, frame number of max Z value

	amslog debug "AMS: starting system. SysId: $sysId | Iteration: $iter | Initial Z: $initZ"
	
	# localize relevant global variables
	global AMS_sharedDataPrefix AMS_thresholdMinZ AMS_thresholdMaxZ AMS_stepSize AMS_stepsPerCycle 
	global AMS_suppressDCD AMS_ATS AMS_ATS_proc

	# setup some local variables
	set maxZ $initZ
	set currZ $initZ
	set maxZFrame 0
	set frameNum 0
	set zDatFile [open ${AMS_sharedDataPrefix}.${iter}.${count}.zdat w]
	set trajFile [open ${AMS_sharedDataPrefix}.${iter}.${count}.traj w]
	set frameDir ${AMS_sharedDataPrefix}.${iter}.${count}.frames
	file mkdir $frameDir

	# write frame 0 data (i'm not sure if 0-frame data will ever be relevant--anything that doesn't advance will be the next killed system)
	puts $zDatFile "${frameNum}\t${currZ}"; flush $zDatFile
	puts $trajFile "${frameNum}\t[variables]"; flush $trajFile
	output [file join $frameDir f.${frameNum}]
	incr frameNum
	set where [zone]
	while { $where == 0 } {
		
		# Adaptive Timestepping
		if { $AMS_ATS } {
			# update the timestep based on the current z value
			# NOTE: stepsize must be 1) positive integer, 2) multiple of stepspercycle
			#       we're doing modest checking/error handling/logging here
			set css [eval $AMS_ATS_proc $currZ]
			set AMS_stepSize [expr {(int($css) / $AMS_stepsPerCycle) * $AMS_stepsPerCycle}]
			# catch any algorithmic errors where stepsize < stepspercycle, reset and issue a warning
			if { $AMS_stepSize < $AMS_stepsPerCycle } {
				set AMS_stepSize $AMS_stepsPerCycle
				amslog warning "ATS: stepsize below NAMD stepspercycle threshold, resetting to stepspercycle. (CurrZ: $currZ,  Computed Stepsize: $css)"
			} else {
				amslog debug "ATS: CurrZ: $currZ,  Computed Stepsize: $css,  Actual Stepsize: $AMS_stepSize"
			}

			# update the dcdfreq if dcd suppression is not turned on
			if { !$AMS_suppressDCD } { DCDFreq $AMS_stepSize }
		}

		# advance the simulation
		run $AMS_stepSize
		#update zone information
		set where [zone]
		# probe Z value
	        set currZ [ams_measure]
		#set currZ [colvarvalue ams_measure]
		amslog info "AMS sys: $sysId , currZ: $currZ"

		if { $currZ > $maxZ } {
			# we really only need to write restart frame data for new maxZ values
			puts $zDatFile "$frameNum\t$currZ"; flush $zDatFile
			# write frame restart data to .frames (requires NAMD 2.9 or higher)
			output [file join $frameDir f.${frameNum}]

			# update the max z value
			set maxZ $currZ
			set maxZFrame $frameNum
		}
		# writing in the traj file to see the whole trajectory (and calculate the flux)
		puts $trajFile "${frameNum}\t[variables]"; flush $trajFile

		# update status file
		updateStatusFile [list $sysId "r" $maxZ]

		# assuming the dcdfreq = stepSize
		# frameNum is 0-based
		incr frameNum
	}

	# debug
	amslog debug "AMS: Termination criteria met at Z: $currZ ($frameNum steps)"

	# clean up and return relevant data
	close $zDatFile
	close $trajFile
	return [list $maxZ $maxZFrame]
}

proc termInstance { sysId maxZ maxZFrame } {
	# cleans up upon conclusion of simulation

	# PASSED: system id, max Z value observed, frame number of the max Z value
	# RETURNS: nothing

	# localize relevant globals
	global AMS_thresholdMaxZ
	global AMS_sharedDataPrefix
	global AMS_Brep

	# debug
	amslog info "AMS: Terminating iteration with final data -- sysId: $sysId | MaxZ: $maxZ | maxZFrame: $maxZFrame"

	#-------------------
	# UPDATE SYSTEM DATA
	#-------------------
	# read system data
	set sysDataAll [readSysDataFile]

	# parse out and update data for the current system
	set sysDataCurr [lsearch -inline -index 0 $sysDataAll $sysId]
	lset sysDataCurr 1 $maxZ
	lset sysDataCurr 2 end 1 $maxZFrame

	# write the updated data to file
	updateSysDataFile $sysDataCurr

	#------------------
	# UPDATE THE STATUS
	#------------------
	if { $maxZ > $AMS_thresholdMaxZ } {
		if { [zone] == 1 } {
			updateStatusFile [list $sysId "b" $maxZ]
			amslog info "AMS: sysId $sysId terminated and reached B"
			incr AMS_Brep
		} else {
			updateStatusFile [list $sysId "c" $maxZ]
			amslog info "AMS: sysId $sysId terminated without reaching B"
		}
	} else {
		updateStatusFile [list $sysId "s" $maxZ]
	}
	

	amslog info "AMS: Iteration complete at: [exec date +%s]"
}

#==============================================================================
