Difference between revisions of "RAL HTCondor Multicore Jobs Configuration"
From GridPP Wiki
(Created page with "== Defrag daemon setup == Our current configuration for the defrag daemon is: DAEMON_LIST = $(DAEMON_LIST) DEFRAG DEFRAG_INTERVAL = 600 DEFRAG_DRAINING_MACHINES_PER_HOUR = ...") |
|||
Line 24: | Line 24: | ||
MAX_DEFRAG_LOG = 104857600 | MAX_DEFRAG_LOG = 104857600 | ||
MAX_NUM_DEFRAG_LOG = 10 | MAX_NUM_DEFRAG_LOG = 10 | ||
+ | |||
+ | == Adjusting draining based on demand == | ||
+ | |||
+ | The following script is run as a cron every 20 minutes. | ||
+ | #!/bin/bash | ||
+ | # Change condor_defrag daemon parameters depending on how many idle and running multicore jobs | ||
+ | # there are.<br> | ||
+ | function setDefrag () { | ||
+ | # Get the address of the defrag daemon | ||
+ | defrag_address=$(condor_status -any -autoformat MyAddress -constraint 'MyType =?= "Defrag"') | ||
+ | |||
+ | # Log | ||
+ | echo "Setting DEFRAG_MAX_CONCURRENT_DRAINING=$3, DEFRAG_DRAINING_MACHINES_PER_HOUR=$4, DEFRAG_MAX_WHOLE_MACHINES=$5 (idle multicore=$1, running multicore=$2)" | ||
+ | |||
+ | # Set configuration | ||
+ | /usr/bin/condor_config_val -address "$defrag_address" -rset "DEFRAG_MAX_CONCURRENT_DRAINING = $3" >& /dev/null | ||
+ | /usr/bin/condor_config_val -address "$defrag_address" -rset "DEFRAG_DRAINING_MACHINES_PER_HOUR = $4" >& /dev/null | ||
+ | /usr/bin/condor_config_val -address "$defrag_address" -rset "DEFRAG_MAX_WHOLE_MACHINES = $5" >& /dev/null | ||
+ | /usr/sbin/condor_reconfig -daemon defrag >& /dev/null | ||
+ | }<br> | ||
+ | # Get total number of idle multicore jobs | ||
+ | idle_jobs=$(condor_q -global -constraint 'RequestCpus == 8 && JobStatus == 1' -autoformat ClusterId | wc -l)<br> | ||
+ | # Get total numbe of running multicore jobs | ||
+ | running_jobs=$(condor_q -global -constraint 'RequestCpus == 8 && JobStatus == 2' -autoformat ClusterId | wc -l)<br> | ||
+ | if [ $idle_jobs -gt 20 ] && [ $running_jobs -lt 190 ] | ||
+ | then | ||
+ | setDefrag $idle_jobs $running_jobs 60 40 300 | ||
+ | elif [ $idle_jobs -gt 20 ] && [ $running_jobs -gt 190 ] | ||
+ | then | ||
+ | setDefrag $idle_jobs $running_jobs 8 8 300 | ||
+ | else | ||
+ | setDefrag $idle_jobs $running_jobs 1 1 4 | ||
+ | fi |
Revision as of 08:33, 21 May 2014
Defrag daemon setup
Our current configuration for the defrag daemon is:
DAEMON_LIST = $(DAEMON_LIST) DEFRAG DEFRAG_INTERVAL = 600 DEFRAG_DRAINING_MACHINES_PER_HOUR = 30.0 DEFRAG_MAX_CONCURRENT_DRAINING = 60 DEFRAG_MAX_WHOLE_MACHINES = 300 DEFRAG_SCHEDULE = graceful
## Allow some defrag configuration to be settable DEFRAG.SETTABLE_ATTRS_ADMINISTRATOR = DEFRAG_MAX_CONCURRENT_DRAINING,DEFRAG_DRAINING_MACHINES_PER_HOUR,DEFRAG_MAX_WHOLE_MACHINES ENABLE_RUNTIME_CONFIG = TRUE
## Which machines are more desirable to drain DEFRAG_RANK = ifThenElse(Cpus >= 8, -10, (TotalCpus - Cpus)/(8.0 - Cpus))
# Definition of a "whole" machine: # - anything with 8 cores (since multicore jobs only need 8 cores, don't need to drain whole machines with > 8 cores) # - must be configured to actually start new jobs (otherwise machines which are deliberately being drained will be included) DEFRAG_WHOLE_MACHINE_EXPR = ((Cpus == TotalCpus) || (Cpus >= 8)) && StartJobs =?= True
# Decide which machines to drain # - must not be cloud machines # - must be healthy # - must be configured to actually start new jobs DEFRAG_REQUIREMENTS = PartitionableSlot && Offline =!= True && RalCluster =!= "wn-cloud" && StartJobs =?= True && NODE_IS_HEALTHY =?= True
## Logs MAX_DEFRAG_LOG = 104857600 MAX_NUM_DEFRAG_LOG = 10
Adjusting draining based on demand
The following script is run as a cron every 20 minutes.
#!/bin/bash # Change condor_defrag daemon parameters depending on how many idle and running multicore jobs # there are.
function setDefrag () { # Get the address of the defrag daemon defrag_address=$(condor_status -any -autoformat MyAddress -constraint 'MyType =?= "Defrag"')
# Log echo "Setting DEFRAG_MAX_CONCURRENT_DRAINING=$3, DEFRAG_DRAINING_MACHINES_PER_HOUR=$4, DEFRAG_MAX_WHOLE_MACHINES=$5 (idle multicore=$1, running multicore=$2)"
# Set configuration /usr/bin/condor_config_val -address "$defrag_address" -rset "DEFRAG_MAX_CONCURRENT_DRAINING = $3" >& /dev/null /usr/bin/condor_config_val -address "$defrag_address" -rset "DEFRAG_DRAINING_MACHINES_PER_HOUR = $4" >& /dev/null /usr/bin/condor_config_val -address "$defrag_address" -rset "DEFRAG_MAX_WHOLE_MACHINES = $5" >& /dev/null /usr/sbin/condor_reconfig -daemon defrag >& /dev/null }
# Get total number of idle multicore jobs idle_jobs=$(condor_q -global -constraint 'RequestCpus == 8 && JobStatus == 1' -autoformat ClusterId | wc -l)
# Get total numbe of running multicore jobs running_jobs=$(condor_q -global -constraint 'RequestCpus == 8 && JobStatus == 2' -autoformat ClusterId | wc -l)
if [ $idle_jobs -gt 20 ] && [ $running_jobs -lt 190 ] then setDefrag $idle_jobs $running_jobs 60 40 300 elif [ $idle_jobs -gt 20 ] && [ $running_jobs -gt 190 ] then setDefrag $idle_jobs $running_jobs 8 8 300 else setDefrag $idle_jobs $running_jobs 1 1 4 fi