@@ -38,10 +38,10 @@ function die() {
3838 if [[ -n " $NHC_DETACHED " ]]; then
3939 echo " $RET $* " > $RESULTFILE
4040 elif [[ " $NHC_RM " == " sge" ]]; then
41- echo " begin"
42- echo " $HOSTNAME :healthy:false"
43- echo " $HOSTNAME :diagnosis:NHC: $* "
44- echo " end"
41+ echo " begin" > $NHC_FD_OUT
42+ echo " $HOSTNAME :healthy:false" > $NHC_FD_OUT
43+ echo " $HOSTNAME :diagnosis:NHC: $* " > $NHC_FD_OUT
44+ echo " end" > $NHC_FD_OUT
4545 return 77
4646 elif [[ -n " $LOGFILE " ]]; then
4747 oecho " ERROR: $NAME : Health check failed: $* "
@@ -51,7 +51,7 @@ function die() {
5151 return 0
5252 fi
5353 kill_watchdog
54- [[ -n " $LOGFILE " ]] && exec 1>&3 - 2>&4 -
54+ [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3 - 2>&4 -
5555 exit $RET
5656}
5757
@@ -91,11 +91,7 @@ function oecho() {
9191
9292 if [[ " $SILENT " == " 0" ]]; then
9393 [[ $TS -ne 0 ]] && PREFIX=" [$SECONDS ] - "
94- if [[ -n " $LOGFILE " ]]; then
95- echo " $PREFIX $@ " >&3
96- else
97- echo " $PREFIX $@ "
98- fi
94+ echo " $PREFIX $@ " >& $NHC_FD_OUT
9995 fi
10096}
10197
@@ -105,11 +101,7 @@ function eecho() {
105101
106102 if [[ " $SILENT " == " 0" ]]; then
107103 [[ $TS -ne 0 ]] && PREFIX=" [$SECONDS ] - "
108- if [[ -n " $LOGFILE " ]]; then
109- echo " $PREFIX $@ " >&4
110- else
111- echo " $PREFIX $@ "
112- fi
104+ echo " $PREFIX $@ " >& $NHC_FD_ERR
113105 fi
114106}
115107
@@ -119,11 +111,7 @@ function vecho() {
119111
120112 if [[ " $VERBOSE " == " 1" ]]; then
121113 [[ $TS -ne 0 ]] && PREFIX=" [$SECONDS ] - "
122- if [[ -n " $LOGFILE " ]]; then
123- echo " $PREFIX $@ " >&3
124- else
125- echo " $PREFIX $@ "
126- fi
114+ echo " $PREFIX $@ " >& $NHC_FD_OUT
127115 fi
128116}
129117
@@ -171,8 +159,10 @@ function nhcmain_init_env() {
171159 WATCHDOG_PID=0
172160 FAIL_CNT=0
173161 FORCE_SETSID=0
162+ NHC_FD_OUT=1
163+ NHC_FD_ERR=2
174164 export PATH SYSCONFIGDIR LIBEXECDIR HOSTNAME HOSTNAME_S RET LOGGER_TEXT
175- export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID
165+ export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID NHC_FD_OUT NHC_FD_ERR
176166
177167 # Users may override this in /etc/sysconfig/nhc.
178168 NAME=${0/#* \/ }
@@ -294,12 +284,15 @@ function nhcmain_finalize_env() {
294284 DETACHED_MODE=${DETACHED_MODE:- 0}
295285 DETACHED_MODE_FAIL_NODATA=${DETACHED_MODE_FAIL_NODATA:- 0}
296286 TIMEOUT=${TIMEOUT:- 10}
297- MAX_SYS_UID=${MAX_SYS_UID:- 99}
298287 NHC_CHECK_ALL=${NHC_CHECK_ALL:- 0}
299288 NHC_CHECK_FORKED=${NHC_CHECK_FORKED:- 0}
300289 FORCE_SETSID=${FORCE_SETSID:- 0}
301290 export NHC_SID=0
302291
292+ # Set from system defaults if present.
293+ [[ -z " $MAX_SYS_UID " ]] && nhc_common_get_max_sys_uid
294+ MAX_SYS_UID=${MAX_SYS_UID:- 99}
295+
303296 # Check for session leader.
304297 kill -s 0 -- -$NHC_PID > /dev/null 2>&1
305298 if [[ $? -eq 0 ]]; then
@@ -369,26 +362,27 @@ function nhcmain_find_rm() {
369362 if [[ -d /var/spool/torque ]]; then
370363 NHC_RM=" pbs"
371364 return 0
365+ elif [[ -n " $SGE_ROOT " && -x " $SGE_ROOT /util/arch" ]]; then
366+ # SGE binaries typically won't be on the path defined above in the
367+ # load sensor environment, but SGE_ROOT will be there.
368+ NHC_RM=" sge"
369+ fi
370+
371+ # Search PATH for commands
372+ if type -a -p -f -P pbsnodes >& /dev/null ; then
373+ NHC_RM=" pbs"
374+ return 0
375+ elif type -a -p -f -P scontrol >& /dev/null ; then
376+ NHC_RM=" slurm"
377+ return 0
378+ elif type -a -p -f -P badmin >& /dev/null ; then
379+ NHC_RM=" lsf"
380+ return 0
381+ elif type -a -p -f -P qselect >& /dev/null ; then
382+ NHC_RM=" sge"
383+ return 0
372384 fi
373385
374- IFS=' :'
375- DIRLIST=( $PATH )
376- IFS=$' \t\n '
377- for DIR in " ${DIRLIST[@]} " ; do
378- if [[ -x " $DIR /pbsnodes" ]]; then
379- NHC_RM=" pbs"
380- return 0
381- elif [[ -x " $DIR /scontrol" ]]; then
382- NHC_RM=" slurm"
383- return 0
384- elif [[ -x " $DIR /badmin" ]]; then
385- NHC_RM=" lsf"
386- return 0
387- elif [[ -x " $DIR /qselect" ]]; then
388- NHC_RM=" sge"
389- return 0
390- fi
391- done
392386 if [[ -z " $NHC_RM " ]]; then
393387 log " Unable to detect resource manager."
394388 return 1
@@ -407,6 +401,8 @@ function nhcmain_redirect_output() {
407401 exit 1
408402 else
409403 dbg " Output redirected per LOGFILE variable $LOGFILE "
404+ NHC_FD_OUT=3
405+ NHC_FD_ERR=4
410406 fi
411407 fi
412408}
@@ -506,7 +502,7 @@ function nhcmain_detach() {
506502 nhcmain_redirect_output
507503 ELAPSED=$(( SECONDS- NHC_START_TS))
508504 vlog " Node Health Check detached parent completed successfully (${ELAPSED} s)."
509- [[ -n " $LOGFILE " ]] && exec 1>&3 - 2>&4 -
505+ [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3 - 2>&4 -
510506 exit 0
511507}
512508
@@ -565,14 +561,14 @@ function nhcmain_finish() {
565561 ELAPSED=$(( SECONDS- NHC_START_TS))
566562 vlog " Node Health Check completed successfully (${ELAPSED} s)."
567563 if [[ " $NHC_RM " == " sge" ]]; then
568- echo " begin"
569- echo " $HOSTNAME :healthy:true"
570- echo " $HOSTNAME :diagnosis:HEALTHY"
571- echo " end"
564+ echo " begin" > $NHC_FD_OUT
565+ echo " $HOSTNAME :healthy:true" > $NHC_FD_OUT
566+ echo " $HOSTNAME :diagnosis:HEALTHY" > $NHC_FD_OUT
567+ echo " end" > $NHC_FD_OUT
572568 return 0
573569 fi
574570 kill_watchdog
575- [[ -n " $LOGFILE " ]] && exec 1>&3 - 2>&4 -
571+ [[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3 - 2>&4 -
576572 exit 0
577573}
578574
0 commit comments