Commit 8e66a2e6 authored by Maiken's avatar Maiken
Browse files

Merge branch 'slurm_patch_bug_3973' into 'master'

SLURM cpu and endtime errors (Fixes BUGZ-3973)

See merge request nordugrid/arc!1208
parents e0929455 6ae5b84b
......@@ -154,6 +154,36 @@ if [ ! -z "$perflogdir" ]; then
echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, squeue -a -h -o %i:%T -t all -j: $t" >> $perflogfile
fi
# A special version of interval_to_seconds for Slurm v20.02
# This function takes a time interval formatted as 789:12:34:56 (with days) or
# 12:34:56 (without days) and transforms it to seconds. It returns the result in
# the return_interval_seconds variable.
# Slurm format: [dd-][hh:][mm:][ss][.uuu].
# [.uuu] will always be removed.
# There can be years and months in front of the days, like [yy-][mm-]?
slurm_interval_to_seconds () {
return_interval_seconds=0
_interval_dhms=`echo $1 | sed -e 's|-|:|' -e 's|\.[0-9]\+||'`
_interval_good=`echo $_interval_dhms | grep -E '[^:0-9]'`
_interval_size=`echo $_interval_dhms | grep -o : | wc -l`
if [ X"$_interval_good" = "X" ] ; then
if [ $_interval_size -eq 0 ]; then
return_interval_seconds=$_interval_dhms
elif [ $_interval_size -eq 1 ]; then
return_interval_seconds=`echo $_interval_dhms | tr : ' ' | awk '{print $1*60+$2;}'`
elif [ $_interval_size -eq 2 ]; then
return_interval_seconds=`echo $_interval_dhms | tr : ' ' | awk '{print $1*60*60+$2*60+$3;}'`
elif [ $_interval_size -eq 3 ]; then
return_interval_seconds=`echo $_interval_dhms | tr : ' ' | awk '{print $1*24*60*60+$2*60*60+$3*60+$4;}'`
else
echo "Bad formatting of time interval: $_interval_dhms" 1>&2
fi
else
echo "Bad formatting of time interval: $_interval_dhms" 1>&2
fi
unset _interval_dhms _interval_size _interval_good
}
handle_commentfile () {
localid=$1
sessiondir=`grep -h '^sessiondir=' $jobfile | sed 's/^sessiondir=\(.*\)/\1/'`
......@@ -183,6 +213,7 @@ function handle_exitcode {
exitcode2=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\2/p')
fi
if [ -z "$exitcode1" ] && [ -z "$exitcode2" ] ; then
exitcode=$tmpexitcode
elif [ $exitcode2 -ne 0 ]; then
......@@ -202,21 +233,28 @@ function handle_exitcode {
# CANCELLED jobs in SLURM can have 0 exit code.
# This is a temporary workaround, should later be replaced by
# proper fix that determines the reason of failure
#
# if the job was canceled in queued state there is no .batch for it
# so, sacct -j $localid.batch does not work, but sacct -j $localid will do
# the fix done
function handle_exitcode_cancelled {
localid="$1"
tmpexitcode="$2"
reason="$3"
if [ "$use_sacct" ]; then
jobinfostring=$("$sacct" -j $localid.batch -o ExitCode -P | tail -n 1)
jobinfostring=$("$sacct" -j $localid -o ExitCode,State -P | tail -n 1)
exitcode1=$(echo $jobinfostring|awk -F':' '{print $1}')
exitcode2=$(echo $jobinfostring|awk -F':' '{print $2}')
reason=$(echo $jobinfostring|awk -F'|' '{print $2}')
[ -z "$reason" ] || exitcode2=$(echo $exitcode2|awk -F'|' '{print $1}')
else
jobinfostring=$("$scontrol" -o show job $localid)
exitcode1=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\1/p')
exitcode2=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\2/p')
fi
if [ -z "$exitcode1" ] && [ -z "$exitcode2" ] ; then
exitcode=$tmpexitcode
elif [ ! -z "$exitcode2" ] && [ "$exitcode2" -ne 0 ]; then
......@@ -228,7 +266,9 @@ function handle_exitcode_cancelled {
fi
if [ $exitcode -eq 0 ]; then
exitcode=15
reason="Job was cancelled by SLURM"
if [ ! -z "$reason" ]; then
reason="Job was cancelled by SLURM"
fi
fi
echo "$exitcode $reason" > "${basenames[$localid]}.lrms_done"
kicklist=(${kicklist[@]} $localid)
......@@ -246,7 +286,7 @@ function handle_diag_file {
job_read_diag
if [ "$use_sacct" ]; then
jobinfostring=$("$sacct" -j $localid.batch -o NCPUS,NNODES,CPUTimeRAW,Start,End,ExitCode,State -P | tail -n 1)
jobinfostring=$("$sacct" -j $localid.batch -o NCPUS,NNODES,CPUTimeRAW,Start,End,UserCPU,SystemCPU,ExitCode,State -P | tail -n 1)
cpus=$(echo "$jobinfostring" | awk -F'|' '{print $1}')
......@@ -254,7 +294,11 @@ function handle_diag_file {
endtime=$(echo "$jobinfostring"|awk -F'|' '{print $5}'| sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
cputime=$(echo "$jobinfostring" | awk -F'|' '{print $3}')
# UserCPU,SystemCPU format is [dd-]hh:mm:ss[.uuu]
usercputime=$(echo "$jobinfostring" | awk -F'|' '{print $6}')
kernelcputime=$(echo "$jobinfostring" | awk -F'|' '{print $7}')
[ -z "$usercputime" ] && usercputime="00:00:00"
[ -z "$kernelcputime" ] && kernelcputime="00:00:00"
else
jobinfostring=$("$scontrol" -o show job $localid)
......@@ -270,25 +314,35 @@ function handle_diag_file {
cpus=$(echo "$jobinfostring"|sed -n 's/.*NumCPUs=\([^ ]*\) .*/\1/p')
fi
date_to_utc_seconds "$starttime"
starttime_seconds="$return_date_seconds"
seconds_to_mds_date "$return_date_seconds"
LRMSStartTime=$return_mds_date
date_to_utc_seconds "$endtime"
endtime_seconds="$return_date_seconds"
seconds_to_mds_date "$return_date_seconds"
LRMSEndTime=$return_mds_date
#TODO handle cputime, exitcode etc.
walltime=$(( $endtime_seconds - $starttime_seconds))
#cputime=$(( $walltime * $count))
# Values to write to diag. These will override values already written.
[ -n "$walltime" ] && WallTime=$walltime
[ -n "$cpus" ] && Processors=$cpus
[ -n "$cputime" ] && UserTime=$cputime
#[ -n "$cputime" ] && KernelTime=0
job_write_diag
# if "sacct -j $localid.batch" return string "NCPUS|NNodes..." only, the job has no batch stage, it was killed before start on WN
if [ ! z"$cpus" = "zNCPUS" ] ; then
date_to_utc_seconds "$starttime"
starttime_seconds="$return_date_seconds"
seconds_to_mds_date "$return_date_seconds"
LRMSStartTime=$return_mds_date
date_to_utc_seconds "$endtime"
endtime_seconds="$return_date_seconds"
seconds_to_mds_date "$return_date_seconds"
LRMSEndTime=$return_mds_date
#TODO handle exitcode etc.
walltime=$(( $endtime_seconds - $starttime_seconds))
slurm_interval_to_seconds "$usercputime"
cputime="$return_interval_seconds"
slurm_interval_to_seconds "$kernelcputime"
kernel="$return_interval_seconds"
#cputime=$(( $walltime * $count))
# Values to write to diag. These will override values already written.
[ -n "$walltime" ] && WallTime=$walltime
[ -n "$cpus" ] && Processors=$cpus
[ -n "$cputime" ] && UserTime=$cputime
[ -n "$kernel" ] && KernelTime=$kernel
job_write_diag
fi
}
if [ ! -z "$perflogdir" ]; then
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment