job_submission_condor/scripts/hold_review_and_release.sh

0001 #!/bin/bash
0002
0003 n=1
0004 N=$(condor_q ${*} -constraint 'JobStatus == 5' -af ClusterID ProcID | wc -l)
0005
0006 condor_q ${*} -constraint 'JobStatus == 5' -af ClusterID ProcID NumJobStarts | while read ClusterID ProcID NumJobStarts ; do
0007   echo "Job ${ClusterID}.${ProcID} ${NumJobStarts} ($n/$N)"
0008   if [[ ${ClusterID} > 26930000 ]] ; then
0009     prefix="LOG/CONDOR/osg_${ClusterID}/osg_${ClusterID}_${ProcID}"
0010   else
0011     prefix="LOG/CONDOR/osg_${ClusterID}_${ProcID}"
0012   fi
0013
0014   # Get logs from S3
0015   mc cp S3/eictest/EPIC/${prefix}.err ${prefix}.err || true
0016   mc cp S3/eictest/EPIC/${prefix}.out ${prefix}.out || true
0017
0018   # Copy hold record
0019   for i in ${prefix}.* ; do
0020     j=${i/LOG/HOLD}
0021     mkdir -p $(dirname ${j})
0022     cp ${i} ${j}
0023   done
0024
0025   # Common errors for automatic release
0026   if test -f ${prefix}.err ; then
0027     if grep "FATAL.*bad file descriptor" ${prefix}.err ; then
0028       condor_release ${ClusterID}.${ProcID}
0029       n=$((n+1))
0030       continue
0031     fi
0032     if grep "Transport endpoint is not connected" ${prefix}.err ; then
0033       read <&1
0034       condor_release ${ClusterID}.${ProcID}
0035       n=$((n+1))
0036       continue
0037     fi
0038     if grep "Unable to initialize new alias from the provided credentials." ${prefix}.err ; then
0039       grep ^resource ${prefix}.out
0040       grep ^hostname ${prefix}.out
0041       grep -A20 tracepath ${prefix}.out
0042       read <&1
0043       condor_release ${ClusterID}.${ProcID}
0044       n=$((n+1))
0045       continue
0046     fi
0047     if grep "tracepath: eics3.sdcc.bnl.gov: Temporary failure in name resolution" ${prefix}.err ; then
0048       grep hostname ${prefix}.out
0049       grep -A20 tracepath ${prefix}.out
0050       read <&1
0051       condor_release ${ClusterID}.${ProcID}
0052       n=$((n+1))
0053       continue
0054     fi
0055     if grep "mount /hadoop->/hadoop error" ${prefix}.err ; then
0056       read <&1
0057       condor_release ${ClusterID}.${ProcID}
0058       n=$((n+1))
0059       continue
0060     fi
0061     if grep "FATAL: kernel too old" ${prefix}.err ; then
0062       read <&1
0063       condor_release ${ClusterID}.${ProcID}
0064       n=$((n+1))
0065       continue
0066     fi
0067     #if grep -Pzo "Info in <TGeoManager::CloseGeometry>: -+modeler ready-+.*\n.*\[FATAL\] Detected timeout in worker! Stopping." ${prefix}.out ; then
0068     #  read <&1
0069     #  condor_release ${ClusterID}.${ProcID}
0070     #  n=$((n+1))
0071     #  continue
0072     #fi
0073     if grep -Pzo "Failed to load ID decoder for HcalBarrelHits" ${prefix}.out ; then
0074       read <&1
0075       condor_release ${ClusterID}.${ProcID}
0076       n=$((n+1))
0077       continue
0078     fi
0079   fi
0080   if test -f ${prefix}.out ; then
0081     if grep "GeomNav0003" ${prefix}.out && grep "lens_groove" ${prefix}.out ; then
0082       condor_rm ${ClusterID}.${ProcID}
0083       n=$((n+1))
0084       continue
0085     fi
0086     if grep "Bus error" ${prefix}.out ; then
0087       condor_release ${ClusterID}.${ProcID}
0088       n=$((n+1))
0089       continue
0090     fi
0091     if grep "^SysError.*No such file or directory$" ${prefix}.out ; then
0092       grep ^resource ${prefix}.out
0093       grep ^hostname ${prefix}.out
0094       condor_release ${ClusterID}.${ProcID}
0095       n=$((n+1))
0096       continue
0097     fi
0098     if grep "Unable to initialize new alias from the provided credentials." ${prefix}.out ; then
0099       grep ^date ${prefix}.out
0100       grep ^resource ${prefix}.out
0101       grep ^hostname ${prefix}.out
0102       grep -A20 tracepath ${prefix}.out
0103       read <&1
0104       condor_release ${ClusterID}.${ProcID}
0105       n=$((n+1))
0106       continue
0107     fi
0108     if grep "Unable to validate source" ${prefix}.out ; then
0109       grep ^resource ${prefix}.out
0110       grep ^hostname ${prefix}.out
0111       grep -A20 tracepath ${prefix}.out
0112       read <&1
0113       condor_release ${ClusterID}.${ProcID}
0114       n=$((n+1))
0115       continue
0116     fi
0117     if grep "No internet connection." ${prefix}.out ; then
0118       grep ^resource ${prefix}.out
0119       grep ^hostname ${prefix}.out
0120       grep -A20 tracepath ${prefix}.out
0121       read <&1
0122       condor_release ${ClusterID}.${ProcID}
0123       n=$((n+1))
0124       continue
0125     fi
0126     if [ $(grep "unsorted double linked list corrupted" ${prefix}.out | wc -l) -gt 10 ] ; then
0127       grep -B5 "unsorted double linked list corrupted" ${prefix}.out | head -n 10
0128       read <&1
0129       condor_rm ${ClusterID}.${ProcID}
0130       n=$((n+1))
0131       continue
0132     fi
0133     #if grep -B20 "\[FATAL\] Detected timeout in worker! Stopping." ${prefix}.out ; then
0134     #  read <&1
0135     #  condor_release ${ClusterID}.${ProcID}
0136     #  n=$((n+1))
0137     #  continue
0138     #fi
0139   fi
0140   if test -f ${prefix}.log ; then
0141     if grep "put on hold by SYSTEM_PERIODIC_HOLD due to memory usage" ${prefix}.log ; then
0142       read <&1
0143       condor_rm ${ClusterID}.${ProcID}
0144       n=$((n+1))
0145       continue
0146     fi
0147     #if grep "memory usage exceeded request_memory" ${prefix}.log ; then
0148     #  read <&1
0149     #  condor_release ${ClusterID}.${ProcID}
0150     #  n=$((n+1))
0151     #  continue
0152     #fi
0153   fi
0154
0155   ls -al ${prefix}.*
0156   for i in ${prefix}.* ; do
0157     tail -n 5 ${i}
0158   done
0159   test -f ${prefix}.out && grep -i error ${prefix}.out | tail -n 10
0160   test -f ${prefix}.out && grep resource ${prefix}.out
0161   test -f ${prefix}.out && grep hostname ${prefix}.out
0162
0163   review=x
0164   while [ -n "${review}" ] ; do
0165     read -n 1 -p "Job ${ClusterID}.${ProcID} ${NumJobStarts} ($n/$N): review? [e,l,o] " review <&1
0166     echo
0167     if [ "${review}" == "e" ] ; then
0168       less ${prefix}.err
0169     fi
0170     if [ "${review}" == "l" ] ; then
0171       less ${prefix}.log
0172     fi
0173     if [ "${review}" == "o" ] ; then
0174       less ${prefix}.out
0175     fi
0176   done
0177
0178   release=x
0179   read -n 1 -p "Job ${ClusterID}.${ProcID} ${NumJobStarts} ($n/$N): release? [Y,n,r] " release <&1
0180   echo
0181   if [ -z "${release}" -o "${release}" == "y" ] ; then
0182     condor_release ${ClusterID}.${ProcID}
0183   fi
0184   if [ "${release}" == "r" ] ; then
0185     condor_rm ${ClusterID}.${ProcID}
0186   fi
0187
0188   n=$((n+1))
0189 done