Warning, file /job_submission_condor/scripts/hold_review_and_release.sh was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003 n=1
0004 N=$(condor_q ${*} -constraint 'JobStatus == 5' -af ClusterID ProcID | wc -l)
0005
0006 condor_q ${*} -constraint 'JobStatus == 5' -af ClusterID ProcID NumJobStarts | while read ClusterID ProcID NumJobStarts ; do
0007 echo "Job ${ClusterID}.${ProcID} ${NumJobStarts} ($n/$N)"
0008 if [[ ${ClusterID} > 26930000 ]] ; then
0009 prefix="LOG/CONDOR/osg_${ClusterID}/osg_${ClusterID}_${ProcID}"
0010 else
0011 prefix="LOG/CONDOR/osg_${ClusterID}_${ProcID}"
0012 fi
0013
0014
0015 mc cp S3/eictest/EPIC/${prefix}.err ${prefix}.err || true
0016 mc cp S3/eictest/EPIC/${prefix}.out ${prefix}.out || true
0017
0018
0019 for i in ${prefix}.* ; do
0020 j=${i/LOG/HOLD}
0021 mkdir -p $(dirname ${j})
0022 cp ${i} ${j}
0023 done
0024
0025
0026 if test -f ${prefix}.err ; then
0027 if grep "FATAL.*bad file descriptor" ${prefix}.err ; then
0028 condor_release ${ClusterID}.${ProcID}
0029 n=$((n+1))
0030 continue
0031 fi
0032 if grep "Transport endpoint is not connected" ${prefix}.err ; then
0033 read <&1
0034 condor_release ${ClusterID}.${ProcID}
0035 n=$((n+1))
0036 continue
0037 fi
0038 if grep "Unable to initialize new alias from the provided credentials." ${prefix}.err ; then
0039 grep ^resource ${prefix}.out
0040 grep ^hostname ${prefix}.out
0041 grep -A20 tracepath ${prefix}.out
0042 read <&1
0043 condor_release ${ClusterID}.${ProcID}
0044 n=$((n+1))
0045 continue
0046 fi
0047 if grep "tracepath: eics3.sdcc.bnl.gov: Temporary failure in name resolution" ${prefix}.err ; then
0048 grep hostname ${prefix}.out
0049 grep -A20 tracepath ${prefix}.out
0050 read <&1
0051 condor_release ${ClusterID}.${ProcID}
0052 n=$((n+1))
0053 continue
0054 fi
0055 if grep "mount /hadoop->/hadoop error" ${prefix}.err ; then
0056 read <&1
0057 condor_release ${ClusterID}.${ProcID}
0058 n=$((n+1))
0059 continue
0060 fi
0061 if grep "FATAL: kernel too old" ${prefix}.err ; then
0062 read <&1
0063 condor_release ${ClusterID}.${ProcID}
0064 n=$((n+1))
0065 continue
0066 fi
0067
0068
0069
0070
0071
0072
0073 if grep -Pzo "Failed to load ID decoder for HcalBarrelHits" ${prefix}.out ; then
0074 read <&1
0075 condor_release ${ClusterID}.${ProcID}
0076 n=$((n+1))
0077 continue
0078 fi
0079 fi
0080 if test -f ${prefix}.out ; then
0081 if grep "GeomNav0003" ${prefix}.out && grep "lens_groove" ${prefix}.out ; then
0082 condor_rm ${ClusterID}.${ProcID}
0083 n=$((n+1))
0084 continue
0085 fi
0086 if grep "Bus error" ${prefix}.out ; then
0087 condor_release ${ClusterID}.${ProcID}
0088 n=$((n+1))
0089 continue
0090 fi
0091 if grep "^SysError.*No such file or directory$" ${prefix}.out ; then
0092 grep ^resource ${prefix}.out
0093 grep ^hostname ${prefix}.out
0094 condor_release ${ClusterID}.${ProcID}
0095 n=$((n+1))
0096 continue
0097 fi
0098 if grep "Unable to initialize new alias from the provided credentials." ${prefix}.out ; then
0099 grep ^date ${prefix}.out
0100 grep ^resource ${prefix}.out
0101 grep ^hostname ${prefix}.out
0102 grep -A20 tracepath ${prefix}.out
0103 read <&1
0104 condor_release ${ClusterID}.${ProcID}
0105 n=$((n+1))
0106 continue
0107 fi
0108 if grep "Unable to validate source" ${prefix}.out ; then
0109 grep ^resource ${prefix}.out
0110 grep ^hostname ${prefix}.out
0111 grep -A20 tracepath ${prefix}.out
0112 read <&1
0113 condor_release ${ClusterID}.${ProcID}
0114 n=$((n+1))
0115 continue
0116 fi
0117 if grep "No internet connection." ${prefix}.out ; then
0118 grep ^resource ${prefix}.out
0119 grep ^hostname ${prefix}.out
0120 grep -A20 tracepath ${prefix}.out
0121 read <&1
0122 condor_release ${ClusterID}.${ProcID}
0123 n=$((n+1))
0124 continue
0125 fi
0126 if [ $(grep "unsorted double linked list corrupted" ${prefix}.out | wc -l) -gt 10 ] ; then
0127 grep -B5 "unsorted double linked list corrupted" ${prefix}.out | head -n 10
0128 read <&1
0129 condor_rm ${ClusterID}.${ProcID}
0130 n=$((n+1))
0131 continue
0132 fi
0133
0134
0135
0136
0137
0138
0139 fi
0140 if test -f ${prefix}.log ; then
0141 if grep "put on hold by SYSTEM_PERIODIC_HOLD due to memory usage" ${prefix}.log ; then
0142 read <&1
0143 condor_rm ${ClusterID}.${ProcID}
0144 n=$((n+1))
0145 continue
0146 fi
0147
0148
0149
0150
0151
0152
0153 fi
0154
0155 ls -al ${prefix}.*
0156 for i in ${prefix}.* ; do
0157 tail -n 5 ${i}
0158 done
0159 test -f ${prefix}.out && grep -i error ${prefix}.out | tail -n 10
0160 test -f ${prefix}.out && grep resource ${prefix}.out
0161 test -f ${prefix}.out && grep hostname ${prefix}.out
0162
0163 review=x
0164 while [ -n "${review}" ] ; do
0165 read -n 1 -p "Job ${ClusterID}.${ProcID} ${NumJobStarts} ($n/$N): review? [e,l,o] " review <&1
0166 echo
0167 if [ "${review}" == "e" ] ; then
0168 less ${prefix}.err
0169 fi
0170 if [ "${review}" == "l" ] ; then
0171 less ${prefix}.log
0172 fi
0173 if [ "${review}" == "o" ] ; then
0174 less ${prefix}.out
0175 fi
0176 done
0177
0178 release=x
0179 read -n 1 -p "Job ${ClusterID}.${ProcID} ${NumJobStarts} ($n/$N): release? [Y,n,r] " release <&1
0180 echo
0181 if [ -z "${release}" -o "${release}" == "y" ] ; then
0182 condor_release ${ClusterID}.${ProcID}
0183 fi
0184 if [ "${release}" == "r" ] ; then
0185 condor_rm ${ClusterID}.${ProcID}
0186 fi
0187
0188 n=$((n+1))
0189 done