File indexing completed on 2026-04-19 08:00:02
0001
0002
0003
0004 import re
0005 from typing import Optional
0006
0007
0008
0009
0010 CMPM_Pilot_Wrapper = {
0011 2001: r"wrapper fault",
0012 2002: r"wrapper killed stuck pilot",
0013 2064: r"wrapper got cvmfs repos issue",
0014 2080: r"pilot proxy invalid or with insufficient timeleft",
0015 }
0016
0017
0018 CMPM_HTCondor_Schedd = {
0019
0020 3121: r"Failed to start gridmanager",
0021 3131: r"Failed to start GAHP",
0022 3132: r"Failed to provide GAHP with token",
0023 3190: r"Unspecified gridmanager error",
0024
0025 3200: r"Error connecting to schedd .*",
0026
0027 3601: r"Transfer output files failure at execution point .* while sending files to access point .*",
0028
0029 3631: r"File stage-in failed: Transfer of .* failed: Forbidden",
0030 3632: r"File stage-in failed: Transfer of .* failed: Not Found",
0031 3636: r"File stage-in failed: Transfer of .* failed: HTTP response .*",
0032 3637: r"File stage-in failed: Transfer of .* failed: curl_easy_perform\(\) failed .*",
0033 3639: r"File stage-in failed: .*",
0034
0035 3641: r"File stage-out failed: Transfer of .* failed: Forbidden",
0036 3642: r"File stage-out failed: Transfer of .* failed: Not Found",
0037 3646: r"File stage-out failed: Transfer of .* failed: HTTP response .*",
0038 3647: r"File stage-out failed: Transfer of .* failed: curl_easy_perform\(\) failed .*",
0039 3649: r"File stage-out failed: .*",
0040
0041 3681: r"curl_easy_perform.* failed .*",
0042 3689: r"HTTP response .*",
0043
0044 3811: r"condor job .* not found",
0045 3812: r"cannot get JobStatus of job .*",
0046 3831: r"Payload execution error: returned non-zero .*",
0047 3841: r"cannot get ExitCode of job .*",
0048 3842: r"got invalid ExitCode .* of job .*",
0049 }
0050
0051
0052 CMPM_CE = {
0053
0054 4101: r"Job disappeared from remote schedd",
0055 4102: r"Error locating schedd .*",
0056 4103: r"Schedd .* didn't send expected files",
0057 4104: r"Error receiving files from schedd .*",
0058
0059 4201: r"LRMS error: .* Node fail",
0060 4211: r"LRMS error: .* job killed: vmem",
0061 4214: r"LRMS error: .* job killed: cput",
0062 4215: r"LRMS error: .* job killed: wall",
0063 4219: r"LRMS error: .* job killed: .*",
0064 4220: r"LRMS error: .* Job missing from .*",
0065 4241: r"LRMS error: .* Job failed with unknown exit code",
0066 4242: r"LRMS error: .* Job failed but .* reported .*",
0067 4244: r"LRMS error: .* Job failed .*",
0068 4245: r"LRMS error: .* Job was lost with unknown exit code",
0069 4247: r"LRMS error: .* Job was killed by .*",
0070 4248: r"LRMS error: .* Job was .*",
0071 4249: r"LRMS error: .* Job .*",
0072 4260: r"LRMS error: .* PeriodicRemove evaluated to TRUE",
0073 4261: r"LRMS error: .* RemoveReason: .*",
0074 4290: r"LRMS error: .*",
0075
0076
0077 4311: r"ARC_JOB_NEW timed out",
0078 4312: r"ARC_JOB_KILL timed out",
0079 4313: r"ARC_JOB_CLEAN timed out",
0080 4314: r"ARC_DELEGATION_NEW timed out",
0081 4315: r"ARC_JOB_STAGE_IN timed out",
0082 4316: r"ARC_JOB_STAGE_OUT timed out",
0083 4319: r"ARC_.* timed out",
0084
0085 4351: r"ARC job failed: Job submission to .* failed",
0086 4352: r"ARC job failed: Job is canceled by external request",
0087 4353: r"ARC job failed: Failed extracting LRMS ID due to some internal error",
0088 4360: r"New job submission is not allowed",
0089 4361: r"Forbidden",
0090 4362: r"Job could not be cleaned",
0091 4363: r"ARC job has no credentials",
0092 4364: r"Failed to find valid session directory",
0093 4390: r"ARC job failed: .*",
0094 4391: r"ARC job failed for unknown reason",
0095 }
0096
0097
0098 CMPM_Batch_System = {
0099
0100
0101 5511: r"Number of submitted jobs would exceed MAX_JOBS_SUBMITTED",
0102 5512: r"Number of submitted jobs would exceed MAX_JOBS_PER_OWNER",
0103 5519: r"Number of submitted jobs would exceed .*",
0104 5520: r"The job exceeded allowed .* duration of .*",
0105
0106 5530: r"Error from slot.*: (Job|.* job) has gone over (memory|.* memory) limit of .*",
0107 5531: r"Error from slot.*: peak memory usage exceeded .*",
0108
0109 5541: r"Error from slot.*: Error running docker job: .*",
0110 5549: r"Error from slot.*: .*",
0111
0112 5570: r"SYSTEM_PERIODIC_HOLD",
0113 5571: r".* Second start not allowed",
0114 5572: r"job aborted due to .*",
0115 5573: r"Job runtime longer than reserved",
0116 5574: r"Memory usage higher than .*",
0117 5575: r"Failed to create session directory",
0118
0119 5601: r"submission command failed \(exit code = .*\).*",
0120 5602: r"no jobId in submission script's output .*",
0121 5699: r"Slurm .*",
0122 }
0123
0124
0125
0126
0127 CMPM_Kubernetes = {
0128 6801: r"Job has reached the specified backoff limit",
0129 6810: r"Pod was rejected: Node didn't have enough resource: .*",
0130 6814: r"Pod was rejected: .*",
0131 6815: r"Pod was active on the node longer than the specified deadline",
0132 6819: r"Pod was .*",
0133
0134 6881: r"Failed to get status for id=.*",
0135 6882: r"JOB id=.* not found",
0136 6883: r"container not terminated yet \(.*\) while pod Succeeded",
0137 6884: r"container terminated by k8s for reason .*",
0138 }
0139
0140
0141 CMPM_Kill_Remove = {
0142
0143
0144 8970: r"Python-initiated action\.* \(by user .*\)",
0145 8971: r"via condor_rm .*",
0146 8979: r"PeriodicRemove .*",
0147
0148 8981: r"removed by SYSTEM_PERIODIC_REMOVE due to job restarted undesirably",
0149 8982: r"removed by SYSTEM_PERIODIC_REMOVE due to job held time exceeded .*",
0150 8983: r"removed by SYSTEM_PERIODIC_REMOVE due to job status unchanged time exceeded .*",
0151 8984: r"removed by SYSTEM_PERIODIC_REMOVE due to job staying in queue time exceeded .*",
0152 8985: r"removed by SYSTEM_PERIODIC_REMOVE due to job remote status outdated time exceeded .*",
0153 8990: r"removed by SYSTEM_PERIODIC_REMOVE due to .*",
0154 8991: r"Remove Reason unknown",
0155 }
0156
0157 CMPM_Catchall = {
0158
0159 9000: r".*",
0160 }
0161
0162
0163
0164
0165 class ErrorMessagePatternHandler(object):
0166 """
0167 Class to handle error message patterns and their corresponding error codes.
0168 This class is used to map error messages to specific error codes for supplemental worker errors.
0169 """
0170
0171 def __init__(self, code_pattern_map: dict):
0172 """
0173 Initialize the ErrorMessagePatternHandler with a mapping of error codes to regex patterns.
0174
0175 Args:
0176 code_pattern_map (dict): A dictionary mapping error codes to regex patterns.
0177 """
0178 self._code_pattern_map = code_pattern_map.copy()
0179 self._pattern_code_map = {v: k for k, v in self._code_pattern_map.items()}
0180
0181 def get_error_code(self, message: str) -> Optional[int]:
0182 """
0183 Get the error code for a given message based on the defined patterns.
0184
0185 Args:
0186 message (str): The error message to check against the patterns.
0187
0188 Returns:
0189 int: The error code if a pattern matches, otherwise None.
0190 """
0191 for pattern, code in self._pattern_code_map.items():
0192 if re.search(pattern, message):
0193 return code
0194 return None
0195
0196
0197 class WorkerErrors(object):
0198 error_codes = {
0199 "SUCCEEDED": 0,
0200 "UNKNOWN": 1000,
0201 "PREEMPTED": 1001,
0202 "GENERAL_ERROR": 9000,
0203 }
0204
0205
0206 htcondor_message_pattern_handler = ErrorMessagePatternHandler(
0207 CMPM_Pilot_Wrapper | CMPM_HTCondor_Schedd | CMPM_CE | CMPM_Batch_System | CMPM_Kill_Remove | CMPM_Catchall
0208 )
0209
0210
0211 k8s_message_pattern_handler = ErrorMessagePatternHandler(CMPM_Kubernetes | CMPM_Catchall)