File indexing completed on 2026-04-10 08:39:14
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 import re
0012
0013
0014 class ErrorCodes:
0015 """
0016 Pilot error codes.
0017
0018 Note: Error code numbering is the same as in Pilot 1 since that is expected by the PanDA server and monitor.
0019 Note 2: Add error codes as they are needed in other modules. Do not import the full Pilot 1 list at once as there
0020 might very well be codes that can be reassigned/removed.
0021 """
0022
0023
0024 GENERALERROR = 1008
0025 NOLOCALSPACE = 1098
0026 STAGEINFAILED = 1099
0027 REPLICANOTFOUND = 1100
0028 NOSUCHFILE = 1103
0029 USERDIRTOOLARGE = 1104
0030 STDOUTTOOBIG = 1106
0031 SETUPFAILURE = 1110
0032 NFSSQLITE = 1115
0033 QUEUEDATA = 1116
0034 QUEUEDATANOTOK = 1117
0035 OUTPUTFILETOOLARGE = 1124
0036 NOSTORAGE = 1133
0037 STAGEOUTFAILED = 1137
0038 PUTMD5MISMATCH = 1141
0039 CHMODTRF = 1143
0040 PANDAKILL = 1144
0041 GETMD5MISMATCH = 1145
0042 TRFDOWNLOADFAILURE = 1149
0043 LOOPINGJOB = 1150
0044 STAGEINTIMEOUT = 1151
0045 STAGEOUTTIMEOUT = 1152
0046 NOPROXY = 1163
0047 MISSINGOUTPUTFILE = 1165
0048 SIZETOOLARGE = 1168
0049 GETADMISMATCH = 1171
0050 PUTADMISMATCH = 1172
0051 NOVOMSPROXY = 1177
0052 GETGLOBUSSYSERR = 1180
0053 PUTGLOBUSSYSERR = 1181
0054 NOSOFTWAREDIR = 1186
0055 NOPAYLOADMETADATA = 1187
0056 LFNTOOLONG = 1190
0057 ZEROFILESIZE = 1191
0058 MKDIR = 1199
0059 KILLSIGNAL = 1200
0060 SIGTERM = 1201
0061 SIGQUIT = 1202
0062 SIGSEGV = 1203
0063 SIGXCPU = 1204
0064 USERKILL = 1205
0065 SIGBUS = 1206
0066 SIGUSR1 = 1207
0067 MISSINGINSTALLATION = 1211
0068 PAYLOADOUTOFMEMORY = 1212
0069 REACHEDMAXTIME = 1213
0070 UNKNOWNPAYLOADFAILURE = 1220
0071 FILEEXISTS = 1221
0072 BADALLOC = 1223
0073 ESRECOVERABLE = 1224
0074 ESFATAL = 1228
0075 EXECUTEDCLONEJOB = 1234
0076 PAYLOADEXCEEDMAXMEM = 1235
0077 FAILEDBYSERVER = 1236
0078 ESNOEVENTS = 1238
0079 MESSAGEHANDLINGFAILURE = 1240
0080 CHKSUMNOTSUP = 1242
0081 NORELEASEFOUND = 1244
0082 NOUSERTARBALL = 1246
0083 BADXML = 1247
0084
0085
0086 NOTIMPLEMENTED = 1300
0087 UNKNOWNEXCEPTION = 1301
0088 CONVERSIONFAILURE = 1302
0089 FILEHANDLINGFAILURE = 1303
0090 PAYLOADEXECUTIONFAILURE = 1305
0091 SINGULARITYGENERALFAILURE = 1306
0092 SINGULARITYNOLOOPDEVICES = 1307
0093 SINGULARITYBINDPOINTFAILURE = 1308
0094 SINGULARITYIMAGEMOUNTFAILURE = 1309
0095 PAYLOADEXECUTIONEXCEPTION = 1310
0096 NOTDEFINED = 1311
0097 NOTSAMELENGTH = 1312
0098 NOSTORAGEPROTOCOL = 1313
0099 UNKNOWNCHECKSUMTYPE = 1314
0100 UNKNOWNTRFFAILURE = 1315
0101 RUCIOSERVICEUNAVAILABLE = 1316
0102 EXCEEDEDMAXWAITTIME = 1317
0103 COMMUNICATIONFAILURE = 1318
0104 INTERNALPILOTPROBLEM = 1319
0105 LOGFILECREATIONFAILURE = 1320
0106 RUCIOLOCATIONFAILED = 1321
0107 RUCIOLISTREPLICASFAILED = 1322
0108 UNKNOWNCOPYTOOL = 1323
0109 SERVICENOTAVAILABLE = 1324
0110 SINGULARITYNOTINSTALLED = 1325
0111 NOREPLICAS = 1326
0112 UNREACHABLENETWORK = 1327
0113 PAYLOADSIGSEGV = 1328
0114 NONDETERMINISTICDDM = 1329
0115 JSONRETRIEVALTIMEOUT = 1330
0116 MISSINGINPUTFILE = 1331
0117 BLACKHOLE = 1332
0118 NOREMOTESPACE = 1333
0119 SETUPFATAL = 1334
0120 MISSINGUSERCODE = 1335
0121 JOBALREADYRUNNING = 1336
0122 BADMEMORYMONITORJSON = 1337
0123 STAGEINAUTHENTICATIONFAILURE = 1338
0124 DBRELEASEFAILURE = 1339
0125 SINGULARITYNEWUSERNAMESPACE = 1340
0126 BADQUEUECONFIGURATION = 1341
0127 MIDDLEWAREIMPORTFAILURE = 1342
0128 NOOUTPUTINJOBREPORT = 1343
0129 RESOURCEUNAVAILABLE = 1344
0130 SINGULARITYFAILEDUSERNAMESPACE = 1345
0131 TRANSFORMNOTFOUND = 1346
0132 UNSUPPORTEDSL5OS = 1347
0133 SINGULARITYRESOURCEUNAVAILABLE = 1348
0134 UNRECOGNIZEDTRFARGUMENTS = 1349
0135 EMPTYOUTPUTFILE = 1350
0136 UNRECOGNIZEDTRFSTDERR = 1351
0137 STATFILEPROBLEM = 1352
0138 NOSUCHPROCESS = 1353
0139 GENERALCPUCALCPROBLEM = 1354
0140 COREDUMP = 1355
0141 PREPROCESSFAILURE = 1356
0142 POSTPROCESSFAILURE = 1357
0143 MISSINGRELEASEUNPACKED = 1358
0144 PANDAQUEUENOTACTIVE = 1359
0145 IMAGENOTFOUND = 1360
0146 REMOTEFILECOULDNOTBEOPENED = 1361
0147 XRDCPERROR = 1362
0148 KILLPAYLOAD = 1363
0149 MISSINGCREDENTIALS = 1364
0150 NOCTYPES = 1365
0151
0152 _error_messages = {
0153 GENERALERROR: "General pilot error, consult batch log",
0154 NOLOCALSPACE: "Not enough local space",
0155 STAGEINFAILED: "Failed to stage-in file",
0156 REPLICANOTFOUND: "Replica not found",
0157 NOSUCHFILE: "No such file or directory",
0158 USERDIRTOOLARGE: "User work directory too large",
0159 STDOUTTOOBIG: "Payload log or stdout file too big",
0160 SETUPFAILURE: "Failed during payload setup",
0161 NFSSQLITE: "NFS SQLite locking problems",
0162 QUEUEDATA: "Pilot could not download queuedata",
0163 QUEUEDATANOTOK: "Pilot found non-valid queuedata",
0164 OUTPUTFILETOOLARGE: "Output file too large",
0165 NOSTORAGE: "Fetching default storage failed: no activity related storage defined",
0166 STAGEOUTFAILED: "Failed to stage-out file",
0167 PUTMD5MISMATCH: "md5sum mismatch on output file",
0168 GETMD5MISMATCH: "md5sum mismatch on input file",
0169 CHMODTRF: "Failed to chmod transform",
0170 PANDAKILL: "This job was killed by panda server",
0171 MISSINGOUTPUTFILE: "Local output file is missing",
0172 SIZETOOLARGE: "Total file size too large",
0173 TRFDOWNLOADFAILURE: "Transform could not be downloaded",
0174 LOOPINGJOB: "Looping job killed by pilot",
0175 STAGEINTIMEOUT: "File transfer timed out during stage-in",
0176 STAGEOUTTIMEOUT: "File transfer timed out during stage-out",
0177 NOPROXY: "Grid proxy not valid",
0178 GETADMISMATCH: "adler32 mismatch on input file",
0179 PUTADMISMATCH: "adler32 mismatch on output file",
0180 NOVOMSPROXY: "Voms proxy not valid",
0181 GETGLOBUSSYSERR: "Globus system error during stage-in",
0182 PUTGLOBUSSYSERR: "Globus system error during stage-out",
0183 NOSOFTWAREDIR: "Software directory does not exist",
0184 NOPAYLOADMETADATA: "Payload metadata does not exist",
0185 LFNTOOLONG: "LFN too long (exceeding limit of 255 characters)",
0186 ZEROFILESIZE: "File size cannot be zero",
0187 MKDIR: "Failed to create local directory",
0188 KILLSIGNAL: "Job terminated by unknown kill signal",
0189 SIGTERM: "Job killed by signal: SIGTERM",
0190 SIGQUIT: "Job killed by signal: SIGQUIT",
0191 SIGSEGV: "Job killed by signal: SIGSEGV",
0192 SIGXCPU: "Job killed by signal: SIGXCPU",
0193 SIGUSR1: "Job killed by signal: SIGUSR1",
0194 SIGBUS: "Job killed by signal: SIGBUS",
0195 USERKILL: "Job killed by user",
0196 MISSINGINSTALLATION: "Missing installation",
0197 PAYLOADOUTOFMEMORY: "Payload ran out of memory",
0198 REACHEDMAXTIME: "Reached batch system time limit",
0199 UNKNOWNPAYLOADFAILURE: "Job failed due to unknown reason (consult log file)",
0200 FILEEXISTS: "File already exists",
0201 BADALLOC: "Transform failed due to bad_alloc",
0202 CHKSUMNOTSUP: "Query checksum is not supported",
0203 NORELEASEFOUND: "No release candidates found",
0204 NOUSERTARBALL: "User tarball could not be downloaded from PanDA server",
0205 BADXML: "Badly formed XML",
0206 ESRECOVERABLE: "Event service: recoverable error",
0207 ESFATAL: "Event service: fatal error",
0208 EXECUTEDCLONEJOB: "Clone job is already executed",
0209 PAYLOADEXCEEDMAXMEM: "Payload exceeded maximum allowed memory",
0210 FAILEDBYSERVER: "Failed by server",
0211 ESNOEVENTS: "Event service: no events",
0212 MESSAGEHANDLINGFAILURE: "Failed to handle message from payload",
0213 NOTIMPLEMENTED: "The class or function is not implemented",
0214 UNKNOWNEXCEPTION: "An unknown pilot exception has occurred",
0215 CONVERSIONFAILURE: "Failed to convert object data",
0216 FILEHANDLINGFAILURE: "Failed during file handling",
0217 PAYLOADEXECUTIONFAILURE: "Failed to execute payload",
0218 SINGULARITYGENERALFAILURE: "Singularity: general failure",
0219 SINGULARITYNOLOOPDEVICES: "Singularity: No more available loop devices",
0220 SINGULARITYBINDPOINTFAILURE: "Singularity: Not mounting requested bind point",
0221 SINGULARITYIMAGEMOUNTFAILURE: "Singularity: Failed to mount image",
0222 SINGULARITYNOTINSTALLED: "Singularity: not installed",
0223 PAYLOADEXECUTIONEXCEPTION: "Exception caught during payload execution",
0224 NOTDEFINED: "Not defined",
0225 NOTSAMELENGTH: "Not same length",
0226 NOSTORAGEPROTOCOL: "No protocol defined for storage endpoint",
0227 UNKNOWNCHECKSUMTYPE: "Unknown checksum type",
0228 UNKNOWNTRFFAILURE: "Unknown transform failure",
0229 RUCIOSERVICEUNAVAILABLE: "Rucio: Service unavailable",
0230 EXCEEDEDMAXWAITTIME: "Exceeded maximum waiting time",
0231 COMMUNICATIONFAILURE: "Failed to communicate with server",
0232 INTERNALPILOTPROBLEM: "An internal Pilot problem has occurred (consult Pilot log)",
0233 LOGFILECREATIONFAILURE: "Failed during creation of log file",
0234 RUCIOLOCATIONFAILED: "Failed to get client location for Rucio",
0235 RUCIOLISTREPLICASFAILED: "Failed to get replicas from Rucio",
0236 UNKNOWNCOPYTOOL: "Unknown copy tool",
0237 SERVICENOTAVAILABLE: "Service not available at the moment",
0238 NOREPLICAS: "No matching replicas were found in list_replicas() output",
0239 UNREACHABLENETWORK: "Unable to stage-in file since network is unreachable",
0240 PAYLOADSIGSEGV: "SIGSEGV: Invalid memory reference or a segmentation fault",
0241 NONDETERMINISTICDDM: "Failed to construct SURL for non-deterministic ddm (update CRIC)",
0242 JSONRETRIEVALTIMEOUT: "JSON retrieval timed out",
0243 MISSINGINPUTFILE: "Input file is missing in storage element",
0244 BLACKHOLE: "Black hole detected in file system (consult Pilot log)",
0245 NOREMOTESPACE: "No space left on device",
0246 SETUPFATAL: "Setup failed with a fatal exception (consult Payload log)",
0247 MISSINGUSERCODE: "User code not available on PanDA server (resubmit task with --useNewCode)",
0248 JOBALREADYRUNNING: "Job is already running elsewhere",
0249 BADMEMORYMONITORJSON: "Memory monitor produced bad output",
0250 STAGEINAUTHENTICATIONFAILURE: "Authentication failure during stage-in",
0251 DBRELEASEFAILURE: "Local DBRelease handling failed (consult Pilot log)",
0252 SINGULARITYNEWUSERNAMESPACE: "Singularity: Failed invoking the NEWUSER namespace runtime",
0253 BADQUEUECONFIGURATION: "Bad queue configuration detected",
0254 MIDDLEWAREIMPORTFAILURE: "Failed to import middleware (consult Pilot log)",
0255 NOOUTPUTINJOBREPORT: "Found no output in job report",
0256 RESOURCEUNAVAILABLE: "Resource temporarily unavailable",
0257 SINGULARITYFAILEDUSERNAMESPACE: "Singularity: Failed to create user namespace",
0258 TRANSFORMNOTFOUND: "Transform not found",
0259 UNSUPPORTEDSL5OS: "Unsupported SL5 OS",
0260 SINGULARITYRESOURCEUNAVAILABLE: "Singularity: Resource temporarily unavailable",
0261 UNRECOGNIZEDTRFARGUMENTS: "Unrecognized transform arguments",
0262 EMPTYOUTPUTFILE: "Empty output file detected",
0263 UNRECOGNIZEDTRFSTDERR: "Unrecognized fatal error in transform stderr",
0264 STATFILEPROBLEM: "Failed to stat proc file for CPU consumption calculation",
0265 NOSUCHPROCESS: "CPU consumption calculation failed: No such process",
0266 GENERALCPUCALCPROBLEM: "General CPU consumption calculation problem (consult Pilot log)",
0267 COREDUMP: "Core dump detected",
0268 PREPROCESSFAILURE: "Pre-process command failed",
0269 POSTPROCESSFAILURE: "Post-process command failed",
0270 MISSINGRELEASEUNPACKED: "Missing release setup in unpacked container",
0271 PANDAQUEUENOTACTIVE: "PanDA queue is not active",
0272 IMAGENOTFOUND: "Image not found",
0273 REMOTEFILECOULDNOTBEOPENED: "Remote file could not be opened",
0274 XRDCPERROR: "Xrdcp was unable to open file",
0275 KILLPAYLOAD: "Raythena has decided to kill payload",
0276 MISSINGCREDENTIALS: "Unable to locate credentials for S3 transfer",
0277 NOCTYPES: "Python module ctypes not available on worker node"
0278 }
0279
0280 put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181]
0281 recoverable_error_codes = [0] + put_error_codes
0282
0283 def get_kill_signal_error_code(self, signal):
0284 """
0285 Match a kill signal with a corresponding Pilot error code.
0286
0287 :param signal: signal name (string).
0288 :return: Pilot error code (integer).
0289 """
0290
0291 signals_dictionary = {'SIGTERM': self.SIGTERM,
0292 'SIGQUIT': self.SIGQUIT,
0293 'SIGSEGV': self.SIGSEGV,
0294 'SIGXCPU': self.SIGXCPU,
0295 'SIGUSR1': self.SIGUSR1,
0296 'SIGBUS': self.SIGBUS}
0297
0298 return signals_dictionary.get(signal, self.KILLSIGNAL)
0299
0300 def get_error_message(self, errorcode):
0301 """
0302 Return the error message corresponding to the given error code.
0303
0304 :param errorcode:
0305 :return: errormessage (string)
0306 """
0307
0308 if errorcode in self._error_messages:
0309 return self._error_messages[errorcode]
0310 else:
0311 return "Unknown error code: %d" % errorcode
0312
0313 def add_error_code(self, errorcode, pilot_error_codes=[], pilot_error_diags=[], priority=False, msg=None):
0314 """
0315 Add pilot error code to list of error codes.
0316 This function adds the given error code to the list of all errors that have occurred. This is needed since
0317 several errors can happen; e.g. a stage-in error can be followed by a stage-out error during the log transfer.
0318 The full list of errors is dumped to the log, but only the first error is reported to the server.
0319 The function also sets the corresponding error message.
0320
0321 :param errorcode: pilot error code (integer)
0322 :param pilot_error_codes: list of pilot error codes (list of integers)
0323 :param pilot_error_diags: list of pilot error diags (list of strings)
0324 :param priority: if set to True, the new errorcode will be added to the error code list first (highest priority)
0325 :param msg: error message (more detailed) to overwrite standard error message (string).
0326 :return: pilot_error_codes, pilot_error_diags
0327 """
0328
0329
0330 if errorcode not in pilot_error_codes:
0331 error_msg = msg if msg else self.get_error_message(errorcode)
0332 if priority:
0333 pilot_error_codes.insert(0, errorcode)
0334 pilot_error_diags.insert(0, error_msg)
0335 else:
0336 pilot_error_codes.append(errorcode)
0337 pilot_error_diags.append(error_msg)
0338
0339 return pilot_error_codes, pilot_error_diags
0340
0341 def remove_error_code(self, errorcode, pilot_error_codes=[], pilot_error_diags=[]):
0342 """
0343 Silently remove an error code and its diagnostics from the internal error lists.
0344 There is no warning or exception thrown in case the error code is not present in the lists.
0345
0346 :param errorcode: error code (int).
0347 :return: pilot_error_codes, pilot_error_diags
0348 """
0349
0350 if errorcode in pilot_error_codes:
0351 try:
0352 index = pilot_error_codes.index(errorcode)
0353 except ValueError:
0354 pass
0355 else:
0356
0357 pilot_error_codes.pop(index)
0358 pilot_error_diags.pop(index)
0359
0360 return pilot_error_codes, pilot_error_diags
0361
0362 def report_errors(self, pilot_error_codes, pilot_error_diags):
0363 """
0364 Report all errors that occurred during running.
0365 The function should be called towards the end of running a job.
0366
0367 :param pilot_error_codes: list of pilot error codes (list of integers)
0368 :param pilot_error_diags: list of pilot error diags (list of strings)
0369 :return: error_report (string)
0370 """
0371
0372 i = 0
0373 if pilot_error_codes == []:
0374 report = "no pilot errors were reported"
0375 else:
0376 report = "Nr.\tError code\tError diagnostics"
0377 for errorcode in pilot_error_codes:
0378 i += 1
0379 report += "\n%d.\t%d\t%s" % (i, errorcode, pilot_error_diags[i - 1])
0380
0381 return report
0382
0383 def resolve_transform_error(self, exit_code, stderr):
0384 """
0385 Assign a pilot error code to a specific transform error.
0386 :param exit_code: transform exit code.
0387 :param stderr: transform stderr
0388 :return: pilot error code (int)
0389 """
0390
0391 if exit_code == 251 and "Not mounting requested bind point" in stderr:
0392 exit_code = self.SINGULARITYBINDPOINTFAILURE
0393 elif exit_code == 255 and "No more available loop devices" in stderr:
0394 exit_code = self.SINGULARITYNOLOOPDEVICES
0395 elif exit_code == 255 and "Failed to mount image" in stderr:
0396 exit_code = self.SINGULARITYIMAGEMOUNTFAILURE
0397 elif exit_code == 255 and "Operation not permitted" in stderr:
0398 exit_code = self.SINGULARITYGENERALFAILURE
0399 elif "Singularity is not installed" in stderr:
0400 exit_code = self.SINGULARITYNOTINSTALLED
0401 elif exit_code == 64 and "cannot create directory" in stderr:
0402 exit_code = self.MKDIR
0403 elif exit_code == -1:
0404 exit_code = self.UNKNOWNTRFFAILURE
0405 elif exit_code != 0:
0406 exit_code = self.PAYLOADEXECUTIONFAILURE
0407
0408 return exit_code
0409
0410 def extract_stderr_error(self, stderr):
0411 """
0412 Extract the ERROR message from the payload stderr.
0413 :param stderr: string.
0414 :return: string.
0415 """
0416
0417
0418 if "command not found" in stderr:
0419 msg = stderr
0420 else:
0421 msg = self.get_message_for_pattern([r"ERROR\s*:\s*(.*)", r"Error\s*:\s*(.*)", r"error\s*:\s*(.*)"], stderr)
0422 return msg
0423
0424 def extract_stderr_warning(self, stderr):
0425 """
0426 Extract the WARNING message from the payload stderr.
0427 :param stderr: string.
0428 :return: string.
0429 """
0430
0431 return self.get_message_for_pattern([r"WARNING\s*:\s*(.*)", r"Warning\s*:\s*(.*)", r"warning\s*:\s*(.*)"], stderr)
0432
0433 def get_message_for_pattern(self, patterns, stderr):
0434 """
0435
0436 :param patterns: list of patterns.
0437 :param stderr: string.
0438 :return: string.
0439 """
0440
0441 msg = ""
0442 for pattern in patterns:
0443 found = re.findall(pattern, stderr)
0444 if len(found) > 0:
0445 msg = found[0]
0446 break
0447
0448 return msg
0449
0450 def format_diagnostics(self, code, diag):
0451 """
0452 Format the error diagnostics by adding the standard error message and the tail of the longer piloterrordiag.
0453 If there is any kind of failure handling the diagnostics string, the standard error description will be returned.
0454
0455 :param code: standard error code (int).
0456 :param diag: dynamic error diagnostics (string).
0457 :return: formatted error diagnostics (string).
0458 """
0459
0460 max_message_length = 256
0461 try:
0462 standard_message = self._error_messages[code] + ":"
0463 except Exception:
0464 standard_message = ""
0465
0466
0467 if "Traceback" in diag:
0468 pattern = 'details:(.+)'
0469 found = re.findall(pattern, diag)
0470 if found:
0471 diag = found[0]
0472 diag = re.sub(r'\[?PilotException\(\"?\'?', r'', diag)
0473 diag = re.sub(r'\[?StageInFailure\(\"?\'?', r'', diag)
0474 diag = re.sub(r'\[?StageOutFailure\(\"?\'?', r'', diag)
0475 diag = re.sub(' +', ' ', diag)
0476
0477 try:
0478 if diag:
0479
0480
0481
0482 if standard_message in diag:
0483 if len(diag) > max_message_length:
0484 error_message = standard_message + diag[-(max_message_length - len(standard_message)):]
0485 else:
0486 error_message = standard_message + diag[len(standard_message):][-max_message_length:]
0487 else:
0488 if len(diag) + len(standard_message) > max_message_length:
0489 error_message = standard_message + diag[:(max_message_length + len(standard_message))]
0490 else:
0491 error_message = standard_message + diag
0492
0493 if '::' in error_message:
0494 error_message = re.sub(':+', ':', error_message)
0495
0496 else:
0497 error_message = standard_message
0498 except Exception:
0499 error_message = diag
0500
0501 return error_message
0502
0503 @classmethod
0504 def is_recoverable(self, code=0):
0505 """
0506 Determine whether code is a recoverable error code or not.
0507
0508 :param code: Pilot error code (int).
0509 :return: boolean.
0510 """
0511
0512 return code in self.recoverable_error_codes