Back to home page

EIC code displayed by LXR

 
 

    


Warning, file /pilot2/pilot/user/atlas/diagnose.py was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 #!/usr/bin/env python
0002 # Licensed under the Apache License, Version 2.0 (the "License");
0003 # you may not use this file except in compliance with the License.
0004 # You may obtain a copy of the License at
0005 # http://www.apache.org/licenses/LICENSE-2.0
0006 #
0007 # Authors:
0008 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020
0009 
0010 import json
0011 import os
0012 import re
0013 import logging
0014 from glob import glob
0015 
0016 from pilot.common.errorcodes import ErrorCodes
0017 from pilot.common.exception import PilotException, BadXML
0018 from pilot.util.config import config
0019 from pilot.util.filehandling import get_guid, tail, grep, open_file, read_file, scan_file  #, write_file
0020 from pilot.util.math import convert_mb_to_b
0021 from pilot.util.workernode import get_local_disk_space
0022 
0023 from .common import update_job_data, parse_jobreport_data
0024 from .metadata import get_metadata_from_xml, get_total_number_of_events, get_guid_from_xml
0025 
0026 logger = logging.getLogger(__name__)
0027 errors = ErrorCodes()
0028 
0029 
0030 def interpret(job):
0031     """
0032     Interpret the payload, look for specific errors in the stdout.
0033 
0034     :param job: job object
0035     :return: exit code (payload) (int).
0036     """
0037 
0038     exit_code = 0
0039 
0040     # extract errors from job report
0041     process_job_report(job)
0042     if job.piloterrorcodes:
0043         # ignore metadata error if trf exit code is non-zero
0044         if len(job.piloterrorcodes) == 1 and errors.NOPAYLOADMETADATA in job.piloterrorcodes and job.transexitcode != 0:
0045             logger.warning('ignore metadata error for now')
0046         else:
0047             logger.warning('aborting payload error diagnosis since an error has already been set: %s', str(job.piloterrorcodes))
0048             return -1
0049 
0050     if job.exitcode != 0:
0051         exit_code = job.exitcode
0052 
0053     # check for special errors
0054     if exit_code == 146:
0055         logger.warning('user tarball was not downloaded (payload exit code %d)', exit_code)
0056         set_error_nousertarball(job)
0057     elif exit_code == 160:
0058         logger.info('ignoring harmless preprocess exit code %d', exit_code)
0059         job.transexitcode = 0
0060         job.exitcode = 0
0061         exit_code = 0
0062 
0063     # extract special information, e.g. number of events
0064     try:
0065         extract_special_information(job)
0066     except PilotException as exc:
0067         logger.error('PilotException caught while extracting special job information: %s', exc)
0068         exit_code = exc.get_error_code()
0069         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
0070 
0071     # interpret the exit info from the payload
0072     try:
0073         interpret_payload_exit_info(job)
0074     except Exception as exc:
0075         logger.warning('exception caught while interpreting payload exit info: %s', exc)
0076 
0077     return exit_code
0078 
0079 
0080 def interpret_payload_exit_info(job):
0081     """
0082     Interpret the exit info from the payload
0083 
0084     :param job: job object.
0085     :return:
0086     """
0087 
0088     # try to identify out of memory errors in the stderr
0089     if is_out_of_memory(job):
0090         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADOUTOFMEMORY, priority=True)
0091         return
0092 
0093     # look for specific errors in the stdout (tail)
0094     if is_installation_error(job):
0095         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGINSTALLATION, priority=True)
0096         return
0097 
0098     # did AtlasSetup fail?
0099     if is_atlassetup_error(job):
0100         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.SETUPFATAL, priority=True)
0101         return
0102 
0103     # did the payload run out of space?
0104     if is_out_of_space(job):
0105         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, priority=True)
0106 
0107         # double check local space
0108         spaceleft = convert_mb_to_b(get_local_disk_space(os.getcwd()))  # B (diskspace is in MB)
0109         logger.info('verifying local space: %d B', spaceleft)
0110         return
0111 
0112     # look for specific errors in the stdout (full)
0113     if is_nfssqlite_locking_problem(job):
0114         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NFSSQLITE, priority=True)
0115         return
0116 
0117     # is the user tarball missing on the server?
0118     if is_user_code_missing(job):
0119         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGUSERCODE, priority=True)
0120         return
0121 
0122     # set a general Pilot error code if the payload error could not be identified
0123     if job.transexitcode == 0 and job.exitcode != 0:
0124         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.UNKNOWNPAYLOADFAILURE, priority=True)
0125 
0126 
0127 def is_out_of_memory(job):
0128     """
0129     Did the payload run out of memory?
0130 
0131     :param job: job object.
0132     :return: Boolean. (note: True means the error was found)
0133     """
0134 
0135     out_of_memory = False
0136 
0137     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0138     stderr = os.path.join(job.workdir, config.Payload.payloadstderr)
0139 
0140     files = {stderr: ["FATAL out of memory: taking the application down"], stdout: ["St9bad_alloc", "std::bad_alloc"]}
0141     for path in files:
0142         if os.path.exists(path):
0143             logger.info('looking for out-of-memory errors in %s', os.path.basename(path))
0144             if os.path.getsize(path) > 0:
0145                 matched_lines = grep(files[path], path)
0146                 if matched_lines:
0147                     logger.warning("identified an out of memory error in %s %s:", job.payload, os.path.basename(path))
0148                     for line in matched_lines:
0149                         logger.info(line)
0150                     out_of_memory = True
0151         else:
0152             logger.warning('file does not exist: %s (cannot look for out-of-memory error in it)')
0153 
0154     return out_of_memory
0155 
0156 
0157 def is_user_code_missing(job):
0158     """
0159     Is the user code (tarball) missing on the server?
0160 
0161     :param job: job object.
0162     :return: Boolean. (note: True means the error was found)
0163     """
0164 
0165     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0166     error_messages = ["ERROR: unable to fetch source tarball from web"]
0167 
0168     return scan_file(stdout,
0169                      error_messages,
0170                      warning_message="identified an \'%s\' message in %s" % (error_messages[0], os.path.basename(stdout)))
0171 
0172 
0173 def is_out_of_space(job):
0174     """
0175     Did the disk run out of space?
0176 
0177     :param job: job object.
0178     :return: Boolean. (note: True means the error was found)
0179     """
0180 
0181     stderr = os.path.join(job.workdir, config.Payload.payloadstderr)
0182     error_messages = ["No space left on device"]
0183 
0184     return scan_file(stderr,
0185                      error_messages,
0186                      warning_message="identified a \'%s\' message in %s" % (error_messages[0], os.path.basename(stderr)))
0187 
0188 
0189 def is_installation_error(job):
0190     """
0191     Did the payload fail to run? (Due to faulty/missing installation).
0192 
0193     :param job: job object.
0194     :return: Boolean. (note: True means the error was found)
0195     """
0196 
0197     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0198     _tail = tail(stdout)
0199     res_tmp = _tail[:1024]
0200     return res_tmp[0:3] == "sh:" and 'setup.sh' in res_tmp and 'No such file or directory' in res_tmp
0201 
0202 
0203 def is_atlassetup_error(job):
0204     """
0205     Did AtlasSetup fail with a fatal error?
0206 
0207     :param job: job object.
0208     :return: Boolean. (note: True means the error was found)
0209     """
0210 
0211     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0212     _tail = tail(stdout)
0213     res_tmp = _tail[:2048]
0214     return "AtlasSetup(FATAL): Fatal exception" in res_tmp
0215 
0216 
0217 def is_nfssqlite_locking_problem(job):
0218     """
0219     Were there any NFS SQLite locking problems?
0220 
0221     :param job: job object.
0222     :return: Boolean. (note: True means the error was found)
0223     """
0224 
0225     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0226     error_messages = ["prepare 5 database is locked", "Error SQLiteStatement"]
0227 
0228     return scan_file(stdout,
0229                      error_messages,
0230                      warning_message="identified an NFS/Sqlite locking problem in %s" % os.path.basename(stdout))
0231 
0232 
0233 def extract_special_information(job):
0234     """
0235     Extract special information from different sources, such as number of events and data base fields.
0236 
0237     :param job: job object.
0238     :return:
0239     """
0240 
0241     # try to find the number(s) of processed events (will be set in the relevant job fields)
0242     find_number_of_events(job)
0243 
0244     # get the DB info from the jobReport
0245     try:
0246         find_db_info(job)
0247     except Exception as exc:
0248         logger.warning('detected problem with parsing job report (in find_db_info()): %s', exc)
0249 
0250 
0251 def find_number_of_events(job):
0252     """
0253     Locate the number of events.
0254 
0255     :param job: job object.
0256     :return:
0257     """
0258 
0259     if job.nevents:
0260         logger.info('number of events already known: %d', job.nevents)
0261         return
0262 
0263     logger.info('looking for number of processed events (source #1: jobReport.json)')
0264     find_number_of_events_in_jobreport(job)
0265     if job.nevents > 0:
0266         logger.info('found %d processed events', job.nevents)
0267         return
0268 
0269     logger.info('looking for number of processed events (source #2: metadata.xml)')
0270     find_number_of_events_in_xml(job)
0271     if job.nevents > 0:
0272         logger.info('found %d processed events', job.nevents)
0273         return
0274 
0275     logger.info('looking for number of processed events (source #3: athena summary file(s)')
0276     nev1, nev2 = process_athena_summary(job)
0277     if nev1 > 0:
0278         job.nevents = nev1
0279         logger.info('found %d processed (read) events', job.nevents)
0280     if nev2 > 0:
0281         job.neventsw = nev2
0282         logger.info('found %d processed (written) events', job.neventsw)
0283 
0284 
0285 def find_number_of_events_in_jobreport(job):
0286     """
0287     Try to find the number of events in the jobReport.json file.
0288 
0289     :param job: job object.
0290     :return:
0291     """
0292 
0293     try:
0294         work_attributes = parse_jobreport_data(job.metadata)
0295     except Exception as exc:
0296         logger.warning('exception caught while parsing job report: %s', exc)
0297         return
0298 
0299     if 'nEvents' in work_attributes:
0300         try:
0301             n_events = work_attributes.get('nEvents')
0302             if n_events:
0303                 job.nevents = int(n_events)
0304         except ValueError as exc:
0305             logger.warning('failed to convert number of events to int: %s', exc)
0306 
0307 
0308 def find_number_of_events_in_xml(job):
0309     """
0310     Try to find the number of events in the metadata.xml file.
0311 
0312     :param job: job object.
0313     :raises: BadXML exception if metadata cannot be parsed.
0314     :return:
0315     """
0316 
0317     try:
0318         metadata = get_metadata_from_xml(job.workdir)
0319     except Exception as exc:
0320         msg = "Exception caught while interpreting XML: %s" % exc
0321         raise BadXML(msg)
0322 
0323     if metadata:
0324         nevents = get_total_number_of_events(metadata)
0325         if nevents > 0:
0326             job.nevents = nevents
0327 
0328 
0329 def process_athena_summary(job):
0330     """
0331     Try to find the number of events in the Athena summary file.
0332 
0333     :param job: job object.
0334     :return: number of read events (int), number of written events (int).
0335     """
0336 
0337     nev1 = 0
0338     nev2 = 0
0339     file_pattern_list = ['AthSummary*', 'AthenaSummary*']
0340 
0341     file_list = []
0342     # loop over all patterns in the list to find all possible summary files
0343     for file_pattern in file_pattern_list:
0344         # get all the summary files for the current file pattern
0345         files = glob(os.path.join(job.workdir, file_pattern))
0346         # append all found files to the file list
0347         for summary_file in files:
0348             file_list.append(summary_file)
0349 
0350     if file_list == [] or file_list == ['']:
0351         logger.info("did not find any athena summary files")
0352     else:
0353         # find the most recent and the oldest files
0354         recent_summary_file, recent_time, oldest_summary_file, oldest_time = \
0355             find_most_recent_and_oldest_summary_files(file_list)
0356         if oldest_summary_file == recent_summary_file:
0357             logger.info("summary file %s will be processed for errors and number of events",
0358                         os.path.basename(oldest_summary_file))
0359         else:
0360             logger.info("most recent summary file %s (updated at %d) will be processed for errors [to be implemented]",
0361                         os.path.basename(recent_summary_file), recent_time)
0362             logger.info("oldest summary file %s (updated at %d) will be processed for number of events",
0363                         os.path.basename(oldest_summary_file), oldest_time)
0364 
0365         # Get the number of events from the oldest summary file
0366         nev1, nev2 = get_number_of_events_from_summary_file(oldest_summary_file)
0367         logger.info("number of events: %d (read)", nev1)
0368         logger.info("number of events: %d (written)", nev2)
0369 
0370     return nev1, nev2
0371 
0372 
0373 def find_most_recent_and_oldest_summary_files(file_list):
0374     """
0375     Find the most recent and the oldest athena summary files.
0376     :param file_list: list of athena summary files (list of strings).
0377     :return: most recent summary file (string), recent time (int), oldest summary file (string), oldest time (int).
0378     """
0379 
0380     oldest_summary_file = ""
0381     recent_summary_file = ""
0382     oldest_time = 9999999999
0383     recent_time = 0
0384     if len(file_list) > 1:
0385         for summary_file in file_list:
0386             # get the modification time
0387             try:
0388                 st_mtime = os.path.getmtime(summary_file)
0389             except OSError as exc:  # Python 2/3
0390                 logger.warning("could not read modification time of file %s: %s", summary_file, exc)
0391             else:
0392                 if st_mtime > recent_time:
0393                     recent_time = st_mtime
0394                     recent_summary_file = summary_file
0395                 if st_mtime < oldest_time:
0396                     oldest_time = st_mtime
0397                     oldest_summary_file = summary_file
0398     else:
0399         oldest_summary_file = file_list[0]
0400         recent_summary_file = oldest_summary_file
0401         try:
0402             oldest_time = os.path.getmtime(oldest_summary_file)
0403         except OSError as exc:  # Python 2/3
0404             logger.warning("could not read modification time of file %s: %s", oldest_summary_file, exc)
0405         else:
0406             recent_time = oldest_time
0407 
0408     return recent_summary_file, recent_time, oldest_summary_file, oldest_time
0409 
0410 
0411 def get_number_of_events_from_summary_file(oldest_summary_file):
0412     """
0413     Get the number of events from the oldest summary file.
0414 
0415     :param oldest_summary_file: athena summary file (filename, str).
0416     :return: number of read events (int), number of written events (int).
0417     """
0418 
0419     nev1 = 0
0420     nev2 = 0
0421 
0422     _file = open_file(oldest_summary_file, 'r')
0423     if _file:
0424         lines = _file.readlines()
0425         _file.close()
0426 
0427         if lines:
0428             for line in lines:
0429                 if "Events Read:" in line:
0430                     try:
0431                         nev1 = int(re.match(r'Events Read\: *(\d+)', line).group(1))  # Python 3 (added r)
0432                     except ValueError as exc:
0433                         logger.warning('failed to convert number of read events to int: %s', exc)
0434                 if "Events Written:" in line:
0435                     try:
0436                         nev2 = int(re.match(r'Events Written\: *(\d+)', line).group(1))  # Python 3 (added r)
0437                     except ValueError as exc:
0438                         logger.warning('failed to convert number of written events to int: %s', exc)
0439                 if nev1 > 0 and nev2 > 0:
0440                     break
0441         else:
0442             logger.warning('failed to get number of events from empty summary file')
0443 
0444     # Get the errors from the most recent summary file
0445     # ...
0446 
0447     return nev1, nev2
0448 
0449 
0450 def find_db_info(job):
0451     """
0452     Find the DB info in the jobReport
0453 
0454     :param job: job object.
0455     :return:
0456     """
0457 
0458     work_attributes = parse_jobreport_data(job.metadata)
0459     if '__db_time' in work_attributes:
0460         try:
0461             job.dbtime = int(work_attributes.get('__db_time'))
0462         except ValueError as exc:
0463             logger.warning('failed to convert dbtime to int: %s', exc)
0464         logger.info('dbtime (total): %d', job.dbtime)
0465     if '__db_data' in work_attributes:
0466         try:
0467             job.dbdata = work_attributes.get('__db_data')
0468         except ValueError as exc:
0469             logger.warning('failed to convert dbdata to int: %s', exc)
0470         logger.info('dbdata (total): %d', job.dbdata)
0471 
0472 
0473 def set_error_nousertarball(job):
0474     """
0475     Set error code for NOUSERTARBALL.
0476 
0477     :param job: job object.
0478     :return:
0479     """
0480 
0481     # get the tail of the stdout since it will contain the URL of the user log
0482     filename = os.path.join(job.workdir, config.Payload.payloadstdout)
0483     _tail = tail(filename)
0484     _tail += 'http://someurl.se/path'
0485     if _tail:
0486         # try to extract the tarball url from the tail
0487         tarball_url = extract_tarball_url(_tail)
0488 
0489         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOUSERTARBALL)
0490         job.piloterrorcode = errors.NOUSERTARBALL
0491         job.piloterrordiag = "User tarball %s cannot be downloaded from PanDA server" % tarball_url
0492 
0493 
0494 def extract_tarball_url(_tail):
0495     """
0496     Extract the tarball URL for missing user code if possible from stdout tail.
0497 
0498     :param _tail: tail of payload stdout (string).
0499     :return: url (string).
0500     """
0501 
0502     tarball_url = "(source unknown)"
0503 
0504     if "https://" in _tail or "http://" in _tail:
0505         pattern = r"(https?\:\/\/.+)"
0506         found = re.findall(pattern, _tail)
0507         if found:
0508             tarball_url = found[0]
0509 
0510     return tarball_url
0511 
0512 
0513 def process_metadata_from_xml(job):
0514     """
0515     Extract necessary metadata from XML when job report is not available.
0516 
0517     :param job: job object.
0518     :return: [updated job object - return not needed].
0519     """
0520 
0521     # get the metadata from the xml file instead, which must exist for most production transforms
0522     path = os.path.join(job.workdir, config.Payload.metadata)
0523     if os.path.exists(path):
0524         job.metadata = read_file(path)
0525     else:
0526         if not job.is_analysis() and job.transformation != 'Archive_tf.py':
0527             diagnostics = 'metadata does not exist: %s' % path
0528             logger.warning(diagnostics)
0529             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOPAYLOADMETADATA)
0530             job.piloterrorcode = errors.NOPAYLOADMETADATA
0531             job.piloterrordiag = diagnostics
0532 
0533     # add missing guids
0534     for dat in job.outdata:
0535         if not dat.guid:
0536             # try to read it from the metadata before the last resort of generating it
0537             metadata = None
0538             try:
0539                 metadata = get_metadata_from_xml(job.workdir)
0540             except Exception as exc:
0541                 msg = "Exception caught while interpreting XML: %s (ignoring it, but guids must now be generated)" % exc
0542                 logger.warning(msg)
0543             if metadata:
0544                 dat.guid = get_guid_from_xml(metadata, dat.lfn)
0545                 logger.info('read guid for lfn=%s from xml: %s', dat.lfn, dat.guid)
0546             else:
0547                 dat.guid = get_guid()
0548                 logger.info('generated guid for lfn=%s: %s', dat.lfn, dat.guid)
0549 
0550 
0551 def process_job_report(job):
0552     """
0553     Process the job report produced by the payload/transform if it exists.
0554     Payload error codes and diagnostics, as well as payload metadata (for output files) and stageout type will be
0555     extracted. The stageout type is either "all" (i.e. stage-out both output and log files) or "log" (i.e. only log file
0556     will be staged out).
0557     Note: some fields might be experiment specific. A call to a user function is therefore also done.
0558 
0559     :param job: job dictionary will be updated by the function and several fields set.
0560     :return:
0561     """
0562 
0563     # get the job report
0564     path = os.path.join(job.workdir, config.Payload.jobreport)
0565     if not os.path.exists(path):
0566         logger.warning('job report does not exist: %s', path)
0567 
0568         # get the metadata from the xml file instead, which must exist for most production transforms
0569         process_metadata_from_xml(job)
0570     else:
0571         with open(path) as data_file:
0572             # compulsory field; the payload must produce a job report (see config file for file name), attach it to the
0573             # job object
0574             job.metadata = json.load(data_file)
0575 
0576             #
0577             update_job_data(job)
0578 
0579             # compulsory fields
0580             try:
0581                 job.exitcode = job.metadata['exitCode']
0582             except KeyError as exc:
0583                 logger.warning('could not find compulsory payload exitCode in job report: %s (will be set to 0)', exc)
0584                 job.exitcode = 0
0585             else:
0586                 logger.info('extracted exit code from job report: %d', job.exitcode)
0587             try:
0588                 job.exitmsg = job.metadata['exitMsg']
0589             except KeyError as exc:
0590                 logger.warning('could not find compulsory payload exitMsg in job report: %s '
0591                                '(will be set to empty string)', exc)
0592                 job.exitmsg = ""
0593             else:
0594                 # assign special payload error code
0595                 if "got a SIGSEGV signal" in job.exitmsg:
0596                     diagnostics = 'Invalid memory reference or a segmentation fault in payload: %s (job report)' % \
0597                                   job.exitmsg
0598                     logger.warning(diagnostics)
0599                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADSIGSEGV)
0600                     job.piloterrorcode = errors.PAYLOADSIGSEGV
0601                     job.piloterrordiag = diagnostics
0602                 else:
0603                     logger.info('extracted exit message from job report: %s', job.exitmsg)
0604                     if job.exitmsg != 'OK':
0605                         job.exeerrordiag = job.exitmsg
0606                         job.exeerrorcode = job.exitcode
0607 
0608             if job.exitcode != 0:
0609                 # get list with identified errors in job report
0610                 job_report_errors = get_job_report_errors(job.metadata)
0611 
0612                 # is it a bad_alloc failure?
0613                 bad_alloc, diagnostics = is_bad_alloc(job_report_errors)
0614                 if bad_alloc:
0615                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.BADALLOC)
0616                     job.piloterrorcode = errors.BADALLOC
0617                     job.piloterrordiag = diagnostics
0618 
0619 
0620 def get_job_report_errors(job_report_dictionary):
0621     """
0622     Extract the error list from the jobReport.json dictionary.
0623     The returned list is scanned for special errors.
0624 
0625     :param job_report_dictionary:
0626     :return: job_report_errors list.
0627     """
0628 
0629     job_report_errors = []
0630     if 'reportVersion' in job_report_dictionary:
0631         logger.info("scanning jobReport (v %s) for error info", job_report_dictionary.get('reportVersion'))
0632     else:
0633         logger.warning("jobReport does not have the reportVersion key")
0634 
0635     if 'executor' in job_report_dictionary:
0636         try:
0637             error_details = job_report_dictionary['executor'][0]['logfileReport']['details']['ERROR']
0638         except (KeyError, TypeError, IndexError) as exc:
0639             logger.warning("WARNING: aborting jobReport scan: %s", exc)
0640         else:
0641             if isinstance(error_details, list):
0642                 for msg in error_details:
0643                     job_report_errors.append(msg['message'])
0644             else:
0645                 logger.warning("did not get a list object: %s", type(error_details))
0646     else:
0647         logger.warning("jobReport does not have the executor key (aborting)")
0648 
0649     return job_report_errors
0650 
0651 
0652 def is_bad_alloc(job_report_errors):
0653     """
0654     Check for bad_alloc errors.
0655 
0656     :param job_report_errors: list with errors extracted from the job report.
0657     :return: bad_alloc (bool), diagnostics (string).
0658     """
0659 
0660     bad_alloc = False
0661     diagnostics = ""
0662     for err in job_report_errors:
0663         if "bad_alloc" in err:
0664             logger.warning("encountered a bad_alloc error: %s", err)
0665             bad_alloc = True
0666             diagnostics = err
0667             break
0668 
0669     return bad_alloc, diagnostics
0670 
0671 
0672 def get_log_extracts(job, state):
0673     """
0674     Extract special warnings and other other info from special logs.
0675     This function also discovers if the payload had any outbound connections.
0676 
0677     :param job: job object.
0678     :param state: job state (string).
0679     :return: log extracts (string).
0680     """
0681 
0682     logger.info("building log extracts (sent to the server as \'pilotLog\')")
0683 
0684     # did the job have any outbound connections?
0685     # look for the pandatracerlog.txt file, produced if the user payload attempted any outgoing connections
0686     extracts = get_panda_tracer_log(job)
0687 
0688     # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself
0689     _extracts = get_pilot_log_extracts(job)
0690     if _extracts != "":
0691         logger.warning('detected the following tail of warning/fatal messages in the pilot log:\n%s', _extracts)
0692         if state == 'failed' or state == 'holding':
0693             extracts += _extracts
0694 
0695     # add extracts from payload logs
0696     # (see buildLogExtracts in Pilot 1)
0697 
0698     return extracts
0699 
0700 
0701 def get_panda_tracer_log(job):
0702     """
0703     Return the contents of the PanDA tracer log if it exists.
0704     This file will contain information about outbound connections.
0705 
0706     :param job: job object.
0707     :return: log extracts from pandatracerlog.txt (string).
0708     """
0709 
0710     extracts = ""
0711 
0712     tracerlog = os.path.join(job.workdir, "pandatracerlog.txt")
0713     if os.path.exists(tracerlog):
0714         # only add if file is not empty
0715         if os.path.getsize(tracerlog) > 0:
0716             message = "PandaID=%s had outbound connections: " % (job.jobid)
0717             extracts += message
0718             message = read_file(tracerlog)
0719             extracts += message
0720             logger.warning(message)
0721         else:
0722             logger.info("PanDA tracer log (%s) has zero size (no outbound connections detected)", tracerlog)
0723     else:
0724         logger.debug("PanDA tracer log does not exist: %s (ignoring)", tracerlog)
0725 
0726     return extracts
0727 
0728 
0729 def get_pilot_log_extracts(job):
0730     """
0731     Get the extracts from the pilot log (warning/fatal messages, as well as tail of the log itself).
0732 
0733     :param job: job object.
0734     :return: tail of pilot log (string).
0735     """
0736 
0737     extracts = ""
0738 
0739     path = os.path.join(job.workdir, config.Pilot.pilotlog)
0740     if os.path.exists(path):
0741         # get the last 20 lines of the pilot log in case it contains relevant error information
0742         _tail = tail(path, nlines=20)
0743         if _tail != "":
0744             if extracts != "":
0745                 extracts += "\n"
0746             extracts += "- Log from %s -\n" % config.Pilot.pilotlog
0747             extracts += _tail
0748 
0749         # grep for fatal/critical errors in the pilot log
0750         #errormsgs = ["FATAL", "CRITICAL", "ERROR"]
0751         #matched_lines = grep(errormsgs, path)
0752         #_extracts = ""
0753         #if len(matched_lines) > 0:
0754         #    logger.debug("dumping warning messages from %s:\n", os.path.basename(path))
0755         #    for line in matched_lines:
0756         #        _extracts += line + "\n"
0757         #if _extracts != "":
0758         #    if config.Pilot.error_log != "":
0759         #        path = os.path.join(job.workdir, config.Pilot.error_log)
0760         #        write_file(path, _extracts)
0761         #    extracts += "\n- Error messages from %s -\n" % config.Pilot.pilotlog
0762         #    extracts += _extracts
0763     else:
0764         logger.warning('pilot log file does not exist: %s', path)
0765 
0766     return extracts