user/atlas/diagnose.py

0001 #!/usr/bin/env python
0002 # Licensed under the Apache License, Version 2.0 (the "License");
0003 # you may not use this file except in compliance with the License.
0004 # You may obtain a copy of the License at
0005 # http://www.apache.org/licenses/LICENSE-2.0
0006 #
0007 # Authors:
0008 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2020
0009
0010 import json
0011 import os
0012 import re
0013 import logging
0014 from glob import glob
0015
0016 from pilot.common.errorcodes import ErrorCodes
0017 from pilot.common.exception import PilotException, BadXML
0018 from pilot.util.config import config
0019 from pilot.util.filehandling import get_guid, tail, grep, open_file, read_file, scan_file  #, write_file
0020 from pilot.util.math import convert_mb_to_b
0021 from pilot.util.workernode import get_local_disk_space
0022
0023 from .common import update_job_data, parse_jobreport_data
0024 from .metadata import get_metadata_from_xml, get_total_number_of_events, get_guid_from_xml
0025
0026 logger = logging.getLogger(__name__)
0027 errors = ErrorCodes()
0028
0029
0030 def interpret(job):
0031     """
0032     Interpret the payload, look for specific errors in the stdout.
0033
0034     :param job: job object
0035     :return: exit code (payload) (int).
0036     """
0037
0038     exit_code = 0
0039
0040     # extract errors from job report
0041     process_job_report(job)
0042     if job.piloterrorcodes:
0043         # ignore metadata error if trf exit code is non-zero
0044         if len(job.piloterrorcodes) == 1 and errors.NOPAYLOADMETADATA in job.piloterrorcodes and job.transexitcode != 0:
0045             logger.warning('ignore metadata error for now')
0046         else:
0047             logger.warning('aborting payload error diagnosis since an error has already been set: %s', str(job.piloterrorcodes))
0048             return -1
0049
0050     if job.exitcode != 0:
0051         exit_code = job.exitcode
0052
0053     # check for special errors
0054     if exit_code == 146:
0055         logger.warning('user tarball was not downloaded (payload exit code %d)', exit_code)
0056         set_error_nousertarball(job)
0057     elif exit_code == 160:
0058         logger.info('ignoring harmless preprocess exit code %d', exit_code)
0059         job.transexitcode = 0
0060         job.exitcode = 0
0061         exit_code = 0
0062
0063     # extract special information, e.g. number of events
0064     try:
0065         extract_special_information(job)
0066     except PilotException as exc:
0067         logger.error('PilotException caught while extracting special job information: %s', exc)
0068         exit_code = exc.get_error_code()
0069         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code)
0070
0071     # interpret the exit info from the payload
0072     try:
0073         interpret_payload_exit_info(job)
0074     except Exception as exc:
0075         logger.warning('exception caught while interpreting payload exit info: %s', exc)
0076
0077     return exit_code
0078
0079
0080 def interpret_payload_exit_info(job):
0081     """
0082     Interpret the exit info from the payload
0083
0084     :param job: job object.
0085     :return:
0086     """
0087
0088     # try to identify out of memory errors in the stderr
0089     if is_out_of_memory(job):
0090         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADOUTOFMEMORY, priority=True)
0091         return
0092
0093     # look for specific errors in the stdout (tail)
0094     if is_installation_error(job):
0095         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGINSTALLATION, priority=True)
0096         return
0097
0098     # did AtlasSetup fail?
0099     if is_atlassetup_error(job):
0100         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.SETUPFATAL, priority=True)
0101         return
0102
0103     # did the payload run out of space?
0104     if is_out_of_space(job):
0105         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOLOCALSPACE, priority=True)
0106
0107         # double check local space
0108         spaceleft = convert_mb_to_b(get_local_disk_space(os.getcwd()))  # B (diskspace is in MB)
0109         logger.info('verifying local space: %d B', spaceleft)
0110         return
0111
0112     # look for specific errors in the stdout (full)
0113     if is_nfssqlite_locking_problem(job):
0114         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NFSSQLITE, priority=True)
0115         return
0116
0117     # is the user tarball missing on the server?
0118     if is_user_code_missing(job):
0119         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGUSERCODE, priority=True)
0120         return
0121
0122     # set a general Pilot error code if the payload error could not be identified
0123     if job.transexitcode == 0 and job.exitcode != 0:
0124         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.UNKNOWNPAYLOADFAILURE, priority=True)
0125
0126
0127 def is_out_of_memory(job):
0128     """
0129     Did the payload run out of memory?
0130
0131     :param job: job object.
0132     :return: Boolean. (note: True means the error was found)
0133     """
0134
0135     out_of_memory = False
0136
0137     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0138     stderr = os.path.join(job.workdir, config.Payload.payloadstderr)
0139
0140     files = {stderr: ["FATAL out of memory: taking the application down"], stdout: ["St9bad_alloc", "std::bad_alloc"]}
0141     for path in files:
0142         if os.path.exists(path):
0143             logger.info('looking for out-of-memory errors in %s', os.path.basename(path))
0144             if os.path.getsize(path) > 0:
0145                 matched_lines = grep(files[path], path)
0146                 if matched_lines:
0147                     logger.warning("identified an out of memory error in %s %s:", job.payload, os.path.basename(path))
0148                     for line in matched_lines:
0149                         logger.info(line)
0150                     out_of_memory = True
0151         else:
0152             logger.warning('file does not exist: %s (cannot look for out-of-memory error in it)')
0153
0154     return out_of_memory
0155
0156
0157 def is_user_code_missing(job):
0158     """
0159     Is the user code (tarball) missing on the server?
0160
0161     :param job: job object.
0162     :return: Boolean. (note: True means the error was found)
0163     """
0164
0165     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0166     error_messages = ["ERROR: unable to fetch source tarball from web"]
0167
0168     return scan_file(stdout,
0169                      error_messages,
0170                      warning_message="identified an \'%s\' message in %s" % (error_messages[0], os.path.basename(stdout)))
0171
0172
0173 def is_out_of_space(job):
0174     """
0175     Did the disk run out of space?
0176
0177     :param job: job object.
0178     :return: Boolean. (note: True means the error was found)
0179     """
0180
0181     stderr = os.path.join(job.workdir, config.Payload.payloadstderr)
0182     error_messages = ["No space left on device"]
0183
0184     return scan_file(stderr,
0185                      error_messages,
0186                      warning_message="identified a \'%s\' message in %s" % (error_messages[0], os.path.basename(stderr)))
0187
0188
0189 def is_installation_error(job):
0190     """
0191     Did the payload fail to run? (Due to faulty/missing installation).
0192
0193     :param job: job object.
0194     :return: Boolean. (note: True means the error was found)
0195     """
0196
0197     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0198     _tail = tail(stdout)
0199     res_tmp = _tail[:1024]
0200     return res_tmp[0:3] == "sh:" and 'setup.sh' in res_tmp and 'No such file or directory' in res_tmp
0201
0202
0203 def is_atlassetup_error(job):
0204     """
0205     Did AtlasSetup fail with a fatal error?
0206
0207     :param job: job object.
0208     :return: Boolean. (note: True means the error was found)
0209     """
0210
0211     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0212     _tail = tail(stdout)
0213     res_tmp = _tail[:2048]
0214     return "AtlasSetup(FATAL): Fatal exception" in res_tmp
0215
0216
0217 def is_nfssqlite_locking_problem(job):
0218     """
0219     Were there any NFS SQLite locking problems?
0220
0221     :param job: job object.
0222     :return: Boolean. (note: True means the error was found)
0223     """
0224
0225     stdout = os.path.join(job.workdir, config.Payload.payloadstdout)
0226     error_messages = ["prepare 5 database is locked", "Error SQLiteStatement"]
0227
0228     return scan_file(stdout,
0229                      error_messages,
0230                      warning_message="identified an NFS/Sqlite locking problem in %s" % os.path.basename(stdout))
0231
0232
0233 def extract_special_information(job):
0234     """
0235     Extract special information from different sources, such as number of events and data base fields.
0236
0237     :param job: job object.
0238     :return:
0239     """
0240
0241     # try to find the number(s) of processed events (will be set in the relevant job fields)
0242     find_number_of_events(job)
0243
0244     # get the DB info from the jobReport
0245     try:
0246         find_db_info(job)
0247     except Exception as exc:
0248         logger.warning('detected problem with parsing job report (in find_db_info()): %s', exc)
0249
0250
0251 def find_number_of_events(job):
0252     """
0253     Locate the number of events.
0254
0255     :param job: job object.
0256     :return:
0257     """
0258
0259     if job.nevents:
0260         logger.info('number of events already known: %d', job.nevents)
0261         return
0262
0263     logger.info('looking for number of processed events (source #1: jobReport.json)')
0264     find_number_of_events_in_jobreport(job)
0265     if job.nevents > 0:
0266         logger.info('found %d processed events', job.nevents)
0267         return
0268
0269     logger.info('looking for number of processed events (source #2: metadata.xml)')
0270     find_number_of_events_in_xml(job)
0271     if job.nevents > 0:
0272         logger.info('found %d processed events', job.nevents)
0273         return
0274
0275     logger.info('looking for number of processed events (source #3: athena summary file(s)')
0276     nev1, nev2 = process_athena_summary(job)
0277     if nev1 > 0:
0278         job.nevents = nev1
0279         logger.info('found %d processed (read) events', job.nevents)
0280     if nev2 > 0:
0281         job.neventsw = nev2
0282         logger.info('found %d processed (written) events', job.neventsw)
0283
0284
0285 def find_number_of_events_in_jobreport(job):
0286     """
0287     Try to find the number of events in the jobReport.json file.
0288
0289     :param job: job object.
0290     :return:
0291     """
0292
0293     try:
0294         work_attributes = parse_jobreport_data(job.metadata)
0295     except Exception as exc:
0296         logger.warning('exception caught while parsing job report: %s', exc)
0297         return
0298
0299     if 'nEvents' in work_attributes:
0300         try:
0301             n_events = work_attributes.get('nEvents')
0302             if n_events:
0303                 job.nevents = int(n_events)
0304         except ValueError as exc:
0305             logger.warning('failed to convert number of events to int: %s', exc)
0306
0307
0308 def find_number_of_events_in_xml(job):
0309     """
0310     Try to find the number of events in the metadata.xml file.
0311
0312     :param job: job object.
0313     :raises: BadXML exception if metadata cannot be parsed.
0314     :return:
0315     """
0316
0317     try:
0318         metadata = get_metadata_from_xml(job.workdir)
0319     except Exception as exc:
0320         msg = "Exception caught while interpreting XML: %s" % exc
0321         raise BadXML(msg)
0322
0323     if metadata:
0324         nevents = get_total_number_of_events(metadata)
0325         if nevents > 0:
0326             job.nevents = nevents
0327
0328
0329 def process_athena_summary(job):
0330     """
0331     Try to find the number of events in the Athena summary file.
0332
0333     :param job: job object.
0334     :return: number of read events (int), number of written events (int).
0335     """
0336
0337     nev1 = 0
0338     nev2 = 0
0339     file_pattern_list = ['AthSummary*', 'AthenaSummary*']
0340
0341     file_list = []
0342     # loop over all patterns in the list to find all possible summary files
0343     for file_pattern in file_pattern_list:
0344         # get all the summary files for the current file pattern
0345         files = glob(os.path.join(job.workdir, file_pattern))
0346         # append all found files to the file list
0347         for summary_file in files:
0348             file_list.append(summary_file)
0349
0350     if file_list == [] or file_list == ['']:
0351         logger.info("did not find any athena summary files")
0352     else:
0353         # find the most recent and the oldest files
0354         recent_summary_file, recent_time, oldest_summary_file, oldest_time = \
0355             find_most_recent_and_oldest_summary_files(file_list)
0356         if oldest_summary_file == recent_summary_file:
0357             logger.info("summary file %s will be processed for errors and number of events",
0358                         os.path.basename(oldest_summary_file))
0359         else:
0360             logger.info("most recent summary file %s (updated at %d) will be processed for errors [to be implemented]",
0361                         os.path.basename(recent_summary_file), recent_time)
0362             logger.info("oldest summary file %s (updated at %d) will be processed for number of events",
0363                         os.path.basename(oldest_summary_file), oldest_time)
0364
0365         # Get the number of events from the oldest summary file
0366         nev1, nev2 = get_number_of_events_from_summary_file(oldest_summary_file)
0367         logger.info("number of events: %d (read)", nev1)
0368         logger.info("number of events: %d (written)", nev2)
0369
0370     return nev1, nev2
0371
0372
0373 def find_most_recent_and_oldest_summary_files(file_list):
0374     """
0375     Find the most recent and the oldest athena summary files.
0376     :param file_list: list of athena summary files (list of strings).
0377     :return: most recent summary file (string), recent time (int), oldest summary file (string), oldest time (int).
0378     """
0379
0380     oldest_summary_file = ""
0381     recent_summary_file = ""
0382     oldest_time = 9999999999
0383     recent_time = 0
0384     if len(file_list) > 1:
0385         for summary_file in file_list:
0386             # get the modification time
0387             try:
0388                 st_mtime = os.path.getmtime(summary_file)
0389             except OSError as exc:  # Python 2/3
0390                 logger.warning("could not read modification time of file %s: %s", summary_file, exc)
0391             else:
0392                 if st_mtime > recent_time:
0393                     recent_time = st_mtime
0394                     recent_summary_file = summary_file
0395                 if st_mtime < oldest_time:
0396                     oldest_time = st_mtime
0397                     oldest_summary_file = summary_file
0398     else:
0399         oldest_summary_file = file_list[0]
0400         recent_summary_file = oldest_summary_file
0401         try:
0402             oldest_time = os.path.getmtime(oldest_summary_file)
0403         except OSError as exc:  # Python 2/3
0404             logger.warning("could not read modification time of file %s: %s", oldest_summary_file, exc)
0405         else:
0406             recent_time = oldest_time
0407
0408     return recent_summary_file, recent_time, oldest_summary_file, oldest_time
0409
0410
0411 def get_number_of_events_from_summary_file(oldest_summary_file):
0412     """
0413     Get the number of events from the oldest summary file.
0414
0415     :param oldest_summary_file: athena summary file (filename, str).
0416     :return: number of read events (int), number of written events (int).
0417     """
0418
0419     nev1 = 0
0420     nev2 = 0
0421
0422     _file = open_file(oldest_summary_file, 'r')
0423     if _file:
0424         lines = _file.readlines()
0425         _file.close()
0426
0427         if lines:
0428             for line in lines:
0429                 if "Events Read:" in line:
0430                     try:
0431                         nev1 = int(re.match(r'Events Read\: *(\d+)', line).group(1))  # Python 3 (added r)
0432                     except ValueError as exc:
0433                         logger.warning('failed to convert number of read events to int: %s', exc)
0434                 if "Events Written:" in line:
0435                     try:
0436                         nev2 = int(re.match(r'Events Written\: *(\d+)', line).group(1))  # Python 3 (added r)
0437                     except ValueError as exc:
0438                         logger.warning('failed to convert number of written events to int: %s', exc)
0439                 if nev1 > 0 and nev2 > 0:
0440                     break
0441         else:
0442             logger.warning('failed to get number of events from empty summary file')
0443
0444     # Get the errors from the most recent summary file
0445     # ...
0446
0447     return nev1, nev2
0448
0449
0450 def find_db_info(job):
0451     """
0452     Find the DB info in the jobReport
0453
0454     :param job: job object.
0455     :return:
0456     """
0457
0458     work_attributes = parse_jobreport_data(job.metadata)
0459     if '__db_time' in work_attributes:
0460         try:
0461             job.dbtime = int(work_attributes.get('__db_time'))
0462         except ValueError as exc:
0463             logger.warning('failed to convert dbtime to int: %s', exc)
0464         logger.info('dbtime (total): %d', job.dbtime)
0465     if '__db_data' in work_attributes:
0466         try:
0467             job.dbdata = work_attributes.get('__db_data')
0468         except ValueError as exc:
0469             logger.warning('failed to convert dbdata to int: %s', exc)
0470         logger.info('dbdata (total): %d', job.dbdata)
0471
0472
0473 def set_error_nousertarball(job):
0474     """
0475     Set error code for NOUSERTARBALL.
0476
0477     :param job: job object.
0478     :return:
0479     """
0480
0481     # get the tail of the stdout since it will contain the URL of the user log
0482     filename = os.path.join(job.workdir, config.Payload.payloadstdout)
0483     _tail = tail(filename)
0484     _tail += 'http://someurl.se/path'
0485     if _tail:
0486         # try to extract the tarball url from the tail
0487         tarball_url = extract_tarball_url(_tail)
0488
0489         job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOUSERTARBALL)
0490         job.piloterrorcode = errors.NOUSERTARBALL
0491         job.piloterrordiag = "User tarball %s cannot be downloaded from PanDA server" % tarball_url
0492
0493
0494 def extract_tarball_url(_tail):
0495     """
0496     Extract the tarball URL for missing user code if possible from stdout tail.
0497
0498     :param _tail: tail of payload stdout (string).
0499     :return: url (string).
0500     """
0501
0502     tarball_url = "(source unknown)"
0503
0504     if "https://" in _tail or "http://" in _tail:
0505         pattern = r"(https?\:\/\/.+)"
0506         found = re.findall(pattern, _tail)
0507         if found:
0508             tarball_url = found[0]
0509
0510     return tarball_url
0511
0512
0513 def process_metadata_from_xml(job):
0514     """
0515     Extract necessary metadata from XML when job report is not available.
0516
0517     :param job: job object.
0518     :return: [updated job object - return not needed].
0519     """
0520
0521     # get the metadata from the xml file instead, which must exist for most production transforms
0522     path = os.path.join(job.workdir, config.Payload.metadata)
0523     if os.path.exists(path):
0524         job.metadata = read_file(path)
0525     else:
0526         if not job.is_analysis() and job.transformation != 'Archive_tf.py':
0527             diagnostics = 'metadata does not exist: %s' % path
0528             logger.warning(diagnostics)
0529             job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOPAYLOADMETADATA)
0530             job.piloterrorcode = errors.NOPAYLOADMETADATA
0531             job.piloterrordiag = diagnostics
0532
0533     # add missing guids
0534     for dat in job.outdata:
0535         if not dat.guid:
0536             # try to read it from the metadata before the last resort of generating it
0537             metadata = None
0538             try:
0539                 metadata = get_metadata_from_xml(job.workdir)
0540             except Exception as exc:
0541                 msg = "Exception caught while interpreting XML: %s (ignoring it, but guids must now be generated)" % exc
0542                 logger.warning(msg)
0543             if metadata:
0544                 dat.guid = get_guid_from_xml(metadata, dat.lfn)
0545                 logger.info('read guid for lfn=%s from xml: %s', dat.lfn, dat.guid)
0546             else:
0547                 dat.guid = get_guid()
0548                 logger.info('generated guid for lfn=%s: %s', dat.lfn, dat.guid)
0549
0550
0551 def process_job_report(job):
0552     """
0553     Process the job report produced by the payload/transform if it exists.
0554     Payload error codes and diagnostics, as well as payload metadata (for output files) and stageout type will be
0555     extracted. The stageout type is either "all" (i.e. stage-out both output and log files) or "log" (i.e. only log file
0556     will be staged out).
0557     Note: some fields might be experiment specific. A call to a user function is therefore also done.
0558
0559     :param job: job dictionary will be updated by the function and several fields set.
0560     :return:
0561     """
0562
0563     # get the job report
0564     path = os.path.join(job.workdir, config.Payload.jobreport)
0565     if not os.path.exists(path):
0566         logger.warning('job report does not exist: %s', path)
0567
0568         # get the metadata from the xml file instead, which must exist for most production transforms
0569         process_metadata_from_xml(job)
0570     else:
0571         with open(path) as data_file:
0572             # compulsory field; the payload must produce a job report (see config file for file name), attach it to the
0573             # job object
0574             job.metadata = json.load(data_file)
0575
0576             #
0577             update_job_data(job)
0578
0579             # compulsory fields
0580             try:
0581                 job.exitcode = job.metadata['exitCode']
0582             except KeyError as exc:
0583                 logger.warning('could not find compulsory payload exitCode in job report: %s (will be set to 0)', exc)
0584                 job.exitcode = 0
0585             else:
0586                 logger.info('extracted exit code from job report: %d', job.exitcode)
0587             try:
0588                 job.exitmsg = job.metadata['exitMsg']
0589             except KeyError as exc:
0590                 logger.warning('could not find compulsory payload exitMsg in job report: %s '
0591                                '(will be set to empty string)', exc)
0592                 job.exitmsg = ""
0593             else:
0594                 # assign special payload error code
0595                 if "got a SIGSEGV signal" in job.exitmsg:
0596                     diagnostics = 'Invalid memory reference or a segmentation fault in payload: %s (job report)' % \
0597                                   job.exitmsg
0598                     logger.warning(diagnostics)
0599                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADSIGSEGV)
0600                     job.piloterrorcode = errors.PAYLOADSIGSEGV
0601                     job.piloterrordiag = diagnostics
0602                 else:
0603                     logger.info('extracted exit message from job report: %s', job.exitmsg)
0604                     if job.exitmsg != 'OK':
0605                         job.exeerrordiag = job.exitmsg
0606                         job.exeerrorcode = job.exitcode
0607
0608             if job.exitcode != 0:
0609                 # get list with identified errors in job report
0610                 job_report_errors = get_job_report_errors(job.metadata)
0611
0612                 # is it a bad_alloc failure?
0613                 bad_alloc, diagnostics = is_bad_alloc(job_report_errors)
0614                 if bad_alloc:
0615                     job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.BADALLOC)
0616                     job.piloterrorcode = errors.BADALLOC
0617                     job.piloterrordiag = diagnostics
0618
0619
0620 def get_job_report_errors(job_report_dictionary):
0621     """
0622     Extract the error list from the jobReport.json dictionary.
0623     The returned list is scanned for special errors.
0624
0625     :param job_report_dictionary:
0626     :return: job_report_errors list.
0627     """
0628
0629     job_report_errors = []
0630     if 'reportVersion' in job_report_dictionary:
0631         logger.info("scanning jobReport (v %s) for error info", job_report_dictionary.get('reportVersion'))
0632     else:
0633         logger.warning("jobReport does not have the reportVersion key")
0634
0635     if 'executor' in job_report_dictionary:
0636         try:
0637             error_details = job_report_dictionary['executor'][0]['logfileReport']['details']['ERROR']
0638         except (KeyError, TypeError, IndexError) as exc:
0639             logger.warning("WARNING: aborting jobReport scan: %s", exc)
0640         else:
0641             if isinstance(error_details, list):
0642                 for msg in error_details:
0643                     job_report_errors.append(msg['message'])
0644             else:
0645                 logger.warning("did not get a list object: %s", type(error_details))
0646     else:
0647         logger.warning("jobReport does not have the executor key (aborting)")
0648
0649     return job_report_errors
0650
0651
0652 def is_bad_alloc(job_report_errors):
0653     """
0654     Check for bad_alloc errors.
0655
0656     :param job_report_errors: list with errors extracted from the job report.
0657     :return: bad_alloc (bool), diagnostics (string).
0658     """
0659
0660     bad_alloc = False
0661     diagnostics = ""
0662     for err in job_report_errors:
0663         if "bad_alloc" in err:
0664             logger.warning("encountered a bad_alloc error: %s", err)
0665             bad_alloc = True
0666             diagnostics = err
0667             break
0668
0669     return bad_alloc, diagnostics
0670
0671
0672 def get_log_extracts(job, state):
0673     """
0674     Extract special warnings and other other info from special logs.
0675     This function also discovers if the payload had any outbound connections.
0676
0677     :param job: job object.
0678     :param state: job state (string).
0679     :return: log extracts (string).
0680     """
0681
0682     logger.info("building log extracts (sent to the server as \'pilotLog\')")
0683
0684     # did the job have any outbound connections?
0685     # look for the pandatracerlog.txt file, produced if the user payload attempted any outgoing connections
0686     extracts = get_panda_tracer_log(job)
0687
0688     # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself
0689     _extracts = get_pilot_log_extracts(job)
0690     if _extracts != "":
0691         logger.warning('detected the following tail of warning/fatal messages in the pilot log:\n%s', _extracts)
0692         if state == 'failed' or state == 'holding':
0693             extracts += _extracts
0694
0695     # add extracts from payload logs
0696     # (see buildLogExtracts in Pilot 1)
0697
0698     return extracts
0699
0700
0701 def get_panda_tracer_log(job):
0702     """
0703     Return the contents of the PanDA tracer log if it exists.
0704     This file will contain information about outbound connections.
0705
0706     :param job: job object.
0707     :return: log extracts from pandatracerlog.txt (string).
0708     """
0709
0710     extracts = ""
0711
0712     tracerlog = os.path.join(job.workdir, "pandatracerlog.txt")
0713     if os.path.exists(tracerlog):
0714         # only add if file is not empty
0715         if os.path.getsize(tracerlog) > 0:
0716             message = "PandaID=%s had outbound connections: " % (job.jobid)
0717             extracts += message
0718             message = read_file(tracerlog)
0719             extracts += message
0720             logger.warning(message)
0721         else:
0722             logger.info("PanDA tracer log (%s) has zero size (no outbound connections detected)", tracerlog)
0723     else:
0724         logger.debug("PanDA tracer log does not exist: %s (ignoring)", tracerlog)
0725
0726     return extracts
0727
0728
0729 def get_pilot_log_extracts(job):
0730     """
0731     Get the extracts from the pilot log (warning/fatal messages, as well as tail of the log itself).
0732
0733     :param job: job object.
0734     :return: tail of pilot log (string).
0735     """
0736
0737     extracts = ""
0738
0739     path = os.path.join(job.workdir, config.Pilot.pilotlog)
0740     if os.path.exists(path):
0741         # get the last 20 lines of the pilot log in case it contains relevant error information
0742         _tail = tail(path, nlines=20)
0743         if _tail != "":
0744             if extracts != "":
0745                 extracts += "\n"
0746             extracts += "- Log from %s -\n" % config.Pilot.pilotlog
0747             extracts += _tail
0748
0749         # grep for fatal/critical errors in the pilot log
0750         #errormsgs = ["FATAL", "CRITICAL", "ERROR"]
0751         #matched_lines = grep(errormsgs, path)
0752         #_extracts = ""
0753         #if len(matched_lines) > 0:
0754         #    logger.debug("dumping warning messages from %s:\n", os.path.basename(path))
0755         #    for line in matched_lines:
0756         #        _extracts += line + "\n"
0757         #if _extracts != "":
0758         #    if config.Pilot.error_log != "":
0759         #        path = os.path.join(job.workdir, config.Pilot.error_log)
0760         #        write_file(path, _extracts)
0761         #    extracts += "\n- Error messages from %s -\n" % config.Pilot.pilotlog
0762         #    extracts += _extracts
0763     else:
0764         logger.warning('pilot log file does not exist: %s', path)
0765
0766     return extracts