Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-11 08:41:05

0001 #!/usr/bin/env python
0002 # Licensed under the Apache License, Version 2.0 (the "License");
0003 # you may not use this file except in compliance with the License.
0004 # You may obtain a copy of the License at
0005 # http://www.apache.org/licenses/LICENSE-2.0
0006 #
0007 # Authors:
0008 # - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021
0009 
0010 import os
0011 import logging
0012 
0013 # from .utilities import get_memory_values
0014 from pilot.util.container import execute
0015 
0016 logger = logging.getLogger(__name__)
0017 
0018 
0019 def get_core_count(job):
0020     """
0021     Return the core count from ATHENA_PROC_NUMBER.
0022 
0023     :param job: job object.
0024     :return: core count (int).
0025     """
0026 
0027     if "HPC_HPC" in job.infosys.queuedata.catchall:
0028         if job.corecount is None:
0029             job.corecount = 0
0030     else:
0031         if job.corecount:
0032             # Always use the ATHENA_PROC_NUMBER first, if set
0033             if 'ATHENA_PROC_NUMBER' in os.environ:
0034                 try:
0035                     job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER'))
0036                 except (ValueError, TypeError) as exc:
0037                     logger.warning("ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.corecount value)", exc)
0038         else:
0039             try:
0040                 job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER'))
0041             except Exception:
0042                 logger.warning("environment variable ATHENA_PROC_NUMBER is not set. corecount is not set")
0043 
0044     return job.corecount
0045 
0046 
0047 def add_core_count(corecount, core_counts=[]):
0048     """
0049     Add a core count measurement to the list of core counts.
0050 
0051     :param corecount: current actual core count (int).
0052     :param core_counts: list of core counts (list).
0053     :return: updated list of core counts (list).
0054     """
0055 
0056     if core_counts is None:  # protection
0057         core_counts = []
0058     core_counts.append(corecount)
0059 
0060     return core_counts
0061 
0062 
0063 def set_core_counts(job):
0064     """
0065     Set the number of used cores.
0066 
0067     :param job: job object.
0068     :return:
0069     """
0070 
0071     # something like this could be used if prmon also gave info about ncores
0072     # (change nprocs -> ncores and add ncores to list in utilities module, get_average_summary_dictionary_prmon())
0073 
0074     #summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor)
0075     #if summary_dictionary:
0076     #    if 'nprocs' in summary_dictionary["Other"]:
0077     #        try:
0078     #            job.actualcorecount = int(summary_dictionary["Other"]["nprocs"])
0079     #        except Exception as exc:
0080     #            logger.warning('exception caught: %s', exc)
0081     #        else:
0082     #            job.corecounts = add_core_count(job.actualcorecount)
0083     #            logger.debug('current core counts list: %s', str(job.corecounts))
0084     #    else:
0085     #        logger.debug('summary_dictionary[Other]=%s', summary_dictionary["Other"])
0086     #else:
0087     #    logger.debug('no summary_dictionary')
0088 
0089     if job.pgrp:
0090         # for debugging
0091         #cmd = "ps axo pgid,psr,comm,args | grep %d" % job.pgrp
0092         #exit_code, stdout, stderr = execute(cmd, mute=True)
0093         #logger.debug('%s:\n%s\n', cmd, stdout)
0094 
0095         # ps axo pgid,psr -> 154628   8 \n 154628   9 \n 1546280 1 ..
0096         # sort is redundant; uniq removes any duplicate lines; wc -l gives the final count
0097         # awk is added to get the pgrp list only and then grep -x makes sure that false positives are removed, e.g. 1546280
0098         cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp)
0099         _, stdout, _ = execute(cmd, mute=True)
0100         logger.debug('%s: %s', cmd, stdout)
0101         try:
0102             job.actualcorecount = int(stdout)
0103         except ValueError as exc:
0104             logger.warning('failed to convert number of actual cores to int: %s', exc)
0105         else:
0106             job.corecounts = add_core_count(job.actualcorecount)  #, core_counts=job.corecounts)
0107             #logger.debug('current core counts list: %s', str(job.corecounts))
0108             # check suspicious values
0109             #if job.actualcorecount > 5:
0110             #    logger.warning('detected large actualcorecount: %d', job.actualcorecount)
0111             #    cmd = "ps axo pgid,stat,euid,ruid,tty,tpgid,sess,pgrp,ppid,pid,pcpu,comm | sort | uniq | grep %d" % job.pgrp
0112             #    exit_code, stdout, stderr = execute(cmd, mute=True)
0113             #    logger.debug('%s (pgrp=%d): %s', cmd, job.pgrp, stdout)
0114     else:
0115         logger.debug('payload process group not set - cannot check number of cores used by payload')