Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:58:16

0001 #!/usr/bin/python
0002 
0003 """
0004 check iDDS health
0005 """
0006 
0007 import json
0008 import os
0009 import re
0010 import subprocess
0011 import time
0012 
0013 
0014 def check_command(command, check_string):
0015     print("Checking command : {0}".format(command))
0016     print("For string : {0}".format(check_string))
0017 
0018     tmp_array = command.split()
0019     output = (
0020         subprocess.Popen(tmp_array, stdout=subprocess.PIPE)
0021         .communicate()[0]
0022         .decode("ascii")
0023     )
0024 
0025     if re.search(check_string, output):
0026         print("Found the string, return 100")
0027         return 100
0028     else:
0029         print("String not found, return 0")
0030         return 0
0031 
0032 
0033 def is_logrotate_running():
0034     # get the count of logrotate processes - if >=1 then logrotate is running
0035     output = (
0036         subprocess.Popen(
0037             "ps -eo pgid,args | grep logrotate | grep -v grep | wc -l",
0038             stdout=subprocess.PIPE,
0039             shell=True,
0040         )
0041         .communicate()[0]
0042         .decode("ascii")
0043     )
0044 
0045     try:
0046         cleaned_output = output.strip()
0047         n_logrotate_processes = int(cleaned_output)
0048     except ValueError:
0049         print(
0050             "The string has an unexpected format and couldn't be converted to an integer."
0051         )
0052 
0053     # logrotate process found
0054     if n_logrotate_processes >= 1:
0055         print("Logrotate is running")
0056         return True
0057 
0058     return False
0059 
0060 
0061 def is_restarting():
0062     # get the count of logrotate processes - if >=1 then logrotate is running
0063     output = (
0064         subprocess.Popen(
0065             "ps -eo pgid,args | grep restart|grep http | grep -v grep | wc -l",
0066             stdout=subprocess.PIPE,
0067             shell=True,
0068         )
0069         .communicate()[0]
0070         .decode("ascii")
0071     )
0072 
0073     try:
0074         cleaned_output = output.strip()
0075         n_restarting_processes = int(cleaned_output)
0076     except ValueError:
0077         print(
0078             "The string has an unexpected format and couldn't be converted to an integer."
0079         )
0080 
0081     # logrotate process found
0082     if n_restarting_processes >= 1:
0083         print("http is restarting")
0084         return True
0085 
0086     return False
0087 
0088 
0089 def http_availability(host):
0090     # check the http
0091     avail = 0
0092     if os.environ.get('X509_USER_PROXY', None):
0093         curl = "curl -i -k --cert $X509_USER_PROXY --key $X509_USER_PROXY --cacert $X509_USER_PROXY https://%s:8443/idds/ping" % host
0094         avail = check_command(curl, '"Status": "OK"')
0095         print("http check availability (with proxy): %s" % avail)
0096     elif os.environ.get('PANDA_AUTH', None) and os.environ.get('PANDA_AUTH_VO', None) and os.environ.get('PANDA_AUTH_ID_TOKEN', None):
0097         curl = "curl -i -k -H \"X-IDDS-Auth-Type: ${PANDA_AUTH}\" -H \"X-IDDS-Auth-VO: ${PANDA_AUTH_VO}\" -H \"X-Idds-Auth-Token: ${PANDA_AUTH_ID_TOKEN}\" https://%s:8443/idds/ping" % host
0098         avail = check_command(curl, '"Status": "OK"')
0099         print("http check availability (with oidc token): %s" % avail)
0100     if not avail or avail == 0:
0101         curl = "curl -i -k https://%s:8443/idds/ping" % host
0102         avail = check_command(curl, 'IDDSException')
0103         print("http check availability (without proxy): %s" % avail)
0104 
0105     if not avail or avail == 0:
0106         logrotate_running = is_logrotate_running()
0107         restarting = is_restarting()
0108         if logrotate_running and restarting:
0109             print("log rotation is running and http is restarting")
0110             return 1
0111     return avail
0112 
0113 
0114 def process_availability():
0115     # check the http
0116     process_avail = 0
0117     output = (
0118         subprocess.Popen(
0119             "ps -eo pgid,args | grep 'idds/agents/main.py' | grep -v grep | uniq",
0120             stdout=subprocess.PIPE,
0121             shell=True,
0122         )
0123         .communicate()[0]
0124         .decode("ascii")
0125     )
0126     count = 0
0127     for line in output.split("\n"):
0128         line = line.strip()
0129         if line == "":
0130             continue
0131         count += 1
0132     if count >= 1:
0133         process_avail = 100
0134 
0135     print("agent process check availability: %s" % process_avail)
0136     return process_avail
0137 
0138 
0139 def heartbeat_availability(log_location):
0140     avail = 100
0141     hang_workers = 0
0142     heartbeat_file = os.path.join(log_location, 'idds_availability')
0143     if not os.path.exists(heartbeat_file):
0144         avail = 0
0145         print("idds_heartbeat at %s not exist, avail: %s" % (heartbeat_file, avail))
0146         return avail, hang_workers
0147 
0148     mod_time = os.path.getmtime(heartbeat_file)
0149     print("idds_heartbeat updated at %s (currently is %s, %s seconds ago)" % (mod_time, time.time(), time.time() - mod_time))
0150     if mod_time < time.time() - 1800:
0151         avail = 0
0152         return avail, hang_workers
0153 
0154     try:
0155         with open(heartbeat_file, 'r') as f:
0156             d = json.load(f)
0157             for agent in d:
0158                 info = d[agent]
0159                 num_hang_workers = info['num_hang_workers']
0160                 num_active_workers = info['num_active_workers']
0161                 if num_active_workers > 0 and num_hang_workers > 0:
0162                     hang_workers += num_hang_workers
0163                     agent_avail = int(num_hang_workers * 100 / num_active_workers)
0164                     if agent_avail < avail:
0165                         avail = agent_avail
0166                     print("iDDS agent %s has % hang workers" % num_hang_workers)
0167     except Exception as ex:
0168         print("Failed to parse idds_heartbeat: %s" % str(ex))
0169         avail = 50
0170 
0171     return avail, hang_workers
0172 
0173 
0174 def idds_availability(host, log_location):
0175     infos = {}
0176     http_avail = http_availability(host)
0177     print(f"http avail: {http_avail}")
0178 
0179     process_avail = process_availability()
0180     print(f"agent daemon avail: {process_avail}")
0181 
0182     heartbeat_avail, hang_workers = heartbeat_availability(log_location)
0183     print(f"heartbeat avail: {heartbeat_avail}, hang workers: {hang_workers}")
0184     infos['num_hang_workers'] = hang_workers
0185 
0186     if not http_avail:
0187         availability = 0
0188         avail_info = "iDDS http rest service is not running"
0189     elif not process_avail:
0190         availability = 50
0191         avail_info = "iDDS agents are not running"
0192     else:
0193         if not heartbeat_avail:
0194             availability = 50
0195             avail_info = "iDDS agents are running. However heartbeat file is not found (or not renewed)"
0196         elif heartbeat_avail < 100:
0197             availability = heartbeat_avail
0198             avail_info = "iDDS agents are running. However there are hanging workers"
0199         else:
0200             availability = heartbeat_avail
0201             avail_info = "iDDS is OK"
0202 
0203     print("availability: %s, avail_info: %s, infos: %s" % (availability, avail_info, infos))
0204 
0205     return availability, avail_info, infos
0206 
0207 
0208 def main():
0209     host = 'localhost'
0210     log_location = '/var/log/idds'
0211     avail, avail_info, infos = idds_availability(host, log_location)
0212 
0213     health_file = os.path.join(log_location, 'idds_health')
0214     if avail >= 100:
0215         with open(health_file, 'w') as f:
0216             f.write('OK')
0217     else:
0218         if os.path.exists(health_file):
0219             os.remove(health_file)
0220 
0221 
0222 if __name__ == '__main__':
0223     main()