File indexing completed on 2026-04-09 07:58:16
0001
0002
0003 """
0004 check iDDS health
0005 """
0006
0007 import json
0008 import os
0009 import re
0010 import subprocess
0011 import time
0012
0013
0014 def check_command(command, check_string):
0015 print("Checking command : {0}".format(command))
0016 print("For string : {0}".format(check_string))
0017
0018 tmp_array = command.split()
0019 output = (
0020 subprocess.Popen(tmp_array, stdout=subprocess.PIPE)
0021 .communicate()[0]
0022 .decode("ascii")
0023 )
0024
0025 if re.search(check_string, output):
0026 print("Found the string, return 100")
0027 return 100
0028 else:
0029 print("String not found, return 0")
0030 return 0
0031
0032
0033 def is_logrotate_running():
0034
0035 output = (
0036 subprocess.Popen(
0037 "ps -eo pgid,args | grep logrotate | grep -v grep | wc -l",
0038 stdout=subprocess.PIPE,
0039 shell=True,
0040 )
0041 .communicate()[0]
0042 .decode("ascii")
0043 )
0044
0045 try:
0046 cleaned_output = output.strip()
0047 n_logrotate_processes = int(cleaned_output)
0048 except ValueError:
0049 print(
0050 "The string has an unexpected format and couldn't be converted to an integer."
0051 )
0052
0053
0054 if n_logrotate_processes >= 1:
0055 print("Logrotate is running")
0056 return True
0057
0058 return False
0059
0060
0061 def is_restarting():
0062
0063 output = (
0064 subprocess.Popen(
0065 "ps -eo pgid,args | grep restart|grep http | grep -v grep | wc -l",
0066 stdout=subprocess.PIPE,
0067 shell=True,
0068 )
0069 .communicate()[0]
0070 .decode("ascii")
0071 )
0072
0073 try:
0074 cleaned_output = output.strip()
0075 n_restarting_processes = int(cleaned_output)
0076 except ValueError:
0077 print(
0078 "The string has an unexpected format and couldn't be converted to an integer."
0079 )
0080
0081
0082 if n_restarting_processes >= 1:
0083 print("http is restarting")
0084 return True
0085
0086 return False
0087
0088
0089 def http_availability(host):
0090
0091 avail = 0
0092 if os.environ.get('X509_USER_PROXY', None):
0093 curl = "curl -i -k --cert $X509_USER_PROXY --key $X509_USER_PROXY --cacert $X509_USER_PROXY https://%s:8443/idds/ping" % host
0094 avail = check_command(curl, '"Status": "OK"')
0095 print("http check availability (with proxy): %s" % avail)
0096 elif os.environ.get('PANDA_AUTH', None) and os.environ.get('PANDA_AUTH_VO', None) and os.environ.get('PANDA_AUTH_ID_TOKEN', None):
0097 curl = "curl -i -k -H \"X-IDDS-Auth-Type: ${PANDA_AUTH}\" -H \"X-IDDS-Auth-VO: ${PANDA_AUTH_VO}\" -H \"X-Idds-Auth-Token: ${PANDA_AUTH_ID_TOKEN}\" https://%s:8443/idds/ping" % host
0098 avail = check_command(curl, '"Status": "OK"')
0099 print("http check availability (with oidc token): %s" % avail)
0100 if not avail or avail == 0:
0101 curl = "curl -i -k https://%s:8443/idds/ping" % host
0102 avail = check_command(curl, 'IDDSException')
0103 print("http check availability (without proxy): %s" % avail)
0104
0105 if not avail or avail == 0:
0106 logrotate_running = is_logrotate_running()
0107 restarting = is_restarting()
0108 if logrotate_running and restarting:
0109 print("log rotation is running and http is restarting")
0110 return 1
0111 return avail
0112
0113
0114 def process_availability():
0115
0116 process_avail = 0
0117 output = (
0118 subprocess.Popen(
0119 "ps -eo pgid,args | grep 'idds/agents/main.py' | grep -v grep | uniq",
0120 stdout=subprocess.PIPE,
0121 shell=True,
0122 )
0123 .communicate()[0]
0124 .decode("ascii")
0125 )
0126 count = 0
0127 for line in output.split("\n"):
0128 line = line.strip()
0129 if line == "":
0130 continue
0131 count += 1
0132 if count >= 1:
0133 process_avail = 100
0134
0135 print("agent process check availability: %s" % process_avail)
0136 return process_avail
0137
0138
0139 def heartbeat_availability(log_location):
0140 avail = 100
0141 hang_workers = 0
0142 heartbeat_file = os.path.join(log_location, 'idds_availability')
0143 if not os.path.exists(heartbeat_file):
0144 avail = 0
0145 print("idds_heartbeat at %s not exist, avail: %s" % (heartbeat_file, avail))
0146 return avail, hang_workers
0147
0148 mod_time = os.path.getmtime(heartbeat_file)
0149 print("idds_heartbeat updated at %s (currently is %s, %s seconds ago)" % (mod_time, time.time(), time.time() - mod_time))
0150 if mod_time < time.time() - 1800:
0151 avail = 0
0152 return avail, hang_workers
0153
0154 try:
0155 with open(heartbeat_file, 'r') as f:
0156 d = json.load(f)
0157 for agent in d:
0158 info = d[agent]
0159 num_hang_workers = info['num_hang_workers']
0160 num_active_workers = info['num_active_workers']
0161 if num_active_workers > 0 and num_hang_workers > 0:
0162 hang_workers += num_hang_workers
0163 agent_avail = int(num_hang_workers * 100 / num_active_workers)
0164 if agent_avail < avail:
0165 avail = agent_avail
0166 print("iDDS agent %s has % hang workers" % num_hang_workers)
0167 except Exception as ex:
0168 print("Failed to parse idds_heartbeat: %s" % str(ex))
0169 avail = 50
0170
0171 return avail, hang_workers
0172
0173
0174 def idds_availability(host, log_location):
0175 infos = {}
0176 http_avail = http_availability(host)
0177 print(f"http avail: {http_avail}")
0178
0179 process_avail = process_availability()
0180 print(f"agent daemon avail: {process_avail}")
0181
0182 heartbeat_avail, hang_workers = heartbeat_availability(log_location)
0183 print(f"heartbeat avail: {heartbeat_avail}, hang workers: {hang_workers}")
0184 infos['num_hang_workers'] = hang_workers
0185
0186 if not http_avail:
0187 availability = 0
0188 avail_info = "iDDS http rest service is not running"
0189 elif not process_avail:
0190 availability = 50
0191 avail_info = "iDDS agents are not running"
0192 else:
0193 if not heartbeat_avail:
0194 availability = 50
0195 avail_info = "iDDS agents are running. However heartbeat file is not found (or not renewed)"
0196 elif heartbeat_avail < 100:
0197 availability = heartbeat_avail
0198 avail_info = "iDDS agents are running. However there are hanging workers"
0199 else:
0200 availability = heartbeat_avail
0201 avail_info = "iDDS is OK"
0202
0203 print("availability: %s, avail_info: %s, infos: %s" % (availability, avail_info, infos))
0204
0205 return availability, avail_info, infos
0206
0207
0208 def main():
0209 host = 'localhost'
0210 log_location = '/var/log/idds'
0211 avail, avail_info, infos = idds_availability(host, log_location)
0212
0213 health_file = os.path.join(log_location, 'idds_health')
0214 if avail >= 100:
0215 with open(health_file, 'w') as f:
0216 f.write('OK')
0217 else:
0218 if os.path.exists(health_file):
0219 os.remove(health_file)
0220
0221
0222 if __name__ == '__main__':
0223 main()