File indexing completed on 2026-04-09 07:58:23
0001
0002
0003 """
0004 Cloned from panda_sls.py (https://gitlab.cern.ch/ai/it-puppet-hostgroup-vopanda/-/raw/master/code/files/pandaserver/panda_sls.py?ref_type=heads)
0005 """
0006 import datetime
0007 import json
0008 import optparse
0009 import os
0010 import re
0011 import smtplib
0012 import socket
0013 import subprocess
0014 import time
0015
0016 from email.mime.text import MIMEText
0017
0018 import sls_document
0019
0020
0021
0022
0023 parser = optparse.OptionParser()
0024
0025 parser.add_option(
0026 "--host",
0027 dest="host",
0028 type="string",
0029 help="Hostname of server to check, default is current machine hostname",
0030 )
0031 parser.add_option(
0032 "-l",
0033 "--logs",
0034 dest="logs",
0035 type="string",
0036 help="Location of the panda logs, where to look for information. Default is /var/log/idds",
0037 )
0038 parser.add_option(
0039 "--proxy",
0040 dest="proxy",
0041 type="string",
0042 help="Location of the X509 proxy. Default is $X509_USER_PROXY",
0043 )
0044 parser.add_option(
0045 "--debug",
0046 action="store_true",
0047 dest="debug",
0048 default=False,
0049 help="Print out debug statements.",
0050 )
0051
0052 (options, args) = parser.parse_args()
0053
0054
0055
0056
0057 filename = "/home/atlpilo1/sls_notification_emails.txt"
0058
0059
0060 def __main__():
0061 if options.host:
0062 host = options.host
0063 else:
0064 host = socket.gethostname()
0065 host = re.sub(r"^(\w+).*", r"\1", host)
0066
0067 if options.logs:
0068 log_location = options.logs
0069 else:
0070 log_location = "/var/log/idds"
0071
0072 if options.proxy:
0073 os.environ['X509_USER_PROXY'] = options.proxy
0074
0075 make_idds(host, log_location)
0076
0077
0078 def make_idds(host, log_location):
0079 if options.debug:
0080 print("Creating the idds monitoring entry")
0081
0082 avail, avail_info, infos = idds_availability(host, log_location)
0083
0084 sls_doc = sls_document.SlsDocument()
0085 id = "iDDS"
0086 sls_doc.set_id("%s_%s" % (id, host))
0087 sls_doc.set_status(avail)
0088 sls_doc.set_avail_desc(id)
0089 sls_doc.set_avail_info(avail_info)
0090
0091 for key in infos:
0092 info = infos[key]
0093 sls_doc.add_data(key, info)
0094
0095 email_manager(id, host, avail, avail_info)
0096
0097 return sls_doc.send_document(options.debug)
0098
0099
0100 def check_command(command, check_string):
0101 if options.debug:
0102 print("Checking command : {0}".format(command))
0103 print("For string : {0}".format(check_string))
0104
0105 tmp_array = command.split()
0106 output = (
0107 subprocess.Popen(tmp_array, stdout=subprocess.PIPE)
0108 .communicate()[0]
0109 .decode("ascii")
0110 )
0111
0112 if re.search(check_string, output):
0113 if options.debug:
0114 print("Found the string, return 100")
0115 return 100
0116 else:
0117 if options.debug:
0118 print("String not found, return 0")
0119 return 0
0120
0121
0122 def http_availability(host):
0123
0124 avail = 0
0125 if os.environ.get('X509_USER_PROXY', None):
0126 curl = "curl -i -k --cert $X509_USER_PROXY --key $X509_USER_PROXY --cacert $X509_USER_PROXY https://%s:443/idds/ping" % host
0127 avail = check_command(curl, '"Status": "OK"')
0128 if options.debug:
0129 print("http check availability (with proxy): %s" % avail)
0130 if not avail or avail == 0:
0131 curl = "curl -i -k https://%s:443/idds/ping" % host
0132 avail = check_command(curl, 'IDDSException')
0133 if options.debug:
0134 print("http check availability (without proxy): %s" % avail)
0135
0136 if not avail or avail == 0:
0137 logrotate_running = is_logrotate_running()
0138 restarting = is_restarting()
0139 if logrotate_running or restarting:
0140 return 1
0141 return avail
0142
0143
0144 def process_availability():
0145
0146 process_avail = 0
0147 output = (
0148 subprocess.Popen(
0149 "ps -eo pgid,args | grep 'idds/agents/main.py' | grep -v grep | uniq",
0150 stdout=subprocess.PIPE,
0151 shell=True,
0152 )
0153 .communicate()[0]
0154 .decode("ascii")
0155 )
0156 count = 0
0157 for line in output.split("\n"):
0158 line = line.strip()
0159 if line == "":
0160 continue
0161 count += 1
0162 if count >= 1:
0163 process_avail = 100
0164
0165 if options.debug:
0166 print("agent process check availability: %s" % process_avail)
0167 return process_avail
0168
0169
0170 def heartbeat_availability(log_location):
0171 avail = 100
0172 hang_workers = 0
0173 heartbeat_file = os.path.join(log_location, 'idds_availability')
0174 if not os.path.exists(heartbeat_file):
0175 avail = 0
0176 if options.debug:
0177 print("idds_heartbeat at %s not exist, avail: %s" % (heartbeat_file, avail))
0178 return avail, hang_workers
0179
0180 mod_time = os.path.getmtime(heartbeat_file)
0181 if options.debug:
0182 print("idds_heartbeat updated at %s (currently is %s, %s seconds ago)" % (mod_time, time.time(), time.time() - mod_time))
0183 if mod_time < time.time() - 1800:
0184 avail = 0
0185 return avail, hang_workers
0186
0187 try:
0188 with open(heartbeat_file, 'r') as f:
0189 d = json.load(f)
0190 for agent in d:
0191 info = d[agent]
0192 num_hang_workers = info['num_hang_workers']
0193 num_active_workers = info['num_active_workers']
0194 if num_active_workers > 0 and num_hang_workers > 0:
0195 hang_workers += num_hang_workers
0196 agent_avail = int(num_hang_workers * 100 / num_active_workers)
0197 if agent_avail < avail:
0198 avail = agent_avail
0199 if options.debug:
0200 print("iDDS agent %s has % hang workers" % num_hang_workers)
0201 except Exception as ex:
0202 print("Failed to parse idds_heartbeat: %s" % str(ex))
0203 avail = 50
0204
0205 return avail, hang_workers
0206
0207
0208 def is_logrotate_running():
0209
0210 output = (
0211 subprocess.Popen(
0212 "ps -eo pgid,args | grep logrotate | grep -v grep | wc -l",
0213 stdout=subprocess.PIPE,
0214 shell=True,
0215 )
0216 .communicate()[0]
0217 .decode("ascii")
0218 )
0219
0220 try:
0221 cleaned_output = output.strip()
0222 n_logrotate_processes = int(cleaned_output)
0223 except ValueError:
0224 print(
0225 "The string has an unexpected format and couldn't be converted to an integer."
0226 )
0227
0228
0229 if n_logrotate_processes >= 1:
0230 if options.debug:
0231 print("Logrotate is running")
0232 return True
0233
0234 return False
0235
0236
0237 def is_restarting():
0238
0239 output = (
0240 subprocess.Popen(
0241 "ps -eo pgid,args | grep restart|grep http | grep -v grep | wc -l",
0242 stdout=subprocess.PIPE,
0243 shell=True,
0244 )
0245 .communicate()[0]
0246 .decode("ascii")
0247 )
0248
0249 try:
0250 cleaned_output = output.strip()
0251 n_restarting_processes = int(cleaned_output)
0252 except ValueError:
0253 print(
0254 "The string has an unexpected format and couldn't be converted to an integer."
0255 )
0256
0257
0258 if n_restarting_processes >= 1:
0259 if options.debug:
0260 print("http is restarting")
0261 return True
0262
0263 return False
0264
0265
0266 def idds_availability(host, log_location):
0267 infos = {}
0268 http_avail = http_availability(host)
0269
0270 process_avail = process_availability()
0271
0272 heartbeat_avail, hang_workers = heartbeat_availability(log_location)
0273 infos['num_hang_workers'] = hang_workers
0274
0275 if not http_avail:
0276 availability = 0
0277 avail_info = "iDDS http rest service is not running"
0278 elif not process_avail:
0279 availability = 50
0280 avail_info = "iDDS agents are not running"
0281 else:
0282 if not heartbeat_avail:
0283 availability = 50
0284 avail_info = "iDDS agents are running. However heartbeat file is not found (or not renewed)"
0285 elif heartbeat_avail < 100:
0286 availability = heartbeat_avail
0287 avail_info = "iDDS agents are running. However there are hanging workers"
0288 else:
0289 availability = heartbeat_avail
0290 avail_info = "iDDS is OK"
0291
0292 if options.debug:
0293 print("availability: %s, avail_info: %s, infos: %s" % (availability, avail_info, infos))
0294
0295 return availability, avail_info, infos
0296
0297
0298 def read_last_email_times():
0299 try:
0300 with open(filename, "r") as f:
0301 lines = f.readlines()
0302 return [
0303 datetime.datetime.strptime(line.strip(), "%Y-%m-%d %H:%M:%S")
0304 for line in lines
0305 ]
0306 except FileNotFoundError:
0307 return []
0308
0309
0310 def update_email_times(timestamps):
0311 with open(filename, "w+") as f:
0312
0313 for ts in timestamps[-10:]:
0314 f.write(ts.strftime("%Y-%m-%d %H:%M:%S") + "\n")
0315
0316
0317 def send_email(subject, body, to_email):
0318 from_email = "atlpan@mail.cern.ch"
0319
0320 msg = MIMEText(body)
0321 msg["Subject"] = subject
0322 msg["From"] = from_email
0323 msg["To"] = to_email
0324
0325 server = smtplib.SMTP("localhost")
0326 server.sendmail(from_email, to_email, msg.as_string())
0327 server.quit()
0328
0329
0330 def email_manager(service, host, avail, avail_info):
0331 try:
0332
0333 if avail in ("100", 100):
0334 return
0335
0336
0337 last_email_times = read_last_email_times()
0338 now = datetime.datetime.now()
0339 if last_email_times and now - last_email_times[-1] <= datetime.timedelta(
0340 hours=1
0341 ):
0342 return
0343
0344
0345 subject = "[SLS] Service issues for {0} on {1}".format(service, host)
0346
0347
0348 body = """
0349 Service: {0}
0350 Host: {1}
0351 Availability: {2}
0352 Availability info: {3}
0353 """.format(
0354 service, host, avail, avail_info
0355 )
0356
0357 email = "atlas-adc-idds-admins@cern.ch"
0358
0359 send_email(subject, body, email)
0360
0361
0362 last_email_times.append(now)
0363 update_email_times(last_email_times)
0364
0365 if options.debug:
0366 print("Email sent.")
0367 return
0368
0369 except Exception:
0370 pass
0371
0372
0373
0374 __main__()