Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:58:23

0001 #!/usr/bin/python
0002 
0003 """
0004 Cloned from panda_sls.py (https://gitlab.cern.ch/ai/it-puppet-hostgroup-vopanda/-/raw/master/code/files/pandaserver/panda_sls.py?ref_type=heads)
0005 """
0006 import datetime
0007 import json
0008 import optparse
0009 import os
0010 import re
0011 import smtplib
0012 import socket
0013 import subprocess
0014 import time
0015 
0016 from email.mime.text import MIMEText
0017 
0018 import sls_document
0019 
0020 ###########################################
0021 # define options
0022 ###########################################
0023 parser = optparse.OptionParser()
0024 
0025 parser.add_option(
0026     "--host",
0027     dest="host",
0028     type="string",
0029     help="Hostname of server to check, default is current machine hostname",
0030 )
0031 parser.add_option(
0032     "-l",
0033     "--logs",
0034     dest="logs",
0035     type="string",
0036     help="Location of the panda logs, where to look for information. Default is /var/log/idds",
0037 )
0038 parser.add_option(
0039     "--proxy",
0040     dest="proxy",
0041     type="string",
0042     help="Location of the X509 proxy. Default is $X509_USER_PROXY",
0043 )
0044 parser.add_option(
0045     "--debug",
0046     action="store_true",
0047     dest="debug",
0048     default=False,
0049     help="Print out debug statements.",
0050 )
0051 
0052 (options, args) = parser.parse_args()
0053 
0054 # filepath to store the last email timestamps
0055 # filename = os.path.join(os.path.expanduser("~"), "sls_notification_emails.txt")
0056 # hardcode the path for now. The home directory is sometimes mapped to afs and the file conflicts between machines
0057 filename = "/home/atlpilo1/sls_notification_emails.txt"
0058 
0059 
0060 def __main__():
0061     if options.host:
0062         host = options.host
0063     else:
0064         host = socket.gethostname()
0065         host = re.sub(r"^(\w+).*", r"\1", host)
0066 
0067     if options.logs:  # Backwards compatibility. We plan to migrate the logs
0068         log_location = options.logs
0069     else:
0070         log_location = "/var/log/idds"
0071 
0072     if options.proxy:
0073         os.environ['X509_USER_PROXY'] = options.proxy
0074 
0075     make_idds(host, log_location)
0076 
0077 
0078 def make_idds(host, log_location):
0079     if options.debug:
0080         print("Creating the idds monitoring entry")
0081 
0082     avail, avail_info, infos = idds_availability(host, log_location)
0083 
0084     sls_doc = sls_document.SlsDocument()
0085     id = "iDDS"
0086     sls_doc.set_id("%s_%s" % (id, host))
0087     sls_doc.set_status(avail)
0088     sls_doc.set_avail_desc(id)
0089     sls_doc.set_avail_info(avail_info)
0090 
0091     for key in infos:
0092         info = infos[key]
0093         sls_doc.add_data(key, info)
0094 
0095     email_manager(id, host, avail, avail_info)
0096 
0097     return sls_doc.send_document(options.debug)
0098 
0099 
0100 def check_command(command, check_string):
0101     if options.debug:
0102         print("Checking command : {0}".format(command))
0103         print("For string : {0}".format(check_string))
0104 
0105     tmp_array = command.split()
0106     output = (
0107         subprocess.Popen(tmp_array, stdout=subprocess.PIPE)
0108         .communicate()[0]
0109         .decode("ascii")
0110     )
0111 
0112     if re.search(check_string, output):
0113         if options.debug:
0114             print("Found the string, return 100")
0115         return 100
0116     else:
0117         if options.debug:
0118             print("String not found, return 0")
0119         return 0
0120 
0121 
0122 def http_availability(host):
0123     # check the http
0124     avail = 0
0125     if os.environ.get('X509_USER_PROXY', None):
0126         curl = "curl -i -k --cert $X509_USER_PROXY --key $X509_USER_PROXY --cacert $X509_USER_PROXY https://%s:443/idds/ping" % host
0127         avail = check_command(curl, '"Status": "OK"')
0128         if options.debug:
0129             print("http check availability (with proxy): %s" % avail)
0130     if not avail or avail == 0:
0131         curl = "curl -i -k https://%s:443/idds/ping" % host
0132         avail = check_command(curl, 'IDDSException')
0133         if options.debug:
0134             print("http check availability (without proxy): %s" % avail)
0135 
0136     if not avail or avail == 0:
0137         logrotate_running = is_logrotate_running()
0138         restarting = is_restarting()
0139         if logrotate_running or restarting:
0140             return 1
0141     return avail
0142 
0143 
0144 def process_availability():
0145     # check the http
0146     process_avail = 0
0147     output = (
0148         subprocess.Popen(
0149             "ps -eo pgid,args | grep 'idds/agents/main.py' | grep -v grep | uniq",
0150             stdout=subprocess.PIPE,
0151             shell=True,
0152         )
0153         .communicate()[0]
0154         .decode("ascii")
0155     )
0156     count = 0
0157     for line in output.split("\n"):
0158         line = line.strip()
0159         if line == "":
0160             continue
0161         count += 1
0162     if count >= 1:
0163         process_avail = 100
0164 
0165     if options.debug:
0166         print("agent process check availability: %s" % process_avail)
0167     return process_avail
0168 
0169 
0170 def heartbeat_availability(log_location):
0171     avail = 100
0172     hang_workers = 0
0173     heartbeat_file = os.path.join(log_location, 'idds_availability')
0174     if not os.path.exists(heartbeat_file):
0175         avail = 0
0176         if options.debug:
0177             print("idds_heartbeat at %s not exist, avail: %s" % (heartbeat_file, avail))
0178         return avail, hang_workers
0179 
0180     mod_time = os.path.getmtime(heartbeat_file)
0181     if options.debug:
0182         print("idds_heartbeat updated at %s (currently is %s, %s seconds ago)" % (mod_time, time.time(), time.time() - mod_time))
0183     if mod_time < time.time() - 1800:
0184         avail = 0
0185         return avail, hang_workers
0186 
0187     try:
0188         with open(heartbeat_file, 'r') as f:
0189             d = json.load(f)
0190             for agent in d:
0191                 info = d[agent]
0192                 num_hang_workers = info['num_hang_workers']
0193                 num_active_workers = info['num_active_workers']
0194                 if num_active_workers > 0 and num_hang_workers > 0:
0195                     hang_workers += num_hang_workers
0196                     agent_avail = int(num_hang_workers * 100 / num_active_workers)
0197                     if agent_avail < avail:
0198                         avail = agent_avail
0199                     if options.debug:
0200                         print("iDDS agent %s has % hang workers" % num_hang_workers)
0201     except Exception as ex:
0202         print("Failed to parse idds_heartbeat: %s" % str(ex))
0203         avail = 50
0204 
0205     return avail, hang_workers
0206 
0207 
0208 def is_logrotate_running():
0209     # get the count of logrotate processes - if >=1 then logrotate is running
0210     output = (
0211         subprocess.Popen(
0212             "ps -eo pgid,args | grep logrotate | grep -v grep | wc -l",
0213             stdout=subprocess.PIPE,
0214             shell=True,
0215         )
0216         .communicate()[0]
0217         .decode("ascii")
0218     )
0219 
0220     try:
0221         cleaned_output = output.strip()
0222         n_logrotate_processes = int(cleaned_output)
0223     except ValueError:
0224         print(
0225             "The string has an unexpected format and couldn't be converted to an integer."
0226         )
0227 
0228     # logrotate process found
0229     if n_logrotate_processes >= 1:
0230         if options.debug:
0231             print("Logrotate is running")
0232         return True
0233 
0234     return False
0235 
0236 
0237 def is_restarting():
0238     # get the count of logrotate processes - if >=1 then logrotate is running
0239     output = (
0240         subprocess.Popen(
0241             "ps -eo pgid,args | grep restart|grep http | grep -v grep | wc -l",
0242             stdout=subprocess.PIPE,
0243             shell=True,
0244         )
0245         .communicate()[0]
0246         .decode("ascii")
0247     )
0248 
0249     try:
0250         cleaned_output = output.strip()
0251         n_restarting_processes = int(cleaned_output)
0252     except ValueError:
0253         print(
0254             "The string has an unexpected format and couldn't be converted to an integer."
0255         )
0256 
0257     # logrotate process found
0258     if n_restarting_processes >= 1:
0259         if options.debug:
0260             print("http is restarting")
0261         return True
0262 
0263     return False
0264 
0265 
0266 def idds_availability(host, log_location):
0267     infos = {}
0268     http_avail = http_availability(host)
0269 
0270     process_avail = process_availability()
0271 
0272     heartbeat_avail, hang_workers = heartbeat_availability(log_location)
0273     infos['num_hang_workers'] = hang_workers
0274 
0275     if not http_avail:
0276         availability = 0
0277         avail_info = "iDDS http rest service is not running"
0278     elif not process_avail:
0279         availability = 50
0280         avail_info = "iDDS agents are not running"
0281     else:
0282         if not heartbeat_avail:
0283             availability = 50
0284             avail_info = "iDDS agents are running. However heartbeat file is not found (or not renewed)"
0285         elif heartbeat_avail < 100:
0286             availability = heartbeat_avail
0287             avail_info = "iDDS agents are running. However there are hanging workers"
0288         else:
0289             availability = heartbeat_avail
0290             avail_info = "iDDS is OK"
0291 
0292     if options.debug:
0293         print("availability: %s, avail_info: %s, infos: %s" % (availability, avail_info, infos))
0294 
0295     return availability, avail_info, infos
0296 
0297 
0298 def read_last_email_times():
0299     try:
0300         with open(filename, "r") as f:
0301             lines = f.readlines()
0302         return [
0303             datetime.datetime.strptime(line.strip(), "%Y-%m-%d %H:%M:%S")
0304             for line in lines
0305         ]
0306     except FileNotFoundError:
0307         return []
0308 
0309 
0310 def update_email_times(timestamps):
0311     with open(filename, "w+") as f:
0312         # Save the last 10 timestamps
0313         for ts in timestamps[-10:]:
0314             f.write(ts.strftime("%Y-%m-%d %H:%M:%S") + "\n")
0315 
0316 
0317 def send_email(subject, body, to_email):
0318     from_email = "atlpan@mail.cern.ch"
0319 
0320     msg = MIMEText(body)
0321     msg["Subject"] = subject
0322     msg["From"] = from_email
0323     msg["To"] = to_email
0324 
0325     server = smtplib.SMTP("localhost")
0326     server.sendmail(from_email, to_email, msg.as_string())
0327     server.quit()
0328 
0329 
0330 def email_manager(service, host, avail, avail_info):
0331     try:
0332         # If the server is 100% available, then skip the email
0333         if avail in ("100", 100):
0334             return
0335 
0336         # Get the last email times. Don't send more than one email per hour
0337         last_email_times = read_last_email_times()
0338         now = datetime.datetime.now()
0339         if last_email_times and now - last_email_times[-1] <= datetime.timedelta(
0340             hours=1
0341         ):
0342             return
0343 
0344         # Email subject
0345         subject = "[SLS] Service issues for {0} on {1}".format(service, host)
0346 
0347         # Email content
0348         body = """
0349         Service: {0}
0350         Host: {1}
0351         Availability: {2}
0352         Availability info: {3}
0353         """.format(
0354             service, host, avail, avail_info
0355         )
0356 
0357         email = "atlas-adc-idds-admins@cern.ch"
0358 
0359         send_email(subject, body, email)
0360 
0361         # Record the time of the email
0362         last_email_times.append(now)
0363         update_email_times(last_email_times)
0364 
0365         if options.debug:
0366             print("Email sent.")
0367         return
0368 
0369     except Exception:
0370         pass
0371 
0372 
0373 # run program
0374 __main__()