Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-10 08:39:08

0001 #!/usr/bin/python
0002 
0003 import datetime
0004 import optparse
0005 import re
0006 import socket
0007 import subprocess
0008 import sys
0009 
0010 import SLSxml
0011 
0012 ###########################################
0013 # define options
0014 ###########################################
0015 parser = optparse.OptionParser()
0016 parser.add_option("-u", "--use", dest="use", type="string", help="Use of xml, allowed values: 'mon', 'server' or 'bamboo'")
0017 parser.add_option("--host", dest="host", type="string", help="Hostname of server to check, default is current machine hostname")
0018 parser.add_option("-d", "--dir", dest="dir", type="string", help="Filename of the xml file output.  Default is " + "/data/atlpan/oracle/panda/monitoring")
0019 parser.add_option("--debug", action="store_true", dest="debug", default=False, help="Print out debug statements.")
0020 
0021 (options, args) = parser.parse_args()
0022 
0023 
0024 def __main__():
0025     if options.host:
0026         host = options.host
0027     else:
0028         host = socket.gethostname()
0029         host = re.sub(r"^(\w+).*", r"\1", host)
0030 
0031     if options.use == "mon":
0032         tmp_xml = make_monitor(host)
0033         file_part = "PandaMon"
0034     elif options.use == "server":
0035         tmp_xml = make_server(host)
0036         file_part = "PandaServer"
0037     elif options.use == "bamboo":
0038         tmp_xml = make_bamboo(host)
0039         file_part = "PandaBamboo"
0040     else:
0041         print("Err: please choose a use, 'mon', 'server' or 'bamboo'.")
0042         return
0043 
0044     if options.dir:
0045         file_dir = options.dir
0046     else:
0047         file_dir = "/data/atlpan/oracle/panda/monitoring"
0048 
0049     file_name = f"{file_dir}/{file_part}_{host}.xml"
0050     tmp_file = open(file_name, "w")
0051     tmp_file.write(tmp_xml)
0052     tmp_file.close()
0053 
0054 
0055 def make_server(host):
0056     if options.debug:
0057         print("Creating the server monitoring xml")
0058 
0059     server_avail = server_availability(host)
0060     add_processes = count_add_processes()
0061     num_holdings = count_holdings()
0062     data_used = volume_use("data")
0063     var_used = volume_use("var")
0064     ave_regtime = registration_time()
0065     ave_regtimeDQ2 = registration_time(onlyDQ2=True)
0066     ave_filelookuptime = filelookup_time()
0067     num_finished_call = count_finished_callback()
0068     num_finished_check = count_finished_catalogcheck()
0069 
0070     sls_xml = SLSxml.xml_doc()
0071     sls_xml.set_id(f"PandaServer_{host}")
0072     sls_xml.set_shortname(f"PandaServer monitoring service at {host}")
0073     sls_xml.set_fullname(f"PandaServer monitoring service at {host}")
0074     sls_xml.set_availability(str(server_avail))
0075 
0076     sls_xml.add_data("AddProcesses", "Number of processes for DQ2+LFC registration", str(add_processes))
0077     sls_xml.add_data("HoldingJobs", "Number of holding jobs to be registered", str(num_holdings))
0078     sls_xml.add_data("RegistrationTime", "Average time for DQ2+LFC registration in second", str(ave_regtime))
0079     sls_xml.add_data("RegistrationTimeDQ2", "Average time for DQ2 registration in second", str(ave_regtimeDQ2))
0080     sls_xml.add_data("FileLookupTime", "Average time for replica lookup per 100 files in second", str(ave_filelookuptime))
0081     sls_xml.add_data("DataVolumeUse", "Percent use of the local /data volume", str(data_used))
0082     sls_xml.add_data("VarVolumeUse", "Percent use of the local /var volume", str(var_used))
0083     sls_xml.add_data("FinishedJobsByCallback", "Number of finished jobs by callbacks", str(num_finished_call))
0084     sls_xml.add_data("FinishedJobsByCatalog", "Number of finished jobs by catalog check", str(num_finished_check))
0085     return sls_xml.print_xml()
0086 
0087 
0088 def make_bamboo(host):
0089     if options.debug:
0090         print("Creating the server monitoring xml")
0091 
0092     server_avail = bamboo_availability(host)
0093 
0094     sls_xml = SLSxml.xml_doc()
0095     sls_xml.set_id(f"PandaBamboo_{host}")
0096     sls_xml.set_shortname(f"PandaBamboo monitoring service at {host}")
0097     sls_xml.set_fullname(f"PandaBamboo monitoring service at {host}")
0098     sls_xml.set_availability(str(server_avail))
0099     return sls_xml.print_xml()
0100 
0101 
0102 def make_monitor(host):
0103     if options.debug:
0104         print("Creating the monitor monitoring xml")
0105 
0106     errormes = False
0107     messagetext = ""
0108 
0109     http_avail = httpd_availability(host)
0110     if http_avail == 0:
0111         errormes = True
0112         messagetext += f"Error: web server on {host} not working\n"
0113 
0114     squid_avail = squid_availability()
0115     if squid_avail == 0:
0116         errormes = True
0117         messagetext += f"Error: squid server on {host} not working\n"
0118 
0119     panda_avail = panda_availability(host)
0120     if panda_avail == 0:
0121         errormes = True
0122         messagetext += f"Error: panda monitor on {host} not working\n"
0123 
0124     http_processes = count_processes()
0125 
0126     data_used = volume_use("data")
0127     var_used = volume_use("var")
0128 
0129     if errormes:
0130         error_mail(host, messagetext)
0131 
0132     if options.debug:
0133         print(f"web - {http_avail}, squid - {squid_avail}, panda - {panda_avail}")
0134 
0135     sls_xml = SLSxml.xml_doc()
0136     sls_xml.set_id(f"PandaMon_{host}")
0137     sls_xml.set_shortname(f"PandaMonitor monitoring service at {host}")
0138     sls_xml.set_fullname(f"PandaMonitor monitoring service at {host}")
0139     sls_xml.set_availability(str(panda_avail))
0140 
0141     # adding intervention by hand here
0142     # sls_xml.add_intervention( "2011-01-16T20:00:00", "PT36H",
0143     # "Panda services with be out for over a day due to database server changes." )
0144 
0145     sls_xml.add_data("HttpdAvailability", "Availability of the httpd server", str(http_avail))
0146     sls_xml.add_data("SquidAvailability", "Availability of the squid server", str(squid_avail))
0147     sls_xml.add_data("PandaAvailability", "Availability of the panda monitor", str(panda_avail))
0148     sls_xml.add_data("HttpProcesses", "Number of processes for the panda monitor", str(http_processes))
0149     sls_xml.add_data("DataVolumeUse", "Percent use of the local /data volume", str(data_used))
0150     sls_xml.add_data("VarVolumeUse", "Percent use of the local /var volume", str(var_used))
0151     return sls_xml.print_xml()
0152 
0153 
0154 def httpd_availability(host):
0155     url = f"http://{host}.cern.ch/robots.txt"
0156     return check_url(url, "go away")
0157 
0158 
0159 def squid_availability():
0160     command = "/usr/bin/squidclient -p 25980 cache_object://localhost/info"
0161     return check_command(command, "OK")
0162 
0163 
0164 def panda_availability(host):
0165     port = "25980"
0166     baseurl = "http://" + host + ":" + port + "/server/pandamon/query?"
0167 
0168     reply = check_url(baseurl + "isAlive", "yes")
0169     if reply != "100":
0170         return "0"
0171 
0172     return "100"
0173 
0174     # The above is a simpler test of the python code, for now, until the
0175     # panda monitor migration is more stable, and all network tweaks are
0176     # in quator, so things are stable on reboot/upgrade.  Once that is
0177     # true the below tests should be put back.
0178 
0179     reply = check_url(baseurl + "dash=prod", "CERN:OK")
0180     if reply != "100":
0181         return "0"
0182 
0183     reply = check_url(baseurl + "dash=clouds", "Cloud status")
0184     if reply != "100":
0185         return "0"
0186 
0187     reply = check_url(baseurl + "overview=incidents", "Recorded incidents")
0188     if reply != "100":
0189         return "0"
0190 
0191     reply = check_url(baseurl + "dash=ddm", "Space available")
0192     if reply != "100":
0193         return "0"
0194 
0195     return "100"
0196 
0197 
0198 def server_availability(host):
0199     tmp_url = f"--no-check-certificate https://{host}:25443/server/panda/isAlive"
0200     reply = check_url(tmp_url, "alive=yes")
0201     if reply != "100":
0202         return "0"
0203 
0204     return "100"
0205 
0206 
0207 def bamboo_availability(host):
0208     tmp_url = f"http://{host}:25070/bamboo/bamboo/isAlive"
0209     reply = check_url(tmp_url, "alive=yes")
0210     if reply != "100":
0211         return "0"
0212 
0213     return "100"
0214 
0215 
0216 def check_url(url, check_string):
0217     command = "wget -q -O - " + url
0218     return check_command(command, check_string)
0219 
0220 
0221 def check_command(command, check_string):
0222     if options.debug:
0223         print(f"Checking command : {command}")
0224         print(f"For string : {check_string}")
0225 
0226     tmp_array = command.split()
0227     output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0]
0228 
0229     if re.search(check_string, output):
0230         if options.debug:
0231             print("Found the string, return 100")
0232         return "100"
0233     else:
0234         if options.debug:
0235             print("String not found, return 0")
0236         return "0"
0237 
0238 
0239 def count_processes():
0240     output = subprocess.Popen(["ps", "aux"], stdout=subprocess.PIPE).communicate()[0]
0241     count = 0
0242     for line in output.split("\n"):
0243         if re.match("@@panda_user@@", line):
0244             if re.search("http", line):
0245                 count += 1
0246     return count
0247 
0248 
0249 def count_add_processes():
0250     output = subprocess.Popen("ps -eo pgid,args | grep add.py | grep -v grep | uniq", stdout=subprocess.PIPE, shell=True).communicate()[0]
0251     count = 0
0252     for line in output.split("\n"):
0253         line = line.strip()
0254         if line == "":
0255             continue
0256         count += 1
0257     return count
0258 
0259 
0260 def count_holdings():
0261     output = subprocess.Popen("ls /var/log/panda/ | egrep '(finished|failed)'", stdout=subprocess.PIPE, shell=True).communicate()[0]
0262     count = 0
0263     for line in output.split("\n"):
0264         line = line.strip()
0265         if line == "":
0266             continue
0267         count += 1
0268     return count
0269 
0270 
0271 def registration_time(timeSlice=False, onlyDQ2=False):
0272     aveRegTime = "0.0"
0273     try:
0274         if onlyDQ2:
0275             com = "grep registraion /var/log/panda/panda-Adder.log | grep DQ2 | grep -v LFC"
0276         else:
0277             com = "grep 'LFC+DQ2' /var/log/panda/panda-Adder.log"
0278         if not timeSlice:
0279             com += " | tail -1000"
0280         output = subprocess.Popen(com, stdout=subprocess.PIPE, shell=True).communicate()[0]
0281         regtimeMap = {}
0282         for line in output.split("\n"):
0283             try:
0284                 items = line.split()
0285                 timestamp = items[1][:2]
0286                 regtime = float(items[-2])
0287                 if timestamp not in regtimeMap:
0288                     regtimeMap[timestamp] = {"totalTime": 0.0, "totalReg": 0}
0289                 regtimeMap[timestamp]["totalTime"] += regtime
0290                 regtimeMap[timestamp]["totalReg"] += 1
0291             except Exception:
0292                 pass
0293         timestamps = list(regtimeMap)
0294         if timeSlice:
0295             timestamps.sort()
0296             for timestamp in timestamps:
0297                 print(f"{timestamp} {regtimeMap[timestamp]['totalTime'] / float(regtimeMap[timestamp]['totalReg']):4.1f}sec")
0298         else:
0299             totalTime = 0.0
0300             totalReg = 0
0301             for timestamp in timestamps:
0302                 totalTime += regtimeMap[timestamp]["totalTime"]
0303                 totalReg += regtimeMap[timestamp]["totalReg"]
0304             if totalReg > 0:
0305                 aveRegTime = f"{totalTime / float(totalReg):4.1f}"
0306     except Exception:
0307         errtype, ervalue = sys.exc_info()[:2]
0308         print(f"ERROR : {errtype}:{ervalue} in registration_time")
0309     return aveRegTime
0310 
0311 
0312 def filelookup_time(timeSlice=False):
0313     aveRegTime = "0.0"
0314     timeNow = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
0315     try:
0316         com = "grep took /var/log/panda/panda-broker_util.log | grep file | grep -v 'for 1 LFNs' | grep -v 'for 2 LFNs'"
0317         if not timeSlice:
0318             com += " | tail -1000"
0319         output = subprocess.Popen(com, stdout=subprocess.PIPE, shell=True).communicate()[0]
0320         regtimeMap = {}
0321         for line in output.split("\n"):
0322             try:
0323                 timeDelta = timeNow - datetime.datetime.strptime(line[:19], "%Y-%m-%d %H:%M:%S")
0324                 if timeDelta > datetime.timedelta(minutes=120):
0325                     continue
0326                 items = line.split()
0327                 timestamp = items[1][:2]
0328                 regtime = float(items[-2])
0329                 tmpMatch = re.search(" (\d+) LFNs", line)
0330                 if tmpMatch is None:
0331                     continue
0332                 nFiles = int(tmpMatch.group(1))
0333                 if timestamp not in regtimeMap:
0334                     regtimeMap[timestamp] = {"totalTime": 0.0, "totalReg": 0}
0335                 regtimeMap[timestamp]["totalTime"] += regtime
0336                 regtimeMap[timestamp]["totalReg"] += nFiles
0337             except Exception:
0338                 pass
0339         timestamps = list(regtimeMap)
0340         if timeSlice:
0341             timestamps.sort()
0342             for timestamp in timestamps:
0343                 print(f"{timestamp} {100 * regtimeMap[timestamp]['totalTime'] / float(regtimeMap[timestamp]['totalReg']):4.1f}sec")
0344         else:
0345             totalTime = 0.0
0346             totalReg = 0
0347             for timestamp in timestamps:
0348                 totalTime += regtimeMap[timestamp]["totalTime"]
0349                 totalReg += regtimeMap[timestamp]["totalReg"]
0350             if totalReg > 0:
0351                 aveRegTime = f"{100 * totalTime / float(totalReg):4.1f}"
0352     except Exception:
0353         errtype, ervalue = sys.exc_info()[:2]
0354         print(f"ERROR : {errtype}:{ervalue} in filelookup_time")
0355     return aveRegTime
0356 
0357 
0358 def volume_use(volume_name):
0359     command = "df -Pkh /" + volume_name
0360     used_amount = 0
0361     tmp_array = command.split()
0362     output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0]
0363 
0364     for line in output.split("\n"):
0365         if re.search(volume_name, line):
0366             used_amount = re.search(r"(\d+)\%", line).group(1)
0367 
0368     return used_amount
0369 
0370 
0371 def error_mail(host, message):
0372     mail_cmd = []
0373     mail_cmd.append("mail")
0374     mail_cmd.append("-s")
0375     mail_cmd.append(f"Problems with {host}")
0376     mail_cmd.append("douglas@cern.ch")
0377 
0378     text = f"Problems with {host} :\n\n"
0379     text += message
0380 
0381     p = subprocess.Popen(mail_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
0382     p.stdin.write(text)
0383     p.stdin.close()
0384 
0385 
0386 def count_finished_callback():
0387     nJobs = 0
0388     try:
0389         timeNow = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
0390         output = subprocess.Popen("grep 'all files ready' /var/log/panda/panda-Finisher.log", stdout=subprocess.PIPE, shell=True).communicate()[0]
0391         pandaIDs = []
0392         for line in output.split("\n"):
0393             try:
0394                 line = line.strip()
0395                 if line == "":
0396                     continue
0397                 timeDelta = timeNow - datetime.datetime.strptime(line[:19], "%Y-%m-%d %H:%M:%S")
0398                 if timeDelta > datetime.timedelta(minutes=60):
0399                     continue
0400                 pandaID = line.split()[-4]
0401                 if pandaID not in pandaIDs:
0402                     pandaIDs.append(pandaID)
0403             except Exception:
0404                 pass
0405         nJobs = len(pandaIDs)
0406     except Exception:
0407         pass
0408     return nJobs
0409 
0410 
0411 def count_finished_catalogcheck():
0412     nJobs = 0
0413     try:
0414         timeNow = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
0415         output = subprocess.Popen("grep Finish /var/log/panda/panda-datasetManager.log | grep -v Wait", stdout=subprocess.PIPE, shell=True).communicate()[0]
0416         pandaIDs = []
0417         for line in output.split("\n"):
0418             try:
0419                 line = line.strip()
0420                 if line == "":
0421                     continue
0422                 timeDelta = timeNow - datetime.datetime.strptime(line[:19], "%Y-%m-%d %H:%M:%S")
0423                 if timeDelta > datetime.timedelta(minutes=60):
0424                     continue
0425                 pandaID = line.split()[-4]
0426                 if pandaID not in pandaIDs:
0427                     pandaIDs.append(pandaID)
0428             except Exception:
0429                 pass
0430         nJobs = len(pandaIDs)
0431     except Exception:
0432         pass
0433     return nJobs
0434 
0435 
0436 # run program
0437 __main__()