File indexing completed on 2026-04-10 08:39:08
0001
0002
0003 import datetime
0004 import optparse
0005 import re
0006 import socket
0007 import subprocess
0008 import sys
0009
0010 import SLSxml
0011
0012
0013
0014
0015 parser = optparse.OptionParser()
0016 parser.add_option("-u", "--use", dest="use", type="string", help="Use of xml, allowed values: 'mon', 'server' or 'bamboo'")
0017 parser.add_option("--host", dest="host", type="string", help="Hostname of server to check, default is current machine hostname")
0018 parser.add_option("-d", "--dir", dest="dir", type="string", help="Filename of the xml file output. Default is " + "/data/atlpan/oracle/panda/monitoring")
0019 parser.add_option("--debug", action="store_true", dest="debug", default=False, help="Print out debug statements.")
0020
0021 (options, args) = parser.parse_args()
0022
0023
0024 def __main__():
0025 if options.host:
0026 host = options.host
0027 else:
0028 host = socket.gethostname()
0029 host = re.sub(r"^(\w+).*", r"\1", host)
0030
0031 if options.use == "mon":
0032 tmp_xml = make_monitor(host)
0033 file_part = "PandaMon"
0034 elif options.use == "server":
0035 tmp_xml = make_server(host)
0036 file_part = "PandaServer"
0037 elif options.use == "bamboo":
0038 tmp_xml = make_bamboo(host)
0039 file_part = "PandaBamboo"
0040 else:
0041 print("Err: please choose a use, 'mon', 'server' or 'bamboo'.")
0042 return
0043
0044 if options.dir:
0045 file_dir = options.dir
0046 else:
0047 file_dir = "/data/atlpan/oracle/panda/monitoring"
0048
0049 file_name = f"{file_dir}/{file_part}_{host}.xml"
0050 tmp_file = open(file_name, "w")
0051 tmp_file.write(tmp_xml)
0052 tmp_file.close()
0053
0054
0055 def make_server(host):
0056 if options.debug:
0057 print("Creating the server monitoring xml")
0058
0059 server_avail = server_availability(host)
0060 add_processes = count_add_processes()
0061 num_holdings = count_holdings()
0062 data_used = volume_use("data")
0063 var_used = volume_use("var")
0064 ave_regtime = registration_time()
0065 ave_regtimeDQ2 = registration_time(onlyDQ2=True)
0066 ave_filelookuptime = filelookup_time()
0067 num_finished_call = count_finished_callback()
0068 num_finished_check = count_finished_catalogcheck()
0069
0070 sls_xml = SLSxml.xml_doc()
0071 sls_xml.set_id(f"PandaServer_{host}")
0072 sls_xml.set_shortname(f"PandaServer monitoring service at {host}")
0073 sls_xml.set_fullname(f"PandaServer monitoring service at {host}")
0074 sls_xml.set_availability(str(server_avail))
0075
0076 sls_xml.add_data("AddProcesses", "Number of processes for DQ2+LFC registration", str(add_processes))
0077 sls_xml.add_data("HoldingJobs", "Number of holding jobs to be registered", str(num_holdings))
0078 sls_xml.add_data("RegistrationTime", "Average time for DQ2+LFC registration in second", str(ave_regtime))
0079 sls_xml.add_data("RegistrationTimeDQ2", "Average time for DQ2 registration in second", str(ave_regtimeDQ2))
0080 sls_xml.add_data("FileLookupTime", "Average time for replica lookup per 100 files in second", str(ave_filelookuptime))
0081 sls_xml.add_data("DataVolumeUse", "Percent use of the local /data volume", str(data_used))
0082 sls_xml.add_data("VarVolumeUse", "Percent use of the local /var volume", str(var_used))
0083 sls_xml.add_data("FinishedJobsByCallback", "Number of finished jobs by callbacks", str(num_finished_call))
0084 sls_xml.add_data("FinishedJobsByCatalog", "Number of finished jobs by catalog check", str(num_finished_check))
0085 return sls_xml.print_xml()
0086
0087
0088 def make_bamboo(host):
0089 if options.debug:
0090 print("Creating the server monitoring xml")
0091
0092 server_avail = bamboo_availability(host)
0093
0094 sls_xml = SLSxml.xml_doc()
0095 sls_xml.set_id(f"PandaBamboo_{host}")
0096 sls_xml.set_shortname(f"PandaBamboo monitoring service at {host}")
0097 sls_xml.set_fullname(f"PandaBamboo monitoring service at {host}")
0098 sls_xml.set_availability(str(server_avail))
0099 return sls_xml.print_xml()
0100
0101
0102 def make_monitor(host):
0103 if options.debug:
0104 print("Creating the monitor monitoring xml")
0105
0106 errormes = False
0107 messagetext = ""
0108
0109 http_avail = httpd_availability(host)
0110 if http_avail == 0:
0111 errormes = True
0112 messagetext += f"Error: web server on {host} not working\n"
0113
0114 squid_avail = squid_availability()
0115 if squid_avail == 0:
0116 errormes = True
0117 messagetext += f"Error: squid server on {host} not working\n"
0118
0119 panda_avail = panda_availability(host)
0120 if panda_avail == 0:
0121 errormes = True
0122 messagetext += f"Error: panda monitor on {host} not working\n"
0123
0124 http_processes = count_processes()
0125
0126 data_used = volume_use("data")
0127 var_used = volume_use("var")
0128
0129 if errormes:
0130 error_mail(host, messagetext)
0131
0132 if options.debug:
0133 print(f"web - {http_avail}, squid - {squid_avail}, panda - {panda_avail}")
0134
0135 sls_xml = SLSxml.xml_doc()
0136 sls_xml.set_id(f"PandaMon_{host}")
0137 sls_xml.set_shortname(f"PandaMonitor monitoring service at {host}")
0138 sls_xml.set_fullname(f"PandaMonitor monitoring service at {host}")
0139 sls_xml.set_availability(str(panda_avail))
0140
0141
0142
0143
0144
0145 sls_xml.add_data("HttpdAvailability", "Availability of the httpd server", str(http_avail))
0146 sls_xml.add_data("SquidAvailability", "Availability of the squid server", str(squid_avail))
0147 sls_xml.add_data("PandaAvailability", "Availability of the panda monitor", str(panda_avail))
0148 sls_xml.add_data("HttpProcesses", "Number of processes for the panda monitor", str(http_processes))
0149 sls_xml.add_data("DataVolumeUse", "Percent use of the local /data volume", str(data_used))
0150 sls_xml.add_data("VarVolumeUse", "Percent use of the local /var volume", str(var_used))
0151 return sls_xml.print_xml()
0152
0153
0154 def httpd_availability(host):
0155 url = f"http://{host}.cern.ch/robots.txt"
0156 return check_url(url, "go away")
0157
0158
0159 def squid_availability():
0160 command = "/usr/bin/squidclient -p 25980 cache_object://localhost/info"
0161 return check_command(command, "OK")
0162
0163
0164 def panda_availability(host):
0165 port = "25980"
0166 baseurl = "http://" + host + ":" + port + "/server/pandamon/query?"
0167
0168 reply = check_url(baseurl + "isAlive", "yes")
0169 if reply != "100":
0170 return "0"
0171
0172 return "100"
0173
0174
0175
0176
0177
0178
0179 reply = check_url(baseurl + "dash=prod", "CERN:OK")
0180 if reply != "100":
0181 return "0"
0182
0183 reply = check_url(baseurl + "dash=clouds", "Cloud status")
0184 if reply != "100":
0185 return "0"
0186
0187 reply = check_url(baseurl + "overview=incidents", "Recorded incidents")
0188 if reply != "100":
0189 return "0"
0190
0191 reply = check_url(baseurl + "dash=ddm", "Space available")
0192 if reply != "100":
0193 return "0"
0194
0195 return "100"
0196
0197
0198 def server_availability(host):
0199 tmp_url = f"--no-check-certificate https://{host}:25443/server/panda/isAlive"
0200 reply = check_url(tmp_url, "alive=yes")
0201 if reply != "100":
0202 return "0"
0203
0204 return "100"
0205
0206
0207 def bamboo_availability(host):
0208 tmp_url = f"http://{host}:25070/bamboo/bamboo/isAlive"
0209 reply = check_url(tmp_url, "alive=yes")
0210 if reply != "100":
0211 return "0"
0212
0213 return "100"
0214
0215
0216 def check_url(url, check_string):
0217 command = "wget -q -O - " + url
0218 return check_command(command, check_string)
0219
0220
0221 def check_command(command, check_string):
0222 if options.debug:
0223 print(f"Checking command : {command}")
0224 print(f"For string : {check_string}")
0225
0226 tmp_array = command.split()
0227 output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0]
0228
0229 if re.search(check_string, output):
0230 if options.debug:
0231 print("Found the string, return 100")
0232 return "100"
0233 else:
0234 if options.debug:
0235 print("String not found, return 0")
0236 return "0"
0237
0238
0239 def count_processes():
0240 output = subprocess.Popen(["ps", "aux"], stdout=subprocess.PIPE).communicate()[0]
0241 count = 0
0242 for line in output.split("\n"):
0243 if re.match("@@panda_user@@", line):
0244 if re.search("http", line):
0245 count += 1
0246 return count
0247
0248
0249 def count_add_processes():
0250 output = subprocess.Popen("ps -eo pgid,args | grep add.py | grep -v grep | uniq", stdout=subprocess.PIPE, shell=True).communicate()[0]
0251 count = 0
0252 for line in output.split("\n"):
0253 line = line.strip()
0254 if line == "":
0255 continue
0256 count += 1
0257 return count
0258
0259
0260 def count_holdings():
0261 output = subprocess.Popen("ls /var/log/panda/ | egrep '(finished|failed)'", stdout=subprocess.PIPE, shell=True).communicate()[0]
0262 count = 0
0263 for line in output.split("\n"):
0264 line = line.strip()
0265 if line == "":
0266 continue
0267 count += 1
0268 return count
0269
0270
0271 def registration_time(timeSlice=False, onlyDQ2=False):
0272 aveRegTime = "0.0"
0273 try:
0274 if onlyDQ2:
0275 com = "grep registraion /var/log/panda/panda-Adder.log | grep DQ2 | grep -v LFC"
0276 else:
0277 com = "grep 'LFC+DQ2' /var/log/panda/panda-Adder.log"
0278 if not timeSlice:
0279 com += " | tail -1000"
0280 output = subprocess.Popen(com, stdout=subprocess.PIPE, shell=True).communicate()[0]
0281 regtimeMap = {}
0282 for line in output.split("\n"):
0283 try:
0284 items = line.split()
0285 timestamp = items[1][:2]
0286 regtime = float(items[-2])
0287 if timestamp not in regtimeMap:
0288 regtimeMap[timestamp] = {"totalTime": 0.0, "totalReg": 0}
0289 regtimeMap[timestamp]["totalTime"] += regtime
0290 regtimeMap[timestamp]["totalReg"] += 1
0291 except Exception:
0292 pass
0293 timestamps = list(regtimeMap)
0294 if timeSlice:
0295 timestamps.sort()
0296 for timestamp in timestamps:
0297 print(f"{timestamp} {regtimeMap[timestamp]['totalTime'] / float(regtimeMap[timestamp]['totalReg']):4.1f}sec")
0298 else:
0299 totalTime = 0.0
0300 totalReg = 0
0301 for timestamp in timestamps:
0302 totalTime += regtimeMap[timestamp]["totalTime"]
0303 totalReg += regtimeMap[timestamp]["totalReg"]
0304 if totalReg > 0:
0305 aveRegTime = f"{totalTime / float(totalReg):4.1f}"
0306 except Exception:
0307 errtype, ervalue = sys.exc_info()[:2]
0308 print(f"ERROR : {errtype}:{ervalue} in registration_time")
0309 return aveRegTime
0310
0311
0312 def filelookup_time(timeSlice=False):
0313 aveRegTime = "0.0"
0314 timeNow = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
0315 try:
0316 com = "grep took /var/log/panda/panda-broker_util.log | grep file | grep -v 'for 1 LFNs' | grep -v 'for 2 LFNs'"
0317 if not timeSlice:
0318 com += " | tail -1000"
0319 output = subprocess.Popen(com, stdout=subprocess.PIPE, shell=True).communicate()[0]
0320 regtimeMap = {}
0321 for line in output.split("\n"):
0322 try:
0323 timeDelta = timeNow - datetime.datetime.strptime(line[:19], "%Y-%m-%d %H:%M:%S")
0324 if timeDelta > datetime.timedelta(minutes=120):
0325 continue
0326 items = line.split()
0327 timestamp = items[1][:2]
0328 regtime = float(items[-2])
0329 tmpMatch = re.search(" (\d+) LFNs", line)
0330 if tmpMatch is None:
0331 continue
0332 nFiles = int(tmpMatch.group(1))
0333 if timestamp not in regtimeMap:
0334 regtimeMap[timestamp] = {"totalTime": 0.0, "totalReg": 0}
0335 regtimeMap[timestamp]["totalTime"] += regtime
0336 regtimeMap[timestamp]["totalReg"] += nFiles
0337 except Exception:
0338 pass
0339 timestamps = list(regtimeMap)
0340 if timeSlice:
0341 timestamps.sort()
0342 for timestamp in timestamps:
0343 print(f"{timestamp} {100 * regtimeMap[timestamp]['totalTime'] / float(regtimeMap[timestamp]['totalReg']):4.1f}sec")
0344 else:
0345 totalTime = 0.0
0346 totalReg = 0
0347 for timestamp in timestamps:
0348 totalTime += regtimeMap[timestamp]["totalTime"]
0349 totalReg += regtimeMap[timestamp]["totalReg"]
0350 if totalReg > 0:
0351 aveRegTime = f"{100 * totalTime / float(totalReg):4.1f}"
0352 except Exception:
0353 errtype, ervalue = sys.exc_info()[:2]
0354 print(f"ERROR : {errtype}:{ervalue} in filelookup_time")
0355 return aveRegTime
0356
0357
0358 def volume_use(volume_name):
0359 command = "df -Pkh /" + volume_name
0360 used_amount = 0
0361 tmp_array = command.split()
0362 output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0]
0363
0364 for line in output.split("\n"):
0365 if re.search(volume_name, line):
0366 used_amount = re.search(r"(\d+)\%", line).group(1)
0367
0368 return used_amount
0369
0370
0371 def error_mail(host, message):
0372 mail_cmd = []
0373 mail_cmd.append("mail")
0374 mail_cmd.append("-s")
0375 mail_cmd.append(f"Problems with {host}")
0376 mail_cmd.append("douglas@cern.ch")
0377
0378 text = f"Problems with {host} :\n\n"
0379 text += message
0380
0381 p = subprocess.Popen(mail_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
0382 p.stdin.write(text)
0383 p.stdin.close()
0384
0385
0386 def count_finished_callback():
0387 nJobs = 0
0388 try:
0389 timeNow = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
0390 output = subprocess.Popen("grep 'all files ready' /var/log/panda/panda-Finisher.log", stdout=subprocess.PIPE, shell=True).communicate()[0]
0391 pandaIDs = []
0392 for line in output.split("\n"):
0393 try:
0394 line = line.strip()
0395 if line == "":
0396 continue
0397 timeDelta = timeNow - datetime.datetime.strptime(line[:19], "%Y-%m-%d %H:%M:%S")
0398 if timeDelta > datetime.timedelta(minutes=60):
0399 continue
0400 pandaID = line.split()[-4]
0401 if pandaID not in pandaIDs:
0402 pandaIDs.append(pandaID)
0403 except Exception:
0404 pass
0405 nJobs = len(pandaIDs)
0406 except Exception:
0407 pass
0408 return nJobs
0409
0410
0411 def count_finished_catalogcheck():
0412 nJobs = 0
0413 try:
0414 timeNow = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
0415 output = subprocess.Popen("grep Finish /var/log/panda/panda-datasetManager.log | grep -v Wait", stdout=subprocess.PIPE, shell=True).communicate()[0]
0416 pandaIDs = []
0417 for line in output.split("\n"):
0418 try:
0419 line = line.strip()
0420 if line == "":
0421 continue
0422 timeDelta = timeNow - datetime.datetime.strptime(line[:19], "%Y-%m-%d %H:%M:%S")
0423 if timeDelta > datetime.timedelta(minutes=60):
0424 continue
0425 pandaID = line.split()[-4]
0426 if pandaID not in pandaIDs:
0427 pandaIDs.append(pandaID)
0428 except Exception:
0429 pass
0430 nJobs = len(pandaIDs)
0431 except Exception:
0432 pass
0433 return nJobs
0434
0435
0436
0437 __main__()