File indexing completed on 2026-04-25 08:29:08
0001
0002
0003 from pathlib import Path
0004 import matplotlib as mpl
0005 import matplotlib.pyplot as plt
0006 from matplotlib.colors import LogNorm
0007 import numpy as np
0008 import collections
0009 import pickle
0010 import json
0011
0012 import pprint
0013
0014 from argparsing import monitor_args
0015 from sphenixdbutils import test_mode as dbutils_test_mode
0016 from simpleLogger import slogger, CHATTY, DEBUG, INFO, WARN, ERROR, CRITICAL
0017 from sphenixmisc import setup_rot_handler
0018 from sphenixcondortools import base_batchname_from_args, monitor_condor_jobs
0019 import htcondor2 as htcondor
0020
0021 def plot_memory_distribution(memory_usage, request_memory, output_file):
0022 """Generates and saves a histogram of memory usage vs. requested memory."""
0023 if not memory_usage and not request_memory:
0024 INFO("No memory data to plot for distribution.")
0025 return
0026
0027 mpl.rcParams['axes.formatter.useoffset'] = False
0028
0029 plt.style.use('seaborn-v0_8-deep')
0030 fig, ax = plt.subplots(figsize=(12, 7))
0031
0032 max_val = max(np.max(memory_usage) if memory_usage else 0, np.max(request_memory) if request_memory else 0)
0033 bins = np.linspace(0, max_val, 50)
0034
0035 if memory_usage:
0036 ax.hist(memory_usage, bins=bins, alpha=0.7, label=f'Memory Usage (Avg: {np.mean(memory_usage):.0f} MB)')
0037 if request_memory:
0038 ax.hist(request_memory, bins=bins, alpha=0.7, label=f'Requested Memory (Avg: {np.mean(request_memory):.0f} MB)')
0039
0040 ax.set_title('Distribution of Memory Usage and Request for Held Jobs')
0041 ax.set_xlabel('Memory (MB)')
0042 ax.set_ylabel('Number of Jobs')
0043 ax.legend()
0044 ax.grid(True, which='both', linestyle='--', linewidth=0.5)
0045
0046 plt.tight_layout()
0047 plt.savefig(output_file)
0048 plt.close(fig)
0049 INFO(f"Saved memory distribution plot to {output_file}")
0050
0051 def plot_memory_boxplot(memory_usage, request_memory, output_file):
0052 """Generates and saves a boxplot of memory usage vs. requested memory."""
0053 if not memory_usage and not request_memory:
0054 INFO("No memory data to plot for boxplot.")
0055 return
0056
0057 plt.style.use('seaborn-v0_8-deep')
0058 fig, ax = plt.subplots(figsize=(10, 7))
0059
0060 data_to_plot = []
0061 labels = []
0062 if memory_usage:
0063 data_to_plot.append(memory_usage)
0064 labels.append('Memory Usage')
0065 if request_memory:
0066 data_to_plot.append(request_memory)
0067 labels.append('Requested Memory')
0068
0069 ax.boxplot(data_to_plot, patch_artist=True)
0070
0071 ax.set_title('Box Plot of Memory Usage and Request for Held Jobs')
0072 ax.set_ylabel('Memory (MB)')
0073 ax.set_xticklabels(labels)
0074 ax.yaxis.grid(True, linestyle='--', which='major', color='grey', alpha=0.5)
0075
0076 plt.tight_layout()
0077 plt.savefig(output_file)
0078 plt.close(fig)
0079 INFO(f"Saved memory box plot to {output_file}")
0080
0081 def plot_memory_scatterplot(memory_usage, request_memory, output_file):
0082 """Generates and saves a 2D histogram of memory usage vs. requested memory."""
0083 if not memory_usage or not request_memory:
0084 INFO("Not enough data to plot 2D histogram.")
0085 return
0086
0087 plt.style.use('seaborn-v0_8-deep')
0088 fig, ax = plt.subplots(figsize=(11, 10))
0089
0090 x_data = np.array(request_memory)
0091 y_data = np.array(memory_usage)
0092
0093
0094 counts, xedges, yedges, im = ax.hist2d(x_data, y_data, bins=50, cmap='viridis', norm=LogNorm())
0095
0096
0097 fig.colorbar(im, ax=ax, label='Number of Jobs')
0098
0099
0100 max_val = max(np.max(x_data) if len(x_data) > 0 else 0, np.max(y_data) if len(y_data) > 0 else 0)
0101 ax.plot([0, max_val], [0, max_val], 'r--', label='Requested = Used (y=x)')
0102
0103 ax.set_title('Memory Usage vs. Requested Memory for Held Jobs (2D Histogram)')
0104 ax.set_xlabel('Requested Memory (MB)')
0105 ax.set_ylabel('Actual Memory Usage (MB)')
0106 ax.legend()
0107 ax.grid(True, which='both', linestyle='--', linewidth=0.5)
0108 ax.set_aspect('equal', adjustable='box')
0109
0110 plt.tight_layout()
0111 plt.savefig(output_file)
0112 plt.close(fig)
0113 INFO(f"Saved memory 2D histogram to {output_file}")
0114
0115 def main():
0116 args = monitor_args()
0117
0118 test_mode = (
0119 dbutils_test_mode
0120 or args.test_mode
0121
0122 )
0123
0124
0125 sublogdir=setup_rot_handler(args)
0126 slogger.setLevel(args.loglevel)
0127 INFO(f"Logging to {sublogdir}, level {args.loglevel}")
0128
0129 if test_mode:
0130 INFO("Running in testbed mode.")
0131 args.mangle_dirpath = 'production-testbed'
0132 else:
0133 INFO("Running in production mode.")
0134
0135 batch_name=base_batchname_from_args(args)
0136 jobs=monitor_condor_jobs(batch_name=batch_name, dryrun=args.dryrun)
0137
0138
0139 held_jobs_ads = [ad for ad in jobs.values() if ad.get('JobStatus') == 5]
0140
0141 if not held_jobs_ads:
0142 INFO(f"Found {len(jobs)} total jobs, but none are currently held.")
0143 return
0144
0145 INFO(f"Found {len(jobs)} total jobs, {len(held_jobs_ads)} of which are held.")
0146
0147 held_memory_usage = []
0148 held_request_memory = []
0149 kill_suggestion = []
0150 under_memory_hold_reasons = collections.Counter()
0151 for job_ad in held_jobs_ads:
0152
0153 mu = int(job_ad.get('ResidentSetSize', 0))/1024
0154 rm = int(job_ad.get('MemoryProvisioned', 0))
0155 held_memory_usage.append(mu)
0156 held_request_memory.append(rm)
0157
0158 if mu < rm:
0159 hold_reason = job_ad.get('HoldReason', 'Not Available')
0160 job_id = f"{job_ad.get('ClusterId')}.{job_ad.get('ProcId')}"
0161 DEBUG(f"Job {job_id} held with mu ({mu:.0f}MB) < rm ({rm}MB). Reason: {hold_reason}")
0162 reason_code = job_ad.get('LastHoldReasonCode', 0)
0163 if reason_code !=26 :
0164 WARN(f'Job {job_id} held with mu ({mu:.0f}MB) < rm ({rm}MB). Reason Code {reason_code}:\n\t"{hold_reason}"')
0165 under_memory_hold_reasons[reason_code] += 1
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199 if args.plot:
0200 if held_memory_usage or held_request_memory:
0201 dist_plot_file = f"{batch_name}_memory_distribution.png"
0202 box_plot_file = f"{batch_name}_memory_boxplot.png"
0203 scatter_plot_file = f"{batch_name}_memory_scatterplot.png"
0204 plot_memory_distribution(held_memory_usage, held_request_memory, dist_plot_file)
0205 plot_memory_boxplot(held_memory_usage, held_request_memory, box_plot_file)
0206 plot_memory_scatterplot(held_memory_usage, held_request_memory, scatter_plot_file)
0207
0208 if under_memory_hold_reasons:
0209 INFO("Frequency of hold reason codes for jobs held while under memory request:")
0210 pprint.pprint(dict(under_memory_hold_reasons))
0211
0212
0213 if kill_suggestion:
0214 INFO(f"There were {len(kill_suggestion)} jobs that could not be resubmitted due to exceeding max memory.")
0215 if args.kill:
0216 INFO(f"Killing them now as per --kill option.")
0217 if not args.dryrun or True:
0218 schedd = htcondor.Schedd()
0219 with open(f"{batch_name}_killed_jobs.pkl", "wb") as f:
0220 pickle.dump(kill_suggestion, f)
0221 with open(f"{batch_name}_killed_jobs.json", "w") as f:
0222 for job_ad in kill_suggestion:
0223 json.dump(dict(job_ad), f, indent=4)
0224 try:
0225
0226
0227 INFO(f"Killed {len(kill_suggestion)} jobs that exceeded max memory limit of {args.max_memory}MB.")
0228 except Exception as e:
0229 ERROR(f"Failed to kill jobs: {e}")
0230 else:
0231 kill_procs=[f"{job_ad['ClusterId']}.{job_ad['ProcId']}" for job_ad in kill_suggestion]
0232 INFO(f"You may want to kill them manually: \n{', '.join(kill_procs)}")
0233
0234
0235 INFO(f"{Path(__file__).name} DONE.")
0236
0237 if __name__ == '__main__':
0238 main()
0239 exit(0)