epic-lfhcal-tbana/NewStructure/summarize_runs.py

0001 import os
0002 import re
0003 import sys
0004 import glob
0005 import pandas as pd
0006
0007 # Use like this:
0008 #   python3 summarize_runs.py root://dtn-eic.jlab.org/volatile/eic/EPIC/MC_input/LFHCal_BT/raw/CAEN_Sept24_TB_PS/
0009 # This will output a file run_summary_full.csv in the configs directory.
0010 # It will scrape all of the information from the Run#Info.txt files in the above directory.  In addition it will find the size of
0011 # every Run*list.txt file in said directory and the count of the triggers recorded in that file.
0012 # All of this gets summarized in one (huge) but handy table.
0013
0014 try:
0015     from XRootD import client
0016     from XRootD.client.flags import OpenFlags
0017 except ImportError:
0018     client = None
0019
0020 def clean_key(key):
0021     if ' ' not in key:
0022         key = key.replace('[', '_').replace(']', '').replace('#', '')
0023         key = re.sub(r'[^\w_]', '', key)
0024     return key.strip()
0025
0026 def read_xrdfs_file_lines(xrd_url):
0027     if client is None:
0028         raise ImportError("XRootD client not installed. Run: pip install XRootD")
0029
0030     fixed_url = re.sub(r'^(root://[^/]+)(/[^/])', r'\1/\2', xrd_url)
0031     if not re.match(r'root://[^/]+//', fixed_url):
0032         fixed_url = fixed_url.replace("root://", "root:///", 1)
0033
0034     f = client.File()
0035     status, _ = f.open(fixed_url, OpenFlags.READ)
0036     if not status.ok:
0037         raise IOError(f"Failed to open {fixed_url}: {status.message}")
0038
0039     lines = []
0040     offset = 0
0041     chunk_size = 65536
0042     while True:
0043         status, data = f.read(offset, chunk_size)
0044         if not status.ok or not data:
0045             break
0046         lines.append(data.decode('utf-8'))
0047         offset += len(data)
0048
0049     f.close()
0050     return ''.join(lines).splitlines()
0051
0052 def parse_info_file(filepath):
0053     print(f"Parsing info file: {os.path.basename(filepath)}")
0054     lines = read_xrdfs_file_lines(filepath) if filepath.startswith("root://") else open(filepath).readlines()
0055
0056     data = {}
0057     for line in lines:
0058         line = line.split('#', 1)[0].strip()
0059         if not line or line.startswith('*'):
0060             continue
0061         if ':' in line:
0062             key, val = line.split(':', 1)
0063             data[key.strip()] = val.strip()
0064         elif '=' in line:
0065             key, val = line.split('=', 1)
0066             data[clean_key(key)] = val.strip()
0067         elif re.match(r'\w+\[?\d*\]?\s+', line):
0068             parts = re.split(r'\s+', line, maxsplit=1)
0069             if len(parts) == 2:
0070                 data[clean_key(parts[0])] = parts[1].strip()
0071
0072     content = "\n".join(lines)
0073     if "Run" not in data:
0074         run_match = re.search(r'Run n\.\s*(\d+)', content)
0075         if run_match:
0076             data["Run"] = int(run_match.group(1))
0077
0078     if "Run" in data:
0079         data["Run"] = int(re.search(r'\d+', str(data["Run"])).group())
0080
0081     if "Start Time" not in data:
0082         match = re.search(r'Start Time:\s*(.+)', content)
0083         if match:
0084             data["Start Time"] = match.group(1).strip()
0085     if "Stop Time" not in data:
0086         match = re.search(r'Stop Time:\s*(.+)', content)
0087         if match:
0088             data["Stop Time"] = match.group(1).strip()
0089     if "Elapsed (s)" not in data:
0090         match = re.search(r'Elapsed time\s*=\s*([\d.]+)', content)
0091         if match:
0092             data["Elapsed (s)"] = float(match.group(1))
0093
0094     return data
0095
0096 def count_triggers_in_listfile(filepath):
0097     print(f"  Fast counting triggers in file: {os.path.basename(filepath)}")
0098     last_trgid = None
0099     chunk_size = 65536
0100
0101     if filepath.startswith("root://"):
0102         if client is None:
0103             raise ImportError("XRootD client not installed.")
0104         fixed_url = re.sub(r'^(root://[^/]+)(/[^/])', r'\1/\2', filepath)
0105         if not re.match(r'root://[^/]+//', fixed_url):
0106             fixed_url = fixed_url.replace("root://", "root:///", 1)
0107         f = client.File()
0108         status, _ = f.open(fixed_url, OpenFlags.READ)
0109         if not status.ok:
0110             raise IOError(f"Failed to open {fixed_url}: {status.message}")
0111         status, statinfo = f.stat()
0112         filesize = statinfo.size
0113         offset = max(0, filesize - chunk_size)
0114         status, data = f.read(offset, min(chunk_size, filesize))
0115         f.close()
0116         lines = data.decode("utf-8").splitlines()
0117     else:
0118         with open(filepath, 'rb') as f:
0119             f.seek(0, os.SEEK_END)
0120             filesize = f.tell()
0121             offset = max(0, filesize - chunk_size)
0122             f.seek(offset)
0123             lines = f.read().decode("utf-8").splitlines()
0124
0125     lines.reverse()
0126     for line in lines:
0127         line = line.strip()
0128         if not line or line.startswith('//'):
0129             continue
0130         parts = re.split(r'\s+', line)
0131         if len(parts) >= 6:
0132             try:
0133                 last_trgid = int(parts[5])
0134                 break
0135             except ValueError:
0136                 continue
0137
0138     return (last_trgid + 1) if last_trgid is not None else 0
0139
0140 def list_remote_files(xrd_base):
0141     if client is None:
0142         raise ImportError("XRootD client not installed.")
0143     server = xrd_base.split('//')[1].split('/')[0]
0144     path = '/' + '/'.join(xrd_base.split('//')[1].split('/')[1:])
0145     fs = client.FileSystem(server)
0146     status, listing = fs.dirlist(path)
0147     if not status.ok:
0148         raise IOError(f"Failed to list {xrd_base}: {status.message}")
0149     return [f"{xrd_base.rstrip('/')}/{entry.name}" for entry in listing]
0150
0151 def summarize_runs(directory):
0152     summary = []
0153     all_files = list_remote_files(directory) if directory.startswith("root://") else glob.glob(os.path.join(directory, "*"))
0154
0155     info_files = sorted(f for f in all_files if re.search(r'Run\d+_Info\.txt$', f))
0156
0157     if not info_files:
0158         print("No Run*_Info.txt files found.")
0159         return pd.DataFrame()
0160
0161     for info_path in info_files:
0162         try:
0163             info_data = parse_info_file(info_path)
0164             run_number = info_data['Run']
0165         except Exception as e:
0166             print(f"  Skipping {info_path}: {e}")
0167             continue
0168
0169         base = f"Run{run_number}"
0170         part_files = []
0171
0172         for f in all_files:
0173             #print(f"    Checking file: {os.path.basename(f)}")
0174             if re.match(f".*{base}\\.\\d+_list\\.(txt|csv)$", f) or re.match(f".*{base}_list\\.(txt|csv)$", f):
0175                 print(f"    --> Matched: {os.path.basename(f)}")
0176                 part_files.append(f)
0177
0178         if not part_files:
0179             print(f"  Warning: no list files found for {base}")
0180             continue
0181
0182         n_parts = len(part_files)
0183         n_triggers = sum(count_triggers_in_listfile(f) for f in part_files)
0184
0185         total_size = 0
0186
0187         for f in part_files:
0188             if f.startswith("root://"):
0189                 server = f.split('//')[1].split('/')[0]
0190                 path = '/' + '/'.join(f.split('//')[1].split('/')[1:])
0191                 fs = client.FileSystem(server)
0192                 status, statinfo = fs.stat(path)
0193                 if status.ok:
0194                     total_size += statinfo.size
0195                 else:
0196                     print(f"    Warning: Could not get size for {f}")
0197             else:
0198                 try:
0199                     total_size += os.path.getsize(f)
0200                 except:
0201                     print(f"    Warning: Could not get size for {f}")
0202
0203         info_data["NumParts"] = n_parts
0204         info_data["NumTriggers"] = n_triggers
0205         info_data["TotalSize_MB"] = round(total_size / (1024 * 1024), 2)
0206         summary.append(info_data)
0207
0208     return pd.DataFrame(summary)
0209
0210 if __name__ == "__main__":
0211     import argparse
0212     parser = argparse.ArgumentParser(description="Summarize CAEN DT5202 run files")
0213     parser.add_argument("path", help="Local directory or XRootD path (e.g. ./data or root://server/path/)")
0214     args = parser.parse_args()
0215
0216     summary_df = summarize_runs(args.path)
0217     outfile = "../configs/run_summary_full.csv"
0218     if "Run" in summary_df.columns:
0219         summary_df = summary_df.sort_values(by="Run")
0220     print("Writing file: ", outfile)
0221     summary_df.to_csv(outfile, index=False)
0222     print(summary_df.head())
0223