source/sphenixprod/create_full_filelist_run_seg.py

0001 #!/usr/bin/env python3
0002
0003 """
0004 Generate a list of files (and one with full paths) with given specifications.
0005 Note: It should only find one file every time.
0006 """
0007
0008 import sys
0009 from simpleLogger import slogger, CHATTY, DEBUG, INFO, WARN, ERROR, CRITICAL  # noqa: F401
0010 from sphenixdbutils import cnxn_string_map, dbQuery # type: ignore
0011
0012 def main():
0013     slogger.setLevel("DEBUG")
0014     script_name = sys.argv[0]
0015     if len(sys.argv) == 6 :
0016         dataset = sys.argv[1]
0017         intriplet = sys.argv[2]
0018         dsttype = sys.argv[3]
0019         runnumber_str = sys.argv[4]
0020         segment_str = sys.argv[5]
0021     else:
0022         ERROR( "usage: [dataset] [intriplet] [dsttype] <runnumber> <segment> ")
0023         sys.exit(1)
0024
0025     try:
0026         runnumber = int(runnumber_str)
0027         segment = int(segment_str)
0028     except ValueError:
0029         print(f"Error: runnumber '{runnumber_str}' must be an integer.")
0030         print(f"     : segment '{segment_str}' must be an integer.")
0031         sys.exit(1)
0032
0033     # dsttype comes as a a comma-separated list, add ticks for sql
0034     dsttype4sql=dsttype.replace(",","','")
0035
0036     #  The following:
0037     # SELECT datasets.filename,files.full_file_path
0038     # FROM files,datasets
0039     # WHERE files.lfn=datasets.filename
0040     #  is  very slow. So split it into separate queries.
0041     datasets_query = f"""
0042     SELECT filename
0043     FROM datasets
0044     WHERE datasets.dsttype in ( '{dsttype4sql}' )
0045     AND datasets.runnumber = {runnumber}
0046     AND datasets.segment = {segment}
0047     AND datasets.status=1"""
0048     datasets_query += f"""
0049     AND tag='{intriplet}'
0050     AND dataset = '{dataset}'"""
0051     datasets_query += ";"
0052
0053     print (f"datasets query is {datasets_query}")
0054     rows = dbQuery( cnxn_string_map['fcr'], datasets_query).fetchall()
0055     file_list=[]
0056     for row in rows:
0057         file_list.append(row.filename)
0058
0059     if not file_list:
0060         print("No files found for the given criteria.")
0061         exit(1)
0062     filelist=sorted(file_list)
0063
0064     ### Collect full paths. Note, we can make this optional for combiner jobs.
0065     filelist_str="','".join(filelist)
0066     files_query = f"""
0067     SELECT full_file_path,md5,size,full_host_name
0068     FROM files
0069     WHERE lfn in ( '{filelist_str}' )
0070     ;"""
0071     print (f"files query is {files_query}")
0072     rows = dbQuery( cnxn_string_map['fcr'], files_query).fetchall()
0073     full_path_info=[]
0074     for full_file_path,md5,size,full_host_name in rows:
0075         full_path_info.append(f"{full_file_path} {md5} {size} {full_host_name}")
0076         #full_path_info.append(f"{full_file_path} {size}")
0077         #full_path_info.append(f"{full_file_path}")
0078
0079     if not full_path_info:
0080         print("No files found for the given criteria.")
0081         exit(1)
0082
0083     list_filename = "infile.list"
0084     full_path_list_filename = "infile_paths.list"
0085     try:
0086         with open(list_filename, 'w') as f_out:
0087             for fname in file_list:
0088                 f_out.write(f"{fname}\n")
0089     except IOError as e:
0090             print(f"Error writing to file {list_filename}: {e}")
0091
0092     try:
0093         with open(full_path_list_filename, 'w') as f_out:
0094             for info in full_path_info:
0095                 # print(f"Adding {info}")
0096                 f_out.write(f"{info}\n")
0097     except IOError as e:
0098             print(f"Error writing to file {full_path_list_filename}: {e}")
0099
0100 if __name__ == "__main__":
0101     main()