File indexing completed on 2026-04-11 08:41:05
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 import os
0011 import logging
0012 from xml.dom import minidom
0013 from xml.etree import ElementTree
0014
0015 from pilot.util.filehandling import write_file
0016
0017 logger = logging.getLogger(__name__)
0018
0019
0020 def create_input_file_metadata(file_dictionary, workdir, filename="PoolFileCatalog.xml"):
0021 """
0022 Create a Pool File Catalog for the files listed in the input dictionary.
0023 The function creates properly formatted XML (pretty printed) and writes the XML to file.
0024 Note: any environment variables in the pfn tags will be expanded (see pilot/control/data::get_input_file_dictionary()).
0025
0026 Format:
0027 dictionary = {'guid': 'pfn', ..}
0028 ->
0029 <POOLFILECATALOG>
0030 <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">
0031 <File ID="guid">
0032 <physical>
0033 <pfn filetype="ROOT_All" name="surl"/>
0034 </physical>
0035 <logical/>
0036 </File>
0037 <POOLFILECATALOG>
0038
0039 :param file_dictionary: file dictionary.
0040 :param workdir: job work directory (string).
0041 :param filename: PFC file name (string).
0042 :return: xml (string)
0043 """
0044
0045
0046 data = ElementTree.Element('POOLFILECATALOG')
0047
0048 for fileid in list(file_dictionary.keys()):
0049 _file = ElementTree.SubElement(data, 'File')
0050 _file.set('ID', fileid)
0051 _physical = ElementTree.SubElement(_file, 'physical')
0052 _pfn = ElementTree.SubElement(_physical, 'pfn')
0053 _pfn.set('filetype', 'ROOT_All')
0054 _pfn.set('name', file_dictionary.get(fileid))
0055 ElementTree.SubElement(_file, 'logical')
0056
0057
0058 xml = ElementTree.tostring(data, encoding='utf8')
0059 xml = minidom.parseString(xml).toprettyxml(indent=" ")
0060
0061
0062 if '&' in xml:
0063 xml = xml.replace('&', '&')
0064
0065
0066 xml = xml.replace('<POOLFILECATALOG>', '<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">\n<POOLFILECATALOG>')
0067
0068 write_file(os.path.join(workdir, filename), xml, mute=False)
0069
0070 return xml
0071
0072
0073 def get_file_info_from_xml(workdir, filename="PoolFileCatalog.xml"):
0074 """
0075 Return a file info dictionary based on the metadata in the given XML file.
0076 The file info dictionary is used to replace the input file LFN list in the job parameters with the full PFNs
0077 which are needed for direct access in production jobs.
0078
0079 Example of PoolFileCatalog.xml:
0080
0081 <?xml version="1.0" ?>
0082 <POOLFILECATALOG>
0083 <File ID="4ACC5018-2EA3-B441-BC11-0C0992847FD1">
0084 <physical>
0085 <pfn filetype="ROOT_ALL" name="root://dcgftp.usatlas.bnl.gov:1096//../AOD.11164242._001522.pool.root.1"/>
0086 </physical>
0087 <logical/>
0088 </File>
0089 </POOLFILECATALOG>
0090
0091 which gives the following dictionary:
0092
0093 {'AOD.11164242._001522.pool.root.1': ['root://dcgftp.usatlas.bnl.gov:1096//../AOD.11164242._001522.pool.root.1',
0094 '4ACC5018-2EA3-B441-BC11-0C0992847FD1']}
0095
0096 :param workdir: directory of PoolFileCatalog.xml (string).
0097 :param filename: file name (default: PoolFileCatalog.xml) (string).
0098 :return: dictionary { LFN: [PFN, GUID], .. }
0099 """
0100
0101 file_info_dictionary = {}
0102 tree = ElementTree.parse(os.path.join(workdir, filename))
0103 root = tree.getroot()
0104
0105
0106 for child in root:
0107
0108 guid = child.attrib['ID']
0109 for grandchild in child:
0110
0111 for greatgrandchild in grandchild:
0112
0113 pfn = greatgrandchild.attrib['name']
0114 lfn = os.path.basename(pfn)
0115 file_info_dictionary[lfn] = [pfn, guid]
0116
0117 return file_info_dictionary
0118
0119
0120 def get_metadata_from_xml(workdir, filename="metadata.xml"):
0121 """
0122 Parse the payload metadata.xml file.
0123
0124 Example of metadata.xml:
0125
0126 <?xml version="1.0" encoding="UTF-8"?>
0127 <!DOCTYPE POOLFILECATALOG SYSTEM 'InMemory'>
0128 <POOLFILECATALOG>
0129 <File ID="D2A6D6F4-ADB2-B140-9C2E-D2D5C099B342">
0130 <logical>
0131 <lfn name="RDO_011a43ba-7c98-488d-8741-08da579c5de7.root"/>
0132 </logical>
0133 <metadata att_name="geometryVersion" att_value="ATLAS-R2-2015-03-01-00"/>
0134 <metadata att_name="conditionsTag" att_value="OFLCOND-RUN12-SDR-19"/>
0135 <metadata att_name="size" att_value="3250143"/>
0136 <metadata att_name="events" att_value="3"/>
0137 <metadata att_name="beamType" att_value="collisions"/>
0138 <metadata att_name="fileType" att_value="RDO"/>
0139 </File>
0140 </POOLFILECATALOG>
0141
0142 which gives the following dictionary:
0143
0144 {'RDO_011a43ba-7c98-488d-8741-08da579c5de7.root': {'conditionsTag': 'OFLCOND-RUN12-SDR-19',
0145 'beamType': 'collisions', 'fileType': 'RDO', 'geometryVersion': 'ATLAS-R2-2015-03-01-00', 'events': '3',
0146 'size': '3250143'}}
0147
0148 :param workdir: payload work directory (string).
0149 :param filename: metadata file name (string).
0150 :return: metadata dictionary.
0151 """
0152
0153
0154 metadata_dictionary = {}
0155 path = os.path.join(workdir, filename)
0156 if not os.path.exists(path):
0157 logger.warning('file does not exist: %s', path)
0158 return metadata_dictionary
0159
0160 tree = ElementTree.parse(path)
0161 root = tree.getroot()
0162
0163
0164 for child in root:
0165
0166 lfn = ""
0167 guid = child.attrib['ID'] if 'ID' in child.attrib else None
0168 for grandchild in child:
0169
0170 if grandchild.tag == 'logical':
0171 for greatgrandchild in grandchild:
0172
0173
0174 lfn = greatgrandchild.attrib.get('name')
0175 metadata_dictionary[lfn] = {}
0176 elif grandchild.tag == 'metadata':
0177
0178 name = grandchild.attrib.get('att_name')
0179 value = grandchild.attrib.get('att_value')
0180 metadata_dictionary[lfn][name] = value
0181 else:
0182
0183 pass
0184 if guid:
0185 metadata_dictionary[lfn]['guid'] = guid
0186
0187 return metadata_dictionary
0188
0189
0190 def get_number_of_events(metadata_dictionary, filename=''):
0191 """
0192 Get the number of events for the given file from the metadata dictionary (from metadata.xml).
0193
0194 :param metadata_dictionary: dictionary from parsed metadata.xml file.
0195 :param filename: file name for which the number of events relates to (string).
0196 :return: number of events (int). -1 is returned if the events could not be extracted from the dictionary.
0197 """
0198
0199 nevents = -1
0200 if filename != '' and filename in metadata_dictionary:
0201 try:
0202 nevents = int(metadata_dictionary[filename].get('events'))
0203 except ValueError as exc:
0204 logger.warning('failed to convert number of events to int: %s', exc)
0205 else:
0206 logger.warning('number of events could not be extracted from metadata dictionary (based on metadata.xml)')
0207
0208 return nevents
0209
0210
0211 def get_total_number_of_events(metadata_dictionary):
0212 """
0213 Get the total number of events for all files in the metadata dictionary.
0214
0215 :param metadata_dictionary: dictionary from parsed metadata.xml file.
0216 :return: total number of processed events (int).
0217 """
0218
0219 nevents = 0
0220 for filename in metadata_dictionary:
0221 _nevents = get_number_of_events(metadata_dictionary, filename=filename)
0222 if _nevents != -1:
0223 nevents += _nevents
0224
0225 return nevents
0226
0227
0228 def get_guid(metadata_dictionary, filename=''):
0229 """
0230 Get the guid from the metadata dictionary for the given LFN.
0231
0232 :param metadata_dictionary: dictionary from parsed metadata.xml file.
0233 :param filename: file name for which the number of events relates to (string).
0234 :return: guid (string, None is returned if guid could not be extracted).
0235 """
0236
0237 guid = None
0238 if filename != '' and filename in metadata_dictionary:
0239 try:
0240 guid = metadata_dictionary[filename].get('guid')
0241 except ValueError as exc:
0242 logger.warning('failed to get guid from xml: %s', exc)
0243 else:
0244 logger.warning('guid could not be extracted from metadata dictionary (based on metadata.xml)')
0245
0246 return guid
0247
0248
0249 def get_guid_from_xml(metadata_dictionary, lfn):
0250 """
0251 Get the guid for the given LFN in the metadata dictionary.
0252
0253 :param metadata_dictionary: dictionary from parsed metadata.xml file.
0254 :param lfn: LFN (string).
0255 :return: total number of processed events (int).
0256 """
0257
0258 guid = None
0259 for filename in metadata_dictionary:
0260 if filename == lfn:
0261 guid = get_guid(metadata_dictionary, filename=filename)
0262
0263 return guid