pilot/util/default.cfg

0001 # Licensed under the Apache License, Version 2.0 (the "License");
0002 # you may not use this file except in compliance with the License.
0003 # You may obtain a copy of the License at
0004 # http://www.apache.org/licenses/LICENSE-2.0
0005 #
0006 # Authors:
0007 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017
0008 # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2021
0009
0010
0011 ################################
0012 # Experiment specific parameters
0013
0014 [Experiment]
0015
0016 name: ATLAS
0017
0018
0019 ################################
0020 # Pilot parameters
0021
0022 [Pilot]
0023
0024 # Pilot logs
0025 pilotlog: pilotlog.txt
0026 stageinlog: stageinlog.txt
0027 stageoutlog: stageoutlog.txt
0028
0029 # The file name for the job definition
0030 pandajobdata: pandaJobData.out
0031
0032 # Run with a fake test job, no server updates (values: 'fake', 'real'). The test job type can be 'production' or 'user'.
0033 # The test transfer type can be 'direct' or 'NULL'. Test job command can be 'normal' or 'sleep' (normal means standard
0034 # reconstruction job, while sleep means that the payload command is 'sleep 1' and no input or output transfers).
0035 pandajob: real
0036 testjobtype: production
0037 testjobcommand: normal
0038 testtransfertype: NULL
0039
0040 # The URL for the PanDA server
0041 pandaserver: https://pandaserver.cern.ch:25443
0042 # pandaserver: https://aipanda007.cern.ch:25443
0043
0044 # The URL for the iDDS server
0045 iddsserver: https://pandaserver.cern.ch:25443
0046
0047 # The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode)
0048 heartbeat: 1800
0049 debug_heartbeat: 60
0050
0051 # Heartbeat message file (only used when Pilot is not sending heartbeats to server)
0052 heartbeat_message: heartbeat.json
0053
0054 # Job IDs can be stored to a file that is picked up by the wrapper
0055 jobid_file: pandaIDs.out
0056
0057 # The minimum required disk space for the pilot to run a job
0058 free_space_limit: 2 GB
0059 # The minimum required disk space during running
0060 free_space_limit_running: 1 GB
0061
0062 # The maximum output file size
0063 maximum_output_file_size: 500 GB
0064
0065 # The maximum allowed sum of all input files (files accessed by direct access not counted by pilot)
0066 # (fall-back value, schedconfig value is primarily used)
0067 maximum_input_file_sizes: 14336 MB
0068
0069 # Size limit of payload stdout size during running. unit is in kB (value = 2 * 1024 ** 2)
0070 local_size_limit_stdout: 2097152
0071
0072 # Looping job time limits; if job does not write anything in N minutes, it is considered to be a looping
0073 looping_verification_time: 900
0074 # for both production and user analysis jobs, 2*3600
0075 looping_limit_default: 7200
0076 # The minimum allowed looping limit, 2*3600 (reserved for future use)
0077 looping_limit_min_default: 7200
0078
0079 # Kill instruction monitoring time and file name for instruction
0080 kill_instruction_time: 60
0081 kill_instruction_filename: pilot_kill_payload
0082
0083 # Proxy verification time (used by monitoring) in seconds
0084 proxy_verification_time: 600
0085
0086 # In case payload proxy should be downloaded from the server
0087 payload_proxy_from_server: True
0088
0089 # Disk space monitoring time
0090 disk_space_verification_time: 60
0091
0092 # Memory usage verification time (how often the memory monitor output will be checked)
0093 memory_usage_verification_time: 60
0094
0095 # Process verification time
0096 process_verification_time: 300
0097
0098 # Output file size verification time
0099 output_verification_time: 300
0100
0101 # The default thread check time in seconds, used by thread monitoring
0102 thread_check: 10
0103
0104 # The default CPU check time in seconds, used by CPU monitoring
0105 cpu_check: 60
0106
0107 # The timing file used to store various timing measurements
0108 timing_file: pilot_timing.json
0109
0110 # Optional error log (leave filename empty if not wanted)
0111 error_log: piloterrorlog.txt
0112
0113 # List of redundant files and directories to be removed prior to log file creation
0114 # For ATLAS, any initial /cvmfs bit will automatically be corrected if ATLAS_LOCAL_ROOT_BASE is set
0115 redundant: /cvmfs/atlas.cern.ch/repo/sw/PandaPilot/config/redundant.txt
0116
0117 # Utility commands that may be launched by the pilot before payload, with payload, after payload or with stagein
0118 # E.g. MemoryMonitor is used as an internal name. The actual command is 'prmon'
0119 utility_before_payload:
0120 utility_with_payload:
0121 utility_after_payload_started: MemoryMonitor
0122 utility_with_stagein:
0123
0124 # HTTP related time-outs
0125 http_connect_timeout: 100
0126 http_maxtime: 120
0127
0128 # Remote file open verification (if not wanted, clear the remotefileverification_log)
0129 remotefileverification_dictionary: remotefileverification_dictionary.json
0130 remotefileverification_log: remotefileslog.txt
0131
0132 # The name of the base trace report (the base trace report is written to file for later use)
0133 base_trace_report: base_trace_report.json
0134
0135 ################################
0136 # Information service parameters
0137
0138 [Information]
0139
0140 # Path to local cache
0141 #cache_dir:  /lustre/atlas/proj-shared/csc108/debug/atlas/HPC_pilot_test/queue_cache #for Titan
0142 cache_dir:
0143
0144 # default URL value for primary source of Queuedata (can be overwritten via --queuedata-url option)
0145 queuedata_url: http://pandaserver.cern.ch:25085/cache/schedconfig/{pandaqueue}.all.json
0146 # path to queuedata JSON provided by shared filesystem
0147 queuedata_cvmfs: CVMFS_PATH/atlas.cern.ch/repo/sw/local/etc/cric_pandaqueues.json
0148 # local cache filename of the queuedata json
0149 queuedata_cache: queuedata.json
0150
0151 # URL for the PanDA queues API provided by Information system
0152 queues_url: https://atlas-cric.cern.ch/cache/schedconfig/{pandaqueue}.json
0153 # path to PanDA queues JSON provided by shared filesystem
0154 queues_cvmfs: CVMFS_PATH/atlas.cern.ch/repo/sw/local/etc/cric_pandaqueues.json
0155 # file name of local cache for the PanDA queues JSON
0156 queues_cache: cric_pandaqueues.json
0157
0158 # URL for the DDMEndpoints/storages API provided by Information system
0159 storages_url: https://atlas-cric.cern.ch/cache/ddmendpoints.json
0160 # path to storages JSON cache provided by shared filesystem
0161 storages_cvmfs: CVMFS_PATH/atlas.cern.ch/repo/sw/local/etc/cric_ddmendpoints.json
0162 # file name of local cache for the storages JSON
0163 storages_cache: cric_ddmendpoints.json
0164
0165
0166 # overwrite acopytools for queuedata
0167 #acopytools: {'pr':['rucio']}
0168 #acopytools: {'pr':['rucio'], 'pw':['gfalcopy'], 'pl':['gfalcopy']}
0169 #acopytools: {'pr': ['lsm'], 'pw': ['lsm']}
0170
0171 ################################
0172 # Payload parameters
0173
0174 [Payload]
0175
0176 # File name for the job report produced by the payload
0177 jobreport: jobReport.json
0178
0179 # File name for production job metadata
0180 metadata: metadata.xml
0181
0182 # File names for stdout/stderr
0183 payloadstdout: payload.stdout
0184 payloadstderr: payload.stderr
0185
0186 ################################
0187 # Container parameters
0188
0189 [Container]
0190
0191 # Master parameter (unused)
0192 # Is the pilot allowed to use containers? If False, then any database settings are ignored
0193 # allow_container: False
0194
0195 # The setup type can be either ALRB or (explicit) singularity
0196 setup_type: ALRB
0197
0198 # Name of script file that will contain the payload command to be executed in the container
0199 container_script: container_script.sh
0200
0201 # Name of script file that will contain the setup command for the payload to be executed in the container
0202 release_setup: my_release_setup.sh
0203
0204 # Name of the file that will contain the payload pid
0205 pid_file: pid.txt
0206
0207 # If a middleware container script is listed (e.g. stagein.py), the pilot will perform all stage-in and/or stage-out
0208 # steps in a standard container (to be revised).
0209 # Note: if no middleware container image is specified below, the middleware will still be executed by the specified script
0210 # (without using a container).
0211 middleware_container_stagein_script: stagein.py
0212 middleware_container_stageout_script: stageout.py
0213 # error information and stage-in file status is saved in a json file by the stage-in script and later read by the pilot
0214 stagein_status_dictionary: stagein_status.json
0215 # replica information is passed to the stage-in script using a json file to avoid problems with very long argument lists
0216 stagein_replica_dictionary: stagein_replicas.json
0217 middleware_stagein_stdout: stagein_stdout.txt
0218 middleware_stagein_stderr: stagein_stderr.txt
0219 stageout_status_dictionary: stageout_status.json
0220 middleware_stageout_stdout: stageout_stdout.txt
0221 middleware_stageout_stderr: stageout_stderr.txt
0222
0223 # Name of middleware image
0224 # This image is used if middleware is not found locally on the worker node. Middleware is expected to be present
0225 # in the container image
0226 middleware_container: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/atlas/rucio-clients:default
0227 # On HPC (ALRB will locate the image)
0228 middleware_container_no_path: atlas/rucio-clients:default
0229
0230 ################################
0231 # Harvester parameters
0232
0233 [Harvester]
0234
0235 # Name of the job request file. The pilot places this file in the pilot launch directory when it wants Harvester
0236 # to send another job (placed by Harvester in the same directory)
0237 job_request_file: worker_requestjob.json
0238
0239 # Name of the kill worker file. The pilot places this file in the pilot launch directory when it has finished all jobs
0240 # and wants Harvester to kill the worker (virtual machine)
0241 kill_worker_file: kill_worker
0242
0243 # Name of file with list of IDs of PanDA jobs to be processed by HPC Pilot
0244 jobs_list_file: worker_pandaids.json
0245
0246 # Name of file with PanDA job to be processed by HPC Pilot
0247 pandajob_file: HPCJobs.json
0248
0249 # Name of file with worker report
0250 workerAttributesFile: worker_attributes.json
0251
0252 # Name of file for declaration of stageout
0253 StageOutnFile: event_status.dump.json
0254
0255 ################################
0256 # HPC parameters
0257
0258 [HPC]
0259
0260 # Path to scratch disk (RAM, SSD etc) for placing of job working directory
0261 scratch: /tmp/scratch/
0262
0263 ################################
0264 # Rucio parameters
0265
0266 [Rucio]
0267
0268 # Rucio server URL for traces
0269 url: https://rucio-lb-prod.cern.ch/traces/