|
|
|||
File indexing completed on 2026-01-09 10:16:41
0001 // Copyright (c) Microsoft Corporation. All rights reserved. 0002 // Licensed under the MIT License. 0003 0004 #pragma once 0005 0006 /* 0007 * This file defines SessionOptions Config Keys and format of the Config Values. 0008 * 0009 * The Naming Convention for a SessionOptions Config Key, 0010 * "[Area][.[SubArea1].[SubArea2]...].[Keyname]" 0011 * Such as "ep.cuda.use_arena" 0012 * The Config Key cannot be empty 0013 * The maximum length of the Config Key is 1024 0014 * 0015 * The string format of a SessionOptions Config Value is defined individually for each Config. 0016 * The maximum length of the Config Value is 2048 0017 */ 0018 0019 // Key for disable PrePacking, 0020 // If the config value is set to "1" then the prepacking is disabled, otherwise prepacking is enabled (default value) 0021 static const char* const kOrtSessionOptionsConfigDisablePrepacking = "session.disable_prepacking"; 0022 0023 // A value of "1" means allocators registered in the env will be used. "0" means the allocators created in the session 0024 // will be used. Use this to override the usage of env allocators on a per session level. 0025 static const char* const kOrtSessionOptionsConfigUseEnvAllocators = "session.use_env_allocators"; 0026 0027 // Set to 'ORT' (case sensitive) to load an ORT format model. 0028 // If unset, model type will default to ONNX unless inferred from filename ('.ort' == ORT format) or bytes to be ORT 0029 static const char* const kOrtSessionOptionsConfigLoadModelFormat = "session.load_model_format"; 0030 0031 // Set to 'ORT' (case sensitive) to save optimized model in ORT format when SessionOptions.optimized_model_path is set. 0032 // If unset, format will default to ONNX unless optimized_model_filepath ends in '.ort'. 0033 static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save_model_format"; 0034 0035 // If a value is "1", flush-to-zero and denormal-as-zero are applied. The default is "0". 0036 // When multiple sessions are created, a main thread doesn't override changes from succeeding session options, 0037 // but threads in session thread pools follow option changes. 0038 // When ORT runs with OpenMP, the same rule is applied, i.e. the first session option to flush-to-zero and 0039 // denormal-as-zero is only applied to global OpenMP thread pool, which doesn't support per-session thread pool. 0040 // Note that an alternative way not using this option at runtime is to train and export a model without denormals 0041 // and that's recommended because turning this option on may hurt model accuracy. 0042 static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero"; 0043 0044 // It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not. 0045 // "0": enable. ORT does fusion logic for QDQ format. 0046 // "1": disable. ORT doesn't do fusion logic for QDQ format. 0047 // Its default value is "0" unless the DirectML execution provider is registered, in which case it defaults to "1". 0048 static const char* const kOrtSessionOptionsDisableQuantQDQ = "session.disable_quant_qdq"; 0049 0050 // It controls whether to enable Double QDQ remover and Identical Children Consolidation 0051 // "0": not to disable. ORT does remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs 0052 // "1": disable. ORT doesn't remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs 0053 // Its default value is "0" 0054 static const char* const kOrtSessionOptionsDisableDoubleQDQRemover = "session.disable_double_qdq_remover"; 0055 0056 // If set to "1", enables the removal of QuantizeLinear/DequantizeLinear node pairs once all QDQ handling has been 0057 // completed. e.g. If after all QDQ handling has completed and we have -> FloatOp -> Q -> DQ -> FloatOp -> the 0058 // Q -> DQ could potentially be removed. This will provide a performance benefit by avoiding going from float to 0059 // 8-bit and back to float, but could impact accuracy. The impact on accuracy will be model specific and depend on 0060 // other factors like whether the model was created using Quantization Aware Training or Post Training Quantization. 0061 // As such, it's best to test to determine if enabling this works well for your scenario. 0062 // The default value is "0" 0063 // Available since version 1.11. 0064 static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enable_quant_qdq_cleanup"; 0065 0066 // Enable or disable gelu approximation in graph optimization. "0": disable; "1": enable. The default is "0". 0067 // GeluApproximation has side effects which may change the inference results. It is disabled by default due to this. 0068 static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation"; 0069 0070 // This setting controls whether to enable AheadOfTime function inlining. 0071 // AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model 0072 // as possible with the help of enabled execution providers. 0073 // This can reduce the number of function calls and improve performance because it is done before 0074 // Level1 optimizers and constant folding. However, under some circumstances, when the EPs are not available, 0075 // one can disable the AOT inlining, produce an optimized model and postpone AOT until run time. 0076 // "0": enable; "1": disable. 0077 // Its default value is "0". 0078 static const char* const kOrtSessionOptionsDisableAheadOfTimeFunctionInlining = "session.disable_aot_function_inlining"; 0079 0080 #ifdef ENABLE_TRAINING 0081 // Specifies a path of the file containing a list of memory optimization configurations. 0082 // The value should be a string indicating the file path of the config file. 0083 // The content of the config file is a JSON struct like this: 0084 // [ 0085 // "Gelu+Cast+:1:0", 0086 // "Dropout+:1:1" 0087 // ] 0088 // Taking the example of "Gelu+Cast+:1:0", 0089 // > "Gelu+Cast+" is the subgraph string, a valid "subgraph string" should be one subgraph representation 0090 // output by ORT graph transformations. 0091 // > "1" is "optimization strategy", valid values: 0 - disabled, 1 - recompute. 0092 // > "0" is "number of subgraph to apply" which is used to control how many subgraphs to apply optimization, 0093 // to avoid "oversaving" the memory. 0094 static const char* const kOrtSessionOptionsMemoryOptimizerApplyConfig = "optimization.memory_optimizer_config"; 0095 0096 // Specifies the config for detecting subgraphs for memory footprint reduction. 0097 // The value should be a string contains int separated using commas. The default value is "0:0". 0098 static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimization.enable_memory_probe_recompute_config"; 0099 #endif 0100 0101 // This setting if set should contain a comma separated list of optimizers names that should be disabled. 0102 // Optimizers may take time to execute and affect model loading time. If you feel that a specific optimizer 0103 // does not provider runtime benefits, but affects your model loading time you may disable it using this config 0104 // entry. This option is not enabled in ORT_MINIMAL_BUILD build. 0105 // A list of optimizes is available in onnxruntime/core/optimizer/graph_transformer_utils.cc 0106 // 0107 // Default is an empty string which means no optimizers are disabled. 0108 static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers"; 0109 0110 // Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0". 0111 // Using device allocators means the memory allocation is made using malloc/new. 0112 static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers"; 0113 0114 // Configure whether to allow the inter_op/intra_op threads spinning a number of times before blocking 0115 // "0": thread will block if found no job to run 0116 // "1": thread will spin a number of times before blocking 0117 // The default is "0" when ORT is built with "ORT_CLIENT_PACKAGE_BUILD" and "1" otherwise. 0118 // Thread spinning is disabled by default for client/on-device workloads to reduce cpu utilization and improve power efficiency. 0119 static const char* const kOrtSessionOptionsConfigAllowInterOpSpinning = "session.inter_op.allow_spinning"; 0120 static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session.intra_op.allow_spinning"; 0121 0122 // Key for using model bytes directly for ORT format 0123 // If a session is created using an input byte array contains the ORT format model data, 0124 // By default we will copy the model bytes at the time of session creation to ensure the model bytes 0125 // buffer is valid. 0126 // Setting this option to "1" will disable copy the model bytes, and use the model bytes directly. The caller 0127 // has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed. 0128 static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly"; 0129 0130 /// <summary> 0131 /// Key for using the ORT format model flatbuffer bytes directly for initializers. 0132 /// This avoids copying the bytes and reduces peak memory usage during model loading and initialization. 0133 /// Requires `session.use_ort_model_bytes_directly` to be true. 0134 /// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire 0135 /// duration of the InferenceSession. 0136 /// </summary> 0137 static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers = 0138 "session.use_ort_model_bytes_for_initializers"; 0139 0140 // This should only be specified when exporting an ORT format model for use on a different platform. 0141 // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0" 0142 // Available since version 1.11. 0143 static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed"; 0144 0145 // x64 SSE4.1/AVX2/AVX512(with no VNNI) has overflow problem with quantizied matrix multiplication with U8S8. 0146 // To avoid this we need to use slower U8U8 matrix multiplication instead. This option, if 0147 // turned on, use slower U8U8 matrix multiplications. Only effective with AVX2 or AVX512 0148 // platforms. 0149 static const char* const kOrtSessionOptionsAvx2PrecisionMode = "session.x64quantprecision"; 0150 0151 // Specifies how minimal build graph optimizations are handled in a full build. 0152 // These optimizations are at the extended level or higher. 0153 // Possible values and their effects are: 0154 // "save": Save runtime optimizations when saving an ORT format model. 0155 // "apply": Only apply optimizations available in a minimal build. 0156 // ""/<unspecified>: Apply optimizations available in a full build. 0157 // Available since version 1.11. 0158 static const char* const kOrtSessionOptionsConfigMinimalBuildOptimizations = 0159 "optimization.minimal_build_optimizations"; 0160 0161 // Note: The options specific to an EP should be specified prior to appending that EP to the session options object in 0162 // order for them to take effect. 0163 0164 // Specifies a list of stop op types. Nodes of a type in the stop op types and nodes downstream from them will not be 0165 // run by the NNAPI EP. 0166 // The value should be a ","-delimited list of op types. For example, "Add,Sub". 0167 // If not specified, the default set of stop ops is used. To specify an empty stop ops types list and disable stop op 0168 // exclusion, set the value to "". 0169 static const char* const kOrtSessionOptionsConfigNnapiEpPartitioningStopOps = "ep.nnapi.partitioning_stop_ops"; 0170 0171 // Enabling dynamic block-sizing for multithreading. 0172 // With a positive value, thread pool will split a task of N iterations to blocks of size starting from: 0173 // N / (num_of_threads * dynamic_block_base) 0174 // As execution progresses, the size will decrease according to the diminishing residual of N, 0175 // meaning the task will be distributed in smaller granularity for better parallelism. 0176 // For some models, it helps to reduce the variance of E2E inference latency and boost performance. 0177 // The feature will not function by default, specify any positive integer, e.g. "4", to enable it. 0178 // Available since version 1.11. 0179 static const char* const kOrtSessionOptionsConfigDynamicBlockBase = "session.dynamic_block_base"; 0180 0181 // This option allows to decrease CPU usage between infrequent 0182 // requests and forces any TP threads spinning stop immediately when the last of 0183 // concurrent Run() call returns. 0184 // Spinning is restarted on the next Run() call. 0185 // Applies only to internal thread-pools 0186 static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.force_spinning_stop"; 0187 0188 // "1": all inconsistencies encountered during shape and type inference 0189 // will result in failures. 0190 // "0": in some cases warnings will be logged but processing will continue. The default. 0191 // May be useful to expose bugs in models. 0192 static const char* const kOrtSessionOptionsConfigStrictShapeTypeInference = "session.strict_shape_type_inference"; 0193 0194 // "1": every model using a more recent opset than the latest released one will fail 0195 // "0": the model may or may not work if onnxruntime cannot find an implementation, this option 0196 // is used for development purpose. 0197 static const char* const kOrtSessionOptionsConfigStrictAllowReleasedOpsetsOnly = "session.allow_released_opsets_only"; 0198 0199 // The file saves configuration for partitioning node among logic streams 0200 static const char* const kNodePartitionConfigFile = "session.node_partition_config_file"; 0201 0202 // This Option allows setting affinities for intra op threads. 0203 // Affinity string follows format: 0204 // logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id 0205 // Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to. 0206 // e.g.1,2,3;4,5 0207 // specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th. 0208 // To ease the configuration, an "interval" is also allowed: 0209 // e.g. 1-8;8-16;17-24 0210 // orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth. 0211 // Note: 0212 // 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1, since ort does not set affinity on the main thread which 0213 // is started and managed by the calling app; 0214 // 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors, 0215 // an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group. 0216 // Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary. 0217 static const char* const kOrtSessionOptionsConfigIntraOpThreadAffinities = "session.intra_op_thread_affinities"; 0218 0219 // This option will dump out the model to assist debugging any issues with layout transformation, 0220 // and is primarily intended for developer usage. It is only relevant if an execution provider that requests 0221 // NHWC layout is enabled such as NNAPI, XNNPACK or QNN. 0222 // 0223 // Default is off. Set to "1" to enable. 0224 // 0225 // If modified by layout transformation the model will be dumped after these steps: 0226 // 1) insertion of the layout transformation Transpose nodes 0227 // 2) after those are optimized using the transpose optimizer, 0228 // 3) after the L1 transformers are applied to the updated graph. 0229 // The model will be saved to filename post_layout_transform_step_<step_number>.onnx. 0230 static const char* const kDebugLayoutTransformation = "session.debug_layout_transformation"; 0231 0232 // Graph nodes that are not supported by the execution providers (EPs) explicitly added to the session are 0233 // assigned (i.e., "fallback") to the CPU EP by default. 0234 // 0235 // This option allows the user to disable the fallback of unsupported graph nodes to the CPU EP. 0236 // If this option is set to "1", session creation will fail if the execution providers other than the CPU EP cannot 0237 // fully support all of the nodes in the graph. 0238 // 0239 // It is invalid to set this option and explicitly add the CPU EP to the session. In this case, session creation 0240 // will also fail with an error. 0241 // 0242 // Option values: 0243 // - "0": CPU EP fallback is not disabled. [DEFAULT] 0244 // - "1": CPU EP fallback is disabled. 0245 static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disable_cpu_ep_fallback"; 0246 0247 // Use this config when serializing a large model after optimization to specify an external initializers file 0248 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName = 0249 "session.optimized_model_external_initializers_file_name"; 0250 0251 // Use this config to control the minimum size of the initializer when externalizing it during serialization 0252 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes = 0253 "session.optimized_model_external_initializers_min_size_in_bytes"; 0254 0255 // When loading model from memory buffer and the model has external initializers 0256 // Use this config to set the external data file folder path 0257 // All external data files should be in the same folder 0258 static const char* const kOrtSessionOptionsModelExternalInitializersFileFolderPath = 0259 "session.model_external_initializers_file_folder_path"; 0260 0261 // Use this config when saving pre-packed constant initializers to an external data file. 0262 // This allows you to memory map pre-packed initializers on model load and leave it to 0263 // to the OS the amount of memory consumed by the pre-packed initializers. Otherwise, 0264 // pre-packed data resides on the heap. 0265 // 0266 // - "0": Default is not save pre-packed initializers to a data file. 0267 // - "1": Save pre-packed constant initializers to an external data file. 0268 // Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1") 0269 static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = 0270 "session.save_external_prepacked_constant_initializers"; 0271 0272 // Use this config when you want to collect memory stats for each node in the graph. 0273 // The file format is a CSV file with the following columns: 0274 // The file will be created if it does not exist, and will be overwritten if it does. 0275 // 0276 // The content of the file can be used to estimate memory requirements at run time including 0277 // the temporary allocations. This operation is preferably done on a CPU device, as the model may exceed 0278 // device memory limits in constrained environments. When enabling this option, it is important to disable 0279 // memory patterns, as they tend to allocate large blocks to avoid fragmentation and accommodate needs of multiple 0280 // kernels. Memory patterns may make it difficult to allocate on a device with limited memory. 0281 // 0282 // The collected stats then can be used to partition the graph among the devices in a way that only the 0283 // required memory is allocated on each device. 0284 // 0285 // node_name, initializers_memory, dynamic_outputs_sizes, temp_allocations_size 0286 // 0287 // - "full path to file": there is not a default for this option. If the file can not be opened for writing, an error will be returned. 0288 static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "session.collect_node_memory_stats_to_file"; 0289 0290 /// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats" 0291 /// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. `limit` is optional and when absent 0292 /// the provider may attempt to figure out the memory available automatically. 0293 /// The setting with no limit is expected to look like: ",file name for collected stats" 0294 /// The EP will place nodes on device "file name" : 0295 /// this file is expected to be found at the same folder with the model. The file contains 0296 /// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above) 0297 static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings = 0298 "session.resource_cuda_partitioning_settings"; 0299 0300 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file. 0301 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead. 0302 // "0": disable. (default) 0303 // "1": enable. 0304 static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable"; 0305 0306 // Specify the file path for the Onnx model which has EP context. 0307 // Default to original_file_name_ctx.onnx if not specified 0308 // Folder is not a valid option 0309 static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path"; 0310 0311 // Flag to specify whether to dump the EP context into the Onnx model. 0312 // "0": dump the EP context into separate file, keep the file name in the Onnx model. (default). 0313 // "1": dump the EP context into the Onnx model. 0314 static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode"; 0315 0316 // Specify the EPContext node name prefix to make it unique 0317 // in case user need to merge/connect multiple EPContext nodes in one model 0318 static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix"; 0319 0320 // Share EP related resources across sessions 0321 static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts"; 0322 0323 // Stop to share EP related resources across sessions from then on 0324 static const char* const kOrtSessionOptionStopShareEpContexts = "ep.stop_share_ep_contexts"; 0325 0326 // Used only for context model generation. 0327 // This configuration is used when some nodes are partitioned on the CPU EP and those nodes have external initializers. 0328 // When generating the EP context model, the new model should not rely on the old external data file used by the source ONNX model. 0329 // Use this setting when dumping the EP context model with an external initializers file. 0330 // If specified, all initializers will be placed inside the external data file. 0331 // Otherwise, all initializers will be embedded inside the generated ONNX file. 0332 // By default, this option is not set, meaning all initializers will be included within the ONNX file. 0333 static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName = 0334 "ep.context_model_external_initializers_file_name"; 0335 0336 // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul. 0337 // Option values: 0338 // - "0": Gemm FastMath mode is not enabled. [DEFAULT] 0339 // - "1": Gemm FastMath mode is enabled. 0340 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16"; 0341 0342 // When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option. 0343 // Refer to MatMulNBits op schema for more details. 0344 // If not provided, default is 4. 0345 static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level"; 0346 0347 // THIS OPTION IS NOT A REGULAR SESSION OPTION SINCE IT CAN BE MODIFIED AT ANY TIME 0348 // Meant to be used with SetEpDynamicOptions 0349 // Specify the type of workload for this session. 0350 // “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default] 0351 // “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance. 0352 static const char* const kOrtEpDynamicOptionsWorkloadType = "ep.dynamic.workload_type"; 0353 0354 // Disables model compilation during session initialization. 0355 // 0356 // If this option is set to "1", inference session creation will fail with error code ORT_MODEL_REQUIRES_COMPILATION 0357 // if compilation is required to run the model on any Execution Provider added to the session. 0358 // Only the following kinds of models are valid when this option is set to "1": 0359 // - Pre-compiled models that have EPContext nodes for the compiling Execution Providers in the session. 0360 // - Non-compiled models that run only on non-compiling Execution Providers, like CPU EP. 0361 // 0362 // See \href https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html for details about 0363 // compiled models with EPContext nodes. 0364 // 0365 // Option values: 0366 // - "0": EP compile is not disabled. [DEFAULT] 0367 // - "1": EP compile is disabled. 0368 static const char* const kOrtSessionOptionsDisableModelCompile = "session.disable_model_compile";
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|