Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-10-25 08:49:40

0001 // Copyright (c) Microsoft Corporation. All rights reserved.
0002 // Licensed under the MIT License.
0003 
0004 #pragma once
0005 
0006 /*
0007  * This file defines SessionOptions Config Keys and format of the Config Values.
0008  *
0009  * The Naming Convention for a SessionOptions Config Key,
0010  * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
0011  * Such as "ep.cuda.use_arena"
0012  * The Config Key cannot be empty
0013  * The maximum length of the Config Key is 128
0014  *
0015  * The string format of a SessionOptions Config Value is defined individually for each Config.
0016  * The maximum length of the Config Value is 1024
0017  */
0018 
0019 // Key for disable PrePacking,
0020 // If the config value is set to "1" then the prepacking is disabled, otherwise prepacking is enabled (default value)
0021 static const char* const kOrtSessionOptionsConfigDisablePrepacking = "session.disable_prepacking";
0022 
0023 // A value of "1" means allocators registered in the env will be used. "0" means the allocators created in the session
0024 // will be used. Use this to override the usage of env allocators on a per session level.
0025 static const char* const kOrtSessionOptionsConfigUseEnvAllocators = "session.use_env_allocators";
0026 
0027 // Set to 'ORT' (case sensitive) to load an ORT format model.
0028 // If unset, model type will default to ONNX unless inferred from filename ('.ort' == ORT format) or bytes to be ORT
0029 static const char* const kOrtSessionOptionsConfigLoadModelFormat = "session.load_model_format";
0030 
0031 // Set to 'ORT' (case sensitive) to save optimized model in ORT format when SessionOptions.optimized_model_path is set.
0032 // If unset, format will default to ONNX unless optimized_model_filepath ends in '.ort'.
0033 static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save_model_format";
0034 
0035 // If a value is "1", flush-to-zero and denormal-as-zero are applied. The default is "0".
0036 // When multiple sessions are created, a main thread doesn't override changes from succeeding session options,
0037 // but threads in session thread pools follow option changes.
0038 // When ORT runs with OpenMP, the same rule is applied, i.e. the first session option to flush-to-zero and
0039 // denormal-as-zero is only applied to global OpenMP thread pool, which doesn't support per-session thread pool.
0040 // Note that an alternative way not using this option at runtime is to train and export a model without denormals
0041 // and that's recommended because turning this option on may hurt model accuracy.
0042 static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero";
0043 
0044 // It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not.
0045 // "0": enable. ORT does fusion logic for QDQ format.
0046 // "1": disable. ORT doesn't do fusion logic for QDQ format.
0047 // Its default value is "0" unless the DirectML execution provider is registered, in which case it defaults to "1".
0048 static const char* const kOrtSessionOptionsDisableQuantQDQ = "session.disable_quant_qdq";
0049 
0050 // It controls whether to enable Double QDQ remover and Identical Children Consolidation
0051 // "0": not to disable. ORT does remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
0052 // "1": disable. ORT doesn't remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
0053 // Its default value is "0"
0054 static const char* const kOrtSessionOptionsDisableDoubleQDQRemover = "session.disable_double_qdq_remover";
0055 
0056 // If set to "1", enables the removal of QuantizeLinear/DequantizeLinear node pairs once all QDQ handling has been
0057 // completed. e.g. If after all QDQ handling has completed and we have -> FloatOp -> Q -> DQ -> FloatOp -> the
0058 // Q -> DQ could potentially be removed. This will provide a performance benefit by avoiding going from float to
0059 // 8-bit and back to float, but could impact accuracy. The impact on accuracy will be model specific and depend on
0060 // other factors like whether the model was created using Quantization Aware Training or Post Training Quantization.
0061 // As such, it's best to test to determine if enabling this works well for your scenario.
0062 // The default value is "0"
0063 // Available since version 1.11.
0064 static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enable_quant_qdq_cleanup";
0065 
0066 // Enable or disable gelu approximation in graph optimization. "0": disable; "1": enable. The default is "0".
0067 // GeluApproximation has side effects which may change the inference results. It is disabled by default due to this.
0068 static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation";
0069 
0070 // This setting controls whether to enable AheadOfTime function inlining.
0071 // AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model
0072 // as possible with the help of enabled execution providers.
0073 // This can reduce the number of function calls and improve performance because it is done before
0074 // Level1 optimizers and constant folding. However, under some circumstances, when the EPs are not available,
0075 // one can disable the AOT inlining, produce an optimized model and postpone AOT until run time.
0076 // "0": enable; "1": disable.
0077 // Its default value is "0".
0078 static const char* const kOrtSessionOptionsDisableAheadOfTimeFunctionInlining = "session.disable_aot_function_inlining";
0079 
0080 #ifdef ENABLE_TRAINING
0081 // Specifies a path of the file containing a list of memory optimization configurations.
0082 // The value should be a string indicating the file path of the config file.
0083 // The content of the config file is a JSON struct like this:
0084 // [
0085 //   "Gelu+Cast+:1:0",
0086 //   "Dropout+:1:1"
0087 // ]
0088 // Taking the example of "Gelu+Cast+:1:0",
0089 // > "Gelu+Cast+" is the subgraph string, a valid "subgraph string" should be one subgraph representation
0090 //    output by ORT graph transformations.
0091 // > "1" is "optimization strategy", valid values: 0 - disabled, 1 - recompute.
0092 // > "0" is "number of subgraph to apply" which is used to control how many subgraphs to apply optimization,
0093 //    to avoid "oversaving" the memory.
0094 static const char* const kOrtSessionOptionsMemoryOptimizerApplyConfig = "optimization.memory_optimizer_config";
0095 
0096 // Specifies the config for detecting subgraphs for memory footprint reduction.
0097 // The value should be a string contains int separated using commas. The default value is "0:0".
0098 static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimization.enable_memory_probe_recompute_config";
0099 #endif
0100 
0101 // This setting if set should contain a comma separated list of optimizers names that should be disabled.
0102 // Optimizers may take time to execute and affect model loading time. If you feel that a specific optimizer
0103 // does not provider runtime benefits, but affects your model loading time you may disable it using this config
0104 // entry. This option is not enabled in ORT_MINIMAL_BUILD build.
0105 // A list of optimizes is available in onnxruntime/core/optimizer/graph_transformer_utils.cc
0106 //
0107 // Default is an empty string which means no optimizers are disabled.
0108 static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
0109 
0110 // Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0".
0111 // Using device allocators means the memory allocation is made using malloc/new.
0112 static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers";
0113 
0114 // Configure whether to allow the inter_op/intra_op threads spinning a number of times before blocking
0115 // "0": thread will block if found no job to run
0116 // "1": default, thread will spin a number of times before blocking
0117 static const char* const kOrtSessionOptionsConfigAllowInterOpSpinning = "session.inter_op.allow_spinning";
0118 static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session.intra_op.allow_spinning";
0119 
0120 // Key for using model bytes directly for ORT format
0121 // If a session is created using an input byte array contains the ORT format model data,
0122 // By default we will copy the model bytes at the time of session creation to ensure the model bytes
0123 // buffer is valid.
0124 // Setting this option to "1" will disable copy the model bytes, and use the model bytes directly. The caller
0125 // has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed.
0126 static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly";
0127 
0128 /// <summary>
0129 /// Key for using the ORT format model flatbuffer bytes directly for initializers.
0130 /// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
0131 /// Requires `session.use_ort_model_bytes_directly` to be true.
0132 /// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
0133 /// duration of the InferenceSession.
0134 /// </summary>
0135 static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
0136     "session.use_ort_model_bytes_for_initializers";
0137 
0138 // This should only be specified when exporting an ORT format model for use on a different platform.
0139 // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
0140 // Available since version 1.11.
0141 static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed";
0142 
0143 // x64 SSE4.1/AVX2/AVX512(with no VNNI) has overflow problem with quantizied matrix multiplication with U8S8.
0144 // To avoid this we need to use slower U8U8 matrix multiplication instead. This option, if
0145 // turned on, use slower U8U8 matrix multiplications. Only effective with AVX2 or AVX512
0146 // platforms.
0147 static const char* const kOrtSessionOptionsAvx2PrecisionMode = "session.x64quantprecision";
0148 
0149 // Specifies how minimal build graph optimizations are handled in a full build.
0150 // These optimizations are at the extended level or higher.
0151 // Possible values and their effects are:
0152 // "save": Save runtime optimizations when saving an ORT format model.
0153 // "apply": Only apply optimizations available in a minimal build.
0154 // ""/<unspecified>: Apply optimizations available in a full build.
0155 // Available since version 1.11.
0156 static const char* const kOrtSessionOptionsConfigMinimalBuildOptimizations =
0157     "optimization.minimal_build_optimizations";
0158 
0159 // Note: The options specific to an EP should be specified prior to appending that EP to the session options object in
0160 // order for them to take effect.
0161 
0162 // Specifies a list of stop op types. Nodes of a type in the stop op types and nodes downstream from them will not be
0163 // run by the NNAPI EP.
0164 // The value should be a ","-delimited list of op types. For example, "Add,Sub".
0165 // If not specified, the default set of stop ops is used. To specify an empty stop ops types list and disable stop op
0166 // exclusion, set the value to "".
0167 static const char* const kOrtSessionOptionsConfigNnapiEpPartitioningStopOps = "ep.nnapi.partitioning_stop_ops";
0168 
0169 // Enabling dynamic block-sizing for multithreading.
0170 // With a positive value, thread pool will split a task of N iterations to blocks of size starting from:
0171 // N / (num_of_threads * dynamic_block_base)
0172 // As execution progresses, the size will decrease according to the diminishing residual of N,
0173 // meaning the task will be distributed in smaller granularity for better parallelism.
0174 // For some models, it helps to reduce the variance of E2E inference latency and boost performance.
0175 // The feature will not function by default, specify any positive integer, e.g. "4", to enable it.
0176 // Available since version 1.11.
0177 static const char* const kOrtSessionOptionsConfigDynamicBlockBase = "session.dynamic_block_base";
0178 
0179 // This option allows to decrease CPU usage between infrequent
0180 // requests and forces any TP threads spinning stop immediately when the last of
0181 // concurrent Run() call returns.
0182 // Spinning is restarted on the next Run() call.
0183 // Applies only to internal thread-pools
0184 static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.force_spinning_stop";
0185 
0186 // "1": all inconsistencies encountered during shape and type inference
0187 // will result in failures.
0188 // "0": in some cases warnings will be logged but processing will continue. The default.
0189 // May be useful to expose bugs in models.
0190 static const char* const kOrtSessionOptionsConfigStrictShapeTypeInference = "session.strict_shape_type_inference";
0191 
0192 // "1": every model using a more recent opset than the latest released one will fail
0193 // "0": the model may or may not work if onnxruntime cannot find an implementation, this option
0194 // is used for development purpose.
0195 static const char* const kOrtSessionOptionsConfigStrictAllowReleasedOpsetsOnly = "session.allow_released_opsets_only";
0196 
0197 // The file saves configuration for partitioning node among logic streams
0198 static const char* const kNodePartitionConfigFile = "session.node_partition_config_file";
0199 
0200 // This Option allows setting affinities for intra op threads.
0201 // Affinity string follows format:
0202 // logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
0203 // Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
0204 // e.g.1,2,3;4,5
0205 // specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
0206 // To ease the configuration, an "interval" is also allowed:
0207 // e.g. 1-8;8-16;17-24
0208 // orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
0209 // Note:
0210 // 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1, since ort does not set affinity on the main thread which
0211 //    is started and managed by the calling app;
0212 // 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
0213 //    an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
0214 //    Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
0215 static const char* const kOrtSessionOptionsConfigIntraOpThreadAffinities = "session.intra_op_thread_affinities";
0216 
0217 // This option will dump out the model to assist debugging any issues with layout transformation,
0218 // and is primarily intended for developer usage. It is only relevant if an execution provider that requests
0219 // NHWC layout is enabled such as NNAPI, XNNPACK or QNN.
0220 //
0221 // Default is off. Set to "1" to enable.
0222 //
0223 // If modified by layout transformation the model will be dumped after these steps:
0224 //   1) insertion of the layout transformation Transpose nodes
0225 //   2) after those are optimized using the transpose optimizer,
0226 //   3) after the L1 transformers are applied to the updated graph.
0227 // The model will be saved to filename post_layout_transform_step_<step_number>.onnx.
0228 static const char* const kDebugLayoutTransformation = "session.debug_layout_transformation";
0229 
0230 // Graph nodes that are not supported by the execution providers (EPs) explicitly added to the session are
0231 // assigned (i.e., "fallback") to the CPU EP by default.
0232 //
0233 // This option allows the user to disable the fallback of unsupported graph nodes to the CPU EP.
0234 // If this option is set to "1", session creation will fail if the execution providers other than the CPU EP cannot
0235 // fully support all of the nodes in the graph.
0236 //
0237 // It is invalid to set this option and explicitly add the CPU EP to the session. In this case, session creation
0238 // will also fail with an error.
0239 //
0240 // Option values:
0241 // - "0": CPU EP fallback is not disabled. [DEFAULT]
0242 // - "1": CPU EP fallback is disabled.
0243 static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disable_cpu_ep_fallback";
0244 
0245 // Use this config when serializing a large model after optimization to specify an external initializers file
0246 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
0247     "session.optimized_model_external_initializers_file_name";
0248 
0249 // Use this config to control the minimum size of the initializer when externalizing it during serialization
0250 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
0251     "session.optimized_model_external_initializers_min_size_in_bytes";
0252 
0253 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
0254 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
0255 // "0": disable. (default)
0256 // "1": enable.
0257 static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
0258 
0259 // Specify the file path for the Onnx model which has EP context.
0260 // Default to original_file_name_ctx.onnx if not specified
0261 static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
0262 
0263 // Flag to specify whether to dump the EP context into the Onnx model.
0264 // "0": dump the EP context into separate file, keep the file name in the Onnx model.
0265 // "1": dump the EP context into the Onnx model. (default).
0266 static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
0267 
0268 // Specify the EPContext node name prefix to make it unique
0269 // in case user need to merge/connect multiple EPContext nodes in one model
0270 static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";
0271 
0272 // Share EP related resources across EPs
0273 static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
0274 
0275 // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
0276 // Option values:
0277 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
0278 // - "1": Gemm FastMath mode is enabled.
0279 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
0280 
0281 // When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
0282 // Refer to MatMulNBits op schema for more details.
0283 // If not provided, default is 4.
0284 static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
0285 
0286 // THIS OPTION IS NOT A REGULAR SESSION OPTION SINCE IT CAN BE MODIFIED AT ANY TIME
0287 // Meant to be used with SetEpDynamicOptions
0288 // Specify the type of workload for this session.
0289 // “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
0290 // “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
0291 static const char* const kOrtEpDynamicOptionsWorkloadType = "ep.dynamic.workload_type";