![]() |
|
|||
File indexing completed on 2025-08-28 08:27:00
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <optional> 0022 #include <vector> 0023 0024 #include "arrow/io/caching.h" 0025 #include "arrow/ipc/type_fwd.h" 0026 #include "arrow/status.h" 0027 #include "arrow/type_fwd.h" 0028 #include "arrow/util/compression.h" 0029 #include "arrow/util/visibility.h" 0030 0031 namespace arrow { 0032 0033 class MemoryPool; 0034 0035 namespace ipc { 0036 0037 // ARROW-109: We set this number arbitrarily to help catch user mistakes. For 0038 // deeply nested schemas, it is expected the user will indicate explicitly the 0039 // maximum allowed recursion depth 0040 constexpr int kMaxNestingDepth = 64; 0041 0042 /// \brief Options for writing Arrow IPC messages 0043 struct ARROW_EXPORT IpcWriteOptions { 0044 /// \brief If true, allow field lengths that don't fit in a signed 32-bit int. 0045 /// 0046 /// Some implementations may not be able to parse streams created with this option. 0047 bool allow_64bit = false; 0048 0049 /// \brief The maximum permitted schema nesting depth. 0050 int max_recursion_depth = kMaxNestingDepth; 0051 0052 /// \brief Write padding after memory buffers up to this multiple of bytes. 0053 int32_t alignment = 8; 0054 0055 /// \brief Write the pre-0.15.0 IPC message format 0056 /// 0057 /// This legacy format consists of a 4-byte prefix instead of 8-byte. 0058 bool write_legacy_ipc_format = false; 0059 0060 /// \brief The memory pool to use for allocations made during IPC writing 0061 /// 0062 /// While Arrow IPC is predominantly zero-copy, it may have to allocate 0063 /// memory in some cases (for example if compression is enabled). 0064 MemoryPool* memory_pool = default_memory_pool(); 0065 0066 /// \brief Compression codec to use for record batch body buffers 0067 /// 0068 /// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD. 0069 std::shared_ptr<util::Codec> codec; 0070 0071 /// \brief Minimum space savings percentage required for compression to be applied 0072 /// 0073 /// Space savings is calculated as (1.0 - compressed_size / uncompressed_size). 0074 /// 0075 /// For example, if min_space_savings = 0.1, a 100-byte body buffer won't undergo 0076 /// compression if its expected compressed size exceeds 90 bytes. If this option is 0077 /// unset, compression will be used indiscriminately. If no codec was supplied, this 0078 /// option is ignored. 0079 /// 0080 /// Values outside of the range [0,1] are handled as errors. 0081 /// 0082 /// Note that enabling this option may result in unreadable data for Arrow C++ versions 0083 /// prior to 12.0.0. 0084 std::optional<double> min_space_savings; 0085 0086 /// \brief Use global CPU thread pool to parallelize any computational tasks 0087 /// like compression 0088 bool use_threads = true; 0089 0090 /// \brief Whether to emit dictionary deltas 0091 /// 0092 /// If false, a changed dictionary for a given field will emit a full 0093 /// dictionary replacement. 0094 /// If true, a changed dictionary will be compared against the previous 0095 /// version. If possible, a dictionary delta will be emitted, otherwise 0096 /// a full dictionary replacement. 0097 /// 0098 /// Default is false to maximize stream compatibility. 0099 /// 0100 /// Also, note that if a changed dictionary is a nested dictionary, 0101 /// then a delta is never emitted, for compatibility with the read path. 0102 bool emit_dictionary_deltas = false; 0103 0104 /// \brief Whether to unify dictionaries for the IPC file format 0105 /// 0106 /// The IPC file format doesn't support dictionary replacements. 0107 /// Therefore, chunks of a column with a dictionary type must have the same 0108 /// dictionary in each record batch (or an extended dictionary + delta). 0109 /// 0110 /// If this option is true, RecordBatchWriter::WriteTable will attempt 0111 /// to unify dictionaries across each table column. If this option is 0112 /// false, incompatible dictionaries across a table column will simply 0113 /// raise an error. 0114 /// 0115 /// Note that enabling this option has a runtime cost. Also, not all types 0116 /// currently support dictionary unification. 0117 /// 0118 /// This option is ignored for IPC streams, which support dictionary replacement 0119 /// and deltas. 0120 bool unify_dictionaries = false; 0121 0122 /// \brief Format version to use for IPC messages and their metadata. 0123 /// 0124 /// Presently using V5 version (readable by 1.0.0 and later). 0125 /// V4 is also available (readable by 0.8.0 and later). 0126 MetadataVersion metadata_version = MetadataVersion::V5; 0127 0128 static IpcWriteOptions Defaults(); 0129 }; 0130 0131 /// \brief Options for reading Arrow IPC messages 0132 struct ARROW_EXPORT IpcReadOptions { 0133 /// \brief The maximum permitted schema nesting depth. 0134 int max_recursion_depth = kMaxNestingDepth; 0135 0136 /// \brief The memory pool to use for allocations made during IPC reading 0137 /// 0138 /// While Arrow IPC is predominantly zero-copy, it may have to allocate 0139 /// memory in some cases (for example if compression is enabled). 0140 MemoryPool* memory_pool = default_memory_pool(); 0141 0142 /// \brief Top-level schema fields to include when deserializing RecordBatch. 0143 /// 0144 /// If empty (the default), return all deserialized fields. 0145 /// If non-empty, the values are the indices of fields in the top-level schema. 0146 std::vector<int> included_fields; 0147 0148 /// \brief Use global CPU thread pool to parallelize any computational tasks 0149 /// like decompression 0150 bool use_threads = true; 0151 0152 /// \brief Whether to convert incoming data to platform-native endianness 0153 /// 0154 /// If the endianness of the received schema is not equal to platform-native 0155 /// endianness, then all buffers with endian-sensitive data will be byte-swapped. 0156 /// This includes the value buffers of numeric types, temporal types, decimal 0157 /// types, as well as the offset buffers of variable-sized binary and list-like 0158 /// types. 0159 /// 0160 /// Endianness conversion is achieved by the RecordBatchFileReader, 0161 /// RecordBatchStreamReader and StreamDecoder classes. 0162 bool ensure_native_endian = true; 0163 0164 /// \brief Options to control caching behavior when pre-buffering is requested 0165 /// 0166 /// The lazy property will always be reset to true to deliver the expected behavior 0167 io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults(); 0168 0169 static IpcReadOptions Defaults(); 0170 }; 0171 0172 namespace internal { 0173 0174 Status CheckCompressionSupported(Compression::type codec); 0175 0176 } // namespace internal 0177 } // namespace ipc 0178 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |