Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:00

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <optional>
0022 #include <vector>
0023 
0024 #include "arrow/io/caching.h"
0025 #include "arrow/ipc/type_fwd.h"
0026 #include "arrow/status.h"
0027 #include "arrow/type_fwd.h"
0028 #include "arrow/util/compression.h"
0029 #include "arrow/util/visibility.h"
0030 
0031 namespace arrow {
0032 
0033 class MemoryPool;
0034 
0035 namespace ipc {
0036 
0037 // ARROW-109: We set this number arbitrarily to help catch user mistakes. For
0038 // deeply nested schemas, it is expected the user will indicate explicitly the
0039 // maximum allowed recursion depth
0040 constexpr int kMaxNestingDepth = 64;
0041 
0042 /// \brief Options for writing Arrow IPC messages
0043 struct ARROW_EXPORT IpcWriteOptions {
0044   /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
0045   ///
0046   /// Some implementations may not be able to parse streams created with this option.
0047   bool allow_64bit = false;
0048 
0049   /// \brief The maximum permitted schema nesting depth.
0050   int max_recursion_depth = kMaxNestingDepth;
0051 
0052   /// \brief Write padding after memory buffers up to this multiple of bytes.
0053   int32_t alignment = 8;
0054 
0055   /// \brief Write the pre-0.15.0 IPC message format
0056   ///
0057   /// This legacy format consists of a 4-byte prefix instead of 8-byte.
0058   bool write_legacy_ipc_format = false;
0059 
0060   /// \brief The memory pool to use for allocations made during IPC writing
0061   ///
0062   /// While Arrow IPC is predominantly zero-copy, it may have to allocate
0063   /// memory in some cases (for example if compression is enabled).
0064   MemoryPool* memory_pool = default_memory_pool();
0065 
0066   /// \brief Compression codec to use for record batch body buffers
0067   ///
0068   /// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD.
0069   std::shared_ptr<util::Codec> codec;
0070 
0071   /// \brief Minimum space savings percentage required for compression to be applied
0072   ///
0073   /// Space savings is calculated as (1.0 - compressed_size / uncompressed_size).
0074   ///
0075   /// For example, if min_space_savings = 0.1, a 100-byte body buffer won't undergo
0076   /// compression if its expected compressed size exceeds 90 bytes. If this option is
0077   /// unset, compression will be used indiscriminately. If no codec was supplied, this
0078   /// option is ignored.
0079   ///
0080   /// Values outside of the range [0,1] are handled as errors.
0081   ///
0082   /// Note that enabling this option may result in unreadable data for Arrow C++ versions
0083   /// prior to 12.0.0.
0084   std::optional<double> min_space_savings;
0085 
0086   /// \brief Use global CPU thread pool to parallelize any computational tasks
0087   /// like compression
0088   bool use_threads = true;
0089 
0090   /// \brief Whether to emit dictionary deltas
0091   ///
0092   /// If false, a changed dictionary for a given field will emit a full
0093   /// dictionary replacement.
0094   /// If true, a changed dictionary will be compared against the previous
0095   /// version. If possible, a dictionary delta will be emitted, otherwise
0096   /// a full dictionary replacement.
0097   ///
0098   /// Default is false to maximize stream compatibility.
0099   ///
0100   /// Also, note that if a changed dictionary is a nested dictionary,
0101   /// then a delta is never emitted, for compatibility with the read path.
0102   bool emit_dictionary_deltas = false;
0103 
0104   /// \brief Whether to unify dictionaries for the IPC file format
0105   ///
0106   /// The IPC file format doesn't support dictionary replacements.
0107   /// Therefore, chunks of a column with a dictionary type must have the same
0108   /// dictionary in each record batch (or an extended dictionary + delta).
0109   ///
0110   /// If this option is true, RecordBatchWriter::WriteTable will attempt
0111   /// to unify dictionaries across each table column.  If this option is
0112   /// false, incompatible dictionaries across a table column will simply
0113   /// raise an error.
0114   ///
0115   /// Note that enabling this option has a runtime cost. Also, not all types
0116   /// currently support dictionary unification.
0117   ///
0118   /// This option is ignored for IPC streams, which support dictionary replacement
0119   /// and deltas.
0120   bool unify_dictionaries = false;
0121 
0122   /// \brief Format version to use for IPC messages and their metadata.
0123   ///
0124   /// Presently using V5 version (readable by 1.0.0 and later).
0125   /// V4 is also available (readable by 0.8.0 and later).
0126   MetadataVersion metadata_version = MetadataVersion::V5;
0127 
0128   static IpcWriteOptions Defaults();
0129 };
0130 
0131 /// \brief Options for reading Arrow IPC messages
0132 struct ARROW_EXPORT IpcReadOptions {
0133   /// \brief The maximum permitted schema nesting depth.
0134   int max_recursion_depth = kMaxNestingDepth;
0135 
0136   /// \brief The memory pool to use for allocations made during IPC reading
0137   ///
0138   /// While Arrow IPC is predominantly zero-copy, it may have to allocate
0139   /// memory in some cases (for example if compression is enabled).
0140   MemoryPool* memory_pool = default_memory_pool();
0141 
0142   /// \brief Top-level schema fields to include when deserializing RecordBatch.
0143   ///
0144   /// If empty (the default), return all deserialized fields.
0145   /// If non-empty, the values are the indices of fields in the top-level schema.
0146   std::vector<int> included_fields;
0147 
0148   /// \brief Use global CPU thread pool to parallelize any computational tasks
0149   /// like decompression
0150   bool use_threads = true;
0151 
0152   /// \brief Whether to convert incoming data to platform-native endianness
0153   ///
0154   /// If the endianness of the received schema is not equal to platform-native
0155   /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
0156   /// This includes the value buffers of numeric types, temporal types, decimal
0157   /// types, as well as the offset buffers of variable-sized binary and list-like
0158   /// types.
0159   ///
0160   /// Endianness conversion is achieved by the RecordBatchFileReader,
0161   /// RecordBatchStreamReader and StreamDecoder classes.
0162   bool ensure_native_endian = true;
0163 
0164   /// \brief Options to control caching behavior when pre-buffering is requested
0165   ///
0166   /// The lazy property will always be reset to true to deliver the expected behavior
0167   io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults();
0168 
0169   static IpcReadOptions Defaults();
0170 };
0171 
0172 namespace internal {
0173 
0174 Status CheckCompressionSupported(Compression::type codec);
0175 
0176 }  // namespace internal
0177 }  // namespace ipc
0178 }  // namespace arrow