![]() |
|
|||
File indexing completed on 2025-08-28 08:26:59
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <string> 0023 #include <utility> 0024 #include <vector> 0025 0026 #include "arrow/io/interfaces.h" 0027 #include "arrow/util/type_fwd.h" 0028 #include "arrow/util/visibility.h" 0029 0030 namespace arrow { 0031 namespace io { 0032 0033 struct ARROW_EXPORT CacheOptions { 0034 static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9; 0035 static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64; 0036 0037 /// \brief The maximum distance in bytes between two consecutive 0038 /// ranges; beyond this value, ranges are not combined 0039 int64_t hole_size_limit; 0040 /// \brief The maximum size in bytes of a combined range; if 0041 /// combining two consecutive ranges would produce a range of a 0042 /// size greater than this, they are not combined 0043 int64_t range_size_limit; 0044 /// \brief A lazy cache does not perform any I/O until requested. 0045 /// lazy = false: request all byte ranges when PreBuffer or WillNeed is called. 0046 /// lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader 0047 /// needs them. 0048 /// lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the 0049 /// range that is currently being read. 0050 bool lazy; 0051 /// \brief The maximum number of ranges to be prefetched. This is only used 0052 /// for lazy cache to asynchronously read some ranges after reading the target range. 0053 int64_t prefetch_limit = 0; 0054 0055 bool operator==(const CacheOptions& other) const { 0056 return hole_size_limit == other.hole_size_limit && 0057 range_size_limit == other.range_size_limit && lazy == other.lazy && 0058 prefetch_limit == other.prefetch_limit; 0059 } 0060 0061 /// \brief Construct CacheOptions from network storage metrics (e.g. S3). 0062 /// 0063 /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in 0064 /// milliseconds, also called call setup latency of a new read request. 0065 /// The value is a positive integer. 0066 /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec 0067 /// (per connection). 0068 /// The value is a positive integer. 0069 /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction 0070 /// (per connection) to maximize the net data load. 0071 /// The value is a positive double precision number less than 1. 0072 /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB) 0073 /// to maximize the net data load. 0074 /// The value is a positive integer. 0075 /// \return A new instance of CacheOptions. 0076 static CacheOptions MakeFromNetworkMetrics( 0077 int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec, 0078 double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac, 0079 int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib); 0080 0081 static CacheOptions Defaults(); 0082 static CacheOptions LazyDefaults(); 0083 }; 0084 0085 namespace internal { 0086 0087 /// \brief A read cache designed to hide IO latencies when reading. 0088 /// 0089 /// This class takes multiple byte ranges that an application expects to read, and 0090 /// coalesces them into fewer, larger read requests, which benefits performance on some 0091 /// filesystems, particularly remote ones like Amazon S3. By default, it also issues 0092 /// these read requests in parallel up front. 0093 /// 0094 /// To use: 0095 /// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have 0096 /// the exact offset and length that will later be read. The cache will combine those 0097 /// ranges according to parameters (see constructor). 0098 /// 0099 /// By default, the cache will also start fetching the combined ranges in parallel in 0100 /// the background, unless CacheOptions.lazy is set. 0101 /// 0102 /// 2. Call WaitFor() to be notified when the given ranges have been read. If 0103 /// CacheOptions.lazy is set, I/O will be triggered in the background here instead. 0104 /// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each 0105 /// chunk of the file that can be parsed in parallel). 0106 /// 0107 /// 3. Call Read() to retrieve the actual data for the given ranges. 0108 /// A synchronous application may skip WaitFor() and just call Read() - it will still 0109 /// benefit from coalescing and parallel fetching. 0110 class ARROW_EXPORT ReadRangeCache { 0111 public: 0112 static constexpr int64_t kDefaultHoleSizeLimit = 8192; 0113 static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024; 0114 0115 /// Construct a read cache with default 0116 explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx) 0117 : ReadRangeCache(file, file.get(), std::move(ctx), CacheOptions::Defaults()) {} 0118 0119 /// Construct a read cache with given options 0120 explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx, 0121 CacheOptions options) 0122 : ReadRangeCache(file, file.get(), std::move(ctx), options) {} 0123 0124 /// Construct a read cache with an unowned file 0125 ReadRangeCache(RandomAccessFile* file, IOContext ctx, CacheOptions options) 0126 : ReadRangeCache(NULLPTR, file, std::move(ctx), options) {} 0127 0128 ~ReadRangeCache(); 0129 0130 /// \brief Cache the given ranges in the background. 0131 /// 0132 /// The caller must ensure that the ranges do not overlap with each other, 0133 /// nor with previously cached ranges. Otherwise, behaviour will be undefined. 0134 Status Cache(std::vector<ReadRange> ranges); 0135 0136 /// \brief Read a range previously given to Cache(). 0137 Result<std::shared_ptr<Buffer>> Read(ReadRange range); 0138 0139 /// \brief Wait until all ranges added so far have been cached. 0140 Future<> Wait(); 0141 0142 /// \brief Wait until all given ranges have been cached. 0143 Future<> WaitFor(std::vector<ReadRange> ranges); 0144 0145 protected: 0146 struct Impl; 0147 struct LazyImpl; 0148 0149 ReadRangeCache(std::shared_ptr<RandomAccessFile> owned_file, RandomAccessFile* file, 0150 IOContext ctx, CacheOptions options); 0151 0152 std::unique_ptr<Impl> impl_; 0153 }; 0154 0155 } // namespace internal 0156 } // namespace io 0157 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |