Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:59

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string>
0023 #include <utility>
0024 #include <vector>
0025 
0026 #include "arrow/io/interfaces.h"
0027 #include "arrow/util/type_fwd.h"
0028 #include "arrow/util/visibility.h"
0029 
0030 namespace arrow {
0031 namespace io {
0032 
0033 struct ARROW_EXPORT CacheOptions {
0034   static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9;
0035   static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64;
0036 
0037   /// \brief The maximum distance in bytes between two consecutive
0038   ///   ranges; beyond this value, ranges are not combined
0039   int64_t hole_size_limit;
0040   /// \brief The maximum size in bytes of a combined range; if
0041   ///   combining two consecutive ranges would produce a range of a
0042   ///   size greater than this, they are not combined
0043   int64_t range_size_limit;
0044   /// \brief A lazy cache does not perform any I/O until requested.
0045   ///   lazy = false: request all byte ranges when PreBuffer or WillNeed is called.
0046   ///   lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader
0047   ///   needs them.
0048   ///   lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the
0049   ///   range that is currently being read.
0050   bool lazy;
0051   /// \brief The maximum number of ranges to be prefetched. This is only used
0052   ///   for lazy cache to asynchronously read some ranges after reading the target range.
0053   int64_t prefetch_limit = 0;
0054 
0055   bool operator==(const CacheOptions& other) const {
0056     return hole_size_limit == other.hole_size_limit &&
0057            range_size_limit == other.range_size_limit && lazy == other.lazy &&
0058            prefetch_limit == other.prefetch_limit;
0059   }
0060 
0061   /// \brief Construct CacheOptions from network storage metrics (e.g. S3).
0062   ///
0063   /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in
0064   ///   milliseconds, also called call setup latency of a new read request.
0065   ///   The value is a positive integer.
0066   /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec
0067   ///   (per connection).
0068   ///   The value is a positive integer.
0069   /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction
0070   ///   (per connection) to maximize the net data load.
0071   ///   The value is a positive double precision number less than 1.
0072   /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB)
0073   ///   to maximize the net data load.
0074   ///   The value is a positive integer.
0075   /// \return A new instance of CacheOptions.
0076   static CacheOptions MakeFromNetworkMetrics(
0077       int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec,
0078       double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac,
0079       int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib);
0080 
0081   static CacheOptions Defaults();
0082   static CacheOptions LazyDefaults();
0083 };
0084 
0085 namespace internal {
0086 
0087 /// \brief A read cache designed to hide IO latencies when reading.
0088 ///
0089 /// This class takes multiple byte ranges that an application expects to read, and
0090 /// coalesces them into fewer, larger read requests, which benefits performance on some
0091 /// filesystems, particularly remote ones like Amazon S3. By default, it also issues
0092 /// these read requests in parallel up front.
0093 ///
0094 /// To use:
0095 /// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
0096 ///    the exact offset and length that will later be read. The cache will combine those
0097 ///    ranges according to parameters (see constructor).
0098 ///
0099 ///    By default, the cache will also start fetching the combined ranges in parallel in
0100 ///    the background, unless CacheOptions.lazy is set.
0101 ///
0102 /// 2. Call WaitFor() to be notified when the given ranges have been read. If
0103 ///    CacheOptions.lazy is set, I/O will be triggered in the background here instead.
0104 ///    This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
0105 ///    chunk of the file that can be parsed in parallel).
0106 ///
0107 /// 3. Call Read() to retrieve the actual data for the given ranges.
0108 ///    A synchronous application may skip WaitFor() and just call Read() - it will still
0109 ///    benefit from coalescing and parallel fetching.
0110 class ARROW_EXPORT ReadRangeCache {
0111  public:
0112   static constexpr int64_t kDefaultHoleSizeLimit = 8192;
0113   static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024;
0114 
0115   /// Construct a read cache with default
0116   explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
0117       : ReadRangeCache(file, file.get(), std::move(ctx), CacheOptions::Defaults()) {}
0118 
0119   /// Construct a read cache with given options
0120   explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
0121                           CacheOptions options)
0122       : ReadRangeCache(file, file.get(), std::move(ctx), options) {}
0123 
0124   /// Construct a read cache with an unowned file
0125   ReadRangeCache(RandomAccessFile* file, IOContext ctx, CacheOptions options)
0126       : ReadRangeCache(NULLPTR, file, std::move(ctx), options) {}
0127 
0128   ~ReadRangeCache();
0129 
0130   /// \brief Cache the given ranges in the background.
0131   ///
0132   /// The caller must ensure that the ranges do not overlap with each other,
0133   /// nor with previously cached ranges.  Otherwise, behaviour will be undefined.
0134   Status Cache(std::vector<ReadRange> ranges);
0135 
0136   /// \brief Read a range previously given to Cache().
0137   Result<std::shared_ptr<Buffer>> Read(ReadRange range);
0138 
0139   /// \brief Wait until all ranges added so far have been cached.
0140   Future<> Wait();
0141 
0142   /// \brief Wait until all given ranges have been cached.
0143   Future<> WaitFor(std::vector<ReadRange> ranges);
0144 
0145  protected:
0146   struct Impl;
0147   struct LazyImpl;
0148 
0149   ReadRangeCache(std::shared_ptr<RandomAccessFile> owned_file, RandomAccessFile* file,
0150                  IOContext ctx, CacheOptions options);
0151 
0152   std::unique_ptr<Impl> impl_;
0153 };
0154 
0155 }  // namespace internal
0156 }  // namespace io
0157 }  // namespace arrow