Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 07:59:33

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <memory>
0021 #include <string>
0022 #include <vector>
0023 
0024 #include "arrow/filesystem/filesystem.h"
0025 #include "arrow/util/macros.h"
0026 #include "arrow/util/uri.h"
0027 
0028 namespace Aws::Auth {
0029 class AWSCredentialsProvider;
0030 class STSAssumeRoleCredentialsProvider;
0031 }  // namespace Aws::Auth
0032 
0033 namespace Aws::STS {
0034 class STSClient;
0035 }  // namespace Aws::STS
0036 
0037 namespace arrow::fs {
0038 
0039 /// Options for using a proxy for S3
0040 struct ARROW_EXPORT S3ProxyOptions {
0041   std::string scheme;
0042   std::string host;
0043   int port = -1;
0044   std::string username;
0045   std::string password;
0046 
0047   /// Initialize from URI such as http://username:password@host:port
0048   /// or http://host:port
0049   static Result<S3ProxyOptions> FromUri(const std::string& uri);
0050   static Result<S3ProxyOptions> FromUri(const ::arrow::util::Uri& uri);
0051 
0052   bool Equals(const S3ProxyOptions& other) const;
0053 };
0054 
0055 enum class S3CredentialsKind : int8_t {
0056   /// Anonymous access (no credentials used)
0057   Anonymous,
0058   /// Use default AWS credentials, configured through environment variables
0059   Default,
0060   /// Use explicitly-provided access key pair
0061   Explicit,
0062   /// Assume role through a role ARN
0063   Role,
0064   /// Use web identity token to assume role, configured through environment variables
0065   WebIdentity
0066 };
0067 
0068 /// Pure virtual class for describing custom S3 retry strategies
0069 class ARROW_EXPORT S3RetryStrategy {
0070  public:
0071   virtual ~S3RetryStrategy() = default;
0072 
0073   /// Simple struct where each field corresponds to a field in Aws::Client::AWSError
0074   struct AWSErrorDetail {
0075     /// Corresponds to AWSError::GetErrorType()
0076     int error_type;
0077     /// Corresponds to AWSError::GetMessage()
0078     std::string message;
0079     /// Corresponds to AWSError::GetExceptionName()
0080     std::string exception_name;
0081     /// Corresponds to AWSError::ShouldRetry()
0082     bool should_retry;
0083   };
0084   /// Returns true if the S3 request resulting in the provided error should be retried.
0085   virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0;
0086   /// Returns the time in milliseconds the S3 client should sleep for until retrying.
0087   virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error,
0088                                                 int64_t attempted_retries) = 0;
0089   /// Returns a stock AWS Default retry strategy.
0090   static std::shared_ptr<S3RetryStrategy> GetAwsDefaultRetryStrategy(
0091       int64_t max_attempts);
0092   /// Returns a stock AWS Standard retry strategy.
0093   static std::shared_ptr<S3RetryStrategy> GetAwsStandardRetryStrategy(
0094       int64_t max_attempts);
0095 };
0096 
0097 /// Options for the S3FileSystem implementation.
0098 struct ARROW_EXPORT S3Options {
0099   /// \brief Smart defaults for option values
0100   ///
0101   /// The possible values for this setting are explained in the AWS docs:
0102   /// https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html
0103   std::string smart_defaults = "standard";
0104 
0105   /// \brief AWS region to connect to.
0106   ///
0107   /// If unset, the AWS SDK will choose a default value.  The exact algorithm
0108   /// depends on the SDK version.  Before 1.8, the default is hardcoded
0109   /// to "us-east-1".  Since 1.8, several heuristics are used to determine
0110   /// the region (environment variables, configuration profile, EC2 metadata
0111   /// server).
0112   std::string region;
0113 
0114   /// \brief Socket connection timeout, in seconds
0115   ///
0116   /// If negative, the AWS SDK default value is used (typically 1 second).
0117   double connect_timeout = -1;
0118 
0119   /// \brief Socket read timeout on Windows and macOS, in seconds
0120   ///
0121   /// If negative, the AWS SDK default value is used (typically 3 seconds).
0122   /// This option is ignored on non-Windows, non-macOS systems.
0123   double request_timeout = -1;
0124 
0125   /// If non-empty, override region with a connect string such as "localhost:9000"
0126   // XXX perhaps instead take a URL like "http://localhost:9000"?
0127   std::string endpoint_override;
0128   /// S3 connection transport, default "https"
0129   std::string scheme = "https";
0130 
0131   /// ARN of role to assume
0132   std::string role_arn;
0133   /// Optional identifier for an assumed role session.
0134   std::string session_name;
0135   /// Optional external identifier to pass to STS when assuming a role
0136   std::string external_id;
0137   /// Frequency (in seconds) to refresh temporary credentials from assumed role
0138   int load_frequency = 900;
0139 
0140   /// If connection is through a proxy, set options here
0141   S3ProxyOptions proxy_options;
0142 
0143   /// AWS credentials provider
0144   std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
0145 
0146   /// Type of credentials being used. Set along with credentials_provider.
0147   S3CredentialsKind credentials_kind = S3CredentialsKind::Default;
0148 
0149   /// Whether to use virtual addressing of buckets
0150   ///
0151   /// If true, then virtual addressing is always enabled.
0152   /// If false, then virtual addressing is only enabled if `endpoint_override` is empty.
0153   ///
0154   /// This can be used for non-AWS backends that only support virtual hosted-style access.
0155   bool force_virtual_addressing = false;
0156 
0157   /// Whether OutputStream writes will be issued in the background, without blocking.
0158   bool background_writes = true;
0159 
0160   /// Whether to allow creation of buckets
0161   ///
0162   /// When S3FileSystem creates new buckets, it does not pass any non-default settings.
0163   /// In AWS S3, the bucket and all objects will be not publicly visible, and there
0164   /// will be no bucket policies and no resource tags. To have more control over how
0165   /// buckets are created, use a different API to create them.
0166   bool allow_bucket_creation = false;
0167 
0168   /// Whether to allow deletion of buckets
0169   bool allow_bucket_deletion = false;
0170 
0171   /// Whether to allow pessimistic directory creation in CreateDir function
0172   ///
0173   /// By default, CreateDir function will try to create the directory without checking its
0174   /// existence. It's an optimization to try directory creation and catch the error,
0175   /// rather than issue two dependent I/O calls.
0176   /// Though for key/value storage like Google Cloud Storage, too many creation calls will
0177   /// breach the rate limit for object mutation operations and cause serious consequences.
0178   /// It's also possible you don't have creation access for the parent directory. Set it
0179   /// to be true to address these scenarios.
0180   bool check_directory_existence_before_creation = false;
0181 
0182   /// Whether to allow file-open methods to return before the actual open.
0183   ///
0184   /// Enabling this may reduce the latency of `OpenInputStream`, `OpenOutputStream`,
0185   /// and similar methods, by reducing the number of roundtrips necessary. It may also
0186   /// allow usage of more efficient S3 APIs for small files.
0187   /// The downside is that failure conditions such as attempting to open a file in a
0188   /// non-existing bucket will only be reported when actual I/O is done (at worse,
0189   /// when attempting to close the file).
0190   bool allow_delayed_open = false;
0191 
0192   /// \brief Default metadata for OpenOutputStream.
0193   ///
0194   /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
0195   std::shared_ptr<const KeyValueMetadata> default_metadata;
0196 
0197   /// Optional retry strategy to determine which error types should be retried, and the
0198   /// delay between retries.
0199   std::shared_ptr<S3RetryStrategy> retry_strategy;
0200 
0201   /// Optional customer-provided key for server-side encryption (SSE-C).
0202   ///
0203   /// This should be the 32-byte AES-256 key, unencoded.
0204   std::string sse_customer_key;
0205 
0206   /// Optional path to a single PEM file holding all TLS CA certificates
0207   ///
0208   /// If empty, global filesystem options will be used (see FileSystemGlobalOptions);
0209   /// if the corresponding global filesystem option is also empty, the underlying
0210   /// TLS library's defaults will be used.
0211   ///
0212   /// Note this option may be ignored on some systems (Windows, macOS).
0213   std::string tls_ca_file_path;
0214 
0215   /// Optional path to a directory holding TLS CA
0216   ///
0217   /// The given directory should contain CA certificates as individual PEM files
0218   /// named along the OpenSSL "hashed" format.
0219   ///
0220   /// If empty, global filesystem options will be used (see FileSystemGlobalOptions);
0221   /// if the corresponding global filesystem option is also empty, the underlying
0222   /// TLS library's defaults will be used.
0223   ///
0224   /// Note this option may be ignored on some systems (Windows, macOS).
0225   std::string tls_ca_dir_path;
0226 
0227   /// Whether to verify the S3 endpoint's TLS certificate
0228   ///
0229   /// This option applies if the scheme is "https".
0230   bool tls_verify_certificates = true;
0231 
0232   S3Options();
0233 
0234   /// Configure with the default AWS credentials provider chain.
0235   void ConfigureDefaultCredentials();
0236 
0237   /// Configure with anonymous credentials.  This will only let you access public buckets.
0238   void ConfigureAnonymousCredentials();
0239 
0240   /// Configure with explicit access and secret key.
0241   void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key,
0242                           const std::string& session_token = "");
0243 
0244   /// Configure with credentials from an assumed role.
0245   void ConfigureAssumeRoleCredentials(
0246       const std::string& role_arn, const std::string& session_name = "",
0247       const std::string& external_id = "", int load_frequency = 900,
0248       const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
0249 
0250   /// Configure with credentials from role assumed using a web identity token
0251   void ConfigureAssumeRoleWithWebIdentityCredentials();
0252 
0253   std::string GetAccessKey() const;
0254   std::string GetSecretKey() const;
0255   std::string GetSessionToken() const;
0256 
0257   bool Equals(const S3Options& other) const;
0258 
0259   /// \brief Initialize with default credentials provider chain
0260   ///
0261   /// This is recommended if you use the standard AWS environment variables
0262   /// and/or configuration file.
0263   static S3Options Defaults();
0264 
0265   /// \brief Initialize with anonymous credentials.
0266   ///
0267   /// This will only let you access public buckets.
0268   static S3Options Anonymous();
0269 
0270   /// \brief Initialize with explicit access and secret key.
0271   ///
0272   /// Optionally, a session token may also be provided for temporary credentials
0273   /// (from STS).
0274   static S3Options FromAccessKey(const std::string& access_key,
0275                                  const std::string& secret_key,
0276                                  const std::string& session_token = "");
0277 
0278   /// \brief Initialize from an assumed role.
0279   static S3Options FromAssumeRole(
0280       const std::string& role_arn, const std::string& session_name = "",
0281       const std::string& external_id = "", int load_frequency = 900,
0282       const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
0283 
0284   /// \brief Initialize from an assumed role with web-identity.
0285   /// Uses the AWS SDK which uses environment variables to
0286   /// generate temporary credentials.
0287   static S3Options FromAssumeRoleWithWebIdentity();
0288 
0289   static Result<S3Options> FromUri(const ::arrow::util::Uri& uri,
0290                                    std::string* out_path = NULLPTR);
0291   static Result<S3Options> FromUri(const std::string& uri,
0292                                    std::string* out_path = NULLPTR);
0293 };
0294 
0295 /// S3-backed FileSystem implementation.
0296 ///
0297 /// Some implementation notes:
0298 /// - buckets are special and the operations available on them may be limited
0299 ///   or more expensive than desired.
0300 class ARROW_EXPORT S3FileSystem : public FileSystem {
0301  public:
0302   ~S3FileSystem() override;
0303 
0304   std::string type_name() const override { return "s3"; }
0305 
0306   /// Return the original S3 options when constructing the filesystem
0307   S3Options options() const;
0308   /// Return the actual region this filesystem connects to
0309   std::string region() const;
0310 
0311   bool Equals(const FileSystem& other) const override;
0312   Result<std::string> PathFromUri(const std::string& uri_string) const override;
0313   Result<std::string> MakeUri(std::string path) const override;
0314 
0315   /// \cond FALSE
0316   using FileSystem::CreateDir;
0317   using FileSystem::DeleteDirContents;
0318   using FileSystem::DeleteDirContentsAsync;
0319   using FileSystem::GetFileInfo;
0320   using FileSystem::OpenAppendStream;
0321   using FileSystem::OpenOutputStream;
0322   /// \endcond
0323 
0324   Result<FileInfo> GetFileInfo(const std::string& path) override;
0325   Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
0326 
0327   FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
0328 
0329   Status CreateDir(const std::string& path, bool recursive) override;
0330 
0331   Status DeleteDir(const std::string& path) override;
0332   Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
0333   Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok) override;
0334   Status DeleteRootDirContents() override;
0335 
0336   Status DeleteFile(const std::string& path) override;
0337 
0338   Status Move(const std::string& src, const std::string& dest) override;
0339 
0340   Status CopyFile(const std::string& src, const std::string& dest) override;
0341 
0342   /// Create a sequential input stream for reading from a S3 object.
0343   ///
0344   /// NOTE: Reads from the stream will be synchronous and unbuffered.
0345   /// You way want to wrap the stream in a BufferedInputStream or use
0346   /// a custom readahead strategy to avoid idle waits.
0347   Result<std::shared_ptr<io::InputStream>> OpenInputStream(
0348       const std::string& path) override;
0349   /// Create a sequential input stream for reading from a S3 object.
0350   ///
0351   /// This override avoids a HEAD request by assuming the FileInfo
0352   /// contains correct information.
0353   Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
0354 
0355   /// Create a random access file for reading from a S3 object.
0356   ///
0357   /// See OpenInputStream for performance notes.
0358   Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
0359       const std::string& path) override;
0360   /// Create a random access file for reading from a S3 object.
0361   ///
0362   /// This override avoids a HEAD request by assuming the FileInfo
0363   /// contains correct information.
0364   Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
0365       const FileInfo& info) override;
0366 
0367   /// Create a sequential output stream for writing to a S3 object.
0368   ///
0369   /// NOTE: Writes to the stream will be buffered.  Depending on
0370   /// S3Options.background_writes, they can be synchronous or not.
0371   /// It is recommended to enable background_writes unless you prefer
0372   /// implementing your own background execution strategy.
0373   Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
0374       const std::string& path,
0375       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
0376 
0377   Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
0378       const std::string& path,
0379       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
0380 
0381   /// Create a S3FileSystem instance from the given options.
0382   static Result<std::shared_ptr<S3FileSystem>> Make(
0383       const S3Options& options, const io::IOContext& = io::default_io_context());
0384 
0385  protected:
0386   explicit S3FileSystem(const S3Options& options, const io::IOContext&);
0387 
0388   class Impl;
0389   std::shared_ptr<Impl> impl_;
0390 };
0391 
0392 enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace };
0393 
0394 struct ARROW_EXPORT S3GlobalOptions {
0395   /// The log level for S3-originating messages.
0396   S3LogLevel log_level;
0397 
0398   /// The number of threads to configure when creating AWS' I/O event loop
0399   ///
0400   /// Defaults to 1 as recommended by AWS' doc when the # of connections is
0401   /// expected to be, at most, in the hundreds
0402   ///
0403   /// For more details see Aws::Crt::Io::EventLoopGroup
0404   int num_event_loop_threads = 1;
0405 
0406   /// Whether to install a process-wide SIGPIPE handler
0407   ///
0408   /// The AWS SDK may sometimes emit SIGPIPE signals for certain errors;
0409   /// by default, they would abort the current process.
0410   /// This option, if enabled, will install a process-wide signal handler
0411   /// that logs and otherwise ignore incoming SIGPIPE signals.
0412   ///
0413   /// This option has no effect on Windows.
0414   bool install_sigpipe_handler = false;
0415 
0416   /// \brief Initialize with default options
0417   ///
0418   /// For log_level, this method first tries to extract a suitable value from the
0419   /// environment variable ARROW_S3_LOG_LEVEL.
0420   static S3GlobalOptions Defaults();
0421 };
0422 
0423 /// \brief Initialize the S3 APIs with the specified set of options.
0424 ///
0425 /// It is required to call this function at least once before using S3FileSystem.
0426 ///
0427 /// Once this function is called you MUST call FinalizeS3 before the end of the
0428 /// application in order to avoid a segmentation fault at shutdown.
0429 ARROW_EXPORT
0430 Status InitializeS3(const S3GlobalOptions& options);
0431 
0432 /// \brief Ensure the S3 APIs are initialized, but only if not already done.
0433 ///
0434 /// If necessary, this will call InitializeS3() with some default options.
0435 ARROW_EXPORT
0436 Status EnsureS3Initialized();
0437 
0438 /// Whether S3 was initialized, and not finalized.
0439 ARROW_EXPORT
0440 bool IsS3Initialized();
0441 
0442 /// Whether S3 was finalized.
0443 ARROW_EXPORT
0444 bool IsS3Finalized();
0445 
0446 /// \brief Shutdown the S3 APIs.
0447 ///
0448 /// This can wait for some S3 concurrent calls to finish so as to avoid
0449 /// race conditions.
0450 /// After this function has been called, all S3 calls will fail with an error.
0451 ///
0452 /// Calls to InitializeS3() and FinalizeS3() should be serialized by the
0453 /// application (this also applies to EnsureS3Initialized() and
0454 /// EnsureS3Finalized()).
0455 ARROW_EXPORT
0456 Status FinalizeS3();
0457 
0458 /// \brief Ensure the S3 APIs are shutdown, but only if not already done.
0459 ///
0460 /// If necessary, this will call FinalizeS3().
0461 ARROW_EXPORT
0462 Status EnsureS3Finalized();
0463 
0464 ARROW_EXPORT
0465 Result<std::string> ResolveS3BucketRegion(const std::string& bucket);
0466 
0467 }  // namespace arrow::fs