|
|
|||
File indexing completed on 2026-04-17 07:59:33
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <memory> 0021 #include <string> 0022 #include <vector> 0023 0024 #include "arrow/filesystem/filesystem.h" 0025 #include "arrow/util/macros.h" 0026 #include "arrow/util/uri.h" 0027 0028 namespace Aws::Auth { 0029 class AWSCredentialsProvider; 0030 class STSAssumeRoleCredentialsProvider; 0031 } // namespace Aws::Auth 0032 0033 namespace Aws::STS { 0034 class STSClient; 0035 } // namespace Aws::STS 0036 0037 namespace arrow::fs { 0038 0039 /// Options for using a proxy for S3 0040 struct ARROW_EXPORT S3ProxyOptions { 0041 std::string scheme; 0042 std::string host; 0043 int port = -1; 0044 std::string username; 0045 std::string password; 0046 0047 /// Initialize from URI such as http://username:password@host:port 0048 /// or http://host:port 0049 static Result<S3ProxyOptions> FromUri(const std::string& uri); 0050 static Result<S3ProxyOptions> FromUri(const ::arrow::util::Uri& uri); 0051 0052 bool Equals(const S3ProxyOptions& other) const; 0053 }; 0054 0055 enum class S3CredentialsKind : int8_t { 0056 /// Anonymous access (no credentials used) 0057 Anonymous, 0058 /// Use default AWS credentials, configured through environment variables 0059 Default, 0060 /// Use explicitly-provided access key pair 0061 Explicit, 0062 /// Assume role through a role ARN 0063 Role, 0064 /// Use web identity token to assume role, configured through environment variables 0065 WebIdentity 0066 }; 0067 0068 /// Pure virtual class for describing custom S3 retry strategies 0069 class ARROW_EXPORT S3RetryStrategy { 0070 public: 0071 virtual ~S3RetryStrategy() = default; 0072 0073 /// Simple struct where each field corresponds to a field in Aws::Client::AWSError 0074 struct AWSErrorDetail { 0075 /// Corresponds to AWSError::GetErrorType() 0076 int error_type; 0077 /// Corresponds to AWSError::GetMessage() 0078 std::string message; 0079 /// Corresponds to AWSError::GetExceptionName() 0080 std::string exception_name; 0081 /// Corresponds to AWSError::ShouldRetry() 0082 bool should_retry; 0083 }; 0084 /// Returns true if the S3 request resulting in the provided error should be retried. 0085 virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0; 0086 /// Returns the time in milliseconds the S3 client should sleep for until retrying. 0087 virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error, 0088 int64_t attempted_retries) = 0; 0089 /// Returns a stock AWS Default retry strategy. 0090 static std::shared_ptr<S3RetryStrategy> GetAwsDefaultRetryStrategy( 0091 int64_t max_attempts); 0092 /// Returns a stock AWS Standard retry strategy. 0093 static std::shared_ptr<S3RetryStrategy> GetAwsStandardRetryStrategy( 0094 int64_t max_attempts); 0095 }; 0096 0097 /// Options for the S3FileSystem implementation. 0098 struct ARROW_EXPORT S3Options { 0099 /// \brief Smart defaults for option values 0100 /// 0101 /// The possible values for this setting are explained in the AWS docs: 0102 /// https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html 0103 std::string smart_defaults = "standard"; 0104 0105 /// \brief AWS region to connect to. 0106 /// 0107 /// If unset, the AWS SDK will choose a default value. The exact algorithm 0108 /// depends on the SDK version. Before 1.8, the default is hardcoded 0109 /// to "us-east-1". Since 1.8, several heuristics are used to determine 0110 /// the region (environment variables, configuration profile, EC2 metadata 0111 /// server). 0112 std::string region; 0113 0114 /// \brief Socket connection timeout, in seconds 0115 /// 0116 /// If negative, the AWS SDK default value is used (typically 1 second). 0117 double connect_timeout = -1; 0118 0119 /// \brief Socket read timeout on Windows and macOS, in seconds 0120 /// 0121 /// If negative, the AWS SDK default value is used (typically 3 seconds). 0122 /// This option is ignored on non-Windows, non-macOS systems. 0123 double request_timeout = -1; 0124 0125 /// If non-empty, override region with a connect string such as "localhost:9000" 0126 // XXX perhaps instead take a URL like "http://localhost:9000"? 0127 std::string endpoint_override; 0128 /// S3 connection transport, default "https" 0129 std::string scheme = "https"; 0130 0131 /// ARN of role to assume 0132 std::string role_arn; 0133 /// Optional identifier for an assumed role session. 0134 std::string session_name; 0135 /// Optional external identifier to pass to STS when assuming a role 0136 std::string external_id; 0137 /// Frequency (in seconds) to refresh temporary credentials from assumed role 0138 int load_frequency = 900; 0139 0140 /// If connection is through a proxy, set options here 0141 S3ProxyOptions proxy_options; 0142 0143 /// AWS credentials provider 0144 std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider; 0145 0146 /// Type of credentials being used. Set along with credentials_provider. 0147 S3CredentialsKind credentials_kind = S3CredentialsKind::Default; 0148 0149 /// Whether to use virtual addressing of buckets 0150 /// 0151 /// If true, then virtual addressing is always enabled. 0152 /// If false, then virtual addressing is only enabled if `endpoint_override` is empty. 0153 /// 0154 /// This can be used for non-AWS backends that only support virtual hosted-style access. 0155 bool force_virtual_addressing = false; 0156 0157 /// Whether OutputStream writes will be issued in the background, without blocking. 0158 bool background_writes = true; 0159 0160 /// Whether to allow creation of buckets 0161 /// 0162 /// When S3FileSystem creates new buckets, it does not pass any non-default settings. 0163 /// In AWS S3, the bucket and all objects will be not publicly visible, and there 0164 /// will be no bucket policies and no resource tags. To have more control over how 0165 /// buckets are created, use a different API to create them. 0166 bool allow_bucket_creation = false; 0167 0168 /// Whether to allow deletion of buckets 0169 bool allow_bucket_deletion = false; 0170 0171 /// Whether to allow pessimistic directory creation in CreateDir function 0172 /// 0173 /// By default, CreateDir function will try to create the directory without checking its 0174 /// existence. It's an optimization to try directory creation and catch the error, 0175 /// rather than issue two dependent I/O calls. 0176 /// Though for key/value storage like Google Cloud Storage, too many creation calls will 0177 /// breach the rate limit for object mutation operations and cause serious consequences. 0178 /// It's also possible you don't have creation access for the parent directory. Set it 0179 /// to be true to address these scenarios. 0180 bool check_directory_existence_before_creation = false; 0181 0182 /// Whether to allow file-open methods to return before the actual open. 0183 /// 0184 /// Enabling this may reduce the latency of `OpenInputStream`, `OpenOutputStream`, 0185 /// and similar methods, by reducing the number of roundtrips necessary. It may also 0186 /// allow usage of more efficient S3 APIs for small files. 0187 /// The downside is that failure conditions such as attempting to open a file in a 0188 /// non-existing bucket will only be reported when actual I/O is done (at worse, 0189 /// when attempting to close the file). 0190 bool allow_delayed_open = false; 0191 0192 /// \brief Default metadata for OpenOutputStream. 0193 /// 0194 /// This will be ignored if non-empty metadata is passed to OpenOutputStream. 0195 std::shared_ptr<const KeyValueMetadata> default_metadata; 0196 0197 /// Optional retry strategy to determine which error types should be retried, and the 0198 /// delay between retries. 0199 std::shared_ptr<S3RetryStrategy> retry_strategy; 0200 0201 /// Optional customer-provided key for server-side encryption (SSE-C). 0202 /// 0203 /// This should be the 32-byte AES-256 key, unencoded. 0204 std::string sse_customer_key; 0205 0206 /// Optional path to a single PEM file holding all TLS CA certificates 0207 /// 0208 /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); 0209 /// if the corresponding global filesystem option is also empty, the underlying 0210 /// TLS library's defaults will be used. 0211 /// 0212 /// Note this option may be ignored on some systems (Windows, macOS). 0213 std::string tls_ca_file_path; 0214 0215 /// Optional path to a directory holding TLS CA 0216 /// 0217 /// The given directory should contain CA certificates as individual PEM files 0218 /// named along the OpenSSL "hashed" format. 0219 /// 0220 /// If empty, global filesystem options will be used (see FileSystemGlobalOptions); 0221 /// if the corresponding global filesystem option is also empty, the underlying 0222 /// TLS library's defaults will be used. 0223 /// 0224 /// Note this option may be ignored on some systems (Windows, macOS). 0225 std::string tls_ca_dir_path; 0226 0227 /// Whether to verify the S3 endpoint's TLS certificate 0228 /// 0229 /// This option applies if the scheme is "https". 0230 bool tls_verify_certificates = true; 0231 0232 S3Options(); 0233 0234 /// Configure with the default AWS credentials provider chain. 0235 void ConfigureDefaultCredentials(); 0236 0237 /// Configure with anonymous credentials. This will only let you access public buckets. 0238 void ConfigureAnonymousCredentials(); 0239 0240 /// Configure with explicit access and secret key. 0241 void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key, 0242 const std::string& session_token = ""); 0243 0244 /// Configure with credentials from an assumed role. 0245 void ConfigureAssumeRoleCredentials( 0246 const std::string& role_arn, const std::string& session_name = "", 0247 const std::string& external_id = "", int load_frequency = 900, 0248 const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR); 0249 0250 /// Configure with credentials from role assumed using a web identity token 0251 void ConfigureAssumeRoleWithWebIdentityCredentials(); 0252 0253 std::string GetAccessKey() const; 0254 std::string GetSecretKey() const; 0255 std::string GetSessionToken() const; 0256 0257 bool Equals(const S3Options& other) const; 0258 0259 /// \brief Initialize with default credentials provider chain 0260 /// 0261 /// This is recommended if you use the standard AWS environment variables 0262 /// and/or configuration file. 0263 static S3Options Defaults(); 0264 0265 /// \brief Initialize with anonymous credentials. 0266 /// 0267 /// This will only let you access public buckets. 0268 static S3Options Anonymous(); 0269 0270 /// \brief Initialize with explicit access and secret key. 0271 /// 0272 /// Optionally, a session token may also be provided for temporary credentials 0273 /// (from STS). 0274 static S3Options FromAccessKey(const std::string& access_key, 0275 const std::string& secret_key, 0276 const std::string& session_token = ""); 0277 0278 /// \brief Initialize from an assumed role. 0279 static S3Options FromAssumeRole( 0280 const std::string& role_arn, const std::string& session_name = "", 0281 const std::string& external_id = "", int load_frequency = 900, 0282 const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR); 0283 0284 /// \brief Initialize from an assumed role with web-identity. 0285 /// Uses the AWS SDK which uses environment variables to 0286 /// generate temporary credentials. 0287 static S3Options FromAssumeRoleWithWebIdentity(); 0288 0289 static Result<S3Options> FromUri(const ::arrow::util::Uri& uri, 0290 std::string* out_path = NULLPTR); 0291 static Result<S3Options> FromUri(const std::string& uri, 0292 std::string* out_path = NULLPTR); 0293 }; 0294 0295 /// S3-backed FileSystem implementation. 0296 /// 0297 /// Some implementation notes: 0298 /// - buckets are special and the operations available on them may be limited 0299 /// or more expensive than desired. 0300 class ARROW_EXPORT S3FileSystem : public FileSystem { 0301 public: 0302 ~S3FileSystem() override; 0303 0304 std::string type_name() const override { return "s3"; } 0305 0306 /// Return the original S3 options when constructing the filesystem 0307 S3Options options() const; 0308 /// Return the actual region this filesystem connects to 0309 std::string region() const; 0310 0311 bool Equals(const FileSystem& other) const override; 0312 Result<std::string> PathFromUri(const std::string& uri_string) const override; 0313 Result<std::string> MakeUri(std::string path) const override; 0314 0315 /// \cond FALSE 0316 using FileSystem::CreateDir; 0317 using FileSystem::DeleteDirContents; 0318 using FileSystem::DeleteDirContentsAsync; 0319 using FileSystem::GetFileInfo; 0320 using FileSystem::OpenAppendStream; 0321 using FileSystem::OpenOutputStream; 0322 /// \endcond 0323 0324 Result<FileInfo> GetFileInfo(const std::string& path) override; 0325 Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override; 0326 0327 FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; 0328 0329 Status CreateDir(const std::string& path, bool recursive) override; 0330 0331 Status DeleteDir(const std::string& path) override; 0332 Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; 0333 Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok) override; 0334 Status DeleteRootDirContents() override; 0335 0336 Status DeleteFile(const std::string& path) override; 0337 0338 Status Move(const std::string& src, const std::string& dest) override; 0339 0340 Status CopyFile(const std::string& src, const std::string& dest) override; 0341 0342 /// Create a sequential input stream for reading from a S3 object. 0343 /// 0344 /// NOTE: Reads from the stream will be synchronous and unbuffered. 0345 /// You way want to wrap the stream in a BufferedInputStream or use 0346 /// a custom readahead strategy to avoid idle waits. 0347 Result<std::shared_ptr<io::InputStream>> OpenInputStream( 0348 const std::string& path) override; 0349 /// Create a sequential input stream for reading from a S3 object. 0350 /// 0351 /// This override avoids a HEAD request by assuming the FileInfo 0352 /// contains correct information. 0353 Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; 0354 0355 /// Create a random access file for reading from a S3 object. 0356 /// 0357 /// See OpenInputStream for performance notes. 0358 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( 0359 const std::string& path) override; 0360 /// Create a random access file for reading from a S3 object. 0361 /// 0362 /// This override avoids a HEAD request by assuming the FileInfo 0363 /// contains correct information. 0364 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( 0365 const FileInfo& info) override; 0366 0367 /// Create a sequential output stream for writing to a S3 object. 0368 /// 0369 /// NOTE: Writes to the stream will be buffered. Depending on 0370 /// S3Options.background_writes, they can be synchronous or not. 0371 /// It is recommended to enable background_writes unless you prefer 0372 /// implementing your own background execution strategy. 0373 Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( 0374 const std::string& path, 0375 const std::shared_ptr<const KeyValueMetadata>& metadata) override; 0376 0377 Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( 0378 const std::string& path, 0379 const std::shared_ptr<const KeyValueMetadata>& metadata) override; 0380 0381 /// Create a S3FileSystem instance from the given options. 0382 static Result<std::shared_ptr<S3FileSystem>> Make( 0383 const S3Options& options, const io::IOContext& = io::default_io_context()); 0384 0385 protected: 0386 explicit S3FileSystem(const S3Options& options, const io::IOContext&); 0387 0388 class Impl; 0389 std::shared_ptr<Impl> impl_; 0390 }; 0391 0392 enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace }; 0393 0394 struct ARROW_EXPORT S3GlobalOptions { 0395 /// The log level for S3-originating messages. 0396 S3LogLevel log_level; 0397 0398 /// The number of threads to configure when creating AWS' I/O event loop 0399 /// 0400 /// Defaults to 1 as recommended by AWS' doc when the # of connections is 0401 /// expected to be, at most, in the hundreds 0402 /// 0403 /// For more details see Aws::Crt::Io::EventLoopGroup 0404 int num_event_loop_threads = 1; 0405 0406 /// Whether to install a process-wide SIGPIPE handler 0407 /// 0408 /// The AWS SDK may sometimes emit SIGPIPE signals for certain errors; 0409 /// by default, they would abort the current process. 0410 /// This option, if enabled, will install a process-wide signal handler 0411 /// that logs and otherwise ignore incoming SIGPIPE signals. 0412 /// 0413 /// This option has no effect on Windows. 0414 bool install_sigpipe_handler = false; 0415 0416 /// \brief Initialize with default options 0417 /// 0418 /// For log_level, this method first tries to extract a suitable value from the 0419 /// environment variable ARROW_S3_LOG_LEVEL. 0420 static S3GlobalOptions Defaults(); 0421 }; 0422 0423 /// \brief Initialize the S3 APIs with the specified set of options. 0424 /// 0425 /// It is required to call this function at least once before using S3FileSystem. 0426 /// 0427 /// Once this function is called you MUST call FinalizeS3 before the end of the 0428 /// application in order to avoid a segmentation fault at shutdown. 0429 ARROW_EXPORT 0430 Status InitializeS3(const S3GlobalOptions& options); 0431 0432 /// \brief Ensure the S3 APIs are initialized, but only if not already done. 0433 /// 0434 /// If necessary, this will call InitializeS3() with some default options. 0435 ARROW_EXPORT 0436 Status EnsureS3Initialized(); 0437 0438 /// Whether S3 was initialized, and not finalized. 0439 ARROW_EXPORT 0440 bool IsS3Initialized(); 0441 0442 /// Whether S3 was finalized. 0443 ARROW_EXPORT 0444 bool IsS3Finalized(); 0445 0446 /// \brief Shutdown the S3 APIs. 0447 /// 0448 /// This can wait for some S3 concurrent calls to finish so as to avoid 0449 /// race conditions. 0450 /// After this function has been called, all S3 calls will fail with an error. 0451 /// 0452 /// Calls to InitializeS3() and FinalizeS3() should be serialized by the 0453 /// application (this also applies to EnsureS3Initialized() and 0454 /// EnsureS3Finalized()). 0455 ARROW_EXPORT 0456 Status FinalizeS3(); 0457 0458 /// \brief Ensure the S3 APIs are shutdown, but only if not already done. 0459 /// 0460 /// If necessary, this will call FinalizeS3(). 0461 ARROW_EXPORT 0462 Status EnsureS3Finalized(); 0463 0464 ARROW_EXPORT 0465 Result<std::string> ResolveS3BucketRegion(const std::string& bucket); 0466 0467 } // namespace arrow::fs
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|