![]() |
|
|||
File indexing completed on 2025-08-28 08:26:58
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <memory> 0021 #include <optional> 0022 #include <string> 0023 #include <vector> 0024 0025 #include "arrow/filesystem/filesystem.h" 0026 #include "arrow/util/uri.h" 0027 0028 namespace arrow { 0029 namespace fs { 0030 namespace internal { 0031 0032 // Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers. 0033 struct GcsCredentialsHolder; 0034 0035 } // namespace internal 0036 0037 class GcsFileSystem; 0038 0039 /// \brief Container for GCS Credentials and information necessary to recreate them. 0040 class ARROW_EXPORT GcsCredentials { 0041 public: 0042 bool Equals(const GcsCredentials& other) const; 0043 bool anonymous() const { return anonymous_; } 0044 const std::string& access_token() const { return access_token_; } 0045 TimePoint expiration() const { return expiration_; } 0046 const std::string& target_service_account() const { return target_service_account_; } 0047 const std::string& json_credentials() const { return json_credentials_; } 0048 const std::shared_ptr<internal::GcsCredentialsHolder>& holder() const { 0049 return holder_; 0050 } 0051 0052 private: 0053 GcsCredentials() = default; 0054 bool anonymous_ = false; 0055 std::string access_token_; 0056 TimePoint expiration_; 0057 std::string target_service_account_; 0058 std::string json_credentials_; 0059 std::shared_ptr<internal::GcsCredentialsHolder> holder_; 0060 friend class GcsFileSystem; 0061 friend struct GcsOptions; 0062 }; 0063 0064 /// Options for the GcsFileSystem implementation. 0065 struct ARROW_EXPORT GcsOptions { 0066 /// \brief Equivalent to GcsOptions::Defaults(). 0067 GcsOptions(); 0068 GcsCredentials credentials; 0069 0070 std::string endpoint_override; 0071 std::string scheme; 0072 /// \brief Location to use for creating buckets. 0073 std::string default_bucket_location; 0074 0075 /// \brief If set used to control total time allowed for retrying underlying 0076 /// errors. 0077 /// 0078 /// The default policy is to retry for up to 15 minutes. 0079 std::optional<double> retry_limit_seconds; 0080 0081 /// \brief Default metadata for OpenOutputStream. 0082 /// 0083 /// This will be ignored if non-empty metadata is passed to OpenOutputStream. 0084 std::shared_ptr<const KeyValueMetadata> default_metadata; 0085 0086 /// \brief The project to use for creating buckets. 0087 /// 0088 /// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment 0089 /// variable. Most I/O operations do not need a project id, only applications 0090 /// that create new buckets need a project id. 0091 std::optional<std::string> project_id; 0092 0093 bool Equals(const GcsOptions& other) const; 0094 0095 /// \brief Initialize with Google Default Credentials 0096 /// 0097 /// Create options configured to use [Application Default Credentials][aip/4110]. The 0098 /// details of this mechanism are too involved to describe here, but suffice is to say 0099 /// that applications can override any defaults using an environment variable 0100 /// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google 0101 /// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have 0102 /// the same behavior as the `gcloud` CLI tool on your workstation. 0103 /// 0104 /// \see https://cloud.google.com/docs/authentication 0105 /// 0106 /// [aip/4110]: https://google.aip.dev/auth/4110 0107 static GcsOptions Defaults(); 0108 0109 /// \brief Initialize with anonymous credentials 0110 static GcsOptions Anonymous(); 0111 0112 /// \brief Initialize with access token 0113 /// 0114 /// These credentials are useful when using an out-of-band mechanism to fetch access 0115 /// tokens. Note that access tokens are time limited, you will need to manually refresh 0116 /// the tokens created by the out-of-band mechanism. 0117 static GcsOptions FromAccessToken(const std::string& access_token, 0118 TimePoint expiration); 0119 0120 /// \brief Initialize with service account impersonation 0121 /// 0122 /// Service account impersonation allows one principal (a user or service account) to 0123 /// impersonate a service account. It requires that the calling principal has the 0124 /// necessary permissions *on* the service account. 0125 static GcsOptions FromImpersonatedServiceAccount( 0126 const GcsCredentials& base_credentials, const std::string& target_service_account); 0127 0128 /// Creates service account credentials from a JSON object in string form. 0129 /// 0130 /// The @p json_object is expected to be in the format described by [aip/4112]. Such an 0131 /// object contains the identity of a service account, as well as a private key that can 0132 /// be used to sign tokens, showing the caller was holding the private key. 0133 /// 0134 /// In GCP one can create several "keys" for each service account, and these keys are 0135 /// downloaded as a JSON "key file". The contents of such a file are in the format 0136 /// required by this function. Remember that key files and their contents should be 0137 /// treated as any other secret with security implications, think of them as passwords 0138 /// (because they are!), don't store them or output them where unauthorized persons may 0139 /// read them. 0140 /// 0141 /// Most applications should probably use default credentials, maybe pointing them to a 0142 /// file with these contents. Using this function may be useful when the json object is 0143 /// obtained from a Cloud Secret Manager or a similar service. 0144 /// 0145 /// [aip/4112]: https://google.aip.dev/auth/4112 0146 static GcsOptions FromServiceAccountCredentials(const std::string& json_object); 0147 0148 /// Initialize from URIs such as "gs://bucket/object". 0149 static Result<GcsOptions> FromUri(const arrow::util::Uri& uri, std::string* out_path); 0150 static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path); 0151 }; 0152 0153 /// \brief GCS-backed FileSystem implementation. 0154 /// 0155 /// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object 0156 /// storage system for any amount of data. The main abstractions in GCS are buckets and 0157 /// objects. A bucket is a namespace for objects, buckets can store any number of objects, 0158 /// tens of millions and even billions is not uncommon. Each object contains a single 0159 /// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single 0160 /// version of each object, but versioning can be enabled. Versioning is important because 0161 /// objects are immutable, once created one cannot append data to the object or modify the 0162 /// object data in any way. 0163 /// 0164 /// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket 0165 /// named `foo` no other customer can create a bucket with the same name. Note that a 0166 /// principal (a user or service account) may only list the buckets they are entitled to, 0167 /// and then only within a project. It is not possible to list "all" the buckets. 0168 /// 0169 /// Within each bucket objects are in flat namespace. GCS does not have folders or 0170 /// directories. However, following some conventions it is possible to emulate 0171 /// directories. To this end, this class: 0172 /// 0173 /// - All buckets are treated as directories at the "root" 0174 /// - Creating a root directory results in a new bucket being created, this may be slower 0175 /// than most GCS operations. 0176 /// - The class creates marker objects for a directory, using a metadata attribute to 0177 /// annotate the file. 0178 /// - GCS can list all the objects with a given prefix, this is used to emulate listing 0179 /// of directories. 0180 /// - In object lists GCS can summarize all the objects with a common prefix as a single 0181 /// entry, this is used to emulate non-recursive lists. Note that GCS list time is 0182 /// proportional to the number of objects in the prefix. Listing recursively takes 0183 /// almost the same time as non-recursive lists. 0184 /// 0185 class ARROW_EXPORT GcsFileSystem : public FileSystem { 0186 public: 0187 ~GcsFileSystem() override = default; 0188 0189 std::string type_name() const override; 0190 const GcsOptions& options() const; 0191 0192 bool Equals(const FileSystem& other) const override; 0193 Result<std::string> PathFromUri(const std::string& uri_string) const override; 0194 0195 Result<FileInfo> GetFileInfo(const std::string& path) override; 0196 Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; 0197 0198 Status CreateDir(const std::string& path, bool recursive) override; 0199 0200 Status DeleteDir(const std::string& path) override; 0201 0202 Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override; 0203 0204 /// This is not implemented in GcsFileSystem, as it would be too dangerous. 0205 Status DeleteRootDirContents() override; 0206 0207 Status DeleteFile(const std::string& path) override; 0208 0209 Status Move(const std::string& src, const std::string& dest) override; 0210 0211 Status CopyFile(const std::string& src, const std::string& dest) override; 0212 0213 Result<std::shared_ptr<io::InputStream>> OpenInputStream( 0214 const std::string& path) override; 0215 Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; 0216 0217 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( 0218 const std::string& path) override; 0219 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( 0220 const FileInfo& info) override; 0221 0222 Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( 0223 const std::string& path, 0224 const std::shared_ptr<const KeyValueMetadata>& metadata) override; 0225 0226 ARROW_DEPRECATED( 0227 "Deprecated. " 0228 "OpenAppendStream is unsupported on the GCS FileSystem.") 0229 Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( 0230 const std::string& path, 0231 const std::shared_ptr<const KeyValueMetadata>& metadata) override; 0232 0233 /// Create a GcsFileSystem instance from the given options. 0234 static Result<std::shared_ptr<GcsFileSystem>> Make( 0235 const GcsOptions& options, const io::IOContext& = io::default_io_context()); 0236 0237 private: 0238 explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context); 0239 0240 class Impl; 0241 std::shared_ptr<Impl> impl_; 0242 }; 0243 0244 } // namespace fs 0245 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |