Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:58

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <memory>
0021 #include <optional>
0022 #include <string>
0023 #include <vector>
0024 
0025 #include "arrow/filesystem/filesystem.h"
0026 #include "arrow/util/uri.h"
0027 
0028 namespace arrow {
0029 namespace fs {
0030 namespace internal {
0031 
0032 // Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers.
0033 struct GcsCredentialsHolder;
0034 
0035 }  // namespace internal
0036 
0037 class GcsFileSystem;
0038 
0039 /// \brief Container for GCS Credentials and information necessary to recreate them.
0040 class ARROW_EXPORT GcsCredentials {
0041  public:
0042   bool Equals(const GcsCredentials& other) const;
0043   bool anonymous() const { return anonymous_; }
0044   const std::string& access_token() const { return access_token_; }
0045   TimePoint expiration() const { return expiration_; }
0046   const std::string& target_service_account() const { return target_service_account_; }
0047   const std::string& json_credentials() const { return json_credentials_; }
0048   const std::shared_ptr<internal::GcsCredentialsHolder>& holder() const {
0049     return holder_;
0050   }
0051 
0052  private:
0053   GcsCredentials() = default;
0054   bool anonymous_ = false;
0055   std::string access_token_;
0056   TimePoint expiration_;
0057   std::string target_service_account_;
0058   std::string json_credentials_;
0059   std::shared_ptr<internal::GcsCredentialsHolder> holder_;
0060   friend class GcsFileSystem;
0061   friend struct GcsOptions;
0062 };
0063 
0064 /// Options for the GcsFileSystem implementation.
0065 struct ARROW_EXPORT GcsOptions {
0066   /// \brief Equivalent to GcsOptions::Defaults().
0067   GcsOptions();
0068   GcsCredentials credentials;
0069 
0070   std::string endpoint_override;
0071   std::string scheme;
0072   /// \brief Location to use for creating buckets.
0073   std::string default_bucket_location;
0074 
0075   /// \brief If set used to control total time allowed for retrying underlying
0076   /// errors.
0077   ///
0078   /// The default policy is to retry for up to 15 minutes.
0079   std::optional<double> retry_limit_seconds;
0080 
0081   /// \brief Default metadata for OpenOutputStream.
0082   ///
0083   /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
0084   std::shared_ptr<const KeyValueMetadata> default_metadata;
0085 
0086   /// \brief The project to use for creating buckets.
0087   ///
0088   /// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
0089   /// variable. Most I/O operations do not need a project id, only applications
0090   /// that create new buckets need a project id.
0091   std::optional<std::string> project_id;
0092 
0093   bool Equals(const GcsOptions& other) const;
0094 
0095   /// \brief Initialize with Google Default Credentials
0096   ///
0097   /// Create options configured to use [Application Default Credentials][aip/4110]. The
0098   /// details of this mechanism are too involved to describe here, but suffice is to say
0099   /// that applications can override any defaults using an environment variable
0100   /// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google
0101   /// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have
0102   /// the same behavior as the `gcloud` CLI tool on your workstation.
0103   ///
0104   /// \see https://cloud.google.com/docs/authentication
0105   ///
0106   /// [aip/4110]: https://google.aip.dev/auth/4110
0107   static GcsOptions Defaults();
0108 
0109   /// \brief Initialize with anonymous credentials
0110   static GcsOptions Anonymous();
0111 
0112   /// \brief Initialize with access token
0113   ///
0114   /// These credentials are useful when using an out-of-band mechanism to fetch access
0115   /// tokens. Note that access tokens are time limited, you will need to manually refresh
0116   /// the tokens created by the out-of-band mechanism.
0117   static GcsOptions FromAccessToken(const std::string& access_token,
0118                                     TimePoint expiration);
0119 
0120   /// \brief Initialize with service account impersonation
0121   ///
0122   /// Service account impersonation allows one principal (a user or service account) to
0123   /// impersonate a service account. It requires that the calling principal has the
0124   /// necessary permissions *on* the service account.
0125   static GcsOptions FromImpersonatedServiceAccount(
0126       const GcsCredentials& base_credentials, const std::string& target_service_account);
0127 
0128   /// Creates service account credentials from a JSON object in string form.
0129   ///
0130   /// The @p json_object  is expected to be in the format described by [aip/4112]. Such an
0131   /// object contains the identity of a service account, as well as a private key that can
0132   /// be used to sign tokens, showing the caller was holding the private key.
0133   ///
0134   /// In GCP one can create several "keys" for each service account, and these keys are
0135   /// downloaded as a JSON "key file". The contents of such a file are in the format
0136   /// required by this function. Remember that key files and their contents should be
0137   /// treated as any other secret with security implications, think of them as passwords
0138   /// (because they are!), don't store them or output them where unauthorized persons may
0139   /// read them.
0140   ///
0141   /// Most applications should probably use default credentials, maybe pointing them to a
0142   /// file with these contents. Using this function may be useful when the json object is
0143   /// obtained from a Cloud Secret Manager or a similar service.
0144   ///
0145   /// [aip/4112]: https://google.aip.dev/auth/4112
0146   static GcsOptions FromServiceAccountCredentials(const std::string& json_object);
0147 
0148   /// Initialize from URIs such as "gs://bucket/object".
0149   static Result<GcsOptions> FromUri(const arrow::util::Uri& uri, std::string* out_path);
0150   static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path);
0151 };
0152 
0153 /// \brief GCS-backed FileSystem implementation.
0154 ///
0155 /// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object
0156 /// storage system for any amount of data. The main abstractions in GCS are buckets and
0157 /// objects. A bucket is a namespace for objects, buckets can store any number of objects,
0158 /// tens of millions and even billions is not uncommon.  Each object contains a single
0159 /// blob of data, up to 5TiB in size.  Buckets are typically configured to keep a single
0160 /// version of each object, but versioning can be enabled. Versioning is important because
0161 /// objects are immutable, once created one cannot append data to the object or modify the
0162 /// object data in any way.
0163 ///
0164 /// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket
0165 /// named `foo` no other customer can create a bucket with the same name. Note that a
0166 /// principal (a user or service account) may only list the buckets they are entitled to,
0167 /// and then only within a project. It is not possible to list "all" the buckets.
0168 ///
0169 /// Within each bucket objects are in flat namespace. GCS does not have folders or
0170 /// directories. However, following some conventions it is possible to emulate
0171 /// directories. To this end, this class:
0172 ///
0173 /// - All buckets are treated as directories at the "root"
0174 /// - Creating a root directory results in a new bucket being created, this may be slower
0175 ///   than most GCS operations.
0176 /// - The class creates marker objects for a directory, using a metadata attribute to
0177 ///   annotate the file.
0178 /// - GCS can list all the objects with a given prefix, this is used to emulate listing
0179 ///   of directories.
0180 /// - In object lists GCS can summarize all the objects with a common prefix as a single
0181 ///   entry, this is used to emulate non-recursive lists. Note that GCS list time is
0182 ///   proportional to the number of objects in the prefix. Listing recursively takes
0183 ///   almost the same time as non-recursive lists.
0184 ///
0185 class ARROW_EXPORT GcsFileSystem : public FileSystem {
0186  public:
0187   ~GcsFileSystem() override = default;
0188 
0189   std::string type_name() const override;
0190   const GcsOptions& options() const;
0191 
0192   bool Equals(const FileSystem& other) const override;
0193   Result<std::string> PathFromUri(const std::string& uri_string) const override;
0194 
0195   Result<FileInfo> GetFileInfo(const std::string& path) override;
0196   Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
0197 
0198   Status CreateDir(const std::string& path, bool recursive) override;
0199 
0200   Status DeleteDir(const std::string& path) override;
0201 
0202   Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
0203 
0204   /// This is not implemented in GcsFileSystem, as it would be too dangerous.
0205   Status DeleteRootDirContents() override;
0206 
0207   Status DeleteFile(const std::string& path) override;
0208 
0209   Status Move(const std::string& src, const std::string& dest) override;
0210 
0211   Status CopyFile(const std::string& src, const std::string& dest) override;
0212 
0213   Result<std::shared_ptr<io::InputStream>> OpenInputStream(
0214       const std::string& path) override;
0215   Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
0216 
0217   Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
0218       const std::string& path) override;
0219   Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
0220       const FileInfo& info) override;
0221 
0222   Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
0223       const std::string& path,
0224       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
0225 
0226   ARROW_DEPRECATED(
0227       "Deprecated. "
0228       "OpenAppendStream is unsupported on the GCS FileSystem.")
0229   Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
0230       const std::string& path,
0231       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
0232 
0233   /// Create a GcsFileSystem instance from the given options.
0234   static Result<std::shared_ptr<GcsFileSystem>> Make(
0235       const GcsOptions& options, const io::IOContext& = io::default_io_context());
0236 
0237  private:
0238   explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context);
0239 
0240   class Impl;
0241   std::shared_ptr<Impl> impl_;
0242 };
0243 
0244 }  // namespace fs
0245 }  // namespace arrow