Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:58

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <memory>
0021 #include <string>
0022 #include <vector>
0023 
0024 #include "arrow/filesystem/filesystem.h"
0025 #include "arrow/util/macros.h"
0026 #include "arrow/util/uri.h"
0027 
0028 namespace Azure::Core::Credentials {
0029 class TokenCredential;
0030 }
0031 
0032 namespace Azure::Storage {
0033 class StorageSharedKeyCredential;
0034 }
0035 
0036 namespace Azure::Storage::Blobs {
0037 class BlobServiceClient;
0038 }
0039 
0040 namespace Azure::Storage::Files::DataLake {
0041 class DataLakeFileSystemClient;
0042 class DataLakeServiceClient;
0043 }  // namespace Azure::Storage::Files::DataLake
0044 
0045 namespace arrow::fs {
0046 
0047 class TestAzureFileSystem;
0048 class TestAzureOptions;
0049 
0050 /// Options for the AzureFileSystem implementation.
0051 ///
0052 /// By default, authentication is handled by the Azure SDK's credential chain
0053 /// which may read from multiple environment variables, such as:
0054 /// - `AZURE_TENANT_ID`
0055 /// - `AZURE_CLIENT_ID`
0056 /// - `AZURE_CLIENT_SECRET`
0057 /// - `AZURE_AUTHORITY_HOST`
0058 /// - `AZURE_CLIENT_CERTIFICATE_PATH`
0059 /// - `AZURE_FEDERATED_TOKEN_FILE`
0060 ///
0061 /// Functions are provided for explicit configuration of credentials if that is preferred.
0062 struct ARROW_EXPORT AzureOptions {
0063   friend class TestAzureOptions;
0064 
0065   /// \brief The name of the Azure Storage Account being accessed.
0066   ///
0067   /// All service URLs will be constructed using this storage account name.
0068   /// `ConfigureAccountKeyCredential` assumes the user wants to authenticate
0069   /// this account.
0070   std::string account_name;
0071 
0072   /// \brief hostname[:port] of the Azure Blob Storage Service.
0073   ///
0074   /// If the hostname is a relative domain name (one that starts with a '.'), then storage
0075   /// account URLs will be constructed by prepending the account name to the hostname.
0076   /// If the hostname is a fully qualified domain name, then the hostname will be used
0077   /// as-is and the account name will follow the hostname in the URL path.
0078   ///
0079   /// Default: ".blob.core.windows.net"
0080   std::string blob_storage_authority = ".blob.core.windows.net";
0081 
0082   /// \brief hostname[:port] of the Azure Data Lake Storage Gen 2 Service.
0083   ///
0084   /// If the hostname is a relative domain name (one that starts with a '.'), then storage
0085   /// account URLs will be constructed by prepending the account name to the hostname.
0086   /// If the hostname is a fully qualified domain name, then the hostname will be used
0087   /// as-is and the account name will follow the hostname in the URL path.
0088   ///
0089   /// Default: ".dfs.core.windows.net"
0090   std::string dfs_storage_authority = ".dfs.core.windows.net";
0091 
0092   /// \brief Azure Blob Storage connection transport.
0093   ///
0094   /// Default: "https"
0095   std::string blob_storage_scheme = "https";
0096 
0097   /// \brief Azure Data Lake Storage Gen 2 connection transport.
0098   ///
0099   /// Default: "https"
0100   std::string dfs_storage_scheme = "https";
0101 
0102   // TODO(GH-38598): Add support for more auth methods.
0103   // std::string connection_string;
0104   // std::string sas_token;
0105 
0106   /// \brief Default metadata for OpenOutputStream.
0107   ///
0108   /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
0109   std::shared_ptr<const KeyValueMetadata> default_metadata;
0110 
0111   /// Whether OutputStream writes will be issued in the background, without blocking.
0112   bool background_writes = true;
0113 
0114  private:
0115   enum class CredentialKind {
0116     kDefault,
0117     kAnonymous,
0118     kStorageSharedKey,
0119     kSASToken,
0120     kClientSecret,
0121     kManagedIdentity,
0122     kCLI,
0123     kWorkloadIdentity,
0124     kEnvironment,
0125   } credential_kind_ = CredentialKind::kDefault;
0126 
0127   std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
0128       storage_shared_key_credential_;
0129   std::string sas_token_;
0130   mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;
0131 
0132  public:
0133   AzureOptions();
0134   ~AzureOptions();
0135 
0136  private:
0137   void ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path);
0138   Status ExtractFromUriQuery(const Uri& uri);
0139 
0140  public:
0141   /// \brief Construct a new AzureOptions from an URI.
0142   ///
0143   /// Supported formats:
0144   ///
0145   /// 1. abfs[s]://\<account\>.blob.core.windows.net[/\<container\>[/\<path\>]]
0146   /// 2. abfs[s]://\<container\>\@\<account\>.dfs.core.windows.net[/path]
0147   /// 3. abfs[s]://[\<account@]\<host[.domain]\>[\<:port\>][/\<container\>[/path]]
0148   /// 4. abfs[s]://[\<account@]\<container\>[/path]
0149   ///
0150   /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs
0151   /// [1], (3) is for Azure Blob Storage compatible service including Azurite,
0152   /// and (4) is a shorter version of (1) and (2).
0153   ///
0154   /// Note that there is no difference between abfs and abfss. HTTPS is
0155   /// used with abfs by default. You can force to use HTTP by specifying
0156   /// "enable_tls=false" query.
0157   ///
0158   /// Supported query parameters:
0159   ///
0160   /// * blob_storage_authority: Set AzureOptions::blob_storage_authority
0161   /// * dfs_storage_authority: Set AzureOptions::dfs_storage_authority
0162   /// * enable_tls: If it's "false" or "0", HTTP not HTTPS is used.
0163   /// * credential_kind: One of "default", "anonymous", "workload_identity",
0164   ///   "environment" or "cli". If "default" is specified, it's
0165   ///   just ignored.  If "anonymous" is specified,
0166   ///   AzureOptions::ConfigureAnonymousCredential() is called. If
0167   ///   "workload_identity" is specified,
0168   ///   AzureOptions::ConfigureWorkloadIdentityCredential() is called. If
0169   ///   "environment" is specified,
0170   ///   AzureOptions::ConfigureEnvironmentCredential() is called. If "cli" is
0171   ///   specified, AzureOptions::ConfigureCLICredential() is called.
0172   /// * tenant_id: You must specify "client_id" and "client_secret"
0173   ///   too. AzureOptions::ConfigureClientSecretCredential() is called.
0174   /// * client_id: If you don't specify "tenant_id" and
0175   ///   "client_secret",
0176   ///   AzureOptions::ConfigureManagedIdentityCredential() is
0177   ///   called. If you specify "tenant_id" and "client_secret" too,
0178   ///   AzureOptions::ConfigureClientSecretCredential() is called.
0179   /// * client_secret: You must specify "tenant_id" and "client_id"
0180   ///   too. AzureOptions::ConfigureClientSecretCredential() is called.
0181   /// * A SAS token is made up of several query parameters. Appending a SAS
0182   ///   token to the URI configures SAS token auth by calling
0183   ///   AzureOptions::ConfigureSASCredential().
0184   ///
0185   /// [1]:
0186   /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
0187   static Result<AzureOptions> FromUri(const Uri& uri, std::string* out_path);
0188   static Result<AzureOptions> FromUri(const std::string& uri, std::string* out_path);
0189 
0190   Status ConfigureDefaultCredential();
0191   Status ConfigureAnonymousCredential();
0192   Status ConfigureAccountKeyCredential(const std::string& account_key);
0193   Status ConfigureSASCredential(const std::string& sas_token);
0194   Status ConfigureClientSecretCredential(const std::string& tenant_id,
0195                                          const std::string& client_id,
0196                                          const std::string& client_secret);
0197   Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string());
0198   Status ConfigureCLICredential();
0199   Status ConfigureWorkloadIdentityCredential();
0200   Status ConfigureEnvironmentCredential();
0201 
0202   bool Equals(const AzureOptions& other) const;
0203 
0204   std::string AccountBlobUrl(const std::string& account_name) const;
0205   std::string AccountDfsUrl(const std::string& account_name) const;
0206 
0207   Result<std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient>>
0208   MakeBlobServiceClient() const;
0209 
0210   Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>>
0211   MakeDataLakeServiceClient() const;
0212 };
0213 
0214 /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and
0215 /// Azure Data Lake Storage Gen2 (ADLS Gen2) [2].
0216 ///
0217 /// ADLS Gen2 isn't a dedicated service or account type. It's a set of capabilities that
0218 /// support high throughput analytic workloads, built on Azure Blob Storage. All the data
0219 /// ingested via the ADLS Gen2 APIs is persisted as blobs in the storage account.
0220 /// ADLS Gen2 provides filesystem semantics, file-level security, and Hadoop
0221 /// compatibility. ADLS Gen1 exists as a separate object that will retired on 2024-02-29
0222 /// and new ADLS accounts use Gen2 instead.
0223 ///
0224 /// ADLS Gen2 and Blob APIs can operate on the same data, but there are
0225 /// some limitations [3]. The ones that are relevant to this
0226 /// implementation are listed here:
0227 ///
0228 /// - You can't use Blob APIs, and ADLS APIs to write to the same instance of a file. If
0229 ///   you write to a file by using ADLS APIs then that file's blocks won't be visible
0230 ///   to calls to the GetBlockList Blob API. The only exception is when you're
0231 ///   overwriting.
0232 /// - When you use the ListBlobs operation without specifying a delimiter, the results
0233 ///   include both directories and blobs. If you choose to use a delimiter, use only a
0234 ///   forward slash (/) \--- the only supported delimiter.
0235 /// - If you use the DeleteBlob API to delete a directory, that directory is deleted only
0236 ///   if it's empty. This means that you can't use the Blob API delete directories
0237 ///   recursively.
0238 ///
0239 /// [1]: https://azure.microsoft.com/en-us/products/storage/blobs
0240 /// [2]: https://azure.microsoft.com/en-us/products/storage/data-lake-storage
0241 /// [3]:
0242 /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues
0243 class ARROW_EXPORT AzureFileSystem : public FileSystem {
0244  private:
0245   class Impl;
0246   std::unique_ptr<Impl> impl_;
0247 
0248   explicit AzureFileSystem(std::unique_ptr<Impl>&& impl);
0249 
0250   friend class TestAzureFileSystem;
0251   void ForceCachedHierarchicalNamespaceSupport(int hns_support);
0252 
0253  public:
0254   ~AzureFileSystem() override = default;
0255 
0256   static Result<std::shared_ptr<AzureFileSystem>> Make(
0257       const AzureOptions& options, const io::IOContext& = io::default_io_context());
0258 
0259   std::string type_name() const override { return "abfs"; }
0260 
0261   /// Return the original Azure options when constructing the filesystem
0262   const AzureOptions& options() const;
0263 
0264   bool Equals(const FileSystem& other) const override;
0265 
0266   /// \cond FALSE
0267   using FileSystem::CreateDir;
0268   using FileSystem::DeleteDirContents;
0269   using FileSystem::GetFileInfo;
0270   using FileSystem::OpenAppendStream;
0271   using FileSystem::OpenOutputStream;
0272   /// \endcond
0273 
0274   Result<FileInfo> GetFileInfo(const std::string& path) override;
0275 
0276   Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
0277 
0278   Status CreateDir(const std::string& path, bool recursive) override;
0279 
0280   /// \brief Delete a directory and its contents recursively.
0281   ///
0282   /// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts.
0283   Status DeleteDir(const std::string& path) override;
0284 
0285   /// \brief Non-atomically deletes the contents of a directory.
0286   ///
0287   /// This function can return a bad Status after only partially deleting the
0288   /// contents of the directory.
0289   Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
0290 
0291   /// \brief Deletion of all the containers in the storage account (not
0292   /// implemented for safety reasons).
0293   ///
0294   /// \return Status::NotImplemented
0295   Status DeleteRootDirContents() override;
0296 
0297   /// \brief Deletes a file.
0298   ///
0299   /// Supported on both flat namespace and Hierarchical Namespace storage
0300   /// accounts. A check is made to guarantee the parent directory doesn't
0301   /// disappear after the blob is deleted and while this operation is running,
0302   /// no other client can delete the parent directory due to the use of leases.
0303   ///
0304   /// This means applications can safely retry this operation without coordination to
0305   /// guarantee only one client/process is trying to delete the same file.
0306   Status DeleteFile(const std::string& path) override;
0307 
0308   /// \brief Move/rename a file or directory.
0309   ///
0310   /// There are no files immediately at the root directory, so paths like
0311   /// "/segment" always refer to a container of the storage account and are
0312   /// treated as directories.
0313   ///
0314   /// If `dest` exists but the operation fails for some reason, `Move`
0315   /// guarantees `dest` is not lost.
0316   ///
0317   /// Conditions for a successful move:
0318   ///
0319   /// 1. `src` must exist.
0320   /// 2. `dest` can't contain a strict path prefix of `src`. More generally,
0321   ///    a directory can't be made a subdirectory of itself.
0322   /// 3. If `dest` already exists and it's a file, `src` must also be a file.
0323   ///    `dest` is then replaced by `src`.
0324   /// 4. All components of `dest` must exist, except for the last.
0325   /// 5. If `dest` already exists and it's a directory, `src` must also be a
0326   ///    directory and `dest` must be empty. `dest` is then replaced by `src`
0327   ///    and its contents.
0328   ///
0329   /// Leases are used to guarantee the pre-condition checks and the rename
0330   /// operation are atomic: other clients can't invalidate the pre-condition in
0331   /// the time between the checks and the actual rename operation.
0332   ///
0333   /// This is possible because Move() is only support on storage accounts with
0334   /// Hierarchical Namespace Support enabled.
0335   ///
0336   /// ## Limitations
0337   ///
0338   /// - Moves are not supported on storage accounts without
0339   ///   Hierarchical Namespace support enabled
0340   /// - Moves across different containers are not supported
0341   /// - Moving a path of the form `/container` is not supported as it would
0342   ///   require moving all the files in a container to another container.
0343   ///   The only exception is a `Move("/container_a", "/container_b")` where
0344   ///   both containers are empty or `container_b` doesn't even exist.
0345   ///   The atomicity of the emptiness checks followed by the renaming operation
0346   ///   is guaranteed by the use of leases.
0347   Status Move(const std::string& src, const std::string& dest) override;
0348 
0349   Status CopyFile(const std::string& src, const std::string& dest) override;
0350 
0351   Result<std::shared_ptr<io::InputStream>> OpenInputStream(
0352       const std::string& path) override;
0353 
0354   Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
0355 
0356   Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
0357       const std::string& path) override;
0358 
0359   Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
0360       const FileInfo& info) override;
0361 
0362   Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
0363       const std::string& path,
0364       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
0365 
0366   Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
0367       const std::string& path,
0368       const std::shared_ptr<const KeyValueMetadata>& metadata) override;
0369 
0370   Result<std::string> PathFromUri(const std::string& uri_string) const override;
0371 };
0372 
0373 }  // namespace arrow::fs