![]() |
|
|||
File indexing completed on 2025-08-28 08:26:58
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <memory> 0021 #include <string> 0022 #include <vector> 0023 0024 #include "arrow/filesystem/filesystem.h" 0025 #include "arrow/util/macros.h" 0026 #include "arrow/util/uri.h" 0027 0028 namespace Azure::Core::Credentials { 0029 class TokenCredential; 0030 } 0031 0032 namespace Azure::Storage { 0033 class StorageSharedKeyCredential; 0034 } 0035 0036 namespace Azure::Storage::Blobs { 0037 class BlobServiceClient; 0038 } 0039 0040 namespace Azure::Storage::Files::DataLake { 0041 class DataLakeFileSystemClient; 0042 class DataLakeServiceClient; 0043 } // namespace Azure::Storage::Files::DataLake 0044 0045 namespace arrow::fs { 0046 0047 class TestAzureFileSystem; 0048 class TestAzureOptions; 0049 0050 /// Options for the AzureFileSystem implementation. 0051 /// 0052 /// By default, authentication is handled by the Azure SDK's credential chain 0053 /// which may read from multiple environment variables, such as: 0054 /// - `AZURE_TENANT_ID` 0055 /// - `AZURE_CLIENT_ID` 0056 /// - `AZURE_CLIENT_SECRET` 0057 /// - `AZURE_AUTHORITY_HOST` 0058 /// - `AZURE_CLIENT_CERTIFICATE_PATH` 0059 /// - `AZURE_FEDERATED_TOKEN_FILE` 0060 /// 0061 /// Functions are provided for explicit configuration of credentials if that is preferred. 0062 struct ARROW_EXPORT AzureOptions { 0063 friend class TestAzureOptions; 0064 0065 /// \brief The name of the Azure Storage Account being accessed. 0066 /// 0067 /// All service URLs will be constructed using this storage account name. 0068 /// `ConfigureAccountKeyCredential` assumes the user wants to authenticate 0069 /// this account. 0070 std::string account_name; 0071 0072 /// \brief hostname[:port] of the Azure Blob Storage Service. 0073 /// 0074 /// If the hostname is a relative domain name (one that starts with a '.'), then storage 0075 /// account URLs will be constructed by prepending the account name to the hostname. 0076 /// If the hostname is a fully qualified domain name, then the hostname will be used 0077 /// as-is and the account name will follow the hostname in the URL path. 0078 /// 0079 /// Default: ".blob.core.windows.net" 0080 std::string blob_storage_authority = ".blob.core.windows.net"; 0081 0082 /// \brief hostname[:port] of the Azure Data Lake Storage Gen 2 Service. 0083 /// 0084 /// If the hostname is a relative domain name (one that starts with a '.'), then storage 0085 /// account URLs will be constructed by prepending the account name to the hostname. 0086 /// If the hostname is a fully qualified domain name, then the hostname will be used 0087 /// as-is and the account name will follow the hostname in the URL path. 0088 /// 0089 /// Default: ".dfs.core.windows.net" 0090 std::string dfs_storage_authority = ".dfs.core.windows.net"; 0091 0092 /// \brief Azure Blob Storage connection transport. 0093 /// 0094 /// Default: "https" 0095 std::string blob_storage_scheme = "https"; 0096 0097 /// \brief Azure Data Lake Storage Gen 2 connection transport. 0098 /// 0099 /// Default: "https" 0100 std::string dfs_storage_scheme = "https"; 0101 0102 // TODO(GH-38598): Add support for more auth methods. 0103 // std::string connection_string; 0104 // std::string sas_token; 0105 0106 /// \brief Default metadata for OpenOutputStream. 0107 /// 0108 /// This will be ignored if non-empty metadata is passed to OpenOutputStream. 0109 std::shared_ptr<const KeyValueMetadata> default_metadata; 0110 0111 /// Whether OutputStream writes will be issued in the background, without blocking. 0112 bool background_writes = true; 0113 0114 private: 0115 enum class CredentialKind { 0116 kDefault, 0117 kAnonymous, 0118 kStorageSharedKey, 0119 kSASToken, 0120 kClientSecret, 0121 kManagedIdentity, 0122 kCLI, 0123 kWorkloadIdentity, 0124 kEnvironment, 0125 } credential_kind_ = CredentialKind::kDefault; 0126 0127 std::shared_ptr<Azure::Storage::StorageSharedKeyCredential> 0128 storage_shared_key_credential_; 0129 std::string sas_token_; 0130 mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_; 0131 0132 public: 0133 AzureOptions(); 0134 ~AzureOptions(); 0135 0136 private: 0137 void ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path); 0138 Status ExtractFromUriQuery(const Uri& uri); 0139 0140 public: 0141 /// \brief Construct a new AzureOptions from an URI. 0142 /// 0143 /// Supported formats: 0144 /// 0145 /// 1. abfs[s]://\<account\>.blob.core.windows.net[/\<container\>[/\<path\>]] 0146 /// 2. abfs[s]://\<container\>\@\<account\>.dfs.core.windows.net[/path] 0147 /// 3. abfs[s]://[\<account@]\<host[.domain]\>[\<:port\>][/\<container\>[/path]] 0148 /// 4. abfs[s]://[\<account@]\<container\>[/path] 0149 /// 0150 /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs 0151 /// [1], (3) is for Azure Blob Storage compatible service including Azurite, 0152 /// and (4) is a shorter version of (1) and (2). 0153 /// 0154 /// Note that there is no difference between abfs and abfss. HTTPS is 0155 /// used with abfs by default. You can force to use HTTP by specifying 0156 /// "enable_tls=false" query. 0157 /// 0158 /// Supported query parameters: 0159 /// 0160 /// * blob_storage_authority: Set AzureOptions::blob_storage_authority 0161 /// * dfs_storage_authority: Set AzureOptions::dfs_storage_authority 0162 /// * enable_tls: If it's "false" or "0", HTTP not HTTPS is used. 0163 /// * credential_kind: One of "default", "anonymous", "workload_identity", 0164 /// "environment" or "cli". If "default" is specified, it's 0165 /// just ignored. If "anonymous" is specified, 0166 /// AzureOptions::ConfigureAnonymousCredential() is called. If 0167 /// "workload_identity" is specified, 0168 /// AzureOptions::ConfigureWorkloadIdentityCredential() is called. If 0169 /// "environment" is specified, 0170 /// AzureOptions::ConfigureEnvironmentCredential() is called. If "cli" is 0171 /// specified, AzureOptions::ConfigureCLICredential() is called. 0172 /// * tenant_id: You must specify "client_id" and "client_secret" 0173 /// too. AzureOptions::ConfigureClientSecretCredential() is called. 0174 /// * client_id: If you don't specify "tenant_id" and 0175 /// "client_secret", 0176 /// AzureOptions::ConfigureManagedIdentityCredential() is 0177 /// called. If you specify "tenant_id" and "client_secret" too, 0178 /// AzureOptions::ConfigureClientSecretCredential() is called. 0179 /// * client_secret: You must specify "tenant_id" and "client_id" 0180 /// too. AzureOptions::ConfigureClientSecretCredential() is called. 0181 /// * A SAS token is made up of several query parameters. Appending a SAS 0182 /// token to the URI configures SAS token auth by calling 0183 /// AzureOptions::ConfigureSASCredential(). 0184 /// 0185 /// [1]: 0186 /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri 0187 static Result<AzureOptions> FromUri(const Uri& uri, std::string* out_path); 0188 static Result<AzureOptions> FromUri(const std::string& uri, std::string* out_path); 0189 0190 Status ConfigureDefaultCredential(); 0191 Status ConfigureAnonymousCredential(); 0192 Status ConfigureAccountKeyCredential(const std::string& account_key); 0193 Status ConfigureSASCredential(const std::string& sas_token); 0194 Status ConfigureClientSecretCredential(const std::string& tenant_id, 0195 const std::string& client_id, 0196 const std::string& client_secret); 0197 Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string()); 0198 Status ConfigureCLICredential(); 0199 Status ConfigureWorkloadIdentityCredential(); 0200 Status ConfigureEnvironmentCredential(); 0201 0202 bool Equals(const AzureOptions& other) const; 0203 0204 std::string AccountBlobUrl(const std::string& account_name) const; 0205 std::string AccountDfsUrl(const std::string& account_name) const; 0206 0207 Result<std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient>> 0208 MakeBlobServiceClient() const; 0209 0210 Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>> 0211 MakeDataLakeServiceClient() const; 0212 }; 0213 0214 /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and 0215 /// Azure Data Lake Storage Gen2 (ADLS Gen2) [2]. 0216 /// 0217 /// ADLS Gen2 isn't a dedicated service or account type. It's a set of capabilities that 0218 /// support high throughput analytic workloads, built on Azure Blob Storage. All the data 0219 /// ingested via the ADLS Gen2 APIs is persisted as blobs in the storage account. 0220 /// ADLS Gen2 provides filesystem semantics, file-level security, and Hadoop 0221 /// compatibility. ADLS Gen1 exists as a separate object that will retired on 2024-02-29 0222 /// and new ADLS accounts use Gen2 instead. 0223 /// 0224 /// ADLS Gen2 and Blob APIs can operate on the same data, but there are 0225 /// some limitations [3]. The ones that are relevant to this 0226 /// implementation are listed here: 0227 /// 0228 /// - You can't use Blob APIs, and ADLS APIs to write to the same instance of a file. If 0229 /// you write to a file by using ADLS APIs then that file's blocks won't be visible 0230 /// to calls to the GetBlockList Blob API. The only exception is when you're 0231 /// overwriting. 0232 /// - When you use the ListBlobs operation without specifying a delimiter, the results 0233 /// include both directories and blobs. If you choose to use a delimiter, use only a 0234 /// forward slash (/) \--- the only supported delimiter. 0235 /// - If you use the DeleteBlob API to delete a directory, that directory is deleted only 0236 /// if it's empty. This means that you can't use the Blob API delete directories 0237 /// recursively. 0238 /// 0239 /// [1]: https://azure.microsoft.com/en-us/products/storage/blobs 0240 /// [2]: https://azure.microsoft.com/en-us/products/storage/data-lake-storage 0241 /// [3]: 0242 /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues 0243 class ARROW_EXPORT AzureFileSystem : public FileSystem { 0244 private: 0245 class Impl; 0246 std::unique_ptr<Impl> impl_; 0247 0248 explicit AzureFileSystem(std::unique_ptr<Impl>&& impl); 0249 0250 friend class TestAzureFileSystem; 0251 void ForceCachedHierarchicalNamespaceSupport(int hns_support); 0252 0253 public: 0254 ~AzureFileSystem() override = default; 0255 0256 static Result<std::shared_ptr<AzureFileSystem>> Make( 0257 const AzureOptions& options, const io::IOContext& = io::default_io_context()); 0258 0259 std::string type_name() const override { return "abfs"; } 0260 0261 /// Return the original Azure options when constructing the filesystem 0262 const AzureOptions& options() const; 0263 0264 bool Equals(const FileSystem& other) const override; 0265 0266 /// \cond FALSE 0267 using FileSystem::CreateDir; 0268 using FileSystem::DeleteDirContents; 0269 using FileSystem::GetFileInfo; 0270 using FileSystem::OpenAppendStream; 0271 using FileSystem::OpenOutputStream; 0272 /// \endcond 0273 0274 Result<FileInfo> GetFileInfo(const std::string& path) override; 0275 0276 Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; 0277 0278 Status CreateDir(const std::string& path, bool recursive) override; 0279 0280 /// \brief Delete a directory and its contents recursively. 0281 /// 0282 /// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts. 0283 Status DeleteDir(const std::string& path) override; 0284 0285 /// \brief Non-atomically deletes the contents of a directory. 0286 /// 0287 /// This function can return a bad Status after only partially deleting the 0288 /// contents of the directory. 0289 Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; 0290 0291 /// \brief Deletion of all the containers in the storage account (not 0292 /// implemented for safety reasons). 0293 /// 0294 /// \return Status::NotImplemented 0295 Status DeleteRootDirContents() override; 0296 0297 /// \brief Deletes a file. 0298 /// 0299 /// Supported on both flat namespace and Hierarchical Namespace storage 0300 /// accounts. A check is made to guarantee the parent directory doesn't 0301 /// disappear after the blob is deleted and while this operation is running, 0302 /// no other client can delete the parent directory due to the use of leases. 0303 /// 0304 /// This means applications can safely retry this operation without coordination to 0305 /// guarantee only one client/process is trying to delete the same file. 0306 Status DeleteFile(const std::string& path) override; 0307 0308 /// \brief Move/rename a file or directory. 0309 /// 0310 /// There are no files immediately at the root directory, so paths like 0311 /// "/segment" always refer to a container of the storage account and are 0312 /// treated as directories. 0313 /// 0314 /// If `dest` exists but the operation fails for some reason, `Move` 0315 /// guarantees `dest` is not lost. 0316 /// 0317 /// Conditions for a successful move: 0318 /// 0319 /// 1. `src` must exist. 0320 /// 2. `dest` can't contain a strict path prefix of `src`. More generally, 0321 /// a directory can't be made a subdirectory of itself. 0322 /// 3. If `dest` already exists and it's a file, `src` must also be a file. 0323 /// `dest` is then replaced by `src`. 0324 /// 4. All components of `dest` must exist, except for the last. 0325 /// 5. If `dest` already exists and it's a directory, `src` must also be a 0326 /// directory and `dest` must be empty. `dest` is then replaced by `src` 0327 /// and its contents. 0328 /// 0329 /// Leases are used to guarantee the pre-condition checks and the rename 0330 /// operation are atomic: other clients can't invalidate the pre-condition in 0331 /// the time between the checks and the actual rename operation. 0332 /// 0333 /// This is possible because Move() is only support on storage accounts with 0334 /// Hierarchical Namespace Support enabled. 0335 /// 0336 /// ## Limitations 0337 /// 0338 /// - Moves are not supported on storage accounts without 0339 /// Hierarchical Namespace support enabled 0340 /// - Moves across different containers are not supported 0341 /// - Moving a path of the form `/container` is not supported as it would 0342 /// require moving all the files in a container to another container. 0343 /// The only exception is a `Move("/container_a", "/container_b")` where 0344 /// both containers are empty or `container_b` doesn't even exist. 0345 /// The atomicity of the emptiness checks followed by the renaming operation 0346 /// is guaranteed by the use of leases. 0347 Status Move(const std::string& src, const std::string& dest) override; 0348 0349 Status CopyFile(const std::string& src, const std::string& dest) override; 0350 0351 Result<std::shared_ptr<io::InputStream>> OpenInputStream( 0352 const std::string& path) override; 0353 0354 Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; 0355 0356 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( 0357 const std::string& path) override; 0358 0359 Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( 0360 const FileInfo& info) override; 0361 0362 Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( 0363 const std::string& path, 0364 const std::shared_ptr<const KeyValueMetadata>& metadata) override; 0365 0366 Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( 0367 const std::string& path, 0368 const std::shared_ptr<const KeyValueMetadata>& metadata) override; 0369 0370 Result<std::string> PathFromUri(const std::string& uri_string) const override; 0371 }; 0372 0373 } // namespace arrow::fs
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |