Skip to content

Commit

Permalink
Merge pull request #75 from gdubya/add_abfs_support
Browse files Browse the repository at this point in the history
feat: add support for abfs://
  • Loading branch information
samansmink authored Sep 3, 2024
2 parents eddc484 + 2d21afd commit b0ffe7a
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 10 deletions.
4 changes: 3 additions & 1 deletion src/azure_dfs_filesystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
namespace duckdb {
const string AzureDfsStorageFileSystem::SCHEME = "abfss";
const string AzureDfsStorageFileSystem::PATH_PREFIX = "abfss://";
const string AzureDfsStorageFileSystem::UNSECURE_SCHEME = "abfs";
const string AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX = "abfs://";

inline static bool IsDfsScheme(const string &fpath) {
return fpath.rfind("abfss://", 0) == 0;
return fpath.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 || fpath.rfind(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX, 0) == 0;
}

static void Walk(const Azure::Storage::Files::DataLake::DataLakeFileSystemClient &fs, const std::string &path,
Expand Down
19 changes: 12 additions & 7 deletions src/azure_parsed_url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@ namespace duckdb {
AzureParsedUrl ParseUrl(const std::string &url) {
constexpr auto invalid_url_format =
"The URL %s does not match the expected formats: (azure|az)://<container>/[<path>] or the fully qualified one: "
"(abfss|azure|az)://<storage account>.<endpoint>/<container>/[<path>] "
"or abfss://<container>@<storage account>.<endpoint>/[<path>]";
"(abfs[s]|azure|az)://<storage account>.<endpoint>/<container>/[<path>] "
"or abfs[s]://<container>@<storage account>.<endpoint>/[<path>]";
bool is_fully_qualified;
std::string container, storage_account_name, endpoint, prefix, path;

if (url.rfind("azure://", 0) != 0 && url.rfind("az://", 0) != 0 &&
url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) != 0) {
throw IOException("URL needs to start with azure:// or az:// or %s", AzureDfsStorageFileSystem::PATH_PREFIX);
url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) != 0 && url.rfind(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX, 0) != 0) {
throw IOException("URL needs to start with azure:// or az:// or %s or %s",
AzureDfsStorageFileSystem::PATH_PREFIX,
AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX);
}
const auto prefix_end_pos = url.find("//") + 2;

Expand All @@ -31,9 +33,12 @@ AzureParsedUrl ParseUrl(const std::string &url) {
if (dot_pos != std::string::npos && dot_pos < slash_pos) {
is_fully_qualified = true;

if (url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 &&
if ((
url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 ||
url.rfind(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX, 0) == 0
) &&
at_pos != std::string::npos) {
// syntax is abfss://<container>@<storage account>.<endpoint>/[<path>]
// syntax is abfs[s]://<container>@<storage account>.<endpoint>/[<path>]
const auto path_slash_pos = url.find('/', prefix_end_pos + 1);
if (path_slash_pos == string::npos) {
throw IOException(invalid_url_format, url);
Expand All @@ -44,7 +49,7 @@ AzureParsedUrl ParseUrl(const std::string &url) {
endpoint = url.substr(dot_pos + 1, path_slash_pos - dot_pos - 1);
path = url.substr(path_slash_pos + 1);
} else {
// syntax is (abfss|azure|az)://<storage account>.<endpoint>/<container>/[<path>]
// syntax is (abfs[s]|azure|az)://<storage account>.<endpoint>/<container>/[<path>]
const auto container_slash_pos = url.find('/', dot_pos);
if (container_slash_pos == string::npos) {
throw IOException(invalid_url_format, url);
Expand Down
4 changes: 4 additions & 0 deletions src/azure_secret.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ static unique_ptr<BaseSecret> CreateAzureSecretFromConfig(ClientContext &context
scope.push_back("azure://");
scope.push_back("az://");
scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX);
scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX);
}

auto result = make_uniq<KeyValueSecret>(scope, input.type, input.provider, input.name);
Expand All @@ -61,6 +62,7 @@ static unique_ptr<BaseSecret> CreateAzureSecretFromCredentialChain(ClientContext
scope.push_back("azure://");
scope.push_back("az://");
scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX);
scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX);
}

auto result = make_uniq<KeyValueSecret>(scope, input.type, input.provider, input.name);
Expand All @@ -85,6 +87,7 @@ static unique_ptr<BaseSecret> CreateAzureSecretFromServicePrincipal(ClientContex
scope.push_back("azure://");
scope.push_back("az://");
scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX);
scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX);
}

auto result = make_uniq<KeyValueSecret>(scope, input.type, input.provider, input.name);
Expand Down Expand Up @@ -114,6 +117,7 @@ static unique_ptr<BaseSecret> CreateAzureSecretFromAccessToken(ClientContext &co
scope.push_back("azure://");
scope.push_back("az://");
scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX);
scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX);
}

auto result = make_uniq<KeyValueSecret>(scope, input.type, input.provider, input.name);
Expand Down
4 changes: 2 additions & 2 deletions src/azure_storage_account_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,8 +591,8 @@ ConnectToDfsStorageAccount(optional_ptr<FileOpener> opener, const std::string &p

if (!azure_parsed_url.is_fully_qualified) {
throw InvalidInputException(
"Cannot identified the storage account from path '%s'. To connect anonymously to a "
"storage account easier a fully qualified path has to be provided or secret must be create.",
"Cannot identify the storage account from path '%s'. To connect anonymously to a "
"storage account easier a fully qualified path has to be provided or secret must be created.",
path);
}

Expand Down
2 changes: 2 additions & 0 deletions src/include/azure_dfs_filesystem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class AzureDfsStorageFileSystem : public AzureStorageFileSystem {
public:
static const string SCHEME;
static const string PATH_PREFIX;
static const string UNSECURE_SCHEME;
static const string UNSECURE_PATH_PREFIX;

protected:
// From AzureFilesystem
Expand Down
23 changes: 23 additions & 0 deletions test/sql/cloud/hierarchical_namespace.test
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,36 @@ SELECT count(*) FROM 'abfss://testing-private/partitioned/l_receipmonth=1997/l_s
----
1291

# Check with absolute path using unsecure abfs
query I
SELECT count(*) FROM 'abfs://testing-private/partitioned/l_receipmonth=1997/l_shipmode=TRUCK/data_0.csv';
----
1291

# Check fully qualified name
query I
SELECT count(*) FROM 'abfss://${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/testing-private/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv';
----
2317

# Check fully qualified name using unsecure abfs
query I
SELECT count(*) FROM 'abfs://${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/testing-private/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv';
----
2317

# Check fully qualified name abfss alternative syntax
query I
SELECT count(*) FROM 'abfss://testing-private@${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv';
----
2317

# Check fully qualified name abfs alternative syntax
query I
SELECT count(*) FROM 'abfs://testing-private@${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv';
----
2317

# Enable http info for the explain analyze statement
statement ok
SET azure_http_stats = true;
Expand All @@ -84,6 +102,11 @@ EXPLAIN ANALYZE SELECT count(*) FROM 'abfss://testing-private/partitioned/l_rece
----
analyzed_plan <REGEX>:.*HTTP Stats.*in\: 322\.0 KiB.*\#HEAD\: 1.*GET\: 4.*PUT\: 0.*\#POST\: 0.*

query II
EXPLAIN ANALYZE SELECT count(*) FROM 'abfs://testing-private/partitioned/l_receipmonth=*7/l_shipmode=TRUCK/*.csv';
----
analyzed_plan <REGEX>:.*HTTP Stats.*in\: 322\.0 KiB.*\#HEAD\: 1.*GET\: 4.*PUT\: 0.*\#POST\: 0.*


query II
EXPLAIN ANALYZE SELECT count(*) FROM 'azure://testing-private/partitioned/l_receipmonth=*7/l_shipmode=TRUCK/*.csv';
Expand Down

0 comments on commit b0ffe7a

Please sign in to comment.