From 993a27c67cf95c6b6fe4b09aec62b00fe4c9988a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 5 Oct 2024 22:31:12 +0100 Subject: [PATCH] GH-44303: [C++][FS][Azure] Fix minor hierarchical namespace bugs (#44307) ### Rationale for this change There are a couple of minor bugs in the `AzureFileSystem` for hierarchical namespaces accounts. These cause failures in `TestAzureHierarchicalNSGeneric.GetFileInfoSelectorWithRecursion` and `TestAzureHierarchicalNSGeneric.Empty` which do not run automatically in CI. ### What changes are included in this PR? - Fix incorrectly returning Not found on recursive get file info on container root. - Implement `selector.max_recursion` for hierarchical namespace. This is implemented completely artificially after `directory_client.ListPaths(/*recursive=*/true)`. - Enable a couple of features on the generic tests that were disabled but are actually supported. ### Are these changes tested? There already failing tests for these but they don't run on CI because they require connect to a real Azure blob storage account. I made sure to run all the tests locally including the ones that connect to real Azure storage, both flat and hierarchical and all the tests passed. ### Are there any user-facing changes? * GitHub Issue: #44303 Lead-authored-by: Thomas Newton Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 18 +++++++++++++++--- cpp/src/arrow/filesystem/azurefs_test.cc | 18 +++++++++++++----- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d9a69800bb87e..78f4ad1edd9a9 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1916,18 +1916,22 @@ class AzureFileSystem::Impl { /// \brief List the paths at the root of a filesystem or some dir in a filesystem. /// /// \pre adlfs_client is the client for the filesystem named like the first - /// segment of select.base_dir. + /// segment of select.base_dir. The filesystem is know to exist. Status GetFileInfoWithSelectorFromFileSystem( const DataLake::DataLakeFileSystemClient& adlfs_client, const Core::Context& context, Azure::Nullable page_size_hint, const FileSelector& select, FileInfoVector* acc_results) { ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + // The filesystem a.k.a. the container is known to exist so if the path is empty then + // we have already found the base_location, so initialize found to true. + bool found = base_location.path.empty(); + auto directory_client = adlfs_client.GetDirectoryClient(base_location.path); - bool found = false; DataLake::ListPathsOptions options; options.PageSizeHint = page_size_hint; + auto base_path_depth = internal::GetAbstractPathDepth(base_location.path); try { auto list_response = directory_client.ListPaths(select.recursive, options, context); for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { @@ -1939,7 +1943,15 @@ class AzureFileSystem::Impl { if (path.Name == base_location.path && !path.IsDirectory) { return NotADir(base_location); } - acc_results->push_back(FileInfoFromPath(base_location.container, path)); + // Subtract 1 because with `max_recursion=0` we want to list the base path, + // which will produce results with depth 1 greater that the base path's depth. + // NOTE: `select.max_recursion` + anything will cause integer overflows because + // `select.max_recursion` defaults to `INT32_MAX`. Therefore, options to + // rewrite this condition in a more readable way are limited. + if (internal::GetAbstractPathDepth(path.Name) - base_path_depth - 1 <= + select.max_recursion) { + acc_results->push_back(FileInfoFromPath(base_location.container, path)); + } } } } catch (const Storage::StorageException& exception) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 3697c3bcc319d..242c2c29505ac 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -364,9 +364,9 @@ class TestGeneric : public ::testing::Test, public GenericFileSystemTest { std::shared_ptr GetEmptyFileSystem() override { return fs_; } bool have_implicit_directories() const override { return true; } - bool allow_write_file_over_dir() const override { return true; } - bool allow_read_dir_as_file() const override { return true; } - bool allow_move_dir() const override { return false; } + bool allow_write_file_over_dir() const override { return false; } + bool allow_read_dir_as_file() const override { return false; } + bool allow_move_dir() const override { return true; } bool allow_move_file() const override { return true; } bool allow_append_to_file() const override { return true; } bool have_directory_mtimes() const override { return true; } @@ -404,7 +404,11 @@ class TestAzuriteGeneric : public TestGeneric { } protected: - // Azurite doesn't support moving files over containers. + // Azurite doesn't block writing files over directories. + bool allow_write_file_over_dir() const override { return true; } + // Azurite doesn't support moving directories. + bool allow_move_dir() const override { return false; } + // Azurite doesn't support moving files. bool allow_move_file() const override { return false; } // Azurite doesn't support directory mtime. bool have_directory_mtimes() const override { return false; } @@ -426,7 +430,11 @@ class TestAzureFlatNSGeneric : public TestGeneric { } protected: - // Flat namespace account doesn't support moving files over containers. + // Flat namespace account doesn't block writing files over directories. + bool allow_write_file_over_dir() const override { return true; } + // Flat namespace account doesn't support moving directories. + bool allow_move_dir() const override { return false; } + // Flat namespace account doesn't support moving files. bool allow_move_file() const override { return false; } // Flat namespace account doesn't support directory mtime. bool have_directory_mtimes() const override { return false; }