apache · jshmchenxi · Feb 10, 2025 · Feb 16, 2025 · Feb 19, 2025
diff --git a/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java b/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ExecutorService;
@@ -45,8 +46,11 @@
 import org.apache.iceberg.parquet.ParquetUtil;
 import org.apache.iceberg.util.Tasks;
 import org.apache.iceberg.util.ThreadPools;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class TableMigrationUtil {
+  private static final Logger LOG = LoggerFactory.getLogger(TableMigrationUtil.class);
   private static final PathFilter HIDDEN_PATH_FILTER =
       p -> !p.getName().startsWith("_") && !p.getName().startsWith(".");
 
@@ -163,10 +167,19 @@ public static List<DataFile> listPartition(
 
       Path partitionDir = new Path(partitionUri);
       FileSystem fs = partitionDir.getFileSystem(conf);
-      List<FileStatus> fileStatus =
-          Arrays.stream(fs.listStatus(partitionDir, HIDDEN_PATH_FILTER))
-              .filter(FileStatus::isFile)
-              .collect(Collectors.toList());
+      List<FileStatus> fileStatus;
+      if (fs.exists(partitionDir)) {
+        fileStatus =
+            Arrays.stream(fs.listStatus(partitionDir, HIDDEN_PATH_FILTER))
+                .filter(FileStatus::isFile)
+                .collect(Collectors.toList());
+      } else {
+        LOG.info(
+            "Skipping partition {}: location {} does not exist in the filesystem.",
+            partition,
+            partitionUri);
+        fileStatus = Collections.emptyList();
+      }
       DataFile[] datafiles = new DataFile[fileStatus.size()];
       Tasks.Builder<Integer> task =
           Tasks.range(fileStatus.size()).stopOnFailure().throwFailureWhenFinished();

diff --git a/data/src/test/java/org/apache/iceberg/data/TestTableMigrationUtil.java b/data/src/test/java/org/apache/iceberg/data/TestTableMigrationUtil.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.data;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.generic.GenericData;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.MetricsConfig;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.avro.RandomAvroData;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.parquet.Parquet;
+import org.apache.iceberg.parquet.ParquetAvroWriter;
+import org.apache.iceberg.types.Types;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+class TestTableMigrationUtil {
+  private static final Schema SCHEMA =
+      new Schema(
+          optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get()));
+  private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build();
+  private static final Map<String, String> PARTITION = Map.of("id", "1");
+  private static final String FORMAT = "parquet";
+  private static final Configuration CONF = new Configuration();
+
+  @TempDir protected Path tempTableLocation;
+
+  @Test
+  void testListPartition() throws IOException {
+    Path partitionPath = tempTableLocation.resolve("id=1");
+    String partitionUri = partitionPath.toUri().toString();
+    java.nio.file.Files.createDirectories(partitionPath);
+    writePartitionFile(partitionPath.toFile());
+
+    List<DataFile> dataFiles =
+        TableMigrationUtil.listPartition(
+            PARTITION, partitionUri, FORMAT, SPEC, CONF, MetricsConfig.getDefault(), null);
+    assertThat(dataFiles)
+        .as("List partition with 1 Parquet file should return 1 DataFile")
+        .hasSize(1);
+  }
+
+  @Test
+  void testListEmptyPartition() throws IOException {
+    Path partitionPath = tempTableLocation.resolve("id=1");
+    String partitionUri = partitionPath.toUri().toString();
+    java.nio.file.Files.createDirectories(partitionPath);
+
+    List<DataFile> dataFiles =
+        TableMigrationUtil.listPartition(
+            PARTITION, partitionUri, FORMAT, SPEC, CONF, MetricsConfig.getDefault(), null);
+    assertThat(dataFiles).as("List partition with 0 file should return 0 DataFile").isEmpty();
+  }
+
+  @Test
+  void testListPartitionLocationNotExists() {
+    String partitionUri = tempTableLocation.resolve("id=1").toUri().toString();
+
+    List<DataFile> dataFiles =
+        TableMigrationUtil.listPartition(
+            PARTITION, partitionUri, FORMAT, SPEC, CONF, MetricsConfig.getDefault(), null);
+    assertThat(dataFiles)
+        .as("List partition of which location does not exist should return 0 DataFile")
+        .isEmpty();
+  }
+
+  private static void writePartitionFile(File outputDir) throws IOException {
+    Iterable<GenericData.Record> records = RandomAvroData.generate(SCHEMA, 1000, 54310);
+    File testFile = File.createTempFile("junit", ".parquet", outputDir);
+    assertThat(testFile.delete()).as("Delete should succeed").isTrue();
+
+    try (FileAppender<GenericData.Record> writer =
+        Parquet.write(Files.localOutput(testFile))
+            .schema(SCHEMA)
+            .createWriterFunc(ParquetAvroWriter::buildWriter)
+            .build()) {
+      writer.addAll(records);
+    }
+  }
+}
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestSnapshotTableAction.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestSnapshotTableAction.java
@@ -18,7 +18,10 @@
  */
 package org.apache.iceberg.spark.actions;
 
+import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Map;
 import java.util.concurrent.Executors;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -64,6 +67,23 @@ public void testSnapshotWithParallelTasks() throws IOException {
                   return thread;
                 }))
         .execute();
-    Assert.assertEquals(snapshotThreadsIndex.get(), 2);
+    Assert.assertEquals(2, snapshotThreadsIndex.get());
+  }
+
+  @Test
+  public void testSnapshotWithEmptyPartitionOrLocationMissingPartition() throws IOException {
+    File tableLocation = temp.newFolder();
+    sql(
+        "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet PARTITIONED BY (bucket string) LOCATION '%s'",
+        SOURCE_NAME, tableLocation.toURI().toString());
+    sql("ALTER TABLE %s ADD PARTITION (bucket = 'foo')", SOURCE_NAME);
+    sql("ALTER TABLE %s ADD PARTITION (bucket = 'bar')", SOURCE_NAME);
+
+    // Delete the 2nd partition location
+    Path partitionLocation = tableLocation.toPath().resolve("bucket=bar");
+    Files.delete(partitionLocation);
+
+    SparkActions.get().snapshotTable(SOURCE_NAME).as(tableName).execute();
+    Assert.assertEquals(0L, scalarSql("SELECT count(*) FROM %s.partitions", selectTarget()));
   }
 }
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSnapshotTableAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSnapshotTableAction.java
@@ -22,6 +22,7 @@
 
 import java.io.IOException;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.concurrent.Executors;
 import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.iceberg.ParameterizedTestExtension;
@@ -65,4 +66,23 @@ public void testSnapshotWithParallelTasks() throws IOException {
         .execute();
     assertThat(snapshotThreadsIndex.get()).isEqualTo(2);
   }
+
+  @TestTemplate
+  public void testSnapshotWithEmptyPartitionOrLocationMissingPartition() throws IOException {
+    Path tableLocationPath = Files.createTempDirectory(temp, "junit");
+    sql(
+        "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet PARTITIONED BY (bucket string) LOCATION '%s'",
+        SOURCE_NAME, tableLocationPath.toFile().toString());
+    sql("ALTER TABLE %s ADD PARTITION (bucket = 'foo')", SOURCE_NAME);
+    sql("ALTER TABLE %s ADD PARTITION (bucket = 'bar')", SOURCE_NAME);
+
+    // Delete the 2nd partition location
+    Path partitionLocationPath = tableLocationPath.resolve("bucket=bar");
+    Files.delete(partitionLocationPath);
+
+    SparkActions.get().snapshotTable(SOURCE_NAME).as(tableName).execute();
+    assertThat(scalarSql("SELECT count(*) FROM %s.partitions", selectTarget()))
+        .as("Should have 0 partition after snapshot")
+        .isEqualTo(0L);
+  }
 }