forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-47050][SQL] Collect and publish partition level metrics
Capture the partition sub-paths, along with the number of files, bytes, and rows per partition for each task.
- Loading branch information
Steve Vaughan Jr
committed
Apr 23, 2024
1 parent
b9f2270
commit 2341c22
Showing
11 changed files
with
441 additions
and
51 deletions.
There are no files selected for viewing
99 changes: 99 additions & 0 deletions
99
...atalyst/src/main/java/org/apache/spark/sql/connector/write/PartitionMetricsWriteInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.connector.write; | ||
|
||
import java.io.Serializable; | ||
import java.util.Collections; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
/** | ||
* An aggregator of partition metrics collected during write operations. | ||
* <p> | ||
* This is patterned after {@code org.apache.spark.util.AccumulatorV2} | ||
* </p> | ||
*/ | ||
public class PartitionMetricsWriteInfo implements Serializable { | ||
|
||
private final Map<String, PartitionMetrics> metrics = new TreeMap<>(); | ||
|
||
/** | ||
* Merges another same-type accumulator into this one and update its state, i.e. this should be | ||
* merge-in-place. | ||
* | ||
* @param otherAccumulator Another object containing aggregated partition metrics | ||
*/ | ||
public void merge(PartitionMetricsWriteInfo otherAccumulator) { | ||
otherAccumulator.metrics.forEach((p, m) -> | ||
metrics.computeIfAbsent(p, key -> new PartitionMetrics(0L, 0L, 0)) | ||
.merge(m)); | ||
} | ||
|
||
/** | ||
* Update the partition metrics for the specified path by adding to the existing state. This will | ||
* add the partition if it has not been referenced previously. | ||
* | ||
* @param partitionPath The path for the written partition | ||
* @param bytes The number of additional bytes | ||
* @param records the number of addition records | ||
* @param files the number of additional files | ||
*/ | ||
public void update(String partitionPath, long bytes, long records, int files) { | ||
metrics.computeIfAbsent(partitionPath, key -> new PartitionMetrics(0L, 0L, 0)) | ||
.merge(new PartitionMetrics(bytes, records, files)); | ||
} | ||
|
||
/** | ||
* Update the partition metrics for the specified path by adding to the existing state from an | ||
* individual file. This will add the partition if it has not been referenced previously. | ||
* | ||
* @param partitionPath The path for the written partition | ||
* @param bytes The number of additional bytes | ||
* @param records the number of addition records | ||
*/ | ||
public void updateFile(String partitionPath, long bytes, long records) { | ||
update(partitionPath, bytes, records, 1); | ||
} | ||
|
||
/** | ||
* Convert this instance into an immutable {@code java.util.Map}. This is used for posting to the | ||
* listener bus | ||
* | ||
* @return an immutable map of partition paths to their metrics | ||
*/ | ||
public Map<String, PartitionMetrics> toMap() { | ||
return Collections.unmodifiableMap(metrics); | ||
} | ||
|
||
/** | ||
* Returns if this accumulator is zero value or not. For a map accumulator this indicates if the | ||
* map is empty. | ||
* | ||
* @return {@code true} if there are no partition metrics | ||
*/ | ||
boolean isZero() { | ||
return metrics.isEmpty(); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "PartitionMetricsWriteInfo{" + | ||
"metrics=" + metrics + | ||
'}'; | ||
} | ||
} |
54 changes: 54 additions & 0 deletions
54
sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/PartitionMetrics.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.connector.write | ||
|
||
/** | ||
* The metrics collected for an individual partition | ||
* | ||
* @param numBytes the number of bytes | ||
* @param numRecords the number of records (rows) | ||
* @param numFiles the number of files | ||
*/ | ||
case class PartitionMetrics(var numBytes: Long = 0, var numRecords: Long = 0, var numFiles: Int = 0) | ||
extends Serializable { | ||
|
||
/** | ||
* Updates the metrics for an individual file. | ||
* | ||
* @param bytes the number of bytes | ||
* @param records the number of records (rows) | ||
*/ | ||
def updateFile(bytes: Long, records: Long): Unit = { | ||
numBytes += bytes | ||
numRecords += records | ||
numFiles += 1 | ||
} | ||
|
||
/** | ||
* Merges another same-type accumulator into this one and update its state, i.e. this should be | ||
* merge-in-place. | ||
* @param other Another set of metrics for the same partition | ||
*/ | ||
def merge (other: PartitionMetrics): Unit = { | ||
numBytes += other.numBytes | ||
numRecords += other.numRecords | ||
numFiles += other.numFiles | ||
} | ||
|
||
} |
54 changes: 54 additions & 0 deletions
54
...rc/main/scala/org/apache/spark/sql/connector/write/SparkListenerSQLPartitionMetrics.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.connector.write | ||
|
||
import org.apache.spark.SparkContext | ||
import org.apache.spark.annotation.DeveloperApi | ||
import org.apache.spark.scheduler.SparkListenerEvent | ||
|
||
@DeveloperApi | ||
case class SparkListenerSQLPartitionMetrics(executorId: Long, | ||
metrics: java.util.Map[String, PartitionMetrics]) | ||
extends SparkListenerEvent | ||
|
||
object SQLPartitionMetrics { | ||
|
||
/** | ||
* Post any aggregated partition write statistics to the listener bus using a | ||
* [[SparkListenerSQLPartitionMetrics]] event | ||
* | ||
* @param sc The Spark context | ||
* @param executionId The identifier for the SQL execution that resulted in the partition writes | ||
* @param writeInfo The aggregated partition writes for this SQL exectuion | ||
*/ | ||
def postDriverMetricUpdates(sc: SparkContext, executionId: String, | ||
writeInfo: PartitionMetricsWriteInfo): Unit = { | ||
// Don't bother firing an event if there are no collected metrics | ||
if (writeInfo.isZero) { | ||
return | ||
} | ||
|
||
// There are some cases we don't care about the metrics and call `SparkPlan.doExecute` | ||
// directly without setting an execution id. We should be tolerant to it. | ||
if (executionId != null) { | ||
sc.listenerBus.post( | ||
SparkListenerSQLPartitionMetrics(executionId.toLong, writeInfo.toMap)) | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.