From 8bbec5df6e7e53d2a9ffa6798a582c8040885949 Mon Sep 17 00:00:00 2001 From: Amanda Liu Date: Thu, 16 Jan 2025 20:21:15 +0800 Subject: [PATCH] [SPARK-50795][SQL] Store timestamp as `long` type in `describe` LinkedHashMap ### What changes were proposed in this pull request? When storing table metadata in the `describe` LinkedHashMap object, we retain the timestamp as a `long`data type (instead of converting to a formatted date `string` type) to allow flexibility and extensibility of `describe` date format. Formatting the date fields is delegated to the caller (e.g. describe table, describe as json, describe column, etc.). Example date for describe table: `Mon Nov 01 12:00:00 UTC 2021` Example date for describe as json: `2021-11-01T12:00:00Z` ### Why are the changes needed? Improve extensibility of `describe` and ensure backwards compatibility ### Does this PR introduce _any_ user-facing change? Affects the `describe` output date format ### How was this patch tested? Added `describe table` tests for date format ### Was this patch authored or co-authored using generative AI tooling? No Closes #49513 from asl3/asl3/describetable-dateplaintext. Lead-authored-by: Amanda Liu Co-authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../sql/catalyst/catalog/interface.scala | 29 ++++++++---------- .../command/DescribeRelationJsonCommand.scala | 30 +++++++++++++++++-- .../command/v1/DescribeTableSuite.scala | 15 ++++++++++ 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 7836e533c8b5c..6963e89cf0418 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.catalog import java.net.URI import java.time.{ZoneId, ZoneOffset} +import java.util.Date import scala.collection.mutable import scala.util.control.NonFatal @@ -27,7 +28,7 @@ import com.fasterxml.jackson.annotation.JsonInclude.Include import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} import org.apache.commons.lang3.StringUtils -import org.json4s.JsonAST.{JArray, JBool, JDouble, JInt, JNull, JObject, JString, JValue} +import org.json4s.JsonAST.{JArray, JBool, JDouble, JInt, JLong, JNull, JObject, JString, JValue} import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException @@ -63,6 +64,7 @@ trait MetadataMapSupport { protected def jsonToString( jsonMap: mutable.LinkedHashMap[String, JValue]): mutable.LinkedHashMap[String, String] = { val map = new mutable.LinkedHashMap[String, String]() + val timestampKeys = Set("Created Time", "Last Access") jsonMap.foreach { case (key, jValue) => val stringValue = jValue match { case JString(value) => value @@ -80,20 +82,18 @@ trait MetadataMapSupport { .mkString("[", ", ", "]") case JInt(value) => value.toString case JDouble(value) => value.toString + case JLong(value) => + if (timestampKeys.contains(key)) { + new Date(value).toString + } else { + value.toString + } case _ => jValue.values.toString } map.put(key, stringValue) } map } - - val timestampFormatter = new Iso8601TimestampFormatter( - pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'", - zoneId = ZoneId.of("UTC"), - locale = DateFormatter.defaultLocale, - legacyFormat = LegacyDateFormats.LENIENT_SIMPLE_DATE_FORMAT, - isParsing = true - ) } @@ -191,12 +191,10 @@ case class CatalogTablePartition( map += ("Partition Parameters" -> paramsJson) } - map += ("Created Time" -> JString( - timestampFormatter.format(DateTimeUtils.millisToMicros(createTime)))) + map += ("Created Time" -> JLong(createTime)) val lastAccess = if (lastAccessTime <= 0) JString("UNKNOWN") - else JString( - timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))) + else JLong(lastAccessTime) map += ("Last Access" -> lastAccess) stats.foreach(s => map += ("Partition Statistics" -> JString(s.simpleString))) @@ -605,7 +603,7 @@ case class CatalogTable( val lastAccess: JValue = if (lastAccessTime <= 0) JString("UNKNOWN") - else JString(timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))) + else JLong(lastAccessTime) val viewQueryOutputColumns: JValue = if (viewQueryColumnNames.nonEmpty) JArray(viewQueryColumnNames.map(JString).toList) @@ -617,8 +615,7 @@ case class CatalogTable( if (identifier.database.isDefined) map += "Database" -> JString(identifier.database.get) map += "Table" -> JString(identifier.table) if (Option(owner).exists(_.nonEmpty)) map += "Owner" -> JString(owner) - map += "Created Time" -> - JString(timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))) + map += "Created Time" -> JLong(createTime) if (lastAccess != JNull) map += "Last Access" -> lastAccess map += "Created By" -> JString(s"Spark $createVersion") map += "Type" -> JString(tableType.name) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala index 6abe34f0ea156..4440a8889c05c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.command +import java.time.ZoneId + import scala.collection.mutable import org.json4s._ @@ -29,7 +31,13 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, Se import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.util.quoteIfNeeded +import org.apache.spark.sql.catalyst.util.{ + quoteIfNeeded, + DateFormatter, + DateTimeUtils, + Iso8601TimestampFormatter, + LegacyDateFormats +} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import org.apache.spark.sql.connector.catalog.V1Table import org.apache.spark.sql.errors.QueryCompilationErrors @@ -50,6 +58,13 @@ case class DescribeRelationJsonCommand( nullable = false, new MetadataBuilder().putString("comment", "JSON metadata of the table").build())() )) extends UnaryRunnableCommand { + private lazy val timestampFormatter = new Iso8601TimestampFormatter( + pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'", + zoneId = ZoneId.of("UTC"), + locale = DateFormatter.defaultLocale, + legacyFormat = LegacyDateFormats.LENIENT_SIMPLE_DATE_FORMAT, + isParsing = true + ) override def run(sparkSession: SparkSession): Seq[Row] = { val jsonMap = mutable.LinkedHashMap[String, JValue]() @@ -106,11 +121,22 @@ case class DescribeRelationJsonCommand( "outputformat" -> "output_format" ) + val timestampKeys = Set("created_time", "last_access") + val normalizedKey = key.toLowerCase().replace(" ", "_") val renamedKey = renames.getOrElse(normalizedKey, normalizedKey) if (!jsonMap.contains(renamedKey) && !excludedKeys.contains(renamedKey)) { - jsonMap += renamedKey -> value + val formattedValue = if (timestampKeys.contains(renamedKey)) { + value match { + case JLong(timestamp) => + JString(timestampFormatter.format(DateTimeUtils.millisToMicros(timestamp))) + case _ => value + } + } else { + value + } + jsonMap += renamedKey -> formattedValue } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala index 3602853e53aa8..eef8e212435c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala @@ -657,6 +657,21 @@ class DescribeTableSuite extends DescribeTableSuiteBase with CommandSuiteBase { Row("Table Properties", "[bar=baz]", ""), Row("Location", "file:/tmp/testcat/table_name", ""), Row("Partition Provider", "Catalog", ""))) + + // example date format: Mon Nov 01 12:00:00 UTC 2021 + val dayOfWeek = raw"[A-Z][a-z]{2}" + val month = raw"[A-Z][a-z]{2}" + val day = raw"\s?[0-9]{1,2}" + val time = raw"[0-9]{2}:[0-9]{2}:[0-9]{2}" + val timezone = raw"[A-Z]{3,4}" + val year = raw"[0-9]{4}" + + val timeRegex = raw"""$dayOfWeek $month $day $time $timezone $year""".r + + val createdTimeValue = descriptionDf.filter("col_name = 'Created Time'") + .collect().head.getString(1).trim + + assert(timeRegex.matches(createdTimeValue)) } }