[SPARK-41221][SQL] Add the error class INVALID_FORMAT

### What changes were proposed in this pull request? In the PR, I propose to move out the error sub-classes related to number format issues from `DATATYPE_MISMATCH` to their own error class `INVALID_FORMAT`. Also, the PR deletes the error class `INVALID_LIKE_PATTERN`, and moves their sub-classes to `INVALID_FORMAT`. ### Why are the changes needed? To improve usability of Spark SQL and to don't confuse users by error class names related to values but to to its types. ### Does this PR introduce _any_ user-facing change? Yes, the PR changes some user-facing errors. ### How was this patch tested? By running the affected test suites: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite" $ build/sbt "core/testOnly *SparkThrowableSuite" $ build/sbt "test:testOnly *StringExpressionsSuite" ``` Closes apache#38755 from MaxGekk/refactor-datatype-mismatch. Authored-by: Max Gekk <[email protected]> Signed-off-by: Max Gekk <[email protected]>
beliefer · Dec 18, 2022 · b675081 · b675081
1 parent d15a945
commit b675081
Show file tree

Hide file tree

Showing 11 changed files with 211 additions and 203 deletions.
diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json
@@ -229,46 +229,6 @@
           "Input to <functionName> should all be the same type, but it's <dataType>."
         ]
       },
-      "FORMAT_CONT_THOUSANDS_SEPS" : {
-        "message" : [
-          "Thousands separators (, or G) must have digits in between them in the number format: <format>."
-        ]
-      },
-      "FORMAT_CUR_MUST_BEFORE_DEC" : {
-        "message" : [
-          "Currency characters must appear before any decimal point in the number format: <format>."
-        ]
-      },
-      "FORMAT_CUR_MUST_BEFORE_DIGIT" : {
-        "message" : [
-          "Currency characters must appear before digits in the number format: <format>."
-        ]
-      },
-      "FORMAT_EMPTY" : {
-        "message" : [
-          "The number format string cannot be empty."
-        ]
-      },
-      "FORMAT_THOUSANDS_SEPS_MUST_BEFORE_DEC" : {
-        "message" : [
-          "Thousands separators (, or G) may not appear after the decimal point in the number format: <format>."
-        ]
-      },
-      "FORMAT_UNEXPECTED_TOKEN" : {
-        "message" : [
-          "Unexpected <token> found in the format string <format>; the structure of the format string must match: [MI|S] [$] [0|9|G|,]* [.|D] [0|9]* [$] [PR|MI|S]."
-        ]
-      },
-      "FORMAT_WRONG_NUM_DIGIT" : {
-        "message" : [
-          "The format string requires at least one number digit."
-        ]
-      },
-      "FORMAT_WRONG_NUM_TOKEN" : {
-        "message" : [
-          "At most one <token> is allowed in the number format: <format>."
-        ]
-      },
       "HASH_MAP_TYPE" : {
         "message" : [
           "Input to the function <functionName> cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"."
@@ -687,6 +647,63 @@
     ],
     "sqlState" : "42000"
   },
+  "INVALID_FORMAT" : {
+    "message" : [
+      "The format is invalid: <format>."
+    ],
+    "subClass" : {
+      "CONT_THOUSANDS_SEPS" : {
+        "message" : [
+          "Thousands separators (, or G) must have digits in between them in the number format."
+        ]
+      },
+      "CUR_MUST_BEFORE_DEC" : {
+        "message" : [
+          "Currency characters must appear before any decimal point in the number format."
+        ]
+      },
+      "CUR_MUST_BEFORE_DIGIT" : {
+        "message" : [
+          "Currency characters must appear before digits in the number format."
+        ]
+      },
+      "EMPTY" : {
+        "message" : [
+          "The number format string cannot be empty."
+        ]
+      },
+      "ESC_AT_THE_END" : {
+        "message" : [
+          "The escape character is not allowed to end with."
+        ]
+      },
+      "ESC_IN_THE_MIDDLE" : {
+        "message" : [
+          "The escape character is not allowed to precede <char>."
+        ]
+      },
+      "THOUSANDS_SEPS_MUST_BEFORE_DEC" : {
+        "message" : [
+          "Thousands separators (, or G) may not appear after the decimal point in the number format."
+        ]
+      },
+      "UNEXPECTED_TOKEN" : {
+        "message" : [
+          "Found the unexpected <token> in the format string; the structure of the format string must match: [MI|S] [$] [0|9|G|,]* [.|D] [0|9]* [$] [PR|MI|S]."
+        ]
+      },
+      "WRONG_NUM_DIGIT" : {
+        "message" : [
+          "The format string requires at least one number digit."
+        ]
+      },
+      "WRONG_NUM_TOKEN" : {
+        "message" : [
+          "At most one <token> is allowed in the number format."
+        ]
+      }
+    }
+  },
   "INVALID_FRACTION_OF_SECOND" : {
     "message" : [
       "The fraction of sec must be zero. Valid range is [0, 60]. If necessary set <ansiConfig> to \"false\" to bypass this error."
@@ -708,23 +725,6 @@
       "The <joinType> JOIN with LATERAL correlation is not allowed because an OUTER subquery cannot correlate to its join partner. Remove the LATERAL correlation or use an INNER JOIN, or LEFT OUTER JOIN instead."
     ]
   },
-  "INVALID_LIKE_PATTERN" : {
-    "message" : [
-      "The pattern <pattern> is invalid."
-    ],
-    "subClass" : {
-      "ESC_AT_THE_END" : {
-        "message" : [
-          "the escape character is not allowed to end with."
-        ]
-      },
-      "ESC_IN_THE_MIDDLE" : {
-        "message" : [
-          "the escape character is not allowed to precede <char>."
-        ]
-      }
-    }
-  },
   "INVALID_PANDAS_UDF_PLACEMENT" : {
     "message" : [
       "The group aggregate pandas UDF <functionList> cannot be invoked together with as other, non-pandas aggregate functions."

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -52,6 +52,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB
   val extendedCheckRules: Seq[LogicalPlan => Unit] = Nil
 
   val DATA_TYPE_MISMATCH_ERROR = TreeNodeTag[Boolean]("dataTypeMismatchError")
+  val INVALID_FORMAT_ERROR = TreeNodeTag[Boolean]("invalidFormatError")
 
   /**
    * Fails the analysis at the point where a specific tree node was parsed using a provided
@@ -221,6 +222,9 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB
                 hof.failAnalysis(
                   errorClass = "_LEGACY_ERROR_TEMP_2314",
                   messageParameters = Map("sqlExpr" -> hof.sql, "msg" -> message))
+              case checkRes: TypeCheckResult.InvalidFormat =>
+                hof.setTagValue(INVALID_FORMAT_ERROR, true)
+                hof.invalidFormat(checkRes)
             }
 
           // If an attribute can't be resolved as a map key of string type, either the key should be
@@ -252,6 +256,9 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB
                     "sqlExpr" -> e.sql,
                     "msg" -> message,
                     "hint" -> extraHintForAnsiTypeCoercionExpression(operator)))
+              case checkRes: TypeCheckResult.InvalidFormat =>
+                e.setTagValue(INVALID_FORMAT_ERROR, true)
+                e.invalidFormat(checkRes)
             }
 
           case c: Cast if !c.resolved =>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCheckResult.scala
@@ -55,4 +55,17 @@ object TypeCheckResult {
     extends TypeCheckResult {
     def isSuccess: Boolean = false
   }
+
+  /**
+   * Represents an error of invalid format with the `INVALID_FORMAT` error class.
+   *
+   * @param errorSubClass A sub-class of `INVALID_FORMAT`.
+   * @param messageParameters Parameters of the sub-class error message.
+   */
+  case class InvalidFormat(
+      errorSubClass: String,
+      messageParameters: Map[String, String] = Map.empty)
+    extends TypeCheckResult {
+    def isSuccess: Boolean = false
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, InvalidFormat}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.quoteNameParts
@@ -73,6 +73,13 @@ package object analysis {
         origin = t.origin)
     }
 
+    def invalidFormat(invalidFormat: InvalidFormat): Nothing = {
+      throw new AnalysisException(
+        errorClass = s"INVALID_FORMAT.${invalidFormat.errorSubClass}",
+        messageParameters = invalidFormat.messageParameters,
+        origin = t.origin)
+    }
+
     def tableNotFound(name: Seq[String]): Nothing = {
       throw new AnalysisException(
         errorClass = "TABLE_OR_VIEW_NOT_FOUND",

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ToNumberParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ToNumberParser.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.util
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess}
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{InvalidFormat, TypeCheckSuccess}
 import org.apache.spark.sql.catalyst.expressions.Cast._
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.types.{Decimal, DecimalType, StringType}
@@ -281,37 +281,29 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
 
     // Make sure the format string contains at least one token.
     if (numberFormat.isEmpty) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_EMPTY",
-        messageParameters = Map.empty
-      )
+      return InvalidFormat(
+        errorSubClass = "EMPTY",
+        messageParameters = Map("format" -> toSQLValue(numberFormat, StringType)))
     }
     // Make sure the format string contains at least one digit.
     if (!formatTokens.exists(
       token => token.isInstanceOf[DigitGroups])) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_WRONG_NUM_DIGIT",
-        messageParameters = Map.empty
-      )
+      return InvalidFormat(
+        errorSubClass = "WRONG_NUM_DIGIT",
+        messageParameters = Map("format" -> toSQLValue(numberFormat, StringType)))
     }
     // Make sure that any dollar sign in the format string occurs before any digits.
     if (firstDigitIndex < firstDollarSignIndex) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_CUR_MUST_BEFORE_DIGIT",
-        messageParameters = Map(
-          "format" -> toSQLValue(numberFormat, StringType)
-        )
-      )
+      return InvalidFormat(
+        errorSubClass = "CUR_MUST_BEFORE_DIGIT",
+        messageParameters = Map("format" -> toSQLValue(numberFormat, StringType)))
     }
     // Make sure that any dollar sign in the format string occurs before any decimal point.
     if (firstDecimalPointIndex != -1 &&
       firstDecimalPointIndex < firstDollarSignIndex) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_CUR_MUST_BEFORE_DEC",
-        messageParameters = Map(
-          "format" -> toSQLValue(numberFormat, StringType)
-        )
-      )
+      return InvalidFormat(
+        errorSubClass = "CUR_MUST_BEFORE_DEC",
+        messageParameters = Map("format" -> toSQLValue(numberFormat, StringType)))
     }
     // Make sure that any thousands separators in the format string have digits before and after.
     if (digitGroupsBeforeDecimalPoint.exists {
@@ -327,23 +319,18 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
             false
         })
     }) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_CONT_THOUSANDS_SEPS",
-        messageParameters = Map(
-          "format" -> toSQLValue(numberFormat, StringType)
-        )
-      )
+      return InvalidFormat(
+        errorSubClass = "CONT_THOUSANDS_SEPS",
+        messageParameters = Map("format" -> toSQLValue(numberFormat, StringType)))
     }
     // Make sure that thousands separators does not appear after the decimal point, if any.
     if (digitGroupsAfterDecimalPoint.exists {
       case DigitGroups(tokens, digits) =>
         tokens.length > digits.length
     }) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_THOUSANDS_SEPS_MUST_BEFORE_DEC",
-        messageParameters = Map(
-          "format" -> toSQLValue(numberFormat, StringType)
-        )
+      return InvalidFormat(
+        errorSubClass = "THOUSANDS_SEPS_MUST_BEFORE_DEC",
+        messageParameters = Map("format" -> toSQLValue(numberFormat, StringType))
       )
     }
     // Make sure that the format string does not contain any prohibited duplicate tokens.
@@ -354,13 +341,11 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
       DollarSign(),
       ClosingAngleBracket()).foreach {
       token => if (inputTokenCounts.getOrElse(token, 0) > 1) {
-        return DataTypeMismatch(
-          errorSubClass = "FORMAT_WRONG_NUM_TOKEN",
+        return InvalidFormat(
+          errorSubClass = "WRONG_NUM_TOKEN",
           messageParameters = Map(
             "token" -> token.toString,
-            "format" -> toSQLValue(numberFormat, StringType)
-          )
-        )
+            "format" -> toSQLValue(numberFormat, StringType)))
       }
     }
     // Enforce the ordering of tokens in the format string according to this specification:
@@ -393,13 +378,11 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
       }
     }
     if (formatTokenIndex < formatTokens.length) {
-      return DataTypeMismatch(
-        errorSubClass = "FORMAT_UNEXPECTED_TOKEN",
+      return InvalidFormat(
+        errorSubClass = "UNEXPECTED_TOKEN",
         messageParameters = Map(
           "token" -> formatTokens(formatTokenIndex).toString,
-          "format" -> toSQLValue(numberFormat, StringType)
-        )
-      )
+          "format" -> toSQLValue(numberFormat, StringType)))
     }
     // Validation of the format string finished successfully.
     TypeCheckSuccess

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -2143,17 +2143,16 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase {
 
   def escapeCharacterInTheMiddleError(pattern: String, char: String): Throwable = {
     new AnalysisException(
-      errorClass = "INVALID_LIKE_PATTERN.ESC_IN_THE_MIDDLE",
+      errorClass = "INVALID_FORMAT.ESC_IN_THE_MIDDLE",
       messageParameters = Map(
-        "pattern" -> toSQLValue(pattern, StringType),
+        "format" -> toSQLValue(pattern, StringType),
         "char" -> toSQLValue(char, StringType)))
   }
 
   def escapeCharacterAtTheEndError(pattern: String): Throwable = {
     new AnalysisException(
-      errorClass = "INVALID_LIKE_PATTERN.ESC_AT_THE_END",
-      messageParameters = Map(
-        "pattern" -> toSQLValue(pattern, StringType)))
+      errorClass = "INVALID_FORMAT.ESC_AT_THE_END",
+      messageParameters = Map("format" -> toSQLValue(pattern, StringType)))
   }
 
   def tableIdentifierExistsError(tableIdentifier: TableIdentifier): Throwable = {

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -154,15 +154,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on nonascii
 
     // invalid escaping
-    val invalidEscape = intercept[AnalysisException] {
-      evaluateWithoutCodegen("""a""" like """\a""")
-    }
-    assert(invalidEscape.getMessage.contains("pattern"))
-
-    val endEscape = intercept[AnalysisException] {
-      evaluateWithoutCodegen("""a""" like """a\""")
-    }
-    assert(endEscape.getMessage.contains("pattern"))
+    checkError(
+      exception = intercept[AnalysisException] {
+        evaluateWithoutCodegen("""a""" like """\a""")
+      },
+      errorClass = "INVALID_FORMAT.ESC_IN_THE_MIDDLE",
+      parameters = Map("format" -> """'\\a'""", "char" -> "'a'"))
+    checkError(
+      exception = intercept[AnalysisException] {
+        evaluateWithoutCodegen("""a""" like """a\""")
+      },
+      errorClass = "INVALID_FORMAT.ESC_AT_THE_END",
+      parameters = Map("format" -> """'a\\'"""))
 
     // case
     checkLiteralRow("A" like _, "a%", false)
@@ -231,14 +234,12 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       // scalastyle:on nonascii
 
       // invalid escaping
-      val invalidEscape = intercept[AnalysisException] {
-        evaluateWithoutCodegen("""a""" like(s"""${escapeChar}a""", escapeChar))
-      }
-      assert(invalidEscape.getMessage.contains("pattern"))
-      val endEscape = intercept[AnalysisException] {
-        evaluateWithoutCodegen("""a""" like(s"""a$escapeChar""", escapeChar))
-      }
-      assert(endEscape.getMessage.contains("pattern"))
+      checkError(
+        exception = intercept[AnalysisException] {
+          evaluateWithoutCodegen("""a""" like(s"""${escapeChar}a""", escapeChar))
+        },
+        errorClass = "INVALID_FORMAT.ESC_IN_THE_MIDDLE",
+        parameters = Map("format" -> s"'${escapeChar}a'", "char" -> "'a'"))
 
       // case
       checkLiteralRow("A" like(_, escapeChar), "a%", false)