Skip to content

Commit

Permalink
[SPARK-41174][CORE][SQL] Propagate an error class to users for invali…
Browse files Browse the repository at this point in the history
…d `format` of `to_binary()`

### What changes were proposed in this pull request?
This pr overrides the `checkInputDataTypes()` method of `ToBinary` function to propagate error class to users for invalid `format`.

### Why are the changes needed?
Migration onto error classes unifies Spark SQL error messages.

### Does this PR introduce _any_ user-facing change?
Yes. The PR changes user-facing error messages.

### How was this patch tested?
Pass GitHub Actions

Closes apache#38737 from LuciferYang/SPARK-41174.

Authored-by: yangjie01 <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
  • Loading branch information
LuciferYang authored and beliefer committed Dec 18, 2022
1 parent b675081 commit 3ce9a07
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 45 deletions.
5 changes: 5 additions & 0 deletions core/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,11 @@
"Input to the function <functionName> cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"."
]
},
"INVALID_ARG_VALUE" : {
"message" : [
"The <inputName> value must to be a <requireType> literal of <validValues>, but got <inputValue>."
]
},
"INVALID_JSON_MAP_KEY_TYPE" : {
"message" : [
"Input schema <schema> can only contain STRING as a key type for a MAP."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2620,46 +2620,81 @@ case class ToBinary(
nullOnInvalidFormat: Boolean = false) extends RuntimeReplaceable
with ImplicitCastInputTypes {

override lazy val replacement: Expression = format.map { f =>
assert(f.foldable && (f.dataType == StringType || f.dataType == NullType))
@transient lazy val fmt: String = format.map { f =>
val value = f.eval()
if (value == null) {
Literal(null, BinaryType)
null
} else {
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match {
case "hex" => Unhex(expr, failOnError = true)
case "utf-8" | "utf8" => Encode(expr, Literal("UTF-8"))
case "base64" => UnBase64(expr, failOnError = true)
case _ if nullOnInvalidFormat => Literal(null, BinaryType)
case other => throw QueryCompilationErrors.invalidStringLiteralParameter(
"to_binary",
"format",
other,
Some(
"The value has to be a case-insensitive string literal of " +
"'hex', 'utf-8', 'utf8', or 'base64'."))
}
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT)
}
}.getOrElse("hex")

override lazy val replacement: Expression = if (fmt == null) {
Literal(null, BinaryType)
} else {
fmt match {
case "hex" => Unhex(expr, failOnError = true)
case "utf-8" | "utf8" => Encode(expr, Literal("UTF-8"))
case "base64" => UnBase64(expr, failOnError = true)
case _ => Literal(null, BinaryType)
}
}.getOrElse(Unhex(expr, failOnError = true))
}

def this(expr: Expression) = this(expr, None, false)

def this(expr: Expression, format: Expression) =
this(expr, Some({
// We perform this check in the constructor to make it eager and not go through type coercion.
if (format.foldable && (format.dataType == StringType || format.dataType == NullType)) {
format
} else {
throw QueryCompilationErrors.requireLiteralParameter("to_binary", "format", "string")
}
}), false)
this(expr, Some(format), false)

override def prettyName: String = "to_binary"

override def children: Seq[Expression] = expr +: format.toSeq

override def inputTypes: Seq[AbstractDataType] = children.map(_ => StringType)

override def checkInputDataTypes(): TypeCheckResult = {
def isValidFormat: Boolean = {
fmt == null || Set("hex", "utf-8", "utf8", "base64").contains(fmt)
}
format match {
case Some(f) =>
if (f.foldable && (f.dataType == StringType || f.dataType == NullType)) {
if (isValidFormat || nullOnInvalidFormat) {
super.checkInputDataTypes()
} else {
DataTypeMismatch(
errorSubClass = "INVALID_ARG_VALUE",
messageParameters = Map(
"inputName" -> "fmt",
"requireType" -> s"case-insensitive ${toSQLType(StringType)}",
"validValues" -> "'hex', 'utf-8', 'utf8', or 'base64'",
"inputValue" -> toSQLValue(fmt, StringType)
)
)
}
} else if (!f.foldable) {
DataTypeMismatch(
errorSubClass = "NON_FOLDABLE_INPUT",
messageParameters = Map(
"inputName" -> "fmt",
"inputType" -> toSQLType(StringType),
"inputExpr" -> toSQLExpr(f)
)
)
} else {
DataTypeMismatch(
errorSubClass = "INVALID_ARG_VALUE",
messageParameters = Map(
"inputName" -> "fmt",
"requireType" -> s"case-insensitive ${toSQLType(StringType)}",
"validValues" -> "'hex', 'utf-8', 'utf8', or 'base64'",
"inputValue" -> toSQLValue(f.eval(), f.dataType)
)
)
}
case _ => super.checkInputDataTypes()
}
}

override protected def withNewChildrenInternal(
newChildren: IndexedSeq[Expression]): Expression = {
if (format.isDefined) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,21 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
)
}

test("ToBinary: fails analysis if fmt is not foldable") {
val wrongFmt = AttributeReference("invalidFormat", StringType)()
val toBinaryExpr = ToBinary(Literal("abc"), Some(wrongFmt))
assert(toBinaryExpr.checkInputDataTypes() ==
DataTypeMismatch(
errorSubClass = "NON_FOLDABLE_INPUT",
messageParameters = Map(
"inputName" -> "fmt",
"inputType" -> toSQLType(wrongFmt.dataType),
"inputExpr" -> toSQLExpr(wrongFmt)
)
)
)
}

test("ToNumber: negative tests (the input string does not match the format string)") {
Seq(
// The input contained more thousands separators than the format string.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,7 @@ select to_binary(null, cast(null as string));
-- invalid format
select to_binary('abc', 1);
select to_binary('abc', 'invalidFormat');
CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat');
SELECT to_binary('abc', fmtField) FROM fmtTable;
-- Clean up
DROP VIEW IF EXISTS fmtTable;
Original file line number Diff line number Diff line change
Expand Up @@ -1610,11 +1610,13 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "_LEGACY_ERROR_TEMP_1100",
"errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE",
"messageParameters" : {
"argName" : "format",
"funcName" : "to_binary",
"requiredType" : "string"
"inputName" : "fmt",
"inputValue" : "'1'",
"requireType" : "case-insensitive \"STRING\"",
"sqlExpr" : "\"to_binary(abc, 1)\"",
"validValues" : "'hex', 'utf-8', 'utf8', or 'base64'"
},
"queryContext" : [ {
"objectType" : "",
Expand All @@ -1633,11 +1635,59 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "_LEGACY_ERROR_TEMP_1101",
"errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE",
"messageParameters" : {
"argName" : "format",
"endingMsg" : " The value has to be a case-insensitive string literal of 'hex', 'utf-8', 'utf8', or 'base64'.",
"funcName" : "to_binary",
"invalidValue" : "invalidformat"
}
"inputName" : "fmt",
"inputValue" : "'invalidformat'",
"requireType" : "case-insensitive \"STRING\"",
"sqlExpr" : "\"to_binary(abc, invalidFormat)\"",
"validValues" : "'hex', 'utf-8', 'utf8', or 'base64'"
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
"stopIndex" : 40,
"fragment" : "to_binary('abc', 'invalidFormat')"
} ]
}


-- !query
CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat')
-- !query schema
struct<>
-- !query output



-- !query
SELECT to_binary('abc', fmtField) FROM fmtTable
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
"messageParameters" : {
"inputExpr" : "\"fmtField\"",
"inputName" : "fmt",
"inputType" : "\"STRING\"",
"sqlExpr" : "\"to_binary(abc, fmtField)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
"stopIndex" : 33,
"fragment" : "to_binary('abc', fmtField)"
} ]
}


-- !query
DROP VIEW IF EXISTS fmtTable
-- !query schema
struct<>
-- !query output

Original file line number Diff line number Diff line change
Expand Up @@ -1542,11 +1542,13 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "_LEGACY_ERROR_TEMP_1100",
"errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE",
"messageParameters" : {
"argName" : "format",
"funcName" : "to_binary",
"requiredType" : "string"
"inputName" : "fmt",
"inputValue" : "'1'",
"requireType" : "case-insensitive \"STRING\"",
"sqlExpr" : "\"to_binary(abc, 1)\"",
"validValues" : "'hex', 'utf-8', 'utf8', or 'base64'"
},
"queryContext" : [ {
"objectType" : "",
Expand All @@ -1565,11 +1567,59 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "_LEGACY_ERROR_TEMP_1101",
"errorClass" : "DATATYPE_MISMATCH.INVALID_ARG_VALUE",
"messageParameters" : {
"argName" : "format",
"endingMsg" : " The value has to be a case-insensitive string literal of 'hex', 'utf-8', 'utf8', or 'base64'.",
"funcName" : "to_binary",
"invalidValue" : "invalidformat"
}
"inputName" : "fmt",
"inputValue" : "'invalidformat'",
"requireType" : "case-insensitive \"STRING\"",
"sqlExpr" : "\"to_binary(abc, invalidFormat)\"",
"validValues" : "'hex', 'utf-8', 'utf8', or 'base64'"
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
"stopIndex" : 40,
"fragment" : "to_binary('abc', 'invalidFormat')"
} ]
}


-- !query
CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat')
-- !query schema
struct<>
-- !query output



-- !query
SELECT to_binary('abc', fmtField) FROM fmtTable
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
"messageParameters" : {
"inputExpr" : "\"fmtField\"",
"inputName" : "fmt",
"inputType" : "\"STRING\"",
"sqlExpr" : "\"to_binary(abc, fmtField)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
"stopIndex" : 33,
"fragment" : "to_binary('abc', fmtField)"
} ]
}


-- !query
DROP VIEW IF EXISTS fmtTable
-- !query schema
struct<>
-- !query output

0 comments on commit 3ce9a07

Please sign in to comment.