Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/contributor-guide/spark_expressions_support.md
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@
- [x] contains
- [ ] decode
- [ ] elt
- [ ] encode
- [x] encode
- [x] endswith
- [ ] find_in_set
- [ ] format_number
Expand Down
3 changes: 2 additions & 1 deletion docs/source/user-guide/latest/expressions.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ of expressions that be disabled.
## String Functions

| Expression |
| --------------- |
|-----------------|
| Ascii |
| BitLength |
| Chr |
| Concat |
| ConcatWs |
| Contains |
| Encode |
| EndsWith |
| InitCap |
| Left |
Expand Down
29 changes: 29 additions & 0 deletions spark/src/main/scala/org/apache/comet/serde/strings.scala
Original file line number Diff line number Diff line change
Expand Up @@ -495,4 +495,33 @@ trait CommonStringExprs {
None
}
}

def stringEncode(
expr: Expression,
charset: Expression,
value: Expression,
inputs: Seq[Attribute],
binding: Boolean): Option[Expr] = {
charset match {
case Literal(str, DataTypes.StringType)
if str.toString.toLowerCase(Locale.ROOT) == "utf-8" =>
Comment thread
YutaLin marked this conversation as resolved.
Outdated
// encode(col, 'utf-8') is byte-equivalent to cast(string AS binary)
// because Spark's UTF8String already holds valid UTF-8 bytes.
val strExpr = exprToProtoInternal(value, inputs, binding)
if (strExpr.isDefined) {
CometCast.castToProto(
expr,
None,
DataTypes.BinaryType,
strExpr.get,
CometEvalMode.LEGACY)
} else {
withInfo(expr, value)
None
}
case _ =>
withInfo(expr, "Comet only supports encoding with 'utf-8'.")
None
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ trait CometExprShim extends CommonStringExprs {
case s: StringDecode =>
// Right child is the encoding expression.
stringDecode(expr, s.charset, s.bin, inputs, binding)

case e: Encode =>
stringEncode(expr, e.charset, e.value, inputs, binding)
case _ => None
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ trait CometExprShim extends CommonStringExprs {
case s: StringDecode =>
// Right child is the encoding expression.
stringDecode(expr, s.charset, s.bin, inputs, binding)

case e: Encode =>
stringEncode(expr, e.charset, e.value, inputs, binding)
case expr @ ToPrettyString(child, timeZoneId) =>
val castSupported = CometCast.isSupported(
child.dataType,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ trait CometExprShim extends CommonStringExprs {
val Seq(bin, charset, _, _) = s.arguments
stringDecode(expr, charset, bin, inputs, binding)

case s: StaticInvoke
if s.staticObject == classOf[Encode] &&
s.dataType.isInstanceOf[BinaryType] &&
s.functionName == "encode" &&
s.arguments.size == 4 &&
s.inputTypes == Seq(
StringTypeWithCollation(supportsTrimCollation = true),
StringTypeWithCollation(supportsTrimCollation = true),
BooleanType,
BooleanType) =>
val Seq(value, charset, _, _) = s.arguments
stringEncode(expr, charset, value, inputs, binding)
Comment thread
YutaLin marked this conversation as resolved.
Outdated

case expr @ ToPrettyString(child, timeZoneId) =>
val castSupported = CometCast.isSupported(
child.dataType,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ trait CometExprShim extends CommonStringExprs {
val Seq(bin, charset, _, _) = s.arguments
stringDecode(expr, charset, bin, inputs, binding)

case s: StaticInvoke
if s.staticObject == classOf[Encode] &&
s.dataType.isInstanceOf[BinaryType] &&
s.functionName == "encode" &&
s.arguments.size == 4 &&
s.inputTypes == Seq(
StringTypeWithCollation(supportsTrimCollation = true),
StringTypeWithCollation(supportsTrimCollation = true),
BooleanType,
BooleanType) =>
val Seq(value, charset, _, _) = s.arguments
stringEncode(expr, charset, value, inputs, binding)

case expr @ ToPrettyString(child, timeZoneId) =>
val castSupported = CometCast.isSupported(
child.dataType,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ trait CometExprShim extends CommonStringExprs {
val Seq(bin, charset, _, _) = s.arguments
stringDecode(expr, charset, bin, inputs, binding)

case s: StaticInvoke
if s.staticObject == classOf[Encode] &&
s.dataType.isInstanceOf[BinaryType] &&
s.functionName == "encode" &&
s.arguments.size == 4 &&
s.inputTypes == Seq(
StringTypeWithCollation(supportsTrimCollation = true),
StringTypeWithCollation(supportsTrimCollation = true),
BooleanType,
BooleanType) =>
val Seq(value, charset, _, _) = s.arguments
stringEncode(expr, charset, value, inputs, binding)

case expr @ ToPrettyString(child, timeZoneId) =>
val castSupported = CometCast.isSupported(
child.dataType,
Expand Down
61 changes: 61 additions & 0 deletions spark/src/test/resources/sql-tests/expressions/string/encode.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.

-- Tests for the SQL `encode(str, charset)` function.
--
-- Spark 3.x: Encode is a BinaryExpression(value, charset).
-- Spark 4.x+: Encode is RuntimeReplaceable; the analyzer rewrites it to
-- StaticInvoke(classOf[Encode], BinaryType, "encode", ...)

statement
CREATE TABLE test_encode_utf8(s string) USING parquet

statement
INSERT INTO test_encode_utf8 VALUES ('hello'), ('world'), (''), ('café'), (NULL)

query
SELECT encode(s, 'utf-8') FROM test_encode_utf8

query
SELECT encode(s, 'UTF-8') FROM test_encode_utf8

-- Mixed-case charset literal exercises toLowerCase normalization
query
SELECT encode(s, 'Utf-8') FROM test_encode_utf8

query
SELECT encode('hello', 'utf-8'), encode('', 'utf-8'), encode(CAST(NULL AS STRING), 'utf-8')

-- Different language(French, Japanese)
query
SELECT encode('café', 'utf-8'), encode('日本語', 'utf-8')

-- non-UTF-8 falls back to Spark JVM
statement
CREATE TABLE test_encode_charset_safe(s string) USING parquet

statement
INSERT INTO test_encode_charset_safe VALUES ('hello'), ('world'), (''), (NULL)

query expect_fallback(Comet only supports encoding with 'utf-8'.)
SELECT encode(s, 'UTF-16BE') FROM test_encode_charset_safe

query expect_fallback(Comet only supports encoding with 'utf-8'.)
SELECT encode(s, 'US-ASCII') FROM test_encode_charset_safe

query expect_fallback(Comet only supports encoding with 'utf-8'.)
SELECT encode(s, 'ISO-8859-1') FROM test_encode_charset_safe