[SPARK-50579][SQL] Fix truncatedString

### What changes were proposed in this pull request? In the PR, I propose to respect the `maxFields` argument of `SparkStringUtils.truncatedString` in all cases independently from the size of the another argument `seq`. If size of `seq` is grater or equal to `maxFields`, output exact `maxFields` elements of `seq`. Also, if the number of printed elements is zero, don't print the redundant comma `,` like `[, ... 100 more fields]`, and don't overflow when `maxFields` is `Int.MinValue`. ### Why are the changes needed? To make output consistent for the same `maxFields`. For example, if `maxFields` is 2: ```scala truncatedString(Seq(1, 2), "[", ", ", "]", maxFields = 2) -> "[1, 2]" ``` but for more elements in the input sequence, it prints only one element: ```scala truncatedString(Seq(1, 2, 3), "[", ", ", "]", maxFields = 2) -> "[1, ... 2 more fields]" ``` though the expected output is **"[1, 2, ... 1 more fields]"**. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By existing tests from `UtilSuite` that are moved to `StringUtilsSuite` and additional checks: ``` $ build/sbt "test:testOnly *StringUtilsSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #49187 from MaxGekk/fix-truncatedString. Authored-by: Max Gekk <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
apache · Dec 16, 2024 · 0faf9d5 · 0faf9d5
1 parent d293ba6
commit 0faf9d5
Show file tree

Hide file tree

Showing 12 changed files with 50 additions and 65 deletions.
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -73,7 +73,7 @@ object SparkStringUtils extends Logging {
 
   /**
    * Format a sequence with semantics similar to calling .mkString(). Any elements beyond
-   * maxNumToStringFields will be dropped and replaced by a "... N more fields" placeholder.
+   * `maxFields` will be dropped and replaced by a "... N more fields" placeholder.
    *
    * @return
    *   the trimmed and formatted string.
@@ -90,10 +90,11 @@ object SparkStringUtils extends Logging {
           "Truncated the string representation of a plan since it was too large. This " +
             s"behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.")
       }
-      val numFields = math.max(0, maxFields - 1)
-      seq
-        .take(numFields)
-        .mkString(start, sep, sep + "... " + (seq.length - numFields) + " more fields" + end)
+      val numFields = math.max(0, maxFields)
+      val restNum = seq.length - numFields
+      val ending = (if (numFields == 0) "" else sep) +
+        (if (restNum == 0) "" else s"... $restNum more fields") + end
+      seq.take(numFields).mkString(start, sep, ending)
     } else {
       seq.mkString(start, sep, end)
     }

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala
@@ -136,4 +136,19 @@ class StringUtilsSuite extends SparkFunSuite with SQLHelper {
     val expectedOutput = Seq("`c1`", "`v2.c2`", "`v1`.`c2`")
     assert(orderSuggestedIdentifiersBySimilarity(baseString, testStrings) === expectedOutput)
   }
+
+  test("SPARK-50579: truncated string") {
+    assert(truncatedString(Seq.empty, ", ", -1) === "")
+    assert(truncatedString(Seq("a"), ", ", -1) === "... 1 more fields")
+    assert(truncatedString(Seq("B"), "(", ", ", ")", -1) === "(... 1 more fields)")
+    assert(truncatedString(Seq.empty, ", ", 0) === "")
+    assert(truncatedString(Seq.empty, "[", ", ", "]", 0) === "[]")
+    assert(truncatedString(Seq("a", "b"), ", ", 0) === "... 2 more fields")
+    assert(truncatedString(Seq.empty, ",", 1) === "")
+    assert(truncatedString(Seq("a"), ",", 1) === "a")
+    assert(truncatedString(Seq("a", "b"), ", ", 1) === "a, ... 1 more fields")
+    assert(truncatedString(Seq("a", "b"), ", ", 2) === "a, b")
+    assert(truncatedString(Seq("a", "b", "c"), ", ", Int.MaxValue) === "a, b, c")
+    assert(truncatedString(Seq("a", "b", "c"), ", ", Int.MinValue) === "... 3 more fields")
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/UtilSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/UtilSuite.scala
diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain
@@ -1,2 +1,2 @@
-Project [id#0L, id#0L, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, ... 3 more fields]
+Project [id#0L, id#0L, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, ... 2 more fields]
 +- LocalRelation <empty>, [id#0L, a#0, b#0]
diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_typedLit.explain
@@ -1,2 +1,2 @@
-Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, ... 19 more fields]
+Project [id#0L, id#0L, 1 AS 1#0, null AS NULL#0, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, [8,6] AS ARRAY(8, 6)#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, ... 18 more fields]
 +- LocalRelation <empty>, [id#0L, a#0, b#0]
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out
@@ -630,7 +630,7 @@ Aggregate [a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x], [a#x, b#x, coun
 SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), ROLLUP(a, b), GROUPING SETS((a, b), (a), ())
 -- !query analysis
 Aggregate [a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x], [a#x, b#x, count(1) AS count(1)#xL]
-+- Expand [[a#x, b#x, a#x, b#x, 0, 0], [a#x, b#x, a#x, b#x, 0, 1], [a#x, b#x, a#x, b#x, 0, 2], [a#x, b#x, a#x, b#x, 0, 3], [a#x, b#x, a#x, b#x, 0, 4], [a#x, b#x, a#x, b#x, 0, 5], [a#x, b#x, a#x, b#x, 0, 6], [a#x, b#x, a#x, b#x, 0, 7], [a#x, b#x, a#x, b#x, 0, 8], [a#x, b#x, a#x, b#x, 0, 9], [a#x, b#x, a#x, b#x, 0, 10], [a#x, b#x, a#x, b#x, 0, 11], [a#x, b#x, a#x, b#x, 0, 12], [a#x, b#x, a#x, null, 1, 13], [a#x, b#x, a#x, null, 1, 14], [a#x, b#x, a#x, b#x, 0, 15], [a#x, b#x, a#x, null, 1, 16], [a#x, b#x, a#x, null, 1, 17], [a#x, b#x, a#x, b#x, 0, 18], [a#x, b#x, a#x, b#x, 0, 19], [a#x, b#x, a#x, b#x, 0, 20], [a#x, b#x, a#x, b#x, 0, 21], [a#x, b#x, a#x, b#x, 0, 22], [a#x, b#x, a#x, b#x, 0, 23], ... 12 more fields], [a#x, b#x, a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x]
++- Expand [[a#x, b#x, a#x, b#x, 0, 0], [a#x, b#x, a#x, b#x, 0, 1], [a#x, b#x, a#x, b#x, 0, 2], [a#x, b#x, a#x, b#x, 0, 3], [a#x, b#x, a#x, b#x, 0, 4], [a#x, b#x, a#x, b#x, 0, 5], [a#x, b#x, a#x, b#x, 0, 6], [a#x, b#x, a#x, b#x, 0, 7], [a#x, b#x, a#x, b#x, 0, 8], [a#x, b#x, a#x, b#x, 0, 9], [a#x, b#x, a#x, b#x, 0, 10], [a#x, b#x, a#x, b#x, 0, 11], [a#x, b#x, a#x, b#x, 0, 12], [a#x, b#x, a#x, null, 1, 13], [a#x, b#x, a#x, null, 1, 14], [a#x, b#x, a#x, b#x, 0, 15], [a#x, b#x, a#x, null, 1, 16], [a#x, b#x, a#x, null, 1, 17], [a#x, b#x, a#x, b#x, 0, 18], [a#x, b#x, a#x, b#x, 0, 19], [a#x, b#x, a#x, b#x, 0, 20], [a#x, b#x, a#x, b#x, 0, 21], [a#x, b#x, a#x, b#x, 0, 22], [a#x, b#x, a#x, b#x, 0, 23], [a#x, b#x, a#x, b#x, 0, 24], ... 11 more fields], [a#x, b#x, a#x, b#x, spark_grouping_id#xL, _gen_grouping_pos#x]
    +- Project [a#x, b#x, a#x AS a#x, b#x AS b#x]
       +- SubqueryAlias testdata
          +- View (`testData`, [a#x, b#x])

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/join.sql.out
@@ -1326,7 +1326,7 @@ Aggregate [count(1) AS count(1)#xL]
 +- Filter unique1#x IN (list#x [])
    :  +- Project [unique1#x]
    :     +- Filter (unique2#x = 42)
-   :        +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, ... 7 more fields]
+   :        +- Project [unique1#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, tenthous#x, odd#x, even#x, stringu1#x, stringu2#x, string4#x, unique2#x, two#x, four#x, ten#x, twenty#x, hundred#x, thousand#x, twothousand#x, fivethous#x, ... 6 more fields]
    :           +- Join Inner, (unique1#x = unique1#x)
    :              :- SubqueryAlias b
    :              :  +- SubqueryAlias spark_catalog.default.tenk1