From 756e5f3c268f5ca8f054e6f51c777f6ad81c6d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Sun, 24 May 2026 10:52:30 +0000 Subject: [PATCH 01/58] Add classic.SparkSessionProvider --- .../sql/classic/SparkSessionProvider.scala | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala new file mode 100644 index 0000000000000..e459250f2d3f4 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.classic + +import org.apache.spark.sql + +/** + * A common trait for test suites that require a classic [[SparkSession]]. + */ +trait SparkSessionProvider extends sql.SparkSessionProvider { + override protected def spark: SparkSession +} From 3b2c7cdd8c65cec527a9a9775519d2de6534d8af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Thu, 28 May 2026 18:35:32 +0000 Subject: [PATCH 02/58] Move test.SharedSparkSessionBase functionality to sql.SharedSparkSession --- .../apache/spark/sql/SharedSparkSession.scala | 147 ++++++++++++++++++ .../spark/sql/test/SharedSparkSession.scala | 126 +-------------- 2 files changed, 151 insertions(+), 122 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala new file mode 100644 index 0000000000000..2d34e6829ddd6 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.concurrent.duration._ + +import org.scalatest.{BeforeAndAfterEach, Suite} +import org.scalatest.concurrent.Eventually + +import org.apache.spark.{DebugFilesystem, SparkConf, SparkFunSuite} +import org.apache.spark.internal.config.UNSAFE_EXCEPTION_ON_MEMORY_LEAK +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.sql.test.TestSparkSession + +trait SharedSparkSession + extends SparkFunSuite + with SparkSessionProvider + with BeforeAndAfterEach + with Eventually { + + protected def sparkConf = { + val conf = new SparkConf() + .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName) + .set(UNSAFE_EXCEPTION_ON_MEMORY_LEAK, true) + .set(SQLConf.CODEGEN_FALLBACK.key, "false") + .set(SQLConf.CODEGEN_FACTORY_MODE.key, CodegenObjectFactoryMode.CODEGEN_ONLY.toString) + // Disable ConvertToLocalRelation for better test coverage. Test cases built on + // LocalRelation will exercise the optimization rules better by disabling it as + // this rule may potentially block testing of other optimization rules such as + // ConstantPropagation etc. + .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName) + conf.set( + StaticSQLConf.WAREHOUSE_PATH, + conf.get(StaticSQLConf.WAREHOUSE_PATH) + "/" + getClass.getCanonicalName) + conf.set(StaticSQLConf.LOAD_SESSION_EXTENSIONS_FROM_CLASSPATH, false) + conf.set(StaticSQLConf.SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD, + sys.env.getOrElse("SPARK_TEST_SQL_SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD", + StaticSQLConf.SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD.defaultValueString).toInt) + conf.set(StaticSQLConf.RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD, + sys.env.getOrElse("SPARK_TEST_SQL_RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD", + StaticSQLConf.RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD.defaultValueString).toInt) + } + + /** + * The [[TestSparkSession]] to use for all tests in this suite. + * + * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local + * mode with the default test configurations. + */ + private var _spark: classic.SparkSession = null + + protected override def spark: SparkSession = _spark + + /** + * The [[TestSQLContext]] to use for all tests in this suite. + */ + protected implicit def sqlContext: SQLContext = _spark.sqlContext + + protected def createSparkSession: classic.SparkSession = { + classic.SparkSession.cleanupAnyExistingSession() + new TestSparkSession(sparkConf) + } + + protected def sqlConf: SQLConf = _spark.sessionState.conf + + /** + * Initialize the [[TestSparkSession]]. Generally, this is just called from + * beforeAll; however, in test using styles other than FunSuite, there is + * often code that relies on the session between test group constructs and + * the actual tests, which may need this session. It is purely a semantic + * difference, but semantically, it makes more sense to call + * 'initializeSession' between a 'describe' and an 'it' call than it does to + * call 'beforeAll'. + */ + protected def initializeSession(): Unit = { + if (_spark == null) { + _spark = createSparkSession + } + } + + /** + * Make sure the [[TestSparkSession]] is initialized before any tests are run. + */ + protected override def beforeAll(): Unit = { + initializeSession() + + // Ensure we have initialized the context before calling parent code + super.beforeAll() + } + + /** + * Stop the underlying [[org.apache.spark.SparkContext]], if any. + */ + protected override def afterAll(): Unit = { + try { + super.afterAll() + } finally { + try { + if (_spark != null) { + try { + _spark.sessionState.catalog.reset() + } finally { + _spark.stop() + _spark = null + } + } + } finally { + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + } + } + } + + protected override def beforeEach(): Unit = { + super.beforeEach() + DebugFilesystem.clearOpenStreams() + } + + protected override def afterEach(): Unit = { + super.afterEach() + // Clear all persistent datasets after each test + _spark.sharedState.cacheManager.clearCache() + // files can be closed from other threads, so wait a bit + // normally this doesn't take more than 1s + eventually(timeout(10.seconds), interval(2.seconds)) { + DebugFilesystem.assertNoOpenStreams() + } + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index fb26d3311ebef..eccadab760665 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -19,15 +19,8 @@ package org.apache.spark.sql.test import scala.concurrent.duration._ -import org.scalatest.{BeforeAndAfterEach, Suite} -import org.scalatest.concurrent.Eventually - -import org.apache.spark.{DebugFilesystem, SparkConf} -import org.apache.spark.internal.config.UNSAFE_EXCEPTION_ON_MEMORY_LEAK +import org.apache.spark.sql import org.apache.spark.sql.{classic, QueryTest, QueryTestBase, SparkSession, SparkSessionProvider, SQLContext} -import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode -import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation -import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { @@ -87,119 +80,8 @@ trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { */ trait SharedSparkSessionBase extends QueryTestBase - with SparkSessionProvider - with BeforeAndAfterEach - with Eventually { self: Suite => - - protected def sparkConf = { - val conf = new SparkConf() - .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName) - .set(UNSAFE_EXCEPTION_ON_MEMORY_LEAK, true) - .set(SQLConf.CODEGEN_FALLBACK.key, "false") - .set(SQLConf.CODEGEN_FACTORY_MODE.key, CodegenObjectFactoryMode.CODEGEN_ONLY.toString) - // Disable ConvertToLocalRelation for better test coverage. Test cases built on - // LocalRelation will exercise the optimization rules better by disabling it as - // this rule may potentially block testing of other optimization rules such as - // ConstantPropagation etc. - .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName) - conf.set( - StaticSQLConf.WAREHOUSE_PATH, - conf.get(StaticSQLConf.WAREHOUSE_PATH) + "/" + getClass.getCanonicalName) - conf.set(StaticSQLConf.LOAD_SESSION_EXTENSIONS_FROM_CLASSPATH, false) - conf.set(StaticSQLConf.SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD, - sys.env.getOrElse("SPARK_TEST_SQL_SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD", - StaticSQLConf.SHUFFLE_EXCHANGE_MAX_THREAD_THRESHOLD.defaultValueString).toInt) - conf.set(StaticSQLConf.RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD, - sys.env.getOrElse("SPARK_TEST_SQL_RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD", - StaticSQLConf.RESULT_QUERY_STAGE_MAX_THREAD_THRESHOLD.defaultValueString).toInt) - } - - /** - * The [[TestSparkSession]] to use for all tests in this suite. - * - * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local - * mode with the default test configurations. - */ - private var _spark: TestSparkSession = null - - /** - * The [[TestSparkSession]] to use for all tests in this suite. - */ - protected override def spark: classic.SparkSession = _spark - - /** - * The [[TestSQLContext]] to use for all tests in this suite. - */ - protected implicit def sqlContext: SQLContext = _spark.sqlContext - - protected def createSparkSession: TestSparkSession = { - classic.SparkSession.cleanupAnyExistingSession() - new TestSparkSession(sparkConf) - } - - protected def sqlConf: SQLConf = _spark.sessionState.conf - - /** - * Initialize the [[TestSparkSession]]. Generally, this is just called from - * beforeAll; however, in test using styles other than FunSuite, there is - * often code that relies on the session between test group constructs and - * the actual tests, which may need this session. It is purely a semantic - * difference, but semantically, it makes more sense to call - * 'initializeSession' between a 'describe' and an 'it' call than it does to - * call 'beforeAll'. - */ - protected def initializeSession(): Unit = { - if (_spark == null) { - _spark = createSparkSession - } - } + with sql.SharedSparkSession { - /** - * Make sure the [[TestSparkSession]] is initialized before any tests are run. - */ - protected override def beforeAll(): Unit = { - initializeSession() - - // Ensure we have initialized the context before calling parent code - super.beforeAll() - } - - /** - * Stop the underlying [[org.apache.spark.SparkContext]], if any. - */ - protected override def afterAll(): Unit = { - try { - super.afterAll() - } finally { - try { - if (_spark != null) { - try { - _spark.sessionState.catalog.reset() - } finally { - _spark.stop() - _spark = null - } - } - } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } - } - } - - protected override def beforeEach(): Unit = { - super.beforeEach() - DebugFilesystem.clearOpenStreams() - } - - protected override def afterEach(): Unit = { - super.afterEach() - // Clear all persistent datasets after each test - spark.sharedState.cacheManager.clearCache() - // files can be closed from other threads, so wait a bit - // normally this doesn't take more than 1s - eventually(timeout(10.seconds), interval(2.seconds)) { - DebugFilesystem.assertNoOpenStreams() - } - } + protected override def spark: classic.SparkSession = + super.spark.asInstanceOf[classic.SparkSession] } From 67dd5f0d0bf5a36df9ea1a360eb62670b0a5f240 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Thu, 28 May 2026 18:49:56 +0000 Subject: [PATCH 03/58] [API CHANGE]: Move doThreadPreAudit, doThreadPostAudit to sql.SharedSparkSession This is technically an 'api change' as it moves the thread audit stuff from `test.SharedSparkSession` to `test.SharedSparkSessionBase`. This breaks code that implements `SharedSparkSessionBase` to circumvent the thread audit stuff. --- .../apache/spark/sql/SharedSparkSession.scala | 20 ++++++++++++++-- .../spark/sql/test/SharedSparkSession.scala | 23 ------------------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala index 2d34e6829ddd6..8cab22ac97182 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala @@ -95,10 +95,22 @@ trait SharedSparkSession } } + /** + * Suites extending [[SharedSparkSession]] are sharing resources (e.g. SparkSession) in their + * tests. That trait initializes the spark session in its [[beforeAll()]] implementation before + * the automatic thread snapshot is performed, so the audit code could fail to report threads + * leaked by that shared session. + * + * The behavior is overridden here to take the snapshot before the spark session is initialized. + */ + override protected val enableAutoThreadAudit = false + /** * Make sure the [[TestSparkSession]] is initialized before any tests are run. */ protected override def beforeAll(): Unit = { + doThreadPreAudit() + initializeSession() // Ensure we have initialized the context before calling parent code @@ -122,8 +134,12 @@ trait SharedSparkSession } } } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() + try { + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + } finally { + doThreadPostAudit() + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index eccadab760665..283ef23bfba16 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -24,29 +24,6 @@ import org.apache.spark.sql.{classic, QueryTest, QueryTestBase, SparkSession, Sp trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { - /** - * Suites extending [[SharedSparkSession]] are sharing resources (e.g. SparkSession) in their - * tests. That trait initializes the spark session in its [[beforeAll()]] implementation before - * the automatic thread snapshot is performed, so the audit code could fail to report threads - * leaked by that shared session. - * - * The behavior is overridden here to take the snapshot before the spark session is initialized. - */ - override protected val enableAutoThreadAudit = false - - protected override def beforeAll(): Unit = { - doThreadPreAudit() - super.beforeAll() - } - - protected override def afterAll(): Unit = { - try { - super.afterAll() - } finally { - doThreadPostAudit() - } - } - // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. def runAndFetchMetrics(func: => Unit): Map[(Long, String, String), String] = { From 49778adfda87920b0b0a344297dbd5aa7fca1f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Thu, 28 May 2026 19:13:25 +0000 Subject: [PATCH 04/58] Rename sql.SharedSparkSession to sql.SparkSessionBinder to prevent shadowing --- .../sql/{SharedSparkSession.scala => SparkSessionBinder.scala} | 2 +- .../scala/org/apache/spark/sql/test/SharedSparkSession.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename sql/core/src/test/scala/org/apache/spark/sql/{SharedSparkSession.scala => SparkSessionBinder.scala} (99%) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala similarity index 99% rename from sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala rename to sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index 8cab22ac97182..f1f70e9b2b109 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.TestSparkSession -trait SharedSparkSession +trait SparkSessionBinder extends SparkFunSuite with SparkSessionProvider with BeforeAndAfterEach diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 283ef23bfba16..65bdd029aa65b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -57,7 +57,7 @@ trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { */ trait SharedSparkSessionBase extends QueryTestBase - with sql.SharedSparkSession { + with sql.SparkSessionBinder { protected override def spark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] From 3a6caaa26690d51ddec17229a41eac7fca246094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Thu, 28 May 2026 19:31:39 +0000 Subject: [PATCH 05/58] Deprecate test.SharedSparkSession --- .../scala/org/apache/spark/sql/test/SharedSparkSession.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 65bdd029aa65b..ad764d99c059f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -22,6 +22,7 @@ import scala.concurrent.duration._ import org.apache.spark.sql import org.apache.spark.sql.{classic, QueryTest, QueryTestBase, SparkSession, SparkSessionProvider, SQLContext} +@deprecated("Use SparkSessionBinder and QueryTest instead") trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that @@ -55,6 +56,7 @@ trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { /** * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. */ +@deprecated("Use SparkSessionBinder and QueryTest instead") trait SharedSparkSessionBase extends QueryTestBase with sql.SparkSessionBinder { From cbda4b55efa83889f780521435397c17101f4b3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 25 May 2026 16:44:56 +0000 Subject: [PATCH 06/58] Add connect.SparkSession{Provider,Binder}, connect.QueryTest and demo --- .../apache/spark/sql/connect/QueryTest.scala | 38 +++++++ .../connect/QueryTestWithConnectSuite.scala | 32 ++++++ .../sql/connect/SparkSessionBinder.scala | 98 +++++++++++++++++++ .../sql/connect/SparkSessionProvider.scala | 28 ++++++ .../org/apache/spark/sql/QueryTest.scala | 2 +- 5 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala create mode 100644 sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala create mode 100644 sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala create mode 100644 sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionProvider.scala diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala new file mode 100644 index 0000000000000..e107eb01f5700 --- /dev/null +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import org.apache.spark.{sql => sqlApi} + +/** + * Extends [[sqlApi.QueryTest]] for use with Connect sessions. + * + * Overrides [[checkAnswer]] to avoid classic-only code paths (e.g. `queryExecution`, + * `logicalPlan`, `materializedRdd`) that are not available on Connect DataFrames. + */ +trait QueryTest extends sqlApi.QueryTest with SparkSessionProvider { + + override protected def checkAnswer( + df: => sqlApi.DataFrame, expectedAnswer: Seq[sqlApi.Row]): Unit = { + val sparkAnswer = df.collect().toSeq + sqlApi.QueryTest.sameRows(expectedAnswer, sparkAnswer) match { + case Some(errorMessage) => fail(errorMessage) + case None => + } + } +} diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala new file mode 100644 index 0000000000000..488335daa8ded --- /dev/null +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import org.apache.spark.sql.QueryTestSuite + +/** + * Runs [[QueryTestSuite]] tests through a Connect session. + * + * This validates the `FooSuite with connect.SharedSparkSession` pattern: the existing + * [[QueryTestSuite]] tests are inherited unchanged, but execute against a + * [[org.apache.spark.sql.connect.SparkSession connect.SparkSession]] instead of a classic one. + */ +class QueryTestWithConnectSuite + extends QueryTestSuite + with SparkSessionBinder + with QueryTest diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala new file mode 100644 index 0000000000000..0e077cdcdf33e --- /dev/null +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import java.util.UUID + +import scala.concurrent.duration._ + +import org.scalatest.concurrent.Eventually + +import org.apache.spark.DebugFilesystem +import org.apache.spark.sql +import org.apache.spark.sql.classic +import org.apache.spark.sql.connect.client.SparkConnectClient +import org.apache.spark.sql.connect.common.config.ConnectCommon +import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.sql.connect.service.SparkConnectService + +/** + * A test trait that provides a Connect [[SparkSession]] backed by an in-process gRPC server. + * Extends [[sql.SparkSessionBinder sql.SparkSessionBinder]] (which creates a + * [[classic.SparkSession classic.SparkSession]] and SparkContext), then layers a Connect client + * session on top by starting the gRPC service in-process. + * + * Mix in this trait to exercise existing sql/core test suites through the Connect path: + * {{{ + * class FooWithConnectSuite + * extends FooSuite + * with connect.SparkSessionBinder + * with connect.QueryTest + * }}} + */ +trait SparkSessionBinder extends sql.SparkSessionBinder { + + private val serverPort: Int = + ConnectCommon.CONNECT_GRPC_BINDING_PORT + util.Random.nextInt(1000) + + @volatile private var _connectSpark: SparkSession = _ + + protected override def spark: SparkSession = _connectSpark + + /** The underlying classic session used by the in-process server. */ + private def classicSpark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] + + override def beforeAll(): Unit = { + super.beforeAll() + withSparkEnvConfs((Connect.CONNECT_GRPC_BINDING_PORT.key, serverPort.toString)) { + SparkConnectService.start(classicSpark.sparkContext) + } + val client = SparkConnectClient + .builder() + .port(serverPort) + .sessionId(UUID.randomUUID().toString) + .userId("test") + .build() + _connectSpark = SparkSession + .builder() + .client(client) + .create() + } + + override def afterAll(): Unit = { + try { + if (_connectSpark != null) { + _connectSpark.close() + _connectSpark = null + } + SparkConnectService.stop() + } finally { + super.afterAll() + } + } + + // The base SharedSparkSessionBase.afterEach calls spark.sharedState which is not supported + // on Connect. Override to use the classic session for cleanup. + protected override def afterEach(): Unit = { + // super.afterEach() from BeforeAndAfterEach (skipping SharedSparkSessionBase) + classicSpark.sharedState.cacheManager.clearCache() + Eventually.eventually(Eventually.timeout(10.seconds), Eventually.interval(2.seconds)) { + DebugFilesystem.assertNoOpenStreams() + } + } +} diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionProvider.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionProvider.scala new file mode 100644 index 0000000000000..d9e456c0fd706 --- /dev/null +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionProvider.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import org.apache.spark.sql + +/** + * A common trait for test suites or utils that require a connect [[SparkSession]]. + * Use together with e.g. [[SparkSessionBinder]]. + */ +trait SparkSessionProvider extends sql.SparkSessionProvider { + protected override def spark: SparkSession +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 5a1ea3d9f53cf..5b02cbbd95d43 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -1211,7 +1211,7 @@ object QueryTest extends Assertions { } -class QueryTestSuite extends test.SharedSparkSession { +class QueryTestSuite extends QueryTest with SparkSessionBinder { test("SPARK-16940: checkAnswer should raise TestFailedException for wrong results") { intercept[org.scalatest.exceptions.TestFailedException] { checkAnswer(sql("SELECT 1"), Row(2) :: Nil) From 284c01214b46836b914bb2402f41ace74eaacba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Thu, 28 May 2026 19:40:10 +0000 Subject: [PATCH 07/58] Add classic.SparkSessionBinder with usage demonstration --- .../sql/classic/SparkSessionBinder.scala | 26 +++++++++++++++++++ .../parquet/ParquetQuerySuite.scala | 6 +++-- 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala new file mode 100644 index 0000000000000..9eea2c7cf6965 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.classic + +import org.apache.spark.sql + +trait SparkSessionBinder + extends sql.SparkSessionBinder + with SparkSessionProvider { + override protected def spark: SparkSession = super.spark.asInstanceOf[SparkSession] +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index b20b6d397fd17..5c2bc6829ea59 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.classic import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{SchemaColumnConvertNotSupportedException, SQLHadoopMapReduceCommitProtocol} import org.apache.spark.sql.execution.datasources.parquet.TestingUDT._ @@ -37,14 +38,15 @@ import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.functions.struct import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.util.Utils /** * A test suite that tests various Parquet queries. */ -abstract class ParquetQuerySuite extends ParquetTest with SharedSparkSession { +abstract class ParquetQuerySuite extends ParquetTest + with QueryTest + with classic.SparkSessionBinder { import testImplicits._ test("simple select queries") { From af8915d4c16146aa061e5c5b7cf2ff77c41b5a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Thu, 28 May 2026 20:53:24 +0000 Subject: [PATCH 08/58] fixup: fix compile error --- .../org/apache/spark/sql/test/SharedSparkSession.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index ad764d99c059f..1c506df20634d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.test import scala.concurrent.duration._ import org.apache.spark.sql -import org.apache.spark.sql.{classic, QueryTest, QueryTestBase, SparkSession, SparkSessionProvider, SQLContext} +import org.apache.spark.sql.{classic, QueryTest, QueryTestBase} @deprecated("Use SparkSessionBinder and QueryTest instead") trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { @@ -57,9 +57,7 @@ trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. */ @deprecated("Use SparkSessionBinder and QueryTest instead") -trait SharedSparkSessionBase - extends QueryTestBase - with sql.SparkSessionBinder { +trait SharedSparkSessionBase extends sql.SparkSessionBinder with QueryTestBase { protected override def spark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] From d407555b6524d8ff7f21164aeec8e97f6af37e02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 29 May 2026 17:32:44 +0000 Subject: [PATCH 09/58] Restructure so that SparkSessionBinder implements QueryTest, address nits --- .../apache/spark/sql/connect/QueryTest.scala | 9 ++- .../connect/QueryTestWithConnectSuite.scala | 5 +- .../sql/connect/SparkSessionBinder.scala | 36 ++++-------- .../org/apache/spark/sql/QueryTest.scala | 4 +- .../apache/spark/sql/SparkSessionBinder.scala | 54 ++++++++++-------- .../apache/spark/sql/classic/QueryTest.scala | 55 +++++++++++++++++++ .../sql/classic/SparkSessionBinder.scala | 7 ++- .../sql/classic/SparkSessionProvider.scala | 3 - .../spark/sql/test/SharedSparkSession.scala | 15 +++-- 9 files changed, 123 insertions(+), 65 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala index e107eb01f5700..ab3bd2c494311 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala @@ -20,10 +20,13 @@ package org.apache.spark.sql.connect import org.apache.spark.{sql => sqlApi} /** - * Extends [[sqlApi.QueryTest]] for use with Connect sessions. + * Extends [[sqlApi.QueryTest]] to provide connect-specific overrides to helpers like + * [[checkAnswer]] that avoid classic-only APIs. * - * Overrides [[checkAnswer]] to avoid classic-only code paths (e.g. `queryExecution`, - * `logicalPlan`, `materializedRdd`) that are not available on Connect DataFrames. + * Can be used together with [[SparkSessionBinder connect.SparkSessionBinder]] to create a + * 'connect variant' of a test. + * + * Note: broader use will require more overrides. */ trait QueryTest extends sqlApi.QueryTest with SparkSessionProvider { diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala index 488335daa8ded..f13765dc03aa1 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala @@ -22,11 +22,10 @@ import org.apache.spark.sql.QueryTestSuite /** * Runs [[QueryTestSuite]] tests through a Connect session. * - * This validates the `FooSuite with connect.SharedSparkSession` pattern: the existing + * This validates the `FooSuite with connect.QueryTest` pattern: the existing * [[QueryTestSuite]] tests are inherited unchanged, but execute against a - * [[org.apache.spark.sql.connect.SparkSession connect.SparkSession]] instead of a classic one. + * [[SparkSession connect.SparkSession]] instead of a classic one. */ class QueryTestWithConnectSuite extends QueryTestSuite - with SparkSessionBinder with QueryTest diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index 0e077cdcdf33e..d60f26a22fee5 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -19,20 +19,15 @@ package org.apache.spark.sql.connect import java.util.UUID -import scala.concurrent.duration._ - -import org.scalatest.concurrent.Eventually - -import org.apache.spark.DebugFilesystem +import org.apache.spark.SparkEnv import org.apache.spark.sql import org.apache.spark.sql.classic import org.apache.spark.sql.connect.client.SparkConnectClient -import org.apache.spark.sql.connect.common.config.ConnectCommon import org.apache.spark.sql.connect.config.Connect import org.apache.spark.sql.connect.service.SparkConnectService /** - * A test trait that provides a Connect [[SparkSession]] backed by an in-process gRPC server. + * Provides a [[SparkSession connect.SparkSession]] backed by an in-process gRPC server. * Extends [[sql.SparkSessionBinder sql.SparkSessionBinder]] (which creates a * [[classic.SparkSession classic.SparkSession]] and SparkContext), then layers a Connect client * session on top by starting the gRPC service in-process. @@ -42,15 +37,11 @@ import org.apache.spark.sql.connect.service.SparkConnectService * class FooWithConnectSuite * extends FooSuite * with connect.SparkSessionBinder - * with connect.QueryTest * }}} */ -trait SparkSessionBinder extends sql.SparkSessionBinder { +trait SparkSessionBinder extends sql.SparkSessionBinder with QueryTest { - private val serverPort: Int = - ConnectCommon.CONNECT_GRPC_BINDING_PORT + util.Random.nextInt(1000) - - @volatile private var _connectSpark: SparkSession = _ + private var _connectSpark: SparkSession = _ protected override def spark: SparkSession = _connectSpark @@ -59,12 +50,17 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { override def beforeAll(): Unit = { super.beforeAll() - withSparkEnvConfs((Connect.CONNECT_GRPC_BINDING_PORT.key, serverPort.toString)) { + val prevPort = SparkEnv.get.conf.get(Connect.CONNECT_GRPC_BINDING_PORT) + try { + // set GRPC_BINDING_PORT to 0 so that the server picks a random, freely available port. + SparkEnv.get.conf.set(Connect.CONNECT_GRPC_BINDING_PORT, 0) SparkConnectService.start(classicSpark.sparkContext) + } finally { + SparkEnv.get.conf.set(Connect.CONNECT_GRPC_BINDING_PORT, prevPort) } val client = SparkConnectClient .builder() - .port(serverPort) + .port(SparkConnectService.localPort) .sessionId(UUID.randomUUID().toString) .userId("test") .build() @@ -85,14 +81,4 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { super.afterAll() } } - - // The base SharedSparkSessionBase.afterEach calls spark.sharedState which is not supported - // on Connect. Override to use the classic session for cleanup. - protected override def afterEach(): Unit = { - // super.afterEach() from BeforeAndAfterEach (skipping SharedSparkSessionBase) - classicSpark.sharedState.cacheManager.clearCache() - Eventually.eventually(Eventually.timeout(10.seconds), Eventually.interval(2.seconds)) { - DebugFilesystem.assertNoOpenStreams() - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 5b02cbbd95d43..8f7a0b8f5ddc2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -510,6 +510,7 @@ trait QueryTestBase /** * Strip Spark-side filtering in order to check if a datasource filters rows correctly. */ + @deprecated("Classic-only method, use classic.QueryTest", "4.2.0") protected def stripSparkFilter(df: DataFrame): DataFrame = { val schema = df.schema val withoutFilters = df.queryExecution.executedPlan.transform { @@ -524,6 +525,7 @@ trait QueryTestBase * Turn a logical plan into a `DataFrame`. This should be removed once we have an easier * way to construct `DataFrame` directly out of local data without relying on implicits. */ + @deprecated("Classic-only method, use classic.QueryTest", "4.2.0") protected implicit def logicalPlanToSparkQuery(plan: LogicalPlan): classic.DataFrame = { classic.Dataset.ofRows(spark.asInstanceOf[classic.SparkSession], plan) } @@ -1211,7 +1213,7 @@ object QueryTest extends Assertions { } -class QueryTestSuite extends QueryTest with SparkSessionBinder { +class QueryTestSuite extends QueryTest { test("SPARK-16940: checkAnswer should raise TestFailedException for wrong results") { intercept[org.scalatest.exceptions.TestFailedException] { checkAnswer(sql("SELECT 1"), Row(2) :: Nil) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index f1f70e9b2b109..a3ca244ca3718 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -22,18 +22,44 @@ import scala.concurrent.duration._ import org.scalatest.{BeforeAndAfterEach, Suite} import org.scalatest.concurrent.Eventually -import org.apache.spark.{DebugFilesystem, SparkConf, SparkFunSuite} +import org.apache.spark.{DebugFilesystem, SparkConf} import org.apache.spark.internal.config.UNSAFE_EXCEPTION_ON_MEMORY_LEAK import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.TestSparkSession -trait SparkSessionBinder - extends SparkFunSuite +trait SparkSessionBinder extends QueryTest with SparkSessionBinderBase { + + /** + * Suites extending this trait are sharing resources (e.g. SparkSession) in their + * tests. This trait initializes the spark session in its [[beforeAll()]] implementation before + * the automatic thread snapshot is performed, so the audit code could fail to report threads + * leaked by that shared session. + * + * The behavior is overridden here to take the snapshot before the spark session is initialized. + */ + override protected val enableAutoThreadAudit = false + + protected override def beforeAll(): Unit = { + doThreadPreAudit() + super.beforeAll() + } + + protected override def afterAll(): Unit = { + try { + super.afterAll() + } finally { + doThreadPostAudit() + } + } +} + +trait SparkSessionBinderBase + extends QueryTestBase with SparkSessionProvider with BeforeAndAfterEach - with Eventually { + with Eventually { self: Suite => protected def sparkConf = { val conf = new SparkConf() @@ -95,22 +121,10 @@ trait SparkSessionBinder } } - /** - * Suites extending [[SharedSparkSession]] are sharing resources (e.g. SparkSession) in their - * tests. That trait initializes the spark session in its [[beforeAll()]] implementation before - * the automatic thread snapshot is performed, so the audit code could fail to report threads - * leaked by that shared session. - * - * The behavior is overridden here to take the snapshot before the spark session is initialized. - */ - override protected val enableAutoThreadAudit = false - /** * Make sure the [[TestSparkSession]] is initialized before any tests are run. */ protected override def beforeAll(): Unit = { - doThreadPreAudit() - initializeSession() // Ensure we have initialized the context before calling parent code @@ -134,12 +148,8 @@ trait SparkSessionBinder } } } finally { - try { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } finally { - doThreadPostAudit() - } + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala new file mode 100644 index 0000000000000..20941dd0c549b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.classic + +import scala.language.implicitConversions + +import org.apache.spark.sql +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.FilterExec + +/** + * Extends [[org.apache.spark.sql.QueryTest sql.QueryTest]] to explicitly provide + * a [[SparkSession classic.SparkSession]] and corresponding helpers. + * + * Use this trait to indicate that a test is classic-only, + * i.e it is not intended to run this test with a + * [[org.apache.spark.sql.connect.QueryTest connect.QueryTest]] override. + */ +trait QueryTest extends sql.QueryTest with SparkSessionProvider { + + /** + * Strip Spark-side filtering in order to check if a datasource filters rows correctly. + */ + protected def stripSparkFilter(df: DataFrame): DataFrame = { + val schema = df.schema + val withoutFilters = df.queryExecution.executedPlan.transform { + case FilterExec(_, child) => child + } + + spark.internalCreateDataFrame(withoutFilters.execute(), schema) + } + + /** + * Turn a logical plan into a `DataFrame`. This should be removed once we have an easier + * way to construct `DataFrame` directly out of local data without relying on implicits. + */ + protected implicit override def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = { + Dataset.ofRows(spark, plan) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala index 9eea2c7cf6965..e0b4a794d2bb7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.classic import org.apache.spark.sql -trait SparkSessionBinder - extends sql.SparkSessionBinder - with SparkSessionProvider { +/** + * Overrides [[spark]] to provide a [[SparkSession classic.SparkSession]] + */ +trait SparkSessionBinder extends sql.SparkSessionBinder with QueryTest { override protected def spark: SparkSession = super.spark.asInstanceOf[SparkSession] } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala index e459250f2d3f4..77de0db4bf68b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionProvider.scala @@ -19,9 +19,6 @@ package org.apache.spark.sql.classic import org.apache.spark.sql -/** - * A common trait for test suites that require a classic [[SparkSession]]. - */ trait SparkSessionProvider extends sql.SparkSessionProvider { override protected def spark: SparkSession } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 1c506df20634d..6a176805a349a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -19,11 +19,16 @@ package org.apache.spark.sql.test import scala.concurrent.duration._ +import org.scalatest.Suite + import org.apache.spark.sql -import org.apache.spark.sql.{classic, QueryTest, QueryTestBase} +import org.apache.spark.sql.{classic, QueryTest} + +@deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") +trait SharedSparkSession extends sql.SparkSessionBinder { -@deprecated("Use SparkSessionBinder and QueryTest instead") -trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { + protected override def spark: classic.SparkSession = + super.spark.asInstanceOf[classic.SparkSession] // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. @@ -56,8 +61,8 @@ trait SharedSparkSession extends QueryTest with SharedSparkSessionBase { /** * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. */ -@deprecated("Use SparkSessionBinder and QueryTest instead") -trait SharedSparkSessionBase extends sql.SparkSessionBinder with QueryTestBase { +@deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") +trait SharedSparkSessionBase extends sql.SparkSessionBinderBase { self: Suite => protected override def spark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] From 56b9281d28daf4ccae8ab1309d0a48ba2975fd12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 29 May 2026 17:34:06 +0000 Subject: [PATCH 10/58] fixup --- .../apache/spark/sql/connect/QueryTestWithConnectSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala index f13765dc03aa1..013acba63b80f 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala @@ -22,10 +22,10 @@ import org.apache.spark.sql.QueryTestSuite /** * Runs [[QueryTestSuite]] tests through a Connect session. * - * This validates the `FooSuite with connect.QueryTest` pattern: the existing + * This validates the `FooSuite with connect.SparkSessionBinder` pattern: the existing * [[QueryTestSuite]] tests are inherited unchanged, but execute against a * [[SparkSession connect.SparkSession]] instead of a classic one. */ class QueryTestWithConnectSuite extends QueryTestSuite - with QueryTest + with SparkSessionBinder From 442ff43a459cf2a7f4a0868643ec74a5c773561e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 29 May 2026 18:04:28 +0000 Subject: [PATCH 11/58] Have SharedSparkSession as empty alias of classic.SparkSessionBinder --- .../sql/classic/SparkSessionBinder.scala | 29 +++++++++++++++ .../spark/sql/test/SharedSparkSession.scala | 36 ++----------------- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala index e0b4a794d2bb7..68920a445e5fc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.classic +import scala.concurrent.duration._ + import org.apache.spark.sql /** @@ -24,4 +26,31 @@ import org.apache.spark.sql */ trait SparkSessionBinder extends sql.SparkSessionBinder with QueryTest { override protected def spark: SparkSession = super.spark.asInstanceOf[SparkSession] + + // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that + // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. + def runAndFetchMetrics(func: => Unit): Map[(Long, String, String), String] = { + val statusStore = spark.sharedState.statusStore + val oldCount = statusStore.executionsList().size + + func + + // Wait until the new execution is started and being tracked. + eventually(timeout(10.seconds), interval(10.milliseconds)) { + assert(statusStore.executionsCount() >= oldCount) + } + + // Wait for listener to finish computing the metrics for the execution. + eventually(timeout(10.seconds), interval(10.milliseconds)) { + assert(statusStore.executionsList().nonEmpty && + statusStore.executionsList().last.metricValues != null) + } + + val exec = statusStore.executionsList().last + val execId = exec.executionId + val sqlMetrics = statusStore.planGraph(execId).allNodes + .flatMap(n => n.metrics.map(m => (m.accumulatorId, (n.id, n.name, m.name)))) + .toMap + statusStore.executionMetrics(execId).map { case (k, v) => sqlMetrics(k) -> v } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 6a176805a349a..c52bcd4aa9c2f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -17,46 +17,14 @@ package org.apache.spark.sql.test -import scala.concurrent.duration._ - import org.scalatest.Suite import org.apache.spark.sql -import org.apache.spark.sql.{classic, QueryTest} +import org.apache.spark.sql.classic @deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") -trait SharedSparkSession extends sql.SparkSessionBinder { - - protected override def spark: classic.SparkSession = - super.spark.asInstanceOf[classic.SparkSession] - - // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that - // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. - def runAndFetchMetrics(func: => Unit): Map[(Long, String, String), String] = { - val statusStore = spark.sharedState.statusStore - val oldCount = statusStore.executionsList().size +trait SharedSparkSession extends classic.SparkSessionBinder - func - - // Wait until the new execution is started and being tracked. - eventually(timeout(10.seconds), interval(10.milliseconds)) { - assert(statusStore.executionsCount() >= oldCount) - } - - // Wait for listener to finish computing the metrics for the execution. - eventually(timeout(10.seconds), interval(10.milliseconds)) { - assert(statusStore.executionsList().nonEmpty && - statusStore.executionsList().last.metricValues != null) - } - - val exec = statusStore.executionsList().last - val execId = exec.executionId - val sqlMetrics = statusStore.planGraph(execId).allNodes - .flatMap(n => n.metrics.map(m => (m.accumulatorId, (n.id, n.name, m.name)))) - .toMap - statusStore.executionMetrics(execId).map { case (k, v) => sqlMetrics(k) -> v } - } -} /** * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. From 36e2940bc4028d2ad64eefa2f886a23853b075b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 29 May 2026 22:19:48 +0000 Subject: [PATCH 12/58] fixup --- sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 8f7a0b8f5ddc2..17212fa30b954 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -1213,7 +1213,7 @@ object QueryTest extends Assertions { } -class QueryTestSuite extends QueryTest { +class QueryTestSuite extends QueryTest with SparkSessionBinder { test("SPARK-16940: checkAnswer should raise TestFailedException for wrong results") { intercept[org.scalatest.exceptions.TestFailedException] { checkAnswer(sql("SELECT 1"), Row(2) :: Nil) From 2603ca7b356b994c5bf770983e89b5d6086a19a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 3 Jun 2026 17:09:19 +0000 Subject: [PATCH 13/58] partial refactor of connect/classic test --- .../DataSourceV2DataFrameConnectSuite.scala | 33 ++--------------- .../sql/connect/SparkSessionBinder.scala | 2 +- .../connector/DSv2CacheTableReadTests.scala | 36 +++++++++---------- .../DSv2ExternalMutationTestBase.scala | 17 --------- .../DataSourceV2DataFrameSuite.scala | 15 -------- 5 files changed, 20 insertions(+), 83 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index 1a31e5f8ac1a3..990dedb7435dd 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMe * this class only provides the Connect-specific session, catalog access, and result comparison. */ class DataSourceV2DataFrameConnectSuite - extends SparkConnectServerTest + extends SparkSessionBinder with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -55,43 +55,14 @@ class DataSourceV2DataFrameConnectSuite override protected def testPrefix: String = "[connect] " override protected def isConnect: Boolean = true - override protected def withTestSession(fn: SparkSession => Unit): Unit = - withSession(fn) - - // Cannot use QueryTest.checkAnswer directly because it accesses df.logicalPlan, - // df.queryExecution, and df.materializedRdd, which are not available on Connect *client* - // DataFrames (they throw ConnectClientUnsupportedErrors). Note: checkAnswer IS usable from - // Connect server tests that operate on classic server-side DataFrames, but in this suite - // `df` is a Connect client DataFrame returned by session.table() / session.sql(). - // Instead, collect the rows and delegate to QueryTest.sameRows, which is the same - // value-based, order-agnostic comparison that checkAnswer uses internally. - override protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit = - QueryTest.sameRows(expected, df.collect().toSeq).foreach(msg => fail(msg)) - override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, catalogName: String): C = { - val serverSession = getServerSession(session) - val catalog = serverSession.sessionState.catalogManager.catalog(catalogName) + val catalog = classicSpark.sessionState.catalogManager.catalog(catalogName) val ct = implicitly[ClassTag[C]] require( ct.runtimeClass.isInstance(catalog), s"Expected ${ct.runtimeClass.getName} but got ${catalog.getClass.getName}") catalog.asInstanceOf[C] } - - // No explicit clearCache() for cachingcat is needed here, unlike the classic suite. - // Each withSession call creates a freshly isolated SparkSession on the server side - // (via SparkConnectSessionManager.newIsolatedSession), and afterEach invalidates all - // sessions, so the CachingInMemoryTableCatalog instance is per-test. - override protected def withTestTableAndViews( - session: SparkSession, - table: String, - views: Seq[String] = Seq.empty)(fn: => Unit): Unit = { - try { fn } - finally { - views.foreach(v => session.sql(s"DROP VIEW IF EXISTS $v").collect()) - session.sql(s"DROP TABLE IF EXISTS $table").collect() - } - } } diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index d60f26a22fee5..e306c23b07a77 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -46,7 +46,7 @@ trait SparkSessionBinder extends sql.SparkSessionBinder with QueryTest { protected override def spark: SparkSession = _connectSpark /** The underlying classic session used by the in-process server. */ - private def classicSpark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] + protected def classicSpark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] override def beforeAll(): Unit = { super.beforeAll() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala index ac6ffcc6ecc0d..6419c79c8168d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, Column, InMemoryTableCatalog, TableChange, TableInfo} import org.apache.spark.sql.types.IntegerType @@ -49,35 +49,33 @@ import org.apache.spark.sql.types.IntegerType * (via the CacheManager), making a session drop+recreate scenario trivially different from * the external variant. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * DDL / DML is a no-op (these execute eagerly), so this is harmless. */ trait DSv2CacheTableReadTests extends DSv2ExternalMutationTestBase { - private def assertTableCached(session: SparkSession, tableName: String): Unit = - assert(session.catalog.isCached(tableName)) + private def assertTableCached(tableName: String): Unit = + assert(spark.catalog.isCached(tableName)) test(s"${testPrefix}SPARK-54022: cached table pinned against external data write") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) - } + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 0b2a50534447c..8d31d19c91807 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -51,17 +51,6 @@ trait DSv2ExternalMutationTestBase extends QueryTest { /** Prefix for test names, e.g. "" or "[connect] ". */ protected def testPrefix: String - /** Whether this suite runs under Spark Connect. */ - protected def isConnect: Boolean - - /** Execute a test body with a session. */ - protected def withTestSession(fn: SparkSession => Unit): Unit - - /** - * Assert that a DataFrame's rows match the expected rows (order-agnostic). - */ - protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit - /** * Get a [[TableCatalog]] by name from the underlying session. */ @@ -69,12 +58,6 @@ trait DSv2ExternalMutationTestBase extends QueryTest { session: SparkSession, catalogName: String): C - /** Cleanup wrapper: drop views and the table after the test body, even on failure. */ - protected def withTestTableAndViews( - session: SparkSession, - table: String, - views: Seq[String] = Seq.empty)(fn: => Unit): Unit - /** Appends a row to a DSv2 table via the catalog API, bypassing the session. */ protected def externalAppend( catalog: TableCatalog, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala index 71632e07c78b7..19d3d59bc8242 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala @@ -99,11 +99,6 @@ class DataSourceV2DataFrameSuite override protected def testPrefix: String = "" override protected def isConnect: Boolean = false - override protected def withTestSession(fn: SparkSession => Unit): Unit = fn(spark) - - override protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit = - checkAnswer(df, expected) - override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, catalogName: String): C = { @@ -115,16 +110,6 @@ class DataSourceV2DataFrameSuite c.asInstanceOf[C] } - override protected def withTestTableAndViews( - session: SparkSession, - table: String, - views: Seq[String] = Seq.empty)(fn: => Unit): Unit = { - withTable(table) { - try { fn } - finally { views.foreach(v => session.sql(s"DROP VIEW IF EXISTS $v")) } - } - } - override def verifyTable(tableName: String, expected: DataFrame): Unit = { checkAnswer(spark.table(tableName), expected) } From 8d5f2482019d2a669300852fc2b8b1e7d47eaf07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 3 Jun 2026 17:10:49 +0000 Subject: [PATCH 14/58] rest of refactor --- .../connector/DSv2CacheTableReadTests.scala | 286 ++++++------ .../DSv2RepeatedTableAccessTests.scala | 252 +++++------ .../DSv2TempViewWithStoredPlanTests.scala | 412 +++++++++--------- 3 files changed, 460 insertions(+), 490 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala index 6419c79c8168d..79c101d524a07 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala @@ -81,189 +81,177 @@ trait DSv2CacheTableReadTests extends DSv2ExternalMutationTestBase { test(s"${testPrefix}SPARK-54022: connector w/ cache: cached table pinned, " + "REFRESH clears both layers") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - - session.table(cachingTestTable).cache() - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - val catalog = - getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - - // Both CacheManager and connector cache are stale: external write invisible - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds - // the CacheManager entry, so the external write becomes visible. - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100), Row(2, 200))) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + + spark.table(cachingTestTable).cache() + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + val catalog = + getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + + // Both CacheManager and connector cache are stale: external write invisible + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds + // the CacheManager entry, so the external write becomes visible. + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}SPARK-54022: session write invalidates cache, " + "then external write invisible") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(3, 300)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(3, 300)) - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200), Row(3, 300))) - } + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200), Row(3, 300))) } } test(s"${testPrefix}SPARK-54022: cached table pinned against external schema change") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) - - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}SPARK-54022: session schema change invalidates cache, " + "external write invisible") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null))) + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null))) + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null))) - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) - } + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}SPARK-54022: cached table after external drop and " + "recreate sees empty table") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - val originalTableId = catalog.loadTable(testIdent).id - - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - val newTableId = catalog.loadTable(testIdent).id - assert(originalTableId != newTableId) - - val result = session.table(testTable) - assert(result.schema.fieldNames.toSeq == Seq("id", "salary")) - checkRows(result, Seq.empty) - - // External drop+recreate produces a new table identity, so the prior cache entry - // is unreachable via name lookup (unlike external write/schema change where the - // cache stays pinned). - assert(!session.catalog.isCached(testTable)) - - session.sql(s"REFRESH TABLE $testTable").collect() - checkRows(session.table(testTable), Seq.empty) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val originalTableId = catalog.loadTable(testIdent).id + + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + val newTableId = catalog.loadTable(testIdent).id + assert(originalTableId != newTableId) + + val result = spark.table(testTable) + assert(result.schema.fieldNames.toSeq == Seq("id", "salary")) + checkAnswer(result, Seq.empty) + + // External drop+recreate produces a new table identity, so the prior cache entry + // is unreachable via name lookup (unlike external write/schema change where the + // cache stays pinned). + assert(!spark.catalog.isCached(testTable)) + + spark.sql(s"REFRESH TABLE $testTable").collect() + checkAnswer(spark.table(testTable), Seq.empty) } } test(s"${testPrefix}SPARK-54022: connector w/ cache: cached table stale after " + "external drop and recreate") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - - session.table(cachingTestTable).cache() - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - val catalog = - getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - val originalTableId = catalog.loadTable(testIdent).id - - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - // CachingInMemoryTableCatalog does not invalidate on drop/create, so loadTable - // still returns the old cached table object. CacheManager still matches and - // serves the stale cached data. - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds - // the CacheManager entry, so the new empty table becomes visible. - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table(cachingTestTable), Seq.empty) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + + spark.table(cachingTestTable).cache() + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + val catalog = + getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val originalTableId = catalog.loadTable(testIdent).id + + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + // CachingInMemoryTableCatalog does not invalidate on drop/create, so loadTable + // still returns the old cached table object. CacheManager still matches and + // serves the stale cached data. + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds + // the CacheManager entry, so the new empty table becomes visible. + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table(cachingTestTable), Seq.empty) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala index 533d10a949796..fb22a8bb7ab79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.IntegerType * Each scenario includes a session mutation baseline, an external mutation test, and a * caching-connector variant showing stale results until `REFRESH TABLE`. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * DDL / DML is a no-op (these execute eagerly), so this is harmless. */ @@ -45,178 +45,160 @@ trait DSv2RepeatedTableAccessTests extends DSv2ExternalMutationTestBase { // Scenario 1: data changes via writes test(s"${testPrefix}repeated sql() reflects session write") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}repeated sql() reflects external write") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) - } + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external write") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - - // Caching connector returns stale table: external write invisible - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, external write becomes visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100), Row(2, 200))) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + + // Caching connector returns stale table: external write invisible + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, external write becomes visible + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100), Row(2, 200))) } } // Scenario 2: schema changes test(s"${testPrefix}repeated sql() reflects session schema change") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_col INT").collect() - session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() - checkRows( - session.sql(s"SELECT * FROM $testTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_col INT").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + checkAnswer( + spark.sql(s"SELECT * FROM $testTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}repeated sql() reflects external schema change") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - checkRows( - session.sql(s"SELECT * FROM $testTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + checkAnswer( + spark.sql(s"SELECT * FROM $testTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external schema change") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - // Caching connector returns stale table: external changes invisible - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, schema change + data visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows( - session.sql(s"SELECT * FROM $cachingTestTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + // Caching connector returns stale table: external changes invisible + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, schema change + data visible + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer( + spark.sql(s"SELECT * FROM $cachingTestTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) } } // Scenario 3: drop and recreate table test(s"${testPrefix}repeated sql() reflects session drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - session.sql(s"DROP TABLE $testTable").collect() - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq.empty) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + spark.sql(s"DROP TABLE $testTable").collect() + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq.empty) } } test(s"${testPrefix}repeated sql() reflects external drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq.empty) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq.empty) } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - // Caching connector returns stale table: drop/recreate invisible - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, new empty table visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq.empty) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + // Caching connector returns stale table: drop/recreate invisible + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, new empty table visible + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq.empty) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala index 9f8a93e30550f..e473968794c37 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, LongType, StringType} * backed by DSv2 tables correctly handle data changes, schema changes, and table recreation, * both via session SQL and external catalog mutations. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on DDL * is a no-op (DDL executes eagerly), so this is harmless. */ @@ -35,143 +35,143 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 1.1 (session write) test(s"${testPrefix}temp view with stored plan reflects session write") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 1.2 (external write) test(s"${testPrefix}temp view with stored plan reflects external write") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 1.2 connector w/ cache (external write, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external write") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) // Caching connector returns stale table: external write invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, external write becomes visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.1 (session ADD COLUMN) test(s"${testPrefix}temp view with stored plan preserves schema after session ADD COLUMN") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() // view preserves original 2-column schema, filter still applied - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.2 (external ADD COLUMN) test(s"${testPrefix}temp view with stored plan preserves schema after external ADD COLUMN") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // external schema change via catalog API - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) catalog.alterTable(testIdent, addCol) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) // view preserves original 2-column schema, filter still applied - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.2 connector w/ cache (external ADD COLUMN, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external ADD COLUMN") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) catalog.alterTable(testIdent, addCol) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) // Caching connector returns stale table: external changes invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, view preserves original 2-column schema - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 3.1 (session column removal) test(s"${testPrefix}temp view with stored plan detects session column removal") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -184,20 +184,20 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 3.2 (external column removal) test(s"${testPrefix}temp view with stored plan detects external column removal") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) catalog.alterTable(testIdent, dropCol) checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -210,25 +210,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 3.2 connector w/ cache (external column removal, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external column removal") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) catalog.alterTable(testIdent, dropCol) // Caching connector returns stale table: column removal invisible, no error - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, column removal detected - session.sql(s"REFRESH TABLE $cachingTestTable").collect() + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -241,43 +241,43 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 4.1 (session drop and recreate table) test(s"${testPrefix}temp view with stored plan resolves to session-recreated table") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val originalTableId = catalog.loadTable(testIdent).id - session.sql(s"DROP TABLE $testTable").collect() - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"DROP TABLE $testTable").collect() + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() val newTableId = catalog.loadTable(testIdent).id assert(originalTableId != newTableId) // view resolves to the new empty table - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.table("v"), Seq(Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.table("v"), Seq(Row(2, 200))) } } } // Scenario 4.2 (external drop and recreate table) test(s"${testPrefix}temp view with stored plan resolves to externally recreated table") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val originalTableId = catalog.loadTable(testIdent).id catalog.dropTable(testIdent) @@ -293,25 +293,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { assert(originalTableId != newTableId) // view resolves to the new empty table - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.table("v"), Seq(Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.table("v"), Seq(Row(2, 200))) } } } // Scenario 4.2 connector w/ cache (external drop/recreate, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") catalog.dropTable(testIdent) catalog.createTable( testIdent, @@ -322,11 +322,11 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { .build()) // Caching connector returns stale table: drop/recreate invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, view resolves to new empty table - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq.empty) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq.empty) } } } @@ -334,29 +334,29 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.1 (session drop and re-add column with same type, multiple views) test(s"${testPrefix}temp view with stored plan after session drop and re-add column same type" + " with unfiltered view") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v", "v_no_filter", "v_filter_is_null")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - session.table(testTable).createOrReplaceTempView("v_no_filter") - session.table(testTable).filter("salary IS NULL") + withTable(testTable) { + withView("v", "v_no_filter", "v_filter_is_null") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + spark.table(testTable).createOrReplaceTempView("v_no_filter") + spark.table(testTable).filter("salary IS NULL") .createOrReplaceTempView("v_filter_is_null") - checkRows(session.table("v"), Seq(Row(1, 100))) - checkRows(session.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) - checkRows(session.table("v_filter_is_null"), Seq.empty) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) + checkAnswer(spark.table("v_filter_is_null"), Seq.empty) // drop and re-add column with same name and type - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - session.sql(s"ALTER TABLE $testTable ADD COLUMN salary INT").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN salary INT").collect() // salary values are now null, so the filtered view returns nothing - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) // unfiltered view returns rows with null salary - checkRows(session.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) // IS NULL filter now matches all rows - checkRows(session.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) } } } @@ -364,31 +364,31 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.2 (external drop and re-add column with same type) test(s"${testPrefix}temp view with stored plan after external drop and re-add column " + "same type") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v", "v_no_filter", "v_filter_is_null")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - session.table(testTable).createOrReplaceTempView("v_no_filter") - session.table(testTable).filter("salary IS NULL") + withTable(testTable) { + withView("v", "v_no_filter", "v_filter_is_null") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + spark.table(testTable).createOrReplaceTempView("v_no_filter") + spark.table(testTable).filter("salary IS NULL") .createOrReplaceTempView("v_filter_is_null") - checkRows(session.table("v"), Seq(Row(1, 100))) - checkRows(session.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) - checkRows(session.table("v_filter_is_null"), Seq.empty) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) + checkAnswer(spark.table("v_filter_is_null"), Seq.empty) // external drop and re-add column via catalog API - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), IntegerType, true) catalog.alterTable(testIdent, dropCol, addCol) // salary values are now null, so the filtered view returns nothing - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) // unfiltered view returns rows with null salary - checkRows(session.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) // IS NULL filter now matches all rows - checkRows(session.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) } } } @@ -396,44 +396,44 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.2 connector w/ cache (external drop/re-add column, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external drop/re-add column " + "same type") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), IntegerType, true) catalog.alterTable(testIdent, dropCol, addCol) // Caching connector returns stale table: column drop/re-add invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, salary values are null - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq.empty) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq.empty) } } } // Scenario 6.1 (session drop and re-add column with different type) test(s"${testPrefix}temp view with stored plan detects session column type change") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - session.sql(s"ALTER TABLE $testTable ADD COLUMN salary STRING").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN salary STRING").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -446,21 +446,21 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 6.2 (external drop and re-add column with different type) test(s"${testPrefix}temp view with stored plan detects external column type change") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), StringType, true) catalog.alterTable(testIdent, dropCol, addCol) checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -473,26 +473,26 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 6.2 connector w/ cache (external column type change, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external column type change") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), StringType, true) catalog.alterTable(testIdent, dropCol, addCol) // Caching connector returns stale table: type change invisible, no error - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, type change detected - session.sql(s"REFRESH TABLE $cachingTestTable").collect() + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -505,18 +505,18 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.1 (session type widening from INT to BIGINT) test(s"${testPrefix}temp view with stored plan detects session type widening") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable ALTER COLUMN salary TYPE LONG").collect() + spark.sql(s"ALTER TABLE $testTable ALTER COLUMN salary TYPE LONG").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -529,20 +529,20 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.2 (external type widening from INT to BIGINT) test(s"${testPrefix}temp view with stored plan detects external type widening") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val updateType = TableChange.updateColumnType(Array("salary"), LongType) catalog.alterTable(testIdent, updateType) checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -555,25 +555,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.2 connector w/ cache (external type widening, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external type widening") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val updateType = TableChange.updateColumnType(Array("salary"), LongType) catalog.alterTable(testIdent, updateType) // Caching connector returns stale table: type change invisible, no error - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, type change detected - session.sql(s"REFRESH TABLE $cachingTestTable").collect() + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", From 749120e34c248cb697a6f458b5819787df4982a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 3 Jun 2026 20:06:49 +0000 Subject: [PATCH 15/58] Add example suite --- .../sql/connect/ExampleConnectSuite.scala | 22 +++++++++ .../org/apache/spark/sql/ExampleSuite.scala | 47 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala new file mode 100644 index 0000000000000..98e0a923f803b --- /dev/null +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import org.apache.spark.sql + +class ExampleConnectSuite extends sql.SparkSessionBinder diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala new file mode 100644 index 0000000000000..9c354a88d4cd2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +class ExampleSuite extends SparkSessionBinder { + + test("replaceWhere with partitioned table preserves all partitions") { + withTable("foo") { + val data = Seq( + (1, "Alice", 29), + (2, "Bob", 35), + (3, "Charlie", 23), + ) + + val df = spark.createDataFrame(data).toDF("id", "name", "age") + + df.write.partitionBy("age").format("delta").saveAsTable("foo") + + val data1 = Seq((1, "Blice", 29)) + + val df1 = spark.createDataFrame(data1).toDF("id", "name", "age") + + df1.write + .format("delta") + .option("replaceWhere", "age = 29") + .mode("overwrite") + .saveAsTable("foo") + + assert(spark.sql("SHOW PARTITIONS foo").count() == 3) + } + } +} From 0b427b4a0c222aac80cd1a0293e05177a3ad4f19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 9 Jun 2026 13:39:55 +0000 Subject: [PATCH 16/58] Minimize sql.SparkSessionBinder stuff --- .../apache/spark/sql/SparkSessionBinder.scala | 10 +++--- .../sql/classic/SparkSessionBinder.scala | 33 ++--------------- .../spark/sql/test/SharedSparkSession.scala | 36 +++++++++++++++++-- 3 files changed, 40 insertions(+), 39 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index a3ca244ca3718..b92d073d9a65e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -19,17 +19,17 @@ package org.apache.spark.sql import scala.concurrent.duration._ -import org.scalatest.{BeforeAndAfterEach, Suite} +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Suite} import org.scalatest.concurrent.Eventually -import org.apache.spark.{DebugFilesystem, SparkConf} +import org.apache.spark.{DebugFilesystem, SparkConf, SparkFunSuite} import org.apache.spark.internal.config.UNSAFE_EXCEPTION_ON_MEMORY_LEAK import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.TestSparkSession -trait SparkSessionBinder extends QueryTest with SparkSessionBinderBase { +trait SparkSessionBinder extends SparkSessionBinderBase { self: SparkFunSuite => /** * Suites extending this trait are sharing resources (e.g. SparkSession) in their @@ -56,9 +56,9 @@ trait SparkSessionBinder extends QueryTest with SparkSessionBinderBase { } trait SparkSessionBinderBase - extends QueryTestBase - with SparkSessionProvider + extends SparkSessionProvider with BeforeAndAfterEach + with BeforeAndAfterAll with Eventually { self: Suite => protected def sparkConf = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala index 68920a445e5fc..2f79876d841d8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SparkSessionBinder.scala @@ -17,40 +17,11 @@ package org.apache.spark.sql.classic -import scala.concurrent.duration._ - -import org.apache.spark.sql +import org.apache.spark.{sql, SparkFunSuite} /** * Overrides [[spark]] to provide a [[SparkSession classic.SparkSession]] */ -trait SparkSessionBinder extends sql.SparkSessionBinder with QueryTest { +trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => override protected def spark: SparkSession = super.spark.asInstanceOf[SparkSession] - - // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that - // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. - def runAndFetchMetrics(func: => Unit): Map[(Long, String, String), String] = { - val statusStore = spark.sharedState.statusStore - val oldCount = statusStore.executionsList().size - - func - - // Wait until the new execution is started and being tracked. - eventually(timeout(10.seconds), interval(10.milliseconds)) { - assert(statusStore.executionsCount() >= oldCount) - } - - // Wait for listener to finish computing the metrics for the execution. - eventually(timeout(10.seconds), interval(10.milliseconds)) { - assert(statusStore.executionsList().nonEmpty && - statusStore.executionsList().last.metricValues != null) - } - - val exec = statusStore.executionsList().last - val execId = exec.executionId - val sqlMetrics = statusStore.planGraph(execId).allNodes - .flatMap(n => n.metrics.map(m => (m.accumulatorId, (n.id, n.name, m.name)))) - .toMap - statusStore.executionMetrics(execId).map { case (k, v) => sqlMetrics(k) -> v } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index c52bcd4aa9c2f..b6c875dcf1253 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -17,20 +17,50 @@ package org.apache.spark.sql.test +import scala.concurrent.duration._ + import org.scalatest.Suite -import org.apache.spark.sql +import org.apache.spark.sql.{QueryTestBase, QueryTest, SparkSessionBinderBase} import org.apache.spark.sql.classic @deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") -trait SharedSparkSession extends classic.SparkSessionBinder +trait SharedSparkSession extends QueryTest with classic.SparkSessionBinder { + + // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that + // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. + def runAndFetchMetrics(func: => Unit): Map[(Long, String, String), String] = { + val statusStore = spark.sharedState.statusStore + val oldCount = statusStore.executionsList().size + + func + + // Wait until the new execution is started and being tracked. + eventually(timeout(10.seconds), interval(10.milliseconds)) { + assert(statusStore.executionsCount() >= oldCount) + } + + // Wait for listener to finish computing the metrics for the execution. + eventually(timeout(10.seconds), interval(10.milliseconds)) { + assert(statusStore.executionsList().nonEmpty && + statusStore.executionsList().last.metricValues != null) + } + + val exec = statusStore.executionsList().last + val execId = exec.executionId + val sqlMetrics = statusStore.planGraph(execId).allNodes + .flatMap(n => n.metrics.map(m => (m.accumulatorId, (n.id, n.name, m.name)))) + .toMap + statusStore.executionMetrics(execId).map { case (k, v) => sqlMetrics(k) -> v } + } +} /** * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. */ @deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") -trait SharedSparkSessionBase extends sql.SparkSessionBinderBase { self: Suite => +trait SharedSparkSessionBase extends QueryTestBase with SparkSessionBinderBase { self: Suite => protected override def spark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] From 6e813768a2ee1010dec163c39ef7378bc685f545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 9 Jun 2026 20:47:00 +0000 Subject: [PATCH 17/58] WIP --- .../apache/spark/sql/CheckAnswerHelper.scala | 196 ++++++++++++++++++ .../apache/spark/sql/QueryCleanupHelper.scala | 68 ++++++ .../spark/sql/SessionQueryTestBase.scala | 28 +++ .../spark/sql/SparkSessionProvider.scala | 0 .../DataSourceV2DataFrameConnectSuite.scala | 3 +- .../sql/connect/ExampleConnectSuite.scala | 2 +- ...QueryTest.scala => SessionQueryTest.scala} | 23 +- .../sql/connect/SparkSessionBinder.scala | 11 +- .../apache/spark/sql/CheckAnswerHelper.scala | 56 +++++ .../org/apache/spark/sql/ExampleSuite.scala | 2 +- .../org/apache/spark/sql/QueryTest.scala | 80 +++---- .../apache/spark/sql/SessionQueryTest.scala | 40 ++++ .../apache/spark/sql/SparkSessionBinder.scala | 5 + .../apache/spark/sql/classic/QueryTest.scala | 7 +- .../spark/sql/classic/SessionQueryTest.scala | 42 ++++ .../DSv2ExternalMutationTestBase.scala | 4 +- .../DSv2RepeatedTableAccessTests.scala | 2 +- .../spark/sql/test/SharedSparkSession.scala | 6 +- 18 files changed, 475 insertions(+), 100 deletions(-) create mode 100644 sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala create mode 100644 sql/api/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala create mode 100644 sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala rename sql/{core => api}/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala (100%) rename sql/connect/server/src/test/scala/org/apache/spark/sql/connect/{QueryTest.scala => SessionQueryTest.scala} (54%) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/classic/SessionQueryTest.scala diff --git a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala new file mode 100644 index 0000000000000..cc9710e291227 --- /dev/null +++ b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.TimeZone + +import scala.jdk.CollectionConverters._ +import scala.language.implicitConversions + +import org.scalatest.Assertions + +import org.apache.spark.util.{SparkErrorUtils, SparkStringUtils} +import org.apache.spark.util.ArrayImplicits._ + +trait CheckAnswerHelper extends Assertions { + + /** + * Runs the plan and makes sure the answer matches the expected result. + * + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. + */ + protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { + getErrorMessageInCheckAnswer(df, expectedAnswer) match { + case Some(errorMessage) => fail(errorMessage) + case None => + } + } + + protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { + checkAnswer(df, Seq(expectedAnswer)) + } + + protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { + checkAnswer(df, expectedAnswer.collect().toImmutableArraySeq) + } + + protected def checkAnswer(df: => DataFrame, expectedAnswer: Array[Row]): Unit = { + checkAnswer(df, expectedAnswer.toImmutableArraySeq) + } + + protected def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): Unit = { + checkAnswer(df, expectedAnswer.asScala.toSeq) + } + + protected def isDfSorted(df: DataFrame): Boolean + + /** + * Runs the plan and makes sure the answer matches the expected result. + * If there was exception during the execution or the contents of the DataFrame does not + * match the expected result, an error message will be returned. Otherwise, a None will + * be returned. + * + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. + */ + private def getErrorMessageInCheckAnswer( + df: DataFrame, + expectedAnswer: Seq[Row]): Option[String] = { + val sparkAnswer = try df.collect().toSeq catch { + case e: Exception => + val errorMessage = + s""" + |Exception thrown while executing query: + |${df.queryExecution} + |== Exception == + |$e + |${SparkErrorUtils.stackTraceToString(e)} + """.stripMargin + return Some(errorMessage) + } + + sameRows(expectedAnswer, sparkAnswer, isDfSorted(df)).map { results => + s""" + |Results do not match for query: + |Timezone: ${TimeZone.getDefault} + |Timezone Env: ${sys.env.getOrElse("TZ", "")} + | + |${df.queryExecution} + |== Results == + |$results + """.stripMargin + } + } + + private def prepareAnswer(answer: Seq[Row], isSorted: Boolean): Seq[Row] = { + // Converts data to types that we can do equality comparison using Scala collections. + // For BigDecimal type, the Scala type has a better definition of equality test (similar to + // Java's java.math.BigDecimal.compareTo). + // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for + // equality test. + val converted: Seq[Row] = answer.map(prepareRow) + if (!isSorted) converted.sortBy(_.toString()) else converted + } + + // We need to call prepareRow recursively to handle schemas with struct types. + private def prepareRow(row: Row): Row = { + Row.fromSeq(row.toSeq.map { + case null => null + case bd: java.math.BigDecimal => BigDecimal(bd) + // Equality of WrappedArray differs for AnyVal and AnyRef in Scala 2.12.2+ + case seq: Seq[_] => seq.map { + case b: java.lang.Byte => b.byteValue + case s: java.lang.Short => s.shortValue + case i: java.lang.Integer => i.intValue + case l: java.lang.Long => l.longValue + case f: java.lang.Float => f.floatValue + case d: java.lang.Double => d.doubleValue + case x => x + } + // Convert array to Seq for easy equality check. + case b: Array[_] => b.toSeq + case r: Row => prepareRow(r) + // SPARK-51349: "null" and null had the same precedence in sorting + case "null" => "__null_string__" + case o => o + }) + } + + private def genError( + expectedAnswer: Seq[Row], + sparkAnswer: Seq[Row], + isSorted: Boolean = false): String = { + val getRowType: Option[Row] => String = row => + row.map(row => + if (row.schema == null) { + "struct<>" + } else { + s"${row.schema.catalogString}" + }).getOrElse("struct<>") + + s""" + |== Results == + |${ + SparkStringUtils.sideBySide( + s"== Correct Answer - ${expectedAnswer.size} ==" +: + getRowType(expectedAnswer.headOption) +: + prepareAnswer(expectedAnswer, isSorted).map(_.toString()), + s"== Spark Answer - ${sparkAnswer.size} ==" +: + getRowType(sparkAnswer.headOption) +: + prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n") + } + """.stripMargin + } + + private def compare(obj1: Any, obj2: Any): Boolean = (obj1, obj2) match { + case (null, null) => true + case (null, _) => false + case (_, null) => false + case (a: Array[_], b: Array[_]) => + a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r)} + case (a: Map[_, _], b: Map[_, _]) => + a.size == b.size && a.keys.forall { aKey => + b.keys.find(bKey => compare(aKey, bKey)).exists(bKey => compare(a(aKey), b(bKey))) + } + case (a: Iterable[_], b: Iterable[_]) => + a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r)} + case (a: Product, b: Product) => + compare(a.productIterator.toSeq, b.productIterator.toSeq) + case (a: Row, b: Row) => + compare(a.toSeq, b.toSeq) + // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0. + // in some hardware NaN can be represented with different bits, so first check for it + case (a: Double, b: Double) => + a.isNaN && b.isNaN || + java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b) + case (a: Float, b: Float) => + a.isNaN && b.isNaN || + java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b) + case (a, b) => a == b + } + + private def sameRows( expectedAnswer: Seq[Row], + sparkAnswer: Seq[Row], + isSorted: Boolean = false): Option[String] = { + if (!compare(prepareAnswer(expectedAnswer, isSorted), prepareAnswer(sparkAnswer, isSorted))) { + return Some(genError(expectedAnswer, sparkAnswer, isSorted)) + } + None + } +} diff --git a/sql/api/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala b/sql/api/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala new file mode 100644 index 0000000000000..49658bfe64a78 --- /dev/null +++ b/sql/api/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.scalatest.Assertions + +import org.apache.spark.annotation.Experimental +import org.apache.spark.util.SparkErrorUtils + +/** + * Provides [[withTable]], [[withView]], and [[withUserDefinedFunction]] + */ +@Experimental +trait QueryCleanupHelper extends SparkSessionProvider with Assertions { + + /** + * Drops table `tableName` after calling `f`. + */ + protected def withTable(tableNames: String*)(f: => Unit): Unit = { + SparkErrorUtils.tryWithSafeFinally(f) { + tableNames.foreach { name => + spark.sql(s"DROP TABLE IF EXISTS $name") + } + } + } + + /** + * Drops view `viewName` after calling `f`. + */ + protected def withView(viewNames: String*)(f: => Unit): Unit = { + SparkErrorUtils.tryWithSafeFinally(f)( + viewNames.foreach { name => + spark.sql(s"DROP VIEW IF EXISTS $name") + } + ) + } + + protected def withUserDefinedFunction(functions: (String, Boolean)*)(f: => Unit): Unit = { + try { + f + } catch { + case cause: Throwable => throw cause + } finally { + functions.foreach { case (functionName, isTemporary) => + val withTemporary = if (isTemporary) "TEMPORARY" else "" + spark.sql(s"DROP $withTemporary FUNCTION IF EXISTS $functionName") + assert( + !spark.catalog.functionExists(functionName), + s"Function $functionName should have been dropped. But, it still exists.") + } + } + } +} diff --git a/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala new file mode 100644 index 0000000000000..6a0e4f199665b --- /dev/null +++ b/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +// scalastyle:off funsuite +import org.scalatest.funsuite.AnyFunSuite +// scalastyle:on + +trait SessionQueryTestBase + extends AnyFunSuite + with SparkSessionProvider + with CheckAnswerHelper + with QueryCleanupHelper diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala b/sql/api/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala similarity index 100% rename from sql/core/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala rename to sql/api/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index 990dedb7435dd..edd5236cc9c56 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMe * this class only provides the Connect-specific session, catalog access, and result comparison. */ class DataSourceV2DataFrameConnectSuite - extends SparkSessionBinder + extends SessionQueryTest with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -53,7 +53,6 @@ class DataSourceV2DataFrameConnectSuite .set("spark.sql.catalog.nullbothidscat.copyOnLoad", "true") override protected def testPrefix: String = "[connect] " - override protected def isConnect: Boolean = true override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala index 98e0a923f803b..469a57557f1b7 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala @@ -19,4 +19,4 @@ package org.apache.spark.sql.connect import org.apache.spark.sql -class ExampleConnectSuite extends sql.SparkSessionBinder +class ExampleConnectSuite extends sql.ExampleSuite with SessionQueryTest diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala similarity index 54% rename from sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala rename to sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index ab3bd2c494311..c0fb38fd78804 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -14,28 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.spark.sql.connect -import org.apache.spark.{sql => sqlApi} +import org.apache.spark.sql /** - * Extends [[sqlApi.QueryTest]] to provide connect-specific overrides to helpers like - * [[checkAnswer]] that avoid classic-only APIs. - * - * Can be used together with [[SparkSessionBinder connect.SparkSessionBinder]] to create a - * 'connect variant' of a test. - * - * Note: broader use will require more overrides. + * TODO write docstring */ -trait QueryTest extends sqlApi.QueryTest with SparkSessionProvider { - - override protected def checkAnswer( - df: => sqlApi.DataFrame, expectedAnswer: Seq[sqlApi.Row]): Unit = { - val sparkAnswer = df.collect().toSeq - sqlApi.QueryTest.sameRows(expectedAnswer, sparkAnswer) match { - case Some(errorMessage) => fail(errorMessage) - case None => - } - } +trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { + override def isDfSorted(df: sql.DataFrame): Boolean = false // TODO } diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index e306c23b07a77..ee5ec18d74d70 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.connect import java.util.UUID -import org.apache.spark.SparkEnv +import org.apache.spark.{SparkEnv, SparkFunSuite} import org.apache.spark.sql import org.apache.spark.sql.classic import org.apache.spark.sql.connect.client.SparkConnectClient @@ -31,15 +31,8 @@ import org.apache.spark.sql.connect.service.SparkConnectService * Extends [[sql.SparkSessionBinder sql.SparkSessionBinder]] (which creates a * [[classic.SparkSession classic.SparkSession]] and SparkContext), then layers a Connect client * session on top by starting the gRPC service in-process. - * - * Mix in this trait to exercise existing sql/core test suites through the Connect path: - * {{{ - * class FooWithConnectSuite - * extends FooSuite - * with connect.SparkSessionBinder - * }}} */ -trait SparkSessionBinder extends sql.SparkSessionBinder with QueryTest { +trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => private var _connectSpark: SparkSession = _ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala new file mode 100644 index 0000000000000..dfa23dd5a6b0d --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.io.File +import java.net.URI +import java.nio.file.Files +import java.util.{Locale, TimeZone, UUID} +import java.util.regex.Pattern + +import scala.concurrent.duration._ +import scala.jdk.CollectionConverters._ +import scala.language.implicitConversions +import scala.util.control.NonFatal + +import org.apache.hadoop.fs.Path +import org.scalactic.source.Position +import org.scalatest.{Assertions, BeforeAndAfterAll, Suite, Tag} +import org.scalatest.concurrent.Eventually + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.ExtendedAnalysisException +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.execution.{FilterExec, QueryExecution, SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution +import org.apache.spark.sql.execution.columnar.InMemoryRelation +import org.apache.spark.sql.execution.datasources.DataSourceUtils +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestData +import org.apache.spark.sql.util.QueryExecutionListener +import org.apache.spark.storage.StorageLevel +import org.apache.spark.util.ArrayImplicits._ +import org.apache.spark.util.UninterruptibleThread +import org.apache.spark.util.Utils + +// TODO docstring diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala index 9c354a88d4cd2..bff3cdb7a6d77 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -class ExampleSuite extends SparkSessionBinder { +class ExampleSuite extends SessionQueryTest { test("replaceWhere with partitioned table preserves all partitions") { withTable("foo") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 17212fa30b954..e6b3065b31fa2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -59,8 +59,13 @@ trait QueryTestBase extends Eventually with BeforeAndAfterAll with SQLTestData + with CheckAnswerHelper + with QueryCleanupHelper with PlanTestBase { self: Suite => + override protected def isDfSorted(df: DataFrame): Boolean = + df.logicalPlan.collectFirst { case s: logical.Sort => s }.nonEmpty + /** * Runs the plan and makes sure the answer contains all of the keywords. */ @@ -156,7 +161,7 @@ trait QueryTestBase * @param df the [[DataFrame]] to be executed * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s. */ - protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { + override protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { val analyzedDF = try df catch { case ae: ExtendedAnalysisException => if (ae.plan.isDefined) { @@ -177,11 +182,11 @@ trait QueryTestBase QueryTest.checkAnswer(analyzedDF, expectedAnswer) } - protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { + override protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { checkAnswer(df, Seq(expectedAnswer)) } - protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { + override protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { checkAnswer(df, expectedAnswer.collect().toImmutableArraySeq) } @@ -191,7 +196,7 @@ trait QueryTestBase * @param df the [[DataFrame]] to be executed * @param expectedAnswer the expected result in a [[Array]] of [[Row]]s. */ - protected def checkAnswer(df: => DataFrame, expectedAnswer: Array[Row]): Unit = { + override protected def checkAnswer(df: => DataFrame, expectedAnswer: Array[Row]): Unit = { checkAnswer(df, expectedAnswer.toImmutableArraySeq) } @@ -202,6 +207,7 @@ trait QueryTestBase * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s. * @param absTol the absolute tolerance between actual and expected answers. */ + @deprecated("rarely used") protected def checkAggregatesWithTol(dataFrame: DataFrame, expectedAnswer: Seq[Row], absTol: Double): Unit = { @@ -216,6 +222,7 @@ trait QueryTestBase } } + @deprecated("rarely used") protected def checkAggregatesWithTol(dataFrame: DataFrame, expectedAnswer: Row, absTol: Double): Unit = { @@ -322,25 +329,6 @@ trait QueryTestBase } } - /** - * Drops functions after calling `f`. A function is represented by (functionName, isTemporary). - */ - protected def withUserDefinedFunction(functions: (String, Boolean)*)(f: => Unit): Unit = { - try { - f - } catch { - case cause: Throwable => throw cause - } finally { - functions.foreach { case (functionName, isTemporary) => - val withTemporary = if (isTemporary) "TEMPORARY" else "" - spark.sql(s"DROP $withTemporary FUNCTION IF EXISTS $functionName") - assert( - !spark.sessionState.catalog.functionExists(FunctionIdentifier(functionName)), - s"Function $functionName should have been dropped. But, it still exists.") - } - } - } - /** * Drops temporary view `viewNames` after calling `f`. */ @@ -367,28 +355,6 @@ trait QueryTestBase } } - /** - * Drops table `tableName` after calling `f`. - */ - protected def withTable(tableNames: String*)(f: => Unit): Unit = { - Utils.tryWithSafeFinally(f) { - tableNames.foreach { name => - spark.sql(s"DROP TABLE IF EXISTS $name") - } - } - } - - /** - * Drops view `viewName` after calling `f`. - */ - protected def withView(viewNames: String*)(f: => Unit): Unit = { - Utils.tryWithSafeFinally(f)( - viewNames.foreach { name => - spark.sql(s"DROP VIEW IF EXISTS $name") - } - ) - } - /** * Drops cache `cacheName` after calling `f`. */ @@ -463,6 +429,7 @@ trait QueryTestBase /** * Restores the current catalog/database after calling `f`. */ + @deprecated("rarely used") protected def withCurrentCatalogAndNamespace(f: => Unit): Unit = { val curCatalog = sql("select current_catalog()").head().getString(0) val curDatabase = sql("select current_database()").head().getString(0) @@ -535,6 +502,7 @@ trait QueryTestBase * does not contain a scheme, this path will not be changed after the default * FileSystem is changed. */ + @deprecated("Classic-only method, use classic.QueryTest", "4.2.0") def makeQualifiedPath(path: String): URI = { val hadoopPath = new Path(path) val fs = hadoopPath.getFileSystem(spark.sessionState.newHadoopConf()) @@ -825,7 +793,8 @@ trait QueryTest extends SparkFunSuite with QueryTestBase { } } -object QueryTest extends Assertions { +@deprecated("superseded by CheckAnswerHelper", since = "4.2") +object QueryTest extends CheckAnswerHelper { /** * Runs the plan and makes sure the answer matches the expected result. * @@ -834,12 +803,18 @@ object QueryTest extends Assertions { * @param checkToRDD whether to verify deserialization to an RDD. This runs the query twice. */ def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row], checkToRDD: Boolean = true): Unit = { - getErrorMessageInCheckAnswer(df, expectedAnswer, checkToRDD) match { - case Some(errorMessage) => fail(errorMessage) - case None => + if (checkToRDD) { + SQLExecution.withSQLConfPropagated(df.sparkSession) { + df.materializedRdd.count() // Also attempt to deserialize as an RDD [SPARK-15791] + } } + + super.checkAnswer(df, expectedAnswer) } + override protected def isDfSorted(df: DataFrame): Boolean = + df.logicalPlan.collectFirst { case s: logical.Sort => s }.nonEmpty + /** * Runs the plan and makes sure the answer matches the expected result. * If there was exception during the execution or the contents of the DataFrame does not @@ -1056,13 +1031,6 @@ object QueryTest extends Assertions { } } - def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): Unit = { - getErrorMessageInCheckAnswer(df, expectedAnswer.asScala.toSeq) match { - case Some(errorMessage) => fail(errorMessage) - case None => - } - } - def withQueryExecutionsCaptured(spark: SparkSession)(thunk: => Unit): Seq[QueryExecution] = { var capturedQueryExecutions = Seq.empty[QueryExecution] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala new file mode 100644 index 0000000000000..95c8f86be350b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.SparkFunSuite + +/** + * Provides classic/connect-agnostic test utils + * + * {{{ + * // in sql/core + * FooSuite extends SessionQueryTest { + * test("") { ... } + * } + * + * // in sql/connect + * FooConnectSuite extends connect.SessionQueryTest + * }}} + */ +trait SessionQueryTest + extends SparkFunSuite + with SessionQueryTestBase + with SparkSessionBinder { + override def isDfSorted(df: DataFrame): Boolean = true // TODO +} \ No newline at end of file diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index b92d073d9a65e..da80a3b439054 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -29,6 +29,11 @@ import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.TestSparkSession +/** + * Provides a [[spark]] implementation by creating a [[classic.SparkSession]]. + * + * counterpart to [[SparkSessionProvider]], used in [[org.apache.spark.sql.test.SharedSparkSession]] + */ trait SparkSessionBinder extends SparkSessionBinderBase { self: SparkFunSuite => /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala index 20941dd0c549b..36a52e16314cb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/QueryTest.scala @@ -24,12 +24,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.FilterExec /** - * Extends [[org.apache.spark.sql.QueryTest sql.QueryTest]] to explicitly provide - * a [[SparkSession classic.SparkSession]] and corresponding helpers. - * - * Use this trait to indicate that a test is classic-only, - * i.e it is not intended to run this test with a - * [[org.apache.spark.sql.connect.QueryTest connect.QueryTest]] override. + * Extends [[org.apache.spark.sql.QueryTest sql.QueryTest]] to provide classic-only helpers. */ trait QueryTest extends sql.QueryTest with SparkSessionProvider { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/classic/SessionQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/classic/SessionQueryTest.scala new file mode 100644 index 0000000000000..d56146b05d23b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/classic/SessionQueryTest.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.classic + +import org.apache.spark.sql + +/** + * Override of [[sql.SessionQueryTest]] that provides [[SparkSession classic.SparkSession]]. + * + * Can be used to declare classic-specific tests: + * {{{ + * class FooSuite extends sql.SessionQueryTest { + * // shared classic/connect-agnostic testcases + * } + * + * // no need to extend FooSuite as sql.SessionQueryTest + * // already executes shared tests via classic internally. + * class FooClassicSuite extends classic.SessionQueryTest { + * test("classic-only test") { + * // classic-only APIs are visible here + * spark.sessionState.conf + * } + * } + * }}} + */ +trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 8d31d19c91807..73c69f8a9de41 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -21,7 +21,7 @@ import java.util import scala.reflect.ClassTag -import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession} +import org.apache.spark.sql.{SessionQueryTestBase, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Identifier, InMemoryBaseTable, TableCatalog, TableWritePrivilege} @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Iden * [[DSv2TempViewWithStoredPlanTests]], [[DSv2RepeatedTableAccessTests]], * [[DSv2IncrementallyConstructedQueryTests]], or [[DSv2CacheTableReadTests]]. */ -trait DSv2ExternalMutationTestBase extends QueryTest { +trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { /** Fully qualified table name under the non-caching test catalog. */ protected val testTable: String = "testcat.ns1.ns2.tbl" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala index fb22a8bb7ab79..9c4c3324002df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.Row +import org.apache.spark.sql.{Row, SessionQueryTest} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, Column, InMemoryTableCatalog, TableChange, TableInfo} import org.apache.spark.sql.types.IntegerType diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index b6c875dcf1253..82718583cc088 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -21,10 +21,10 @@ import scala.concurrent.duration._ import org.scalatest.Suite -import org.apache.spark.sql.{QueryTestBase, QueryTest, SparkSessionBinderBase} +import org.apache.spark.sql.{QueryTest, QueryTestBase, SparkSessionBinderBase} import org.apache.spark.sql.classic -@deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") +@deprecated("Use SessionQueryTest (or classic.SessionQueryTest if required) instead", "4.2.0") trait SharedSparkSession extends QueryTest with classic.SparkSessionBinder { // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that @@ -59,7 +59,7 @@ trait SharedSparkSession extends QueryTest with classic.SparkSessionBinder { /** * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. */ -@deprecated("Use SparkSessionBinder (or classic.SparkSessionBinder if required) instead", "4.2.0") +@deprecated("Use SessionQueryTest (or classic.SessionQueryTest if required) instead", "4.2.0") trait SharedSparkSessionBase extends QueryTestBase with SparkSessionBinderBase { self: Suite => protected override def spark: classic.SparkSession = From 9b0b938972dcc2e194558d3f095cfbc02c23f79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 9 Jun 2026 20:55:40 +0000 Subject: [PATCH 18/58] WIP: partially refactor DSv2IncrementallyConstructedQueryTests.scala --- ...v2IncrementallyConstructedQueryTests.scala | 205 ++++++++---------- 1 file changed, 90 insertions(+), 115 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala index 1dbaad18e3e71..4cf80a3ad4aee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala @@ -32,7 +32,7 @@ import org.apache.spark.unsafe.types.UTF8String * mode, resolution is deferred until execution, so both sides of a join always see the * latest table state. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * eager statements (DDL, INSERT) is a no-op, so this is harmless. */ @@ -45,44 +45,40 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join refreshes both sides after external insert" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 100, 1, 100), Row(2, 200, 2, 200))) } } - } test(s"${testPrefix}SPARK-54157: join refreshes both sides after same-session insert" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 100, 1, 100), Row(2, 200, 2, 200))) } } - } // --------------------------------------------------------------------------- // Scenario 2: join after ADD COLUMN. @@ -92,70 +88,66 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external ADD COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") catalog.alterTable( testIdent, TableChange.addColumn(Array("new_column"), IntegerType, true)) externalAppend( catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) if (isConnect) { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) } else { // Classic: df1 keeps its original 2-column schema (id, salary). assert(selfJoin.columns.length == 5, s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) } } } - } test(s"${testPrefix}SPARK-54157: join after same-session ADD COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() - val df2 = session.table(testTable) + val df2 = spark.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) if (isConnect) { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) } else { // Classic: df1 keeps its original 2-column schema (id, salary). assert(selfJoin.columns.length == 5, s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) } } } - } // --------------------------------------------------------------------------- // Scenario 3: join after DROP COLUMN. @@ -165,23 +157,22 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external DROP COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) if (isConnect) { // Connect re-resolves df1 without the dropped column. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 1), Row(2, 2))) } else { @@ -196,25 +187,23 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join after same-session DROP COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - session.sql(s"INSERT INTO $testTable VALUES (2)").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2)").collect() - val df2 = session.table(testTable) + val df2 = spark.table(testTable) if (isConnect) { // Connect re-resolves df1 without the dropped column. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 1), Row(2, 2))) } else { @@ -229,7 +218,6 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } // --------------------------------------------------------------------------- // Scenario 4: external drop and recreate table. @@ -240,13 +228,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external table drop and recreate" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val df1 = spark.table(testTable) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val originTableId = catalog.loadTable(testIdent).id catalog.dropTable(testIdent) @@ -259,13 +246,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) val newTableId = catalog.loadTable(testIdent).id assert(originTableId != newTableId) if (isConnect) { // Connect re-resolves both sides to the recreated table. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -283,18 +270,16 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join after external drop/recreate" + " (table without table ID support, but with column ID support)") { val nullIdT = "nullidcat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullIdT) { - session.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() + withTable(nullIdT) { + spark.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() - val df1 = session.table(nullIdT) - val catalog = getTableCatalog[TableCatalog](session, "nullidcat") + val df1 = spark.table(nullIdT) + val catalog = getTableCatalog[TableCatalog](spark, "nullidcat") assert(catalog.loadTable(testIdent).id == null, "NullTableIdInMemoryTableCatalog should produce null table IDs") @@ -308,11 +293,11 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(nullIdT) + val df2 = spark.table(nullIdT) if (isConnect) { // Connect re-resolves both sides to the recreated table. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -327,19 +312,17 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join does not detect external table drop and recreate" + " (table without table ID support and without column ID support)") { val nullBothT = "nullbothidscat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullBothT) { - session.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() + withTable(nullBothT) { + spark.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() - val df1 = session.table(nullBothT) + val df1 = spark.table(nullBothT) val catalog = getTableCatalog[TableCatalog]( - session, "nullbothidscat") + spark, "nullbothidscat") assert(catalog.loadTable(testIdent).id == null, "NullTableIdAndNullColumnIdInMemoryTableCatalog should produce null table IDs") assert(catalog.loadTable(testIdent).columns().forall(_.id() == null), @@ -355,12 +338,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(nullBothT) + val df2 = spark.table(nullBothT) if (isConnect) { // Connect re-resolves both sides to the recreated table, so the join // sees the row appended after recreate. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -368,13 +351,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas // drop and recreate goes undetected. df1 keeps its pre-drop snapshot // (1, 100) while df2 reads the recreated table (2, 200), so the join finds // no matching ids and returns no rows. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq.empty) } } } - } // --------------------------------------------------------------------------- // Scenario 5: external drop+re-add column. @@ -385,24 +367,23 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external drop+re-add column" + " (table without table ID support, but with column ID support)") { val nullIdT = "nullidcat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullIdT) { - session.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() + withTable(nullIdT) { + spark.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() - val df1 = session.table(nullIdT) + val df1 = spark.table(nullIdT) - val catalog = getTableCatalog[TableCatalog](session, "nullidcat") + val catalog = getTableCatalog[TableCatalog](spark, "nullidcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( testIdent, TableChange.addColumn(Array("salary"), IntegerType, true)) - val df2 = session.table(nullIdT) + val df2 = spark.table(nullIdT) if (isConnect) { // Connect re-resolves both sides with the new column ID. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null))) } else { @@ -417,35 +398,31 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join does not detect external drop+re-add column" + " (table without table ID support and without column ID support)") { val nullBothT = "nullbothidscat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullBothT) { - session.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() + withTable(nullBothT) { + spark.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() - val df1 = session.table(nullBothT) + val df1 = spark.table(nullBothT) - val catalog = getTableCatalog[TableCatalog]( - session, "nullbothidscat") + val catalog = getTableCatalog[TableCatalog](spark, "nullbothidscat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( testIdent, TableChange.addColumn(Array("salary"), IntegerType, true)) - val df2 = session.table(nullBothT) + val df2 = spark.table(nullBothT) // Neither TABLE_ID_MISMATCH nor COLUMN_ID_MISMATCH fires. // The change goes undetected and the join succeeds. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null))) } } - } // --------------------------------------------------------------------------- // Scenario 6: external type change (drop INT column, add STRING column). @@ -457,14 +434,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external drop+re-add different-type column" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( @@ -472,11 +448,11 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, UTF8String.fromString("high"))) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) if (isConnect) { // Connect re-resolves both sides with the new column type. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null), Row(2, "high", 2, "high"))) } else { @@ -491,5 +467,4 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } } From 82185e284234d2f659395f4b1318e9f0b2c9b6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 10 Jun 2026 08:22:17 +0000 Subject: [PATCH 19/58] Remove extra checkAnswer helpers to have on thing to override --- .../org/apache/spark/sql/CheckAnswerHelper.scala | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala index cc9710e291227..542ef2cbcfc49 100644 --- a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -42,22 +42,6 @@ trait CheckAnswerHelper extends Assertions { } } - protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { - checkAnswer(df, Seq(expectedAnswer)) - } - - protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { - checkAnswer(df, expectedAnswer.collect().toImmutableArraySeq) - } - - protected def checkAnswer(df: => DataFrame, expectedAnswer: Array[Row]): Unit = { - checkAnswer(df, expectedAnswer.toImmutableArraySeq) - } - - protected def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): Unit = { - checkAnswer(df, expectedAnswer.asScala.toSeq) - } - protected def isDfSorted(df: DataFrame): Boolean /** From 473f4ff89e2a7dbc38b006f13f14555b4fac2a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 10 Jun 2026 08:23:09 +0000 Subject: [PATCH 20/58] Add SessionQueryTest::sessionType --- .../spark/sql/SessionQueryTestBase.scala | 19 ++++++++++++++++++- .../spark/sql/connect/SessionQueryTest.scala | 2 ++ .../apache/spark/sql/SessionQueryTest.scala | 2 ++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala index 6a0e4f199665b..df317dd8da028 100644 --- a/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala +++ b/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -25,4 +25,21 @@ trait SessionQueryTestBase extends AnyFunSuite with SparkSessionProvider with CheckAnswerHelper - with QueryCleanupHelper + with QueryCleanupHelper { + + /** + * Documents used session so that tests can handle and document session-specific behaviour + * + * {{{ + * test(...) { + * val df = // query with connect-specific behaviour + * if (sessionType = 'connect') { + * checkError(...) + * } else { + * checkAnswer(df, ...) + * } + * } + * }}} + */ + def sessionType: String +} diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index c0fb38fd78804..2ea8fd77d7f36 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -23,4 +23,6 @@ import org.apache.spark.sql */ trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { override def isDfSorted(df: sql.DataFrame): Boolean = false // TODO + + override def sessionType: String = "connect" } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala index 95c8f86be350b..19b5d3ed23409 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala @@ -37,4 +37,6 @@ trait SessionQueryTest with SessionQueryTestBase with SparkSessionBinder { override def isDfSorted(df: DataFrame): Boolean = true // TODO + + override def sessionType: String = "classic" } \ No newline at end of file From 33c369a3b1ff636775c9f7e26850dff6c2149b34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 10 Jun 2026 08:24:01 +0000 Subject: [PATCH 21/58] Fix DSv2ExternalMutationTestBase by replacing 'isConnect' with sessionType and by implementing QueryTest --- .../DSv2ExternalMutationTestBase.scala | 4 ++-- ...Sv2IncrementallyConstructedQueryTests.scala | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 73c69f8a9de41..7ffdf225bfa63 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -21,7 +21,7 @@ import java.util import scala.reflect.ClassTag -import org.apache.spark.sql.{SessionQueryTestBase, SparkSession} +import org.apache.spark.sql.{QueryTest, SessionQueryTestBase, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Identifier, InMemoryBaseTable, TableCatalog, TableWritePrivilege} @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Iden * [[DSv2TempViewWithStoredPlanTests]], [[DSv2RepeatedTableAccessTests]], * [[DSv2IncrementallyConstructedQueryTests]], or [[DSv2CacheTableReadTests]]. */ -trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { +trait DSv2ExternalMutationTestBase extends SessionQueryTestBase with QueryTest { /** Fully qualified table name under the non-caching test catalog. */ protected val testTable: String = "testcat.ns1.ns2.tbl" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala index 4cf80a3ad4aee..a6de3a0139452 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala @@ -103,7 +103,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") @@ -133,7 +133,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") @@ -170,7 +170,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(testTable) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 without the dropped column. checkAnswer( df1.join(df2, df1("id") === df2("id")), @@ -201,7 +201,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(testTable) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 without the dropped column. checkAnswer( df1.join(df2, df1("id") === df2("id")), @@ -250,7 +250,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val newTableId = catalog.loadTable(testIdent).id assert(originTableId != newTableId) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides to the recreated table. checkAnswer( df1.join(df2, df1("id") === df2("id")), @@ -295,7 +295,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(nullIdT) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides to the recreated table. checkAnswer( df1.join(df2, df1("id") === df2("id")), @@ -340,7 +340,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(nullBothT) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides to the recreated table, so the join // sees the row appended after recreate. checkAnswer( @@ -381,7 +381,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(nullIdT) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides with the new column ID. checkAnswer( df1.join(df2, df1("id") === df2("id")), @@ -450,7 +450,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas val df2 = spark.table(testTable) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides with the new column type. checkAnswer( df1.join(df2, df1("id") === df2("id")), From c84739274485ba1c6a6ea200890d20b8431d63a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 10 Jun 2026 08:30:28 +0000 Subject: [PATCH 22/58] Remove unused imports --- .../test/scala/org/apache/spark/sql/CheckAnswerHelper.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala index 542ef2cbcfc49..3ef16279989bc 100644 --- a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -19,13 +19,9 @@ package org.apache.spark.sql import java.util.TimeZone -import scala.jdk.CollectionConverters._ -import scala.language.implicitConversions - import org.scalatest.Assertions import org.apache.spark.util.{SparkErrorUtils, SparkStringUtils} -import org.apache.spark.util.ArrayImplicits._ trait CheckAnswerHelper extends Assertions { From a9aede76be573d876a42efa318aad0b9399625bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Wed, 10 Jun 2026 17:10:04 +0000 Subject: [PATCH 23/58] WIP --- .../apache/spark/sql/CheckAnswerHelper.scala | 176 --------------- .../DataSourceV2DataFrameConnectSuite.scala | 2 +- .../spark/sql/connect/SessionQueryTest.scala | 14 +- .../sql/connect/SparkSessionBinder.scala | 2 +- .../apache/spark/sql/CheckAnswerHelper.scala | 201 ++++++++++++++---- .../apache/spark/sql/QueryCleanupHelper.scala | 0 .../org/apache/spark/sql/QueryTest.scala | 25 ++- .../apache/spark/sql/SessionQueryTest.scala | 13 +- .../spark/sql/SessionQueryTestBase.scala | 6 + .../apache/spark/sql/SparkSessionBinder.scala | 4 +- .../spark/sql/SparkSessionProvider.scala | 0 .../DSv2ExternalMutationTestBase.scala | 2 +- .../DSv2RepeatedTableAccessTests.scala | 2 +- .../DataSourceV2DataFrameSuite.scala | 4 +- .../spark/sql/hive/SessionQueryTest.scala | 24 +++ .../sql/hive/test/TestHiveSingleton.scala | 3 +- 16 files changed, 238 insertions(+), 240 deletions(-) delete mode 100644 sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala rename sql/{api => core}/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala (100%) rename sql/{api => core}/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala (89%) rename sql/{api => core}/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala (100%) create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala diff --git a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala deleted file mode 100644 index 3ef16279989bc..0000000000000 --- a/sql/api/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import java.util.TimeZone - -import org.scalatest.Assertions - -import org.apache.spark.util.{SparkErrorUtils, SparkStringUtils} - -trait CheckAnswerHelper extends Assertions { - - /** - * Runs the plan and makes sure the answer matches the expected result. - * - * @param df the DataFrame to be executed - * @param expectedAnswer the expected result in a Seq of Rows. - */ - protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { - getErrorMessageInCheckAnswer(df, expectedAnswer) match { - case Some(errorMessage) => fail(errorMessage) - case None => - } - } - - protected def isDfSorted(df: DataFrame): Boolean - - /** - * Runs the plan and makes sure the answer matches the expected result. - * If there was exception during the execution or the contents of the DataFrame does not - * match the expected result, an error message will be returned. Otherwise, a None will - * be returned. - * - * @param df the DataFrame to be executed - * @param expectedAnswer the expected result in a Seq of Rows. - */ - private def getErrorMessageInCheckAnswer( - df: DataFrame, - expectedAnswer: Seq[Row]): Option[String] = { - val sparkAnswer = try df.collect().toSeq catch { - case e: Exception => - val errorMessage = - s""" - |Exception thrown while executing query: - |${df.queryExecution} - |== Exception == - |$e - |${SparkErrorUtils.stackTraceToString(e)} - """.stripMargin - return Some(errorMessage) - } - - sameRows(expectedAnswer, sparkAnswer, isDfSorted(df)).map { results => - s""" - |Results do not match for query: - |Timezone: ${TimeZone.getDefault} - |Timezone Env: ${sys.env.getOrElse("TZ", "")} - | - |${df.queryExecution} - |== Results == - |$results - """.stripMargin - } - } - - private def prepareAnswer(answer: Seq[Row], isSorted: Boolean): Seq[Row] = { - // Converts data to types that we can do equality comparison using Scala collections. - // For BigDecimal type, the Scala type has a better definition of equality test (similar to - // Java's java.math.BigDecimal.compareTo). - // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for - // equality test. - val converted: Seq[Row] = answer.map(prepareRow) - if (!isSorted) converted.sortBy(_.toString()) else converted - } - - // We need to call prepareRow recursively to handle schemas with struct types. - private def prepareRow(row: Row): Row = { - Row.fromSeq(row.toSeq.map { - case null => null - case bd: java.math.BigDecimal => BigDecimal(bd) - // Equality of WrappedArray differs for AnyVal and AnyRef in Scala 2.12.2+ - case seq: Seq[_] => seq.map { - case b: java.lang.Byte => b.byteValue - case s: java.lang.Short => s.shortValue - case i: java.lang.Integer => i.intValue - case l: java.lang.Long => l.longValue - case f: java.lang.Float => f.floatValue - case d: java.lang.Double => d.doubleValue - case x => x - } - // Convert array to Seq for easy equality check. - case b: Array[_] => b.toSeq - case r: Row => prepareRow(r) - // SPARK-51349: "null" and null had the same precedence in sorting - case "null" => "__null_string__" - case o => o - }) - } - - private def genError( - expectedAnswer: Seq[Row], - sparkAnswer: Seq[Row], - isSorted: Boolean = false): String = { - val getRowType: Option[Row] => String = row => - row.map(row => - if (row.schema == null) { - "struct<>" - } else { - s"${row.schema.catalogString}" - }).getOrElse("struct<>") - - s""" - |== Results == - |${ - SparkStringUtils.sideBySide( - s"== Correct Answer - ${expectedAnswer.size} ==" +: - getRowType(expectedAnswer.headOption) +: - prepareAnswer(expectedAnswer, isSorted).map(_.toString()), - s"== Spark Answer - ${sparkAnswer.size} ==" +: - getRowType(sparkAnswer.headOption) +: - prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n") - } - """.stripMargin - } - - private def compare(obj1: Any, obj2: Any): Boolean = (obj1, obj2) match { - case (null, null) => true - case (null, _) => false - case (_, null) => false - case (a: Array[_], b: Array[_]) => - a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r)} - case (a: Map[_, _], b: Map[_, _]) => - a.size == b.size && a.keys.forall { aKey => - b.keys.find(bKey => compare(aKey, bKey)).exists(bKey => compare(a(aKey), b(bKey))) - } - case (a: Iterable[_], b: Iterable[_]) => - a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r)} - case (a: Product, b: Product) => - compare(a.productIterator.toSeq, b.productIterator.toSeq) - case (a: Row, b: Row) => - compare(a.toSeq, b.toSeq) - // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0. - // in some hardware NaN can be represented with different bits, so first check for it - case (a: Double, b: Double) => - a.isNaN && b.isNaN || - java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b) - case (a: Float, b: Float) => - a.isNaN && b.isNaN || - java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b) - case (a, b) => a == b - } - - private def sameRows( expectedAnswer: Seq[Row], - sparkAnswer: Seq[Row], - isSorted: Boolean = false): Option[String] = { - if (!compare(prepareAnswer(expectedAnswer, isSorted), prepareAnswer(sparkAnswer, isSorted))) { - return Some(genError(expectedAnswer, sparkAnswer, isSorted)) - } - None - } -} diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index edd5236cc9c56..dbfdc968d0c9c 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.connect import scala.reflect.ClassTag import org.apache.spark.SparkConf -import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession} +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.{DSv2CacheTableReadTests, DSv2IncrementallyConstructedQueryTests, DSv2RepeatedTableAccessTests, DSv2TempViewWithStoredPlanTests} import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMemoryTableCatalog, NullTableIdAndNullColumnIdInMemoryTableCatalog, NullTableIdInMemoryTableCatalog, TableCatalog} diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index 2ea8fd77d7f36..1b24f87aad6da 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -19,10 +19,20 @@ package org.apache.spark.sql.connect import org.apache.spark.sql /** - * TODO write docstring + * Overrides test utils to implement 'connect variants' of suites declared in sql/core: + * {{{ + * // in sql/core + * FooSuite extends SessionQueryTest { test("") { ... } } + * + * // in sql/connect + * FooConnectSuite extends connect.SessionQueryTest + * }}} + * + * This trait overrides [[spark]] to use a [[SparkSession connect.SparkSession]], which executes + * via the gRPC API using an in-process connect server. */ trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { - override def isDfSorted(df: sql.DataFrame): Boolean = false // TODO + final override def isDfSorted(df: sql.DataFrame): Boolean = false // TODO override def sessionType: String = "connect" } diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index ee5ec18d74d70..9f7134746e9db 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -41,7 +41,7 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => /** The underlying classic session used by the in-process server. */ protected def classicSpark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] - override def beforeAll(): Unit = { + override protected def beforeAll(): Unit = { super.beforeAll() val prevPort = SparkEnv.get.conf.get(Connect.CONNECT_GRPC_BINDING_PORT) try { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala index dfa23dd5a6b0d..d5114f2c7e1e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -17,40 +17,167 @@ package org.apache.spark.sql -import java.io.File -import java.net.URI -import java.nio.file.Files -import java.util.{Locale, TimeZone, UUID} -import java.util.regex.Pattern - -import scala.concurrent.duration._ -import scala.jdk.CollectionConverters._ -import scala.language.implicitConversions -import scala.util.control.NonFatal - -import org.apache.hadoop.fs.Path -import org.scalactic.source.Position -import org.scalatest.{Assertions, BeforeAndAfterAll, Suite, Tag} -import org.scalatest.concurrent.Eventually - -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.ExtendedAnalysisException -import org.apache.spark.sql.catalyst.FunctionIdentifier -import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, UnresolvedAttribute} -import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE -import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} -import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.execution.{FilterExec, QueryExecution, SparkPlan, SQLExecution} -import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution -import org.apache.spark.sql.execution.columnar.InMemoryRelation -import org.apache.spark.sql.execution.datasources.DataSourceUtils -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SQLTestData -import org.apache.spark.sql.util.QueryExecutionListener -import org.apache.spark.storage.StorageLevel -import org.apache.spark.util.ArrayImplicits._ -import org.apache.spark.util.UninterruptibleThread -import org.apache.spark.util.Utils - -// TODO docstring +import java.util.TimeZone + +import org.scalatest.Assertions + +import org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.util.{SparkErrorUtils, SparkStringUtils} + +trait CheckAnswerHelper extends Assertions { + + /** + * Runs the plan and makes sure the answer matches the expected result. + * + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. + */ + protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { + getErrorMessageInCheckAnswer(df, expectedAnswer) match { + case Some(errorMessage) => fail(errorMessage) + case None => + } + } + + protected def isDfSorted(df: DataFrame): Boolean = { + df match { + case df: classic.DataFrame => + df.logicalPlan.collectFirst { case s: logical.Sort => s }.nonEmpty + case _ => throw new RuntimeException(s"Cannot determine whether df is sorted: $df") + } + } + + /** + * Runs the plan and makes sure the answer matches the expected result. + * If there was exception during the execution or the contents of the DataFrame does not + * match the expected result, an error message will be returned. Otherwise, a None will + * be returned. + * + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. + */ + private def getErrorMessageInCheckAnswer( + df: DataFrame, + expectedAnswer: Seq[Row]): Option[String] = { + val sparkAnswer = try df.collect().toSeq catch { + case e: Exception => + val errorMessage = + s""" + |Exception thrown while executing query: + |${df.queryExecution} + |== Exception == + |$e + |${SparkErrorUtils.stackTraceToString(e)} + """.stripMargin + return Some(errorMessage) + } + + sameRows(expectedAnswer, sparkAnswer, isDfSorted(df)).map { results => + s""" + |Results do not match for query: + |Timezone: ${TimeZone.getDefault} + |Timezone Env: ${sys.env.getOrElse("TZ", "")} + | + |${df.queryExecution} + |== Results == + |$results + """.stripMargin + } + } + + private def prepareAnswer(answer: Seq[Row], isSorted: Boolean): Seq[Row] = { + // Converts data to types that we can do equality comparison using Scala collections. + // For BigDecimal type, the Scala type has a better definition of equality test (similar to + // Java's java.math.BigDecimal.compareTo). + // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for + // equality test. + val converted: Seq[Row] = answer.map(prepareRow) + if (!isSorted) converted.sortBy(_.toString()) else converted + } + + // We need to call prepareRow recursively to handle schemas with struct types. + private def prepareRow(row: Row): Row = { + Row.fromSeq(row.toSeq.map { + case null => null + case bd: java.math.BigDecimal => BigDecimal(bd) + // Equality of WrappedArray differs for AnyVal and AnyRef in Scala 2.12.2+ + case seq: Seq[_] => seq.map { + case b: java.lang.Byte => b.byteValue + case s: java.lang.Short => s.shortValue + case i: java.lang.Integer => i.intValue + case l: java.lang.Long => l.longValue + case f: java.lang.Float => f.floatValue + case d: java.lang.Double => d.doubleValue + case x => x + } + // Convert array to Seq for easy equality check. + case b: Array[_] => b.toSeq + case r: Row => prepareRow(r) + // SPARK-51349: "null" and null had the same precedence in sorting + case "null" => "__null_string__" + case o => o + }) + } + + private def genError( + expectedAnswer: Seq[Row], + sparkAnswer: Seq[Row], + isSorted: Boolean = false): String = { + val getRowType: Option[Row] => String = row => + row.map(row => + if (row.schema == null) { + "struct<>" + } else { + s"${row.schema.catalogString}" + }).getOrElse("struct<>") + + s""" + |== Results == + |${ + SparkStringUtils.sideBySide( + s"== Correct Answer - ${expectedAnswer.size} ==" +: + getRowType(expectedAnswer.headOption) +: + prepareAnswer(expectedAnswer, isSorted).map(_.toString()), + s"== Spark Answer - ${sparkAnswer.size} ==" +: + getRowType(sparkAnswer.headOption) +: + prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n") + } + """.stripMargin + } + + private def compare(obj1: Any, obj2: Any): Boolean = (obj1, obj2) match { + case (null, null) => true + case (null, _) => false + case (_, null) => false + case (a: Array[_], b: Array[_]) => + a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r)} + case (a: Map[_, _], b: Map[_, _]) => + a.size == b.size && a.keys.forall { aKey => + b.keys.find(bKey => compare(aKey, bKey)).exists(bKey => compare(a(aKey), b(bKey))) + } + case (a: Iterable[_], b: Iterable[_]) => + a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r)} + case (a: Product, b: Product) => + compare(a.productIterator.toSeq, b.productIterator.toSeq) + case (a: Row, b: Row) => + compare(a.toSeq, b.toSeq) + // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0. + // in some hardware NaN can be represented with different bits, so first check for it + case (a: Double, b: Double) => + a.isNaN && b.isNaN || + java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b) + case (a: Float, b: Float) => + a.isNaN && b.isNaN || + java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b) + case (a, b) => a == b + } + + private def sameRows( expectedAnswer: Seq[Row], + sparkAnswer: Seq[Row], + isSorted: Boolean = false): Option[String] = { + if (!compare(prepareAnswer(expectedAnswer, isSorted), prepareAnswer(sparkAnswer, isSorted))) { + return Some(genError(expectedAnswer, sparkAnswer, isSorted)) + } + None + } +} diff --git a/sql/api/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala similarity index 100% rename from sql/api/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala rename to sql/core/src/test/scala/org/apache/spark/sql/QueryCleanupHelper.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index e6b3065b31fa2..b12b315aaa3e3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -30,12 +30,11 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.Path import org.scalactic.source.Position -import org.scalatest.{Assertions, BeforeAndAfterAll, Suite, Tag} +import org.scalatest.{BeforeAndAfterAll, Suite, Tag} import org.scalatest.concurrent.Eventually import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.ExtendedAnalysisException -import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, UnresolvedAttribute} import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE import org.apache.spark.sql.catalyst.plans._ @@ -63,9 +62,6 @@ trait QueryTestBase with QueryCleanupHelper with PlanTestBase { self: Suite => - override protected def isDfSorted(df: DataFrame): Boolean = - df.logicalPlan.collectFirst { case s: logical.Sort => s }.nonEmpty - /** * Runs the plan and makes sure the answer contains all of the keywords. */ @@ -182,11 +178,11 @@ trait QueryTestBase QueryTest.checkAnswer(analyzedDF, expectedAnswer) } - override protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { + protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { checkAnswer(df, Seq(expectedAnswer)) } - override protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { + protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { checkAnswer(df, expectedAnswer.collect().toImmutableArraySeq) } @@ -196,7 +192,7 @@ trait QueryTestBase * @param df the [[DataFrame]] to be executed * @param expectedAnswer the expected result in a [[Array]] of [[Row]]s. */ - override protected def checkAnswer(df: => DataFrame, expectedAnswer: Array[Row]): Unit = { + protected def checkAnswer(df: => DataFrame, expectedAnswer: Array[Row]): Unit = { checkAnswer(df, expectedAnswer.toImmutableArraySeq) } @@ -802,7 +798,11 @@ object QueryTest extends CheckAnswerHelper { * @param expectedAnswer the expected result in a Seq of Rows. * @param checkToRDD whether to verify deserialization to an RDD. This runs the query twice. */ - def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row], checkToRDD: Boolean = true): Unit = { + def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Unit = { + checkAnswer(df, expectedAnswer, checkToRDD = true) + } + + def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row], checkToRDD: Boolean): Unit = { if (checkToRDD) { SQLExecution.withSQLConfPropagated(df.sparkSession) { df.materializedRdd.count() // Also attempt to deserialize as an RDD [SPARK-15791] @@ -812,6 +812,13 @@ object QueryTest extends CheckAnswerHelper { super.checkAnswer(df, expectedAnswer) } + def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): Unit = { + getErrorMessageInCheckAnswer(df, expectedAnswer.asScala.toSeq) match { + case Some(errorMessage) => fail(errorMessage) + case None => + } + } + override protected def isDfSorted(df: DataFrame): Boolean = df.logicalPlan.collectFirst { case s: logical.Sort => s }.nonEmpty diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala index 19b5d3ed23409..d449ee3aa9a04 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala @@ -20,23 +20,24 @@ package org.apache.spark.sql import org.apache.spark.SparkFunSuite /** - * Provides classic/connect-agnostic test utils - * + * Provides connect-compatible test utils to write suites that have 'connect variants': * {{{ * // in sql/core - * FooSuite extends SessionQueryTest { - * test("") { ... } - * } + * FooSuite extends SessionQueryTest { test("") { ... } } * * // in sql/connect * FooConnectSuite extends connect.SessionQueryTest * }}} + * + * While this trait internally uses a [[classic.SparkSession]] when executing tests, + * it exposed as a [[SparkSession sql.SparkSession]] to allow for overriding on the connect side. + * + * For classic-specific tests, use [[classic.SessionQueryTest]]. */ trait SessionQueryTest extends SparkFunSuite with SessionQueryTestBase with SparkSessionBinder { - override def isDfSorted(df: DataFrame): Boolean = true // TODO override def sessionType: String = "classic" } \ No newline at end of file diff --git a/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala similarity index 89% rename from sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala rename to sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala index df317dd8da028..f95bc4a4cf707 100644 --- a/sql/api/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -21,6 +21,12 @@ package org.apache.spark.sql import org.scalatest.funsuite.AnyFunSuite // scalastyle:on +/** + * TODO should be moved to sql/api + * + * base for fully sql/core independent tests, i.e. this trait could be moved to sql/api and then + * used in sql/connect/client. + */ trait SessionQueryTestBase extends AnyFunSuite with SparkSessionProvider diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index da80a3b439054..ae2ba743420ad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -66,7 +66,7 @@ trait SparkSessionBinderBase with BeforeAndAfterAll with Eventually { self: Suite => - protected def sparkConf = { + protected def sparkConf: SparkConf = { val conf = new SparkConf() .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName) .set(UNSAFE_EXCEPTION_ON_MEMORY_LEAK, true) @@ -100,7 +100,7 @@ trait SparkSessionBinderBase protected override def spark: SparkSession = _spark /** - * The [[TestSQLContext]] to use for all tests in this suite. + * The [[test.TestSQLContext]] to use for all tests in this suite. */ protected implicit def sqlContext: SQLContext = _spark.sqlContext diff --git a/sql/api/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala similarity index 100% rename from sql/api/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala rename to sql/core/src/test/scala/org/apache/spark/sql/SparkSessionProvider.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 7ffdf225bfa63..199c0d1f41afe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Iden * [[DSv2TempViewWithStoredPlanTests]], [[DSv2RepeatedTableAccessTests]], * [[DSv2IncrementallyConstructedQueryTests]], or [[DSv2CacheTableReadTests]]. */ -trait DSv2ExternalMutationTestBase extends SessionQueryTestBase with QueryTest { +trait DSv2ExternalMutationTestBase extends QueryTest with SessionQueryTestBase { /** Fully qualified table name under the non-caching test catalog. */ protected val testTable: String = "testcat.ns1.ns2.tbl" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala index 9c4c3324002df..fb22a8bb7ab79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.{Row, SessionQueryTest} +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, Column, InMemoryTableCatalog, TableChange, TableInfo} import org.apache.spark.sql.types.IntegerType diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala index 19d3d59bc8242..4d4b96406bd28 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala @@ -24,7 +24,7 @@ import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode, SparkSession} +import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode, SessionQueryTest, SparkSession} import org.apache.spark.sql.QueryTest.withQueryExecutionsCaptured import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, ReplaceTableAsSelect} @@ -47,6 +47,7 @@ import org.apache.spark.unsafe.types.UTF8String class DataSourceV2DataFrameSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = false) + with SessionQueryTest with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -97,7 +98,6 @@ class DataSourceV2DataFrameSuite // DSv2ExternalMutationTestBase implementations for classic mode override protected def testPrefix: String = "" - override protected def isConnect: Boolean = false override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala new file mode 100644 index 0000000000000..fc4603c546013 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.spark.sql.classic + +trait SessionQueryTest extends classic.SessionQueryTest with test.TestHiveSingleton { + override def sessionType: String = "hive" +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala index 47cc9853f754d..172d374385474 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHiveSingleton.scala @@ -18,8 +18,7 @@ package org.apache.spark.sql.hive.test import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.SparkSessionProvider -import org.apache.spark.sql.classic.SparkSession +import org.apache.spark.sql.classic.{SparkSession, SparkSessionProvider} import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.client.HiveClient From 3ab91571dc67c7d9740cfba644985014a6ac9bea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 12 Jun 2026 16:27:20 +0000 Subject: [PATCH 24/58] reset connect session in beforeEach/afterEach --- .../spark/sql/connect/SparkSessionBinder.scala | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index 9f7134746e9db..19795acfce4ac 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -51,6 +51,9 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => } finally { SparkEnv.get.conf.set(Connect.CONNECT_GRPC_BINDING_PORT, prevPort) } + } + + override def beforeEach(): Unit = { val client = SparkConnectClient .builder() .port(SparkConnectService.localPort) @@ -61,14 +64,19 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => .builder() .client(client) .create() + super.beforeEach() + } + + override def afterEach(): Unit = { + super.afterEach() + if (_connectSpark != null) { + _connectSpark.close() + _connectSpark = null + } } override def afterAll(): Unit = { try { - if (_connectSpark != null) { - _connectSpark.close() - _connectSpark = null - } SparkConnectService.stop() } finally { super.afterAll() From 772a8c6d237a34f287f804c9ad0b99e24777c6e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 12 Jun 2026 16:28:05 +0000 Subject: [PATCH 25/58] Shutdown SparkConnectServer at end of afterAll, don't silence shutdown exceptions --- .../org/apache/spark/sql/connect/SparkSessionBinder.scala | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index 19795acfce4ac..3f654615d5f08 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -76,10 +76,7 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => } override def afterAll(): Unit = { - try { - SparkConnectService.stop() - } finally { - super.afterAll() - } + super.afterAll() + SparkConnectService.stop() } } From f7ab7e4587671b5d892d9c941e52dd65e1a2d3eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Fri, 12 Jun 2026 16:46:58 +0000 Subject: [PATCH 26/58] fix server session access in DataSourceV2DataFrameConnectSuite --- .../DataSourceV2DataFrameConnectSuite.scala | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index dbfdc968d0c9c..11946d8e5ac63 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -20,9 +20,10 @@ package org.apache.spark.sql.connect import scala.reflect.ClassTag import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{classic, connect, SparkSession} import org.apache.spark.sql.connector.{DSv2CacheTableReadTests, DSv2IncrementallyConstructedQueryTests, DSv2RepeatedTableAccessTests, DSv2TempViewWithStoredPlanTests} import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMemoryTableCatalog, NullTableIdAndNullColumnIdInMemoryTableCatalog, NullTableIdInMemoryTableCatalog, TableCatalog} +import org.apache.spark.sql.connect.service.{SessionKey, SparkConnectService} /** * Connect-mode counterpart of [[org.apache.spark.sql.connector.DataSourceV2DataFrameSuite]]. @@ -54,10 +55,21 @@ class DataSourceV2DataFrameConnectSuite override protected def testPrefix: String = "[connect] " + protected def getServerSession(clientSession: SparkSession): classic.SparkSession = { + val connectSession = clientSession.asInstanceOf[connect.SparkSession] + val userId = connectSession.client.userId + val sessionId = connectSession.sessionId + val key = SessionKey(userId, sessionId) + SparkConnectService.sessionManager + .getIsolatedSessionIfPresent(key) + .get + .session + } + override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, catalogName: String): C = { - val catalog = classicSpark.sessionState.catalogManager.catalog(catalogName) + val catalog = getServerSession(session).sessionState.catalogManager.catalog(catalogName) val ct = implicitly[ClassTag[C]] require( ct.runtimeClass.isInstance(catalog), From 3db84f372afc46e87c53c1ef4d2039ab1c498c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 15 Jun 2026 11:50:35 +0000 Subject: [PATCH 27/58] Extract CheckError into CheckErrorHelper --- .../org/apache/spark/CheckErrorHelper.scala | 206 ++++++++++++++++++ .../org/apache/spark/SparkTestSuite.scala | 181 +-------------- 2 files changed, 207 insertions(+), 180 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/CheckErrorHelper.scala diff --git a/core/src/test/scala/org/apache/spark/CheckErrorHelper.scala b/core/src/test/scala/org/apache/spark/CheckErrorHelper.scala new file mode 100644 index 0000000000000..d01600bb439f1 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/CheckErrorHelper.scala @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import scala.collection.mutable.ListBuffer +import scala.jdk.CollectionConverters._ + +import org.scalatest.Suite + +trait CheckErrorHelper { self: Suite => + + case class ExpectedContext( + contextType: QueryContextType, + objectType: String, + objectName: String, + startIndex: Int, + stopIndex: Int, + fragment: String, + callSitePattern: String + ) + + object ExpectedContext { + def apply(fragment: String, start: Int, stop: Int): ExpectedContext = { + ExpectedContext("", "", start, stop, fragment) + } + + // Check the fragment only. This is only used when the fragment is distinguished within + // the query text + def apply(fragment: String): ExpectedContext = { + ExpectedContext("", "", -1, -1, fragment) + } + + def apply( + objectType: String, + objectName: String, + startIndex: Int, + stopIndex: Int, + fragment: String): ExpectedContext = { + new ExpectedContext(QueryContextType.SQL, objectType, objectName, startIndex, stopIndex, + fragment, "") + } + + def apply(fragment: String, callSitePattern: String): ExpectedContext = { + new ExpectedContext(QueryContextType.DataFrame, "", "", -1, -1, fragment, callSitePattern) + } + } + + /** + * Parameter keys that are omitted from comparison when absent from the expected map. + * For each error condition, the set lists keys that are removed from the actual + * exception parameters before comparison with the expected map. + * Test suites may override this to add or change ignorable parameters per condition. + */ + protected def checkErrorIgnorableParameters: Map[String, Set[String]] = Map( + "TABLE_OR_VIEW_NOT_FOUND" -> Set("searchPath") + ) + + /** + * Checks an exception with an error condition against expected results. + * @param exception The exception to check + * @param condition The expected error condition identifying the error + * @param sqlState Optional the expected SQLSTATE, not verified if not supplied + * @param parameters A map of parameter names and values. The names are as defined + * in the error-classes file. + * @param matchPVals Optionally treat the parameters value as regular expression pattern. + * false if not supplied. + */ + protected def checkError( + exception: SparkThrowable, + condition: String, + sqlState: Option[String] = None, + parameters: Map[String, String] = Map.empty, + matchPVals: Boolean = false, + queryContext: Array[ExpectedContext] = Array.empty): Unit = { + val mismatches = new ListBuffer[String] + + if (exception.getCondition != condition) { + mismatches += s"condition: expected '$condition' but got '${exception.getCondition}'" + } + sqlState.foreach { state => + if (exception.getSqlState != state) { + mismatches += s"sqlState: expected '$state' but got '${exception.getSqlState}'" + } + } + + val actualParameters = exception.getMessageParameters.asScala + val ignorable = checkErrorIgnorableParameters.getOrElse(condition, Set.empty[String]) + val actualParametersToCompare = actualParameters.filter { case (k, _) => + !ignorable.contains(k) || parameters.contains(k) + } + if (matchPVals) { + if (actualParametersToCompare.size != parameters.size) { + mismatches += s"parameters size: expected ${parameters.size} but got" + + s" ${actualParametersToCompare.size}" + } + actualParametersToCompare.foreach { case (key, actualVal) => + parameters.get(key) match { + case None => + mismatches += s"parameters: unexpected key '$key' with value '$actualVal'" + case Some(pattern) if !actualVal.matches(pattern) => + mismatches += s"parameters['$key']: value '$actualVal' does not match pattern" + + s" '$pattern'" + case _ => + } + } + parameters.keys.filterNot(actualParametersToCompare.contains).foreach { key => + mismatches += s"parameters: missing expected key '$key'" + } + } else if (actualParametersToCompare != parameters) { + mismatches += s"parameters: expected $parameters but got $actualParametersToCompare" + } + + val actualQueryContext = exception.getQueryContext() + if (actualQueryContext.length != queryContext.length) { + mismatches += s"queryContext.length: expected ${queryContext.length}" + + s" but got ${actualQueryContext.length}" + } + actualQueryContext.zip(queryContext).zipWithIndex.foreach { + case ((actual, expected), idx) => + if (actual.contextType() != expected.contextType) { + mismatches += s"queryContext[$idx].contextType: expected ${expected.contextType}" + + s" but got ${actual.contextType()}" + } + if (actual.contextType() == QueryContextType.SQL) { + if (actual.objectType() != expected.objectType) { + mismatches += s"queryContext[$idx].objectType: expected '${expected.objectType}'" + + s" but got '${actual.objectType()}'" + } + if (actual.objectName() != expected.objectName) { + mismatches += s"queryContext[$idx].objectName: expected '${expected.objectName}'" + + s" but got '${actual.objectName()}'" + } + // If startIndex and stopIndex are -1, it means we simply want to check the + // fragment of the query context. This should be the case when the fragment is + // distinguished within the query text. + if (expected.startIndex != -1 && actual.startIndex() != expected.startIndex) { + mismatches += s"queryContext[$idx].startIndex: expected ${expected.startIndex}" + + s" but got ${actual.startIndex()}" + } + if (expected.stopIndex != -1 && actual.stopIndex() != expected.stopIndex) { + mismatches += s"queryContext[$idx].stopIndex: expected ${expected.stopIndex}" + + s" but got ${actual.stopIndex()}" + } + if (actual.fragment() != expected.fragment) { + mismatches += s"queryContext[$idx].fragment: expected '${expected.fragment}'" + + s" but got '${actual.fragment()}'" + } + } else if (actual.contextType() == QueryContextType.DataFrame) { + if (actual.fragment() != expected.fragment) { + mismatches += s"queryContext[$idx].fragment: expected '${expected.fragment}'" + + s" but got '${actual.fragment()}'" + } + if (expected.callSitePattern.nonEmpty && + !actual.callSite().matches(expected.callSitePattern)) { + mismatches += s"queryContext[$idx].callSite: '${actual.callSite()}'" + + s" does not match pattern '${expected.callSitePattern}'" + } + } + } + + if (mismatches.nonEmpty) { + val sb = new StringBuilder + sb.append(s"checkError found ${mismatches.size} mismatch(es).\n\n") + sb.append("=== Actual Exception State ===\n") + sb.append(s" condition: ${exception.getCondition}\n") + sb.append(s" sqlState: ${exception.getSqlState}\n") + sb.append(s" parameters:\n") + if (actualParameters.isEmpty) { + sb.append(" (empty)\n") + } else { + actualParameters.foreach { case (k, v) => sb.append(s" $k -> $v\n") } + } + actualQueryContext.zipWithIndex.foreach { case (ctx, idx) => + sb.append(s" queryContext[$idx] (${ctx.contextType()}):\n") + if (ctx.contextType() == QueryContextType.SQL) { + sb.append(s" objectType: ${ctx.objectType()}\n") + sb.append(s" objectName: ${ctx.objectName()}\n") + sb.append(s" startIndex: ${ctx.startIndex()}\n") + sb.append(s" stopIndex: ${ctx.stopIndex()}\n") + sb.append(s" fragment: ${ctx.fragment()}\n") + } else if (ctx.contextType() == QueryContextType.DataFrame) { + sb.append(s" fragment: ${ctx.fragment()}\n") + sb.append(s" callSite: ${ctx.callSite()}\n") + } + } + sb.append("\n=== Mismatches ===\n") + mismatches.foreach(m => sb.append(s" $m\n")) + fail(sb.toString()) + } + } +} diff --git a/core/src/test/scala/org/apache/spark/SparkTestSuite.scala b/core/src/test/scala/org/apache/spark/SparkTestSuite.scala index 10504684be9fd..2125c3d81a5ee 100644 --- a/core/src/test/scala/org/apache/spark/SparkTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkTestSuite.scala @@ -70,6 +70,7 @@ trait SparkTestSuite with BeforeAndAfterEach with ThreadAudit with TimeLimits + with CheckErrorHelper with Logging { // scalastyle:on @@ -274,150 +275,6 @@ trait SparkTestSuite } } - /** - * Parameter keys that are omitted from comparison when absent from the expected map. - * For each error condition, the set lists keys that are removed from the actual - * exception parameters before comparison with the expected map. - * Test suites may override this to add or change ignorable parameters per condition. - */ - protected def checkErrorIgnorableParameters: Map[String, Set[String]] = Map( - "TABLE_OR_VIEW_NOT_FOUND" -> Set("searchPath") - ) - - /** - * Checks an exception with an error condition against expected results. - * @param exception The exception to check - * @param condition The expected error condition identifying the error - * @param sqlState Optional the expected SQLSTATE, not verified if not supplied - * @param parameters A map of parameter names and values. The names are as defined - * in the error-classes file. - * @param matchPVals Optionally treat the parameters value as regular expression pattern. - * false if not supplied. - */ - protected def checkError( - exception: SparkThrowable, - condition: String, - sqlState: Option[String] = None, - parameters: Map[String, String] = Map.empty, - matchPVals: Boolean = false, - queryContext: Array[ExpectedContext] = Array.empty): Unit = { - val mismatches = new ListBuffer[String] - - if (exception.getCondition != condition) { - mismatches += s"condition: expected '$condition' but got '${exception.getCondition}'" - } - sqlState.foreach { state => - if (exception.getSqlState != state) { - mismatches += s"sqlState: expected '$state' but got '${exception.getSqlState}'" - } - } - - val actualParameters = exception.getMessageParameters.asScala - val ignorable = checkErrorIgnorableParameters.getOrElse(condition, Set.empty[String]) - val actualParametersToCompare = actualParameters.filter { case (k, _) => - !ignorable.contains(k) || parameters.contains(k) - } - if (matchPVals) { - if (actualParametersToCompare.size != parameters.size) { - mismatches += s"parameters size: expected ${parameters.size} but got" + - s" ${actualParametersToCompare.size}" - } - actualParametersToCompare.foreach { case (key, actualVal) => - parameters.get(key) match { - case None => - mismatches += s"parameters: unexpected key '$key' with value '$actualVal'" - case Some(pattern) if !actualVal.matches(pattern) => - mismatches += s"parameters['$key']: value '$actualVal' does not match pattern" + - s" '$pattern'" - case _ => - } - } - parameters.keys.filterNot(actualParametersToCompare.contains).foreach { key => - mismatches += s"parameters: missing expected key '$key'" - } - } else if (actualParametersToCompare != parameters) { - mismatches += s"parameters: expected $parameters but got $actualParametersToCompare" - } - - val actualQueryContext = exception.getQueryContext() - if (actualQueryContext.length != queryContext.length) { - mismatches += s"queryContext.length: expected ${queryContext.length}" + - s" but got ${actualQueryContext.length}" - } - actualQueryContext.zip(queryContext).zipWithIndex.foreach { - case ((actual, expected), idx) => - if (actual.contextType() != expected.contextType) { - mismatches += s"queryContext[$idx].contextType: expected ${expected.contextType}" + - s" but got ${actual.contextType()}" - } - if (actual.contextType() == QueryContextType.SQL) { - if (actual.objectType() != expected.objectType) { - mismatches += s"queryContext[$idx].objectType: expected '${expected.objectType}'" + - s" but got '${actual.objectType()}'" - } - if (actual.objectName() != expected.objectName) { - mismatches += s"queryContext[$idx].objectName: expected '${expected.objectName}'" + - s" but got '${actual.objectName()}'" - } - // If startIndex and stopIndex are -1, it means we simply want to check the - // fragment of the query context. This should be the case when the fragment is - // distinguished within the query text. - if (expected.startIndex != -1 && actual.startIndex() != expected.startIndex) { - mismatches += s"queryContext[$idx].startIndex: expected ${expected.startIndex}" + - s" but got ${actual.startIndex()}" - } - if (expected.stopIndex != -1 && actual.stopIndex() != expected.stopIndex) { - mismatches += s"queryContext[$idx].stopIndex: expected ${expected.stopIndex}" + - s" but got ${actual.stopIndex()}" - } - if (actual.fragment() != expected.fragment) { - mismatches += s"queryContext[$idx].fragment: expected '${expected.fragment}'" + - s" but got '${actual.fragment()}'" - } - } else if (actual.contextType() == QueryContextType.DataFrame) { - if (actual.fragment() != expected.fragment) { - mismatches += s"queryContext[$idx].fragment: expected '${expected.fragment}'" + - s" but got '${actual.fragment()}'" - } - if (expected.callSitePattern.nonEmpty && - !actual.callSite().matches(expected.callSitePattern)) { - mismatches += s"queryContext[$idx].callSite: '${actual.callSite()}'" + - s" does not match pattern '${expected.callSitePattern}'" - } - } - } - - if (mismatches.nonEmpty) { - val sb = new StringBuilder - sb.append(s"checkError found ${mismatches.size} mismatch(es).\n\n") - sb.append("=== Actual Exception State ===\n") - sb.append(s" condition: ${exception.getCondition}\n") - sb.append(s" sqlState: ${exception.getSqlState}\n") - sb.append(s" parameters:\n") - if (actualParameters.isEmpty) { - sb.append(" (empty)\n") - } else { - actualParameters.foreach { case (k, v) => sb.append(s" $k -> $v\n") } - } - actualQueryContext.zipWithIndex.foreach { case (ctx, idx) => - sb.append(s" queryContext[$idx] (${ctx.contextType()}):\n") - if (ctx.contextType() == QueryContextType.SQL) { - sb.append(s" objectType: ${ctx.objectType()}\n") - sb.append(s" objectName: ${ctx.objectName()}\n") - sb.append(s" startIndex: ${ctx.startIndex()}\n") - sb.append(s" stopIndex: ${ctx.stopIndex()}\n") - sb.append(s" fragment: ${ctx.fragment()}\n") - } else if (ctx.contextType() == QueryContextType.DataFrame) { - sb.append(s" fragment: ${ctx.fragment()}\n") - sb.append(s" callSite: ${ctx.callSite()}\n") - } - } - sb.append("\n=== Mismatches ===\n") - mismatches.foreach(m => sb.append(s" $m\n")) - fail(sb.toString()) - } - } - protected def checkError( exception: SparkThrowable, condition: String, @@ -524,42 +381,6 @@ trait SparkTestSuite condition = "TABLE_OR_VIEW_ALREADY_EXISTS", parameters = Map("relationName" -> tableName)) - case class ExpectedContext( - contextType: QueryContextType, - objectType: String, - objectName: String, - startIndex: Int, - stopIndex: Int, - fragment: String, - callSitePattern: String - ) - - object ExpectedContext { - def apply(fragment: String, start: Int, stop: Int): ExpectedContext = { - ExpectedContext("", "", start, stop, fragment) - } - - // Check the fragment only. This is only used when the fragment is distinguished within - // the query text - def apply(fragment: String): ExpectedContext = { - ExpectedContext("", "", -1, -1, fragment) - } - - def apply( - objectType: String, - objectName: String, - startIndex: Int, - stopIndex: Int, - fragment: String): ExpectedContext = { - new ExpectedContext(QueryContextType.SQL, objectType, objectName, startIndex, stopIndex, - fragment, "") - } - - def apply(fragment: String, callSitePattern: String): ExpectedContext = { - new ExpectedContext(QueryContextType.DataFrame, "", "", -1, -1, fragment, callSitePattern) - } - } - class LogAppender(msg: String = "", maxEvents: Int = 1000) extends AbstractAppender("logAppender", null, null, true, Property.EMPTY_ARRAY) { private val _loggingEvents = new ArrayBuffer[LogEvent]() From 8ded0b269e378f197c1f7fd82cb814d62b801dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 15 Jun 2026 18:28:30 +0000 Subject: [PATCH 28/58] smash --- .../sql/connect/DataSourceV2DataFrameConnectSuite.scala | 2 +- .../org/apache/spark/sql/connect/SparkSessionBinder.scala | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index 11946d8e5ac63..d3294f8af3b4a 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -21,9 +21,9 @@ import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.sql.{classic, connect, SparkSession} +import org.apache.spark.sql.connect.service.{SessionKey, SparkConnectService} import org.apache.spark.sql.connector.{DSv2CacheTableReadTests, DSv2IncrementallyConstructedQueryTests, DSv2RepeatedTableAccessTests, DSv2TempViewWithStoredPlanTests} import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMemoryTableCatalog, NullTableIdAndNullColumnIdInMemoryTableCatalog, NullTableIdInMemoryTableCatalog, TableCatalog} -import org.apache.spark.sql.connect.service.{SessionKey, SparkConnectService} /** * Connect-mode counterpart of [[org.apache.spark.sql.connector.DataSourceV2DataFrameSuite]]. diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index 3f654615d5f08..38bf071db5a33 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -43,6 +43,9 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => override protected def beforeAll(): Unit = { super.beforeAll() + // Other suites using mocks leave a mess in the global executionManager, + // shut it down so that it's cleared before starting server. + SparkConnectService.executionManager.shutdown() val prevPort = SparkEnv.get.conf.get(Connect.CONNECT_GRPC_BINDING_PORT) try { // set GRPC_BINDING_PORT to 0 so that the server picks a random, freely available port. @@ -76,7 +79,7 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => } override def afterAll(): Unit = { - super.afterAll() SparkConnectService.stop() + super.afterAll() } } From 02308de0c72515933dbe4958a0431679cfbd2ec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 15 Jun 2026 18:29:46 +0000 Subject: [PATCH 29/58] add, fix scaladoc --- .../org/apache/spark/sql/CheckAnswerHelper.scala | 11 +++++++++++ .../test/scala/org/apache/spark/sql/QueryTest.scala | 8 +++++++- .../org/apache/spark/sql/SessionQueryTestBase.scala | 2 +- .../org/apache/spark/sql/SparkSessionBinder.scala | 2 +- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala index d5114f2c7e1e5..ff3b989cb9d37 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -21,9 +21,16 @@ import java.util.TimeZone import org.scalatest.Assertions +import org.apache.spark.annotation.Experimental import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.util.{SparkErrorUtils, SparkStringUtils} +/** + * Provides [[checkAnswer]] helper for SQL- & DataFrame-API tests. + * + * TODO: should be moved to sql/api together with SessionQueryTestBase + */ +@Experimental trait CheckAnswerHelper extends Assertions { /** @@ -39,6 +46,10 @@ trait CheckAnswerHelper extends Assertions { } } + /* + * Note: when moving this to sql/api, implementation should stay in sql/core + * (i.e. only have abstract decl in sql/api) + */ protected def isDfSorted(df: DataFrame): Boolean = { df match { case df: classic.DataFrame => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index b12b315aaa3e3..e0480209d9736 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -796,12 +796,18 @@ object QueryTest extends CheckAnswerHelper { * * @param df the DataFrame to be executed * @param expectedAnswer the expected result in a Seq of Rows. - * @param checkToRDD whether to verify deserialization to an RDD. This runs the query twice. */ def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Unit = { checkAnswer(df, expectedAnswer, checkToRDD = true) } + /** + * Runs the plan and makes sure the answer matches the expected result. + * + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. + * @param checkToRDD whether to verify deserialization to an RDD. This runs the query twice. + */ def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row], checkToRDD: Boolean): Unit = { if (checkToRDD) { SQLExecution.withSQLConfPropagated(df.sparkSession) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala index f95bc4a4cf707..db92029afb9a1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -39,7 +39,7 @@ trait SessionQueryTestBase * {{{ * test(...) { * val df = // query with connect-specific behaviour - * if (sessionType = 'connect') { + * if (sessionType == "connect") { * checkError(...) * } else { * checkAnswer(df, ...) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index ae2ba743420ad..b1eca239fdd13 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -100,7 +100,7 @@ trait SparkSessionBinderBase protected override def spark: SparkSession = _spark /** - * The [[test.TestSQLContext]] to use for all tests in this suite. + * The [[SQLContext]] to use for all tests in this suite. */ protected implicit def sqlContext: SQLContext = _spark.sqlContext From cd8f516b0cd79ec184df92ebccb6509b86754e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 15 Jun 2026 18:30:40 +0000 Subject: [PATCH 30/58] Update, extend deprecation annotations --- .../scala/org/apache/spark/sql/QueryTest.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index e0480209d9736..cda95b6199aaf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -203,7 +203,7 @@ trait QueryTestBase * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s. * @param absTol the absolute tolerance between actual and expected answers. */ - @deprecated("rarely used") + @deprecated("rarely used", since = "4.2.0") protected def checkAggregatesWithTol(dataFrame: DataFrame, expectedAnswer: Seq[Row], absTol: Double): Unit = { @@ -218,7 +218,7 @@ trait QueryTestBase } } - @deprecated("rarely used") + @deprecated("rarely used", since = "4.2.0") protected def checkAggregatesWithTol(dataFrame: DataFrame, expectedAnswer: Row, absTol: Double): Unit = { @@ -425,7 +425,7 @@ trait QueryTestBase /** * Restores the current catalog/database after calling `f`. */ - @deprecated("rarely used") + @deprecated("rarely used", since = "4.2.0") protected def withCurrentCatalogAndNamespace(f: => Unit): Unit = { val curCatalog = sql("select current_catalog()").head().getString(0) val curDatabase = sql("select current_database()").head().getString(0) @@ -473,7 +473,7 @@ trait QueryTestBase /** * Strip Spark-side filtering in order to check if a datasource filters rows correctly. */ - @deprecated("Classic-only method, use classic.QueryTest", "4.2.0") + @deprecated("Classic-only method, use classic.QueryTest", since = "4.2.0") protected def stripSparkFilter(df: DataFrame): DataFrame = { val schema = df.schema val withoutFilters = df.queryExecution.executedPlan.transform { @@ -488,7 +488,7 @@ trait QueryTestBase * Turn a logical plan into a `DataFrame`. This should be removed once we have an easier * way to construct `DataFrame` directly out of local data without relying on implicits. */ - @deprecated("Classic-only method, use classic.QueryTest", "4.2.0") + @deprecated("Classic-only method, use classic.QueryTest", since = "4.2.0") protected implicit def logicalPlanToSparkQuery(plan: LogicalPlan): classic.DataFrame = { classic.Dataset.ofRows(spark.asInstanceOf[classic.SparkSession], plan) } @@ -498,7 +498,7 @@ trait QueryTestBase * does not contain a scheme, this path will not be changed after the default * FileSystem is changed. */ - @deprecated("Classic-only method, use classic.QueryTest", "4.2.0") + @deprecated("Classic-only method, use classic.QueryTest", since = "4.2.0") def makeQualifiedPath(path: String): URI = { val hadoopPath = new Path(path) val fs = hadoopPath.getFileSystem(spark.sessionState.newHadoopConf()) @@ -789,7 +789,7 @@ trait QueryTest extends SparkFunSuite with QueryTestBase { } } -@deprecated("superseded by CheckAnswerHelper", since = "4.2") +@deprecated("superseded by CheckAnswerHelper", since = "4.2.0") object QueryTest extends CheckAnswerHelper { /** * Runs the plan and makes sure the answer matches the expected result. From 292dee40c374f28145789cda1bfc6e992e2039aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 15 Jun 2026 18:31:03 +0000 Subject: [PATCH 31/58] Add missing newline at EOF --- .../src/test/scala/org/apache/spark/sql/SessionQueryTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala index d449ee3aa9a04..534c92e9d8627 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala @@ -40,4 +40,4 @@ trait SessionQueryTest with SparkSessionBinder { override def sessionType: String = "classic" -} \ No newline at end of file +} From 2958be0965d870ea6cc36765d13b27288702010a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 16 Jun 2026 10:38:05 +0000 Subject: [PATCH 32/58] Remove unused import --- core/src/test/scala/org/apache/spark/SparkTestSuite.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/SparkTestSuite.scala b/core/src/test/scala/org/apache/spark/SparkTestSuite.scala index 2125c3d81a5ee..0fd595bf3fdf3 100644 --- a/core/src/test/scala/org/apache/spark/SparkTestSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkTestSuite.scala @@ -22,8 +22,7 @@ import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.{Files, Path} import java.util.{Locale, TimeZone} -import scala.collection.mutable.{ArrayBuffer, ListBuffer} -import scala.jdk.CollectionConverters._ +import scala.collection.mutable.ArrayBuffer import org.apache.logging.log4j._ import org.apache.logging.log4j.core.{LogEvent, Logger, LoggerContext} From 88292e4c05a6dde77382355d69a5a4320e786f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 16:14:51 +0000 Subject: [PATCH 33/58] fixup: CheckErrorHelper --- .../test/scala/org/apache/spark/sql/SessionQueryTestBase.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala index db92029afb9a1..bc96514118139 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql // scalastyle:off funsuite import org.scalatest.funsuite.AnyFunSuite + +import org.apache.spark.CheckErrorHelper // scalastyle:on /** @@ -31,6 +33,7 @@ trait SessionQueryTestBase extends AnyFunSuite with SparkSessionProvider with CheckAnswerHelper + with CheckErrorHelper with QueryCleanupHelper { /** From bcb97fd8269099272b5f29fd78e6df9336010c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 16:15:44 +0000 Subject: [PATCH 34/58] Only use SessionQueryTestBase in DSv2ExternalMutationTestBase --- .../spark/sql/connector/DSv2ExternalMutationTestBase.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 199c0d1f41afe..73c69f8a9de41 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -21,7 +21,7 @@ import java.util import scala.reflect.ClassTag -import org.apache.spark.sql.{QueryTest, SessionQueryTestBase, SparkSession} +import org.apache.spark.sql.{SessionQueryTestBase, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Identifier, InMemoryBaseTable, TableCatalog, TableWritePrivilege} @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Iden * [[DSv2TempViewWithStoredPlanTests]], [[DSv2RepeatedTableAccessTests]], * [[DSv2IncrementallyConstructedQueryTests]], or [[DSv2CacheTableReadTests]]. */ -trait DSv2ExternalMutationTestBase extends QueryTest with SessionQueryTestBase { +trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { /** Fully qualified table name under the non-caching test catalog. */ protected val testTable: String = "testcat.ns1.ns2.tbl" From 6ca5822c2b1d16436e6077a06132125a04b69f42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 16:16:52 +0000 Subject: [PATCH 35/58] Add SQLConfHelper to SessionQueryTestBase --- .../scala/org/apache/spark/sql/SessionQueryTestBase.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala index bc96514118139..559a8d9400d54 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -21,6 +21,8 @@ package org.apache.spark.sql import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.CheckErrorHelper +import org.apache.spark.sql.catalyst.SQLConfHelper +import org.apache.spark.sql.internal.SQLConf // scalastyle:on /** @@ -34,8 +36,12 @@ trait SessionQueryTestBase with SparkSessionProvider with CheckAnswerHelper with CheckErrorHelper + with SQLConfHelper with QueryCleanupHelper { + override def conf: SQLConf = + throw new UnsupportedOperationException("TODO: SessionQueryTestBase should not provide conf") + /** * Documents used session so that tests can handle and document session-specific behaviour * From a89b8b011ab68af91be1957ee465e351193ae3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 16:55:02 +0000 Subject: [PATCH 36/58] Fix QueryTest::checkAnswer --- .../test/scala/org/apache/spark/sql/QueryTest.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index cda95b6199aaf..24a82fdb18f35 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -173,9 +173,15 @@ trait QueryTestBase } } - assertEmptyMissingInput(analyzedDF) + if (analyzedDF.isInstanceOf[classic.DataFrame]) { + assertEmptyMissingInput(analyzedDF) - QueryTest.checkAnswer(analyzedDF, expectedAnswer) + SQLExecution.withSQLConfPropagated(analyzedDF.sparkSession) { + df.materializedRdd.count() // Also attempt to deserialize as an RDD [SPARK-15791] + } + } + + super.checkAnswer(analyzedDF, expectedAnswer) } protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { From fc5b673a29f32817dcde933d375c7b6abb14fafa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:18:23 +0000 Subject: [PATCH 37/58] Catch analysis-time failure in CheckAnswerHelper --- .../apache/spark/sql/CheckAnswerHelper.scala | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala index ff3b989cb9d37..0b9fd24a1ca59 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -22,6 +22,7 @@ import java.util.TimeZone import org.scalatest.Assertions import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.catalyst.ExtendedAnalysisException import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.util.{SparkErrorUtils, SparkStringUtils} @@ -40,7 +41,23 @@ trait CheckAnswerHelper extends Assertions { * @param expectedAnswer the expected result in a Seq of Rows. */ protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { - getErrorMessageInCheckAnswer(df, expectedAnswer) match { + + val analyzedDF = try df catch { + case ae: ExtendedAnalysisException => + if (ae.plan.isDefined) { + fail( + s""" + |Failed to analyze query: $ae + |${ae.plan.get} + | + |${SparkErrorUtils.stackTraceToString(ae)} + |""".stripMargin) + } else { + throw ae + } + } + + getErrorMessageInCheckAnswer(analyzedDF, expectedAnswer) match { case Some(errorMessage) => fail(errorMessage) case None => } From 5bc0c8d29b922ad6a6bfbed2d7950388efd44bcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:22:16 +0000 Subject: [PATCH 38/58] CheckAnswerHelper: limit df.queryExec access to classic dfs --- .../test/scala/org/apache/spark/sql/CheckAnswerHelper.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala index 0b9fd24a1ca59..0437a41edd782 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CheckAnswerHelper.scala @@ -92,7 +92,7 @@ trait CheckAnswerHelper extends Assertions { val errorMessage = s""" |Exception thrown while executing query: - |${df.queryExecution} + |${if (df.isInstanceOf[classic.DataFrame]) { df.queryExecution } else df.toString} |== Exception == |$e |${SparkErrorUtils.stackTraceToString(e)} @@ -106,7 +106,7 @@ trait CheckAnswerHelper extends Assertions { |Timezone: ${TimeZone.getDefault} |Timezone Env: ${sys.env.getOrElse("TZ", "")} | - |${df.queryExecution} + |${if (df.isInstanceOf[classic.DataFrame]) { df.queryExecution } else df.toString } |== Results == |$results """.stripMargin From 3a41ba15e933246f7cf8fce1c2f2ce5702e4a243 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:22:31 +0000 Subject: [PATCH 39/58] fix grammar --- .../src/test/scala/org/apache/spark/sql/SessionQueryTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala index 534c92e9d8627..a80c77ba877db 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTest.scala @@ -30,7 +30,7 @@ import org.apache.spark.SparkFunSuite * }}} * * While this trait internally uses a [[classic.SparkSession]] when executing tests, - * it exposed as a [[SparkSession sql.SparkSession]] to allow for overriding on the connect side. + * it is exposed as a [[SparkSession sql.SparkSession]] to allow for overriding on the connect side. * * For classic-specific tests, use [[classic.SessionQueryTest]]. */ From 9946e076461026be7b5864c8bc0c5e70f90e3425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:23:05 +0000 Subject: [PATCH 40/58] fix capitalization --- .../test/scala/org/apache/spark/sql/SparkSessionBinder.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index b1eca239fdd13..82a0d0c166e94 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.test.TestSparkSession /** * Provides a [[spark]] implementation by creating a [[classic.SparkSession]]. * - * counterpart to [[SparkSessionProvider]], used in [[org.apache.spark.sql.test.SharedSparkSession]] + * Counterpart to [[SparkSessionProvider]], used in [[org.apache.spark.sql.test.SharedSparkSession]] */ trait SparkSessionBinder extends SparkSessionBinderBase { self: SparkFunSuite => From ec566ff988a9a8f5fc877e3f69910f05aa15384e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:36:49 +0000 Subject: [PATCH 41/58] docstring fix --- .../scala/org/apache/spark/sql/connect/SessionQueryTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index 1b24f87aad6da..344825a347c67 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql * FooSuite extends SessionQueryTest { test("") { ... } } * * // in sql/connect - * FooConnectSuite extends connect.SessionQueryTest + * FooConnectSuite extends FooSuite with connect.SessionQueryTest * }}} * * This trait overrides [[spark]] to use a [[SparkSession connect.SparkSession]], which executes From a2872d2d2858e215daece70ac6e9e3ab6a3e6d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:43:08 +0000 Subject: [PATCH 42/58] Don't add hive.SessionQueryTest (yet) --- .../spark/sql/hive/SessionQueryTest.scala | 24 ------------------- 1 file changed, 24 deletions(-) delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala deleted file mode 100644 index fc4603c546013..0000000000000 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SessionQueryTest.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.hive - -import org.apache.spark.sql.classic - -trait SessionQueryTest extends classic.SessionQueryTest with test.TestHiveSingleton { - override def sessionType: String = "hive" -} From 53a5b2cd412fa2157bee5ea089dfd66cad8bc136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 17:48:45 +0000 Subject: [PATCH 43/58] Add docstring for connect.SessionQueryTest::isDfSorted --- .../org/apache/spark/sql/connect/SessionQueryTest.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index 344825a347c67..919ddf118b983 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -32,7 +32,13 @@ import org.apache.spark.sql * via the gRPC API using an in-process connect server. */ trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { - final override def isDfSorted(df: sql.DataFrame): Boolean = false // TODO + + /** + * TODO add required test-only API to Spark Connect + * This method is used by [[checkAnswer]] internally but cannot yet be implemented in connect. + * Thus we always return `false` for now. + */ + override def isDfSorted(df: sql.DataFrame): Boolean = false override def sessionType: String = "connect" } From ef19ed47716809a19c6e70495026f235cbc4a465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Mon, 22 Jun 2026 18:01:05 +0000 Subject: [PATCH 44/58] Document that SparkSessionBinderBase is temporary --- .../org/apache/spark/sql/SparkSessionBinder.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala index 82a0d0c166e94..017e35ff4ce4b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBinder.scala @@ -23,6 +23,7 @@ import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Suite} import org.scalatest.concurrent.Eventually import org.apache.spark.{DebugFilesystem, SparkConf, SparkFunSuite} +import org.apache.spark.annotation.Experimental import org.apache.spark.internal.config.UNSAFE_EXCEPTION_ON_MEMORY_LEAK import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation @@ -60,6 +61,16 @@ trait SparkSessionBinder extends SparkSessionBinderBase { self: SparkFunSuite => } } +/** + * [[SparkSessionBinderBase]] is needed for now as + * [[test.SharedSparkSessionBase SharedSparkSessionBase]] is still used by e.g. + * [[test.GenericWordSpecSuite]]. + * + * This Base might be merged into [[SparkSessionBinder]] once it is not required anymore. + * + * TODO: migrate SharedSparkSessionBase users so this can be removed + */ +@Experimental trait SparkSessionBinderBase extends SparkSessionProvider with BeforeAndAfterEach From 02e8c2834839134ad9ebf0cdc153cdd8ea38accd Mon Sep 17 00:00:00 2001 From: Leonid Lygin Date: Tue, 23 Jun 2026 12:17:27 +0000 Subject: [PATCH 45/58] use precomputed analyzedDF instead of df in checkAnswer --- sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 24a82fdb18f35..a0ca2d4b49e52 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -177,7 +177,7 @@ trait QueryTestBase assertEmptyMissingInput(analyzedDF) SQLExecution.withSQLConfPropagated(analyzedDF.sparkSession) { - df.materializedRdd.count() // Also attempt to deserialize as an RDD [SPARK-15791] + analyzedDF.materializedRdd.count() // Also attempt to deserialize as an RDD [SPARK-15791] } } From 8e831418e602acc0075aab7a4db16f440471018c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 11:54:51 +0000 Subject: [PATCH 46/58] make connect.SparkSessionBinder::classicSession private --- ... ExampleSessionAgnosticConnectSuite.scala} | 4 +- .../sql/connect/SparkSessionBinder.scala | 2 +- .../sql/ExampleSessionAgnosticSuite.scala | 53 +++++++++++++++++++ .../org/apache/spark/sql/ExampleSuite.scala | 47 ---------------- 4 files changed, 57 insertions(+), 49 deletions(-) rename sql/connect/server/src/test/scala/org/apache/spark/sql/connect/{ExampleConnectSuite.scala => ExampleSessionAgnosticConnectSuite.scala} (88%) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala similarity index 88% rename from sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala rename to sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala index 469a57557f1b7..aa5137c840f15 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala @@ -19,4 +19,6 @@ package org.apache.spark.sql.connect import org.apache.spark.sql -class ExampleConnectSuite extends sql.ExampleSuite with SessionQueryTest +class ExampleConnectSessionAgnosticSuite + extends sql.ExampleSessionAgnosticSuite + with SessionQueryTest diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala index 38bf071db5a33..24b33163c2328 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SparkSessionBinder.scala @@ -39,7 +39,7 @@ trait SparkSessionBinder extends sql.SparkSessionBinder { self: SparkFunSuite => protected override def spark: SparkSession = _connectSpark /** The underlying classic session used by the in-process server. */ - protected def classicSpark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] + private def classicSpark: classic.SparkSession = super.spark.asInstanceOf[classic.SparkSession] override protected def beforeAll(): Unit = { super.beforeAll() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala new file mode 100644 index 0000000000000..0c6cb1c1930ce --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +/** + * Example for + */ +class ExampleSessionAgnosticSuite extends SessionQueryTest { + + test("Example classic/connect-agnostic testcase") { + withTable("t") { + spark.sql(s"CREATE TABLE t (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO t VALUES (1, 100)").collect() + + val df1 = spark.table("t") + + spark.sql(s"ALTER TABLE t ADD COLUMN new_column INT").collect() + spark.sql(s"INSERT INTO t VALUES (2, 200, -1)").collect() + + val df2 = spark.table("t") + val selfJoin = df1.join(df2, df1("id") === df2("id")) + + if (sessionType == "connect") { + // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). + assert(selfJoin.columns.length == 6, + s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") + checkAnswer(selfJoin, + Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) + } else { + // Classic: df1 keeps its original 2-column schema (id, salary). + assert(selfJoin.columns.length == 5, + s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") + checkAnswer(selfJoin, + Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala deleted file mode 100644 index bff3cdb7a6d77..0000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSuite.scala +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -class ExampleSuite extends SessionQueryTest { - - test("replaceWhere with partitioned table preserves all partitions") { - withTable("foo") { - val data = Seq( - (1, "Alice", 29), - (2, "Bob", 35), - (3, "Charlie", 23), - ) - - val df = spark.createDataFrame(data).toDF("id", "name", "age") - - df.write.partitionBy("age").format("delta").saveAsTable("foo") - - val data1 = Seq((1, "Blice", 29)) - - val df1 = spark.createDataFrame(data1).toDF("id", "name", "age") - - df1.write - .format("delta") - .option("replaceWhere", "age = 29") - .mode("overwrite") - .saveAsTable("foo") - - assert(spark.sql("SHOW PARTITIONS foo").count() == 3) - } - } -} From a9acd7699a35c494192814e1a753c8eacb67d3d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 11:55:28 +0000 Subject: [PATCH 47/58] fixup: rename --- .../spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala index aa5137c840f15..ac28afd3b2955 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ExampleSessionAgnosticConnectSuite.scala @@ -19,6 +19,6 @@ package org.apache.spark.sql.connect import org.apache.spark.sql -class ExampleConnectSessionAgnosticSuite +class ExampleSessionAgnosticConnectSuite extends sql.ExampleSessionAgnosticSuite with SessionQueryTest From 13da2a0a2f798f46a6b512b9ecb9af8dc8bca620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 11:56:11 +0000 Subject: [PATCH 48/58] add example testcase with conf stuff --- .../sql/ExampleSessionAgnosticSuite.scala | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala index 0c6cb1c1930ce..d72cae519e001 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExampleSessionAgnosticSuite.scala @@ -17,11 +17,19 @@ package org.apache.spark.sql +import org.apache.spark.SparkConf +import org.apache.spark.sql.connector.catalog.InMemoryPartitionTableCatalog + /** * Example for */ class ExampleSessionAgnosticSuite extends SessionQueryTest { + override protected def sparkConf: SparkConf = + super.sparkConf + .set("spark.sql.catalog.testcat", classOf[InMemoryPartitionTableCatalog].getName) + .set("spark.sql.defaultCatalog", "testcat") + test("Example classic/connect-agnostic testcase") { withTable("t") { spark.sql(s"CREATE TABLE t (id INT, salary INT) USING foo").collect() @@ -50,4 +58,15 @@ class ExampleSessionAgnosticSuite extends SessionQueryTest { } } } + + test("testcase that uses withConf") { + withConf("spark.sql.charAsVarchar" -> "true") { + withTable("t") { + spark.sql(s"CREATE TABLE t(col CHAR(5)) USING foo") + checkAnswer( + spark.sql(s"desc t").selectExpr("data_type"), + Seq(Row("varchar(5)"))) + } + } + } } From 03f49126721342cd32e3e54cd8f4fe0bddffebdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 11:56:49 +0000 Subject: [PATCH 49/58] SessionQueryTestBase declares 'withConf' instead of extending SQLConfHelper --- .../spark/sql/SessionQueryTestBase.scala | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala index 559a8d9400d54..5a75bedc82ac0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionQueryTestBase.scala @@ -22,7 +22,6 @@ import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.CheckErrorHelper import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.internal.SQLConf // scalastyle:on /** @@ -39,8 +38,28 @@ trait SessionQueryTestBase with SQLConfHelper with QueryCleanupHelper { - override def conf: SQLConf = - throw new UnsupportedOperationException("TODO: SessionQueryTestBase should not provide conf") + /** + * Sets all configurations specified in `pairs`, calls `f`, and then restores all configurations. + */ + protected def withConf[T](pairs: (String, String)*)(f: => T): T = { + val (keys, values) = pairs.unzip + val currentValues = keys.map { key => + if (spark.conf.contains(key)) { + Some(spark.conf.get(key)) + } else { + None + } + } + keys.lazyZip(values).foreach { (k, v) => + spark.conf.set(k, v) + } + try f finally { + keys.zip(currentValues).foreach { + case (key, Some(value)) => spark.conf.set(key, value) + case (key, None) => spark.conf.unset(key) + } + } + } /** * Documents used session so that tests can handle and document session-specific behaviour From ca341c66731864ed6f7df808302b3dbe0913cc5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 12:20:49 +0000 Subject: [PATCH 50/58] add connect isDfSorted and connect.DataFrame::explainString --- .../apache/spark/sql/connect/Dataset.scala | 23 +++++++++++++++++++ .../spark/sql/connect/SessionQueryTest.scala | 5 +++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala index c27a83b79b89f..14124b048e4f8 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala @@ -241,6 +241,29 @@ class Dataset[T] private[sql] ( // scalastyle:on println } + private[connect] def explainString(mode: String): String = { + val protoMode = mode.trim.toLowerCase(util.Locale.ROOT) match { + case "simple" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_SIMPLE + case "extended" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_EXTENDED + case "codegen" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_CODEGEN + case "cost" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_COST + case "formatted" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_FORMATTED + case _ => throw new IllegalArgumentException("Unsupported explain mode: " + mode) + } + sparkSession + .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN, Some(protoMode)) + .getExplain + .getExplainString + } + + private[connect] def explainString(extended: Boolean): String = if (extended) { + explainString("extended") + } else { + explainString("simple") + } + + private[connect] def explainString(): String = explainString("simple") + /** @inheritdoc */ def isLocal: Boolean = sparkSession .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index 919ddf118b983..9d4aeb3ac2697 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -38,7 +38,10 @@ trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { * This method is used by [[checkAnswer]] internally but cannot yet be implemented in connect. * Thus we always return `false` for now. */ - override def isDfSorted(df: sql.DataFrame): Boolean = false + override def isDfSorted(df: sql.DataFrame): Boolean = df match { + case df: DataFrame => df.explainString(true).contains("sort") + case df => super.isDfSorted(df) + } override def sessionType: String = "connect" } From 9d0f4b3597647058aba31bbbb79169c1994313c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 13:33:58 +0000 Subject: [PATCH 51/58] fixup! add connect isDfSorted and connect.DataFrame::explainString --- .../scala/org/apache/spark/sql/connect/SessionQueryTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index 9d4aeb3ac2697..db70815f0dfc5 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -39,7 +39,7 @@ trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { * Thus we always return `false` for now. */ override def isDfSorted(df: sql.DataFrame): Boolean = df match { - case df: DataFrame => df.explainString(true).contains("sort") + case df: DataFrame => df.explainString(true).contains("Sort") case df => super.isDfSorted(df) } From cd7183abcdb39385d73246f3d3a16125a3d34535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 13:43:37 +0000 Subject: [PATCH 52/58] fixup! add connect isDfSorted and connect.DataFrame::explainString --- .../scala/org/apache/spark/sql/QueryTest.scala | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index a0ca2d4b49e52..e08f0e6b5dfcc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -1212,4 +1212,19 @@ class QueryTestSuite extends QueryTest with SparkSessionBinder { "from range(2)"), Seq(Row(Row(null)), Row(Row("null")))) } + + test("checkAnswer demands correct result order for ordered queries") { + val e = intercept[org.scalatest.exceptions.TestFailedException] { + checkAnswer( + sql("SELECT col1 FROM VALUE 1, 2, 1, 3 ORDER BY col 1"), + Seq(Row(3), Row(1), Row(1), Row(2))) + } + assert(e.getMessage().contains("Results do not match for query")) + } + + test("checkAnswer ignores result order for unordered queries") { + checkAnswer( + sql("SELECT col1 FROM VALUE 1, 2, 1, 3"), + Seq(Row(3), Row(1), Row(1), Row(2))) + } } From 895ff8d7caf4961433af7cf5dd3893c943343273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 13:44:04 +0000 Subject: [PATCH 53/58] deprecate rarely used method in SharedSparkSession --- .../scala/org/apache/spark/sql/test/SharedSparkSession.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 82718583cc088..1c01acd6aef6f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -29,6 +29,7 @@ trait SharedSparkSession extends QueryTest with classic.SparkSessionBinder { // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that // execution as a map keyed by (planNodeId, planNodeName, metricName) -> metricValue. + @deprecated("rarely used", "4.2.0") def runAndFetchMetrics(func: => Unit): Map[(Long, String, String), String] = { val statusStore = spark.sharedState.statusStore val oldCount = statusStore.executionsList().size From deac517d0270e76939f78beeca17a3911a1351af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 13:58:53 +0000 Subject: [PATCH 54/58] WIP: provide isDfSorted override for connect --- .../apache/spark/sql/connect/QueryTestWithConnectSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala index 013acba63b80f..76c2201756600 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/QueryTestWithConnectSuite.scala @@ -28,4 +28,4 @@ import org.apache.spark.sql.QueryTestSuite */ class QueryTestWithConnectSuite extends QueryTestSuite - with SparkSessionBinder + with SessionQueryTest From 783681f59b336c2b6c55dafdb86ccf791ed8c781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 17:16:35 +0000 Subject: [PATCH 55/58] fixup! add connect isDfSorted and connect.DataFrame::explainString --- .../org/apache/spark/sql/connect/SessionQueryTest.scala | 6 ++---- .../src/test/scala/org/apache/spark/sql/QueryTest.scala | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala index db70815f0dfc5..11a7369cfcaa1 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/SessionQueryTest.scala @@ -34,12 +34,10 @@ import org.apache.spark.sql trait SessionQueryTest extends sql.SessionQueryTest with SparkSessionBinder { /** - * TODO add required test-only API to Spark Connect - * This method is used by [[checkAnswer]] internally but cannot yet be implemented in connect. - * Thus we always return `false` for now. + * Approximates [[sql.SessionQueryTest.isDfSorted]] by inspecting the explain string. */ override def isDfSorted(df: sql.DataFrame): Boolean = df match { - case df: DataFrame => df.explainString(true).contains("Sort") + case df: DataFrame => df.explainString(extended = true).contains("Sort") case df => super.isDfSorted(df) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index e08f0e6b5dfcc..90f1565c2a722 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -1216,7 +1216,7 @@ class QueryTestSuite extends QueryTest with SparkSessionBinder { test("checkAnswer demands correct result order for ordered queries") { val e = intercept[org.scalatest.exceptions.TestFailedException] { checkAnswer( - sql("SELECT col1 FROM VALUE 1, 2, 1, 3 ORDER BY col 1"), + sql("SELECT col1 FROM VALUES 1, 2, 1, 3 ORDER BY col1"), Seq(Row(3), Row(1), Row(1), Row(2))) } assert(e.getMessage().contains("Results do not match for query")) @@ -1224,7 +1224,7 @@ class QueryTestSuite extends QueryTest with SparkSessionBinder { test("checkAnswer ignores result order for unordered queries") { checkAnswer( - sql("SELECT col1 FROM VALUE 1, 2, 1, 3"), + sql("SELECT col1 FROM VALUES 1, 2, 1, 3"), Seq(Row(3), Row(1), Row(1), Row(2))) } } From c009270c74f5807e12e3c130eb3c12157cde33b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 17:17:45 +0000 Subject: [PATCH 56/58] Don't deprecate SharedSparkSession[Base] yet --- .../scala/org/apache/spark/sql/test/SharedSparkSession.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 1c01acd6aef6f..d8065b21a6188 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -24,7 +24,6 @@ import org.scalatest.Suite import org.apache.spark.sql.{QueryTest, QueryTestBase, SparkSessionBinderBase} import org.apache.spark.sql.classic -@deprecated("Use SessionQueryTest (or classic.SessionQueryTest if required) instead", "4.2.0") trait SharedSparkSession extends QueryTest with classic.SparkSessionBinder { // Runs func (which must trigger exactly one SQL execution) and returns the SQL metrics of that @@ -60,7 +59,6 @@ trait SharedSparkSession extends QueryTest with classic.SparkSessionBinder { /** * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]]. */ -@deprecated("Use SessionQueryTest (or classic.SessionQueryTest if required) instead", "4.2.0") trait SharedSparkSessionBase extends QueryTestBase with SparkSessionBinderBase { self: Suite => protected override def spark: classic.SparkSession = From 176a528b305764ed34fa5f39272c917be2f11781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 17:21:03 +0000 Subject: [PATCH 57/58] Don't refactor DSv2 classic/connect tests (yet) --- .../DataSourceV2DataFrameConnectSuite.scala | 46 +- .../connector/DSv2CacheTableReadTests.scala | 322 +++++++------- .../DSv2ExternalMutationTestBase.scala | 21 +- ...v2IncrementallyConstructedQueryTests.scala | 223 +++++----- .../DSv2RepeatedTableAccessTests.scala | 252 ++++++----- .../DSv2TempViewWithStoredPlanTests.scala | 412 +++++++++--------- .../DataSourceV2DataFrameSuite.scala | 19 +- 7 files changed, 701 insertions(+), 594 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index d3294f8af3b4a..1a31e5f8ac1a3 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -20,8 +20,7 @@ package org.apache.spark.sql.connect import scala.reflect.ClassTag import org.apache.spark.SparkConf -import org.apache.spark.sql.{classic, connect, SparkSession} -import org.apache.spark.sql.connect.service.{SessionKey, SparkConnectService} +import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession} import org.apache.spark.sql.connector.{DSv2CacheTableReadTests, DSv2IncrementallyConstructedQueryTests, DSv2RepeatedTableAccessTests, DSv2TempViewWithStoredPlanTests} import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMemoryTableCatalog, NullTableIdAndNullColumnIdInMemoryTableCatalog, NullTableIdInMemoryTableCatalog, TableCatalog} @@ -35,7 +34,7 @@ import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMe * this class only provides the Connect-specific session, catalog access, and result comparison. */ class DataSourceV2DataFrameConnectSuite - extends SessionQueryTest + extends SparkConnectServerTest with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -54,26 +53,45 @@ class DataSourceV2DataFrameConnectSuite .set("spark.sql.catalog.nullbothidscat.copyOnLoad", "true") override protected def testPrefix: String = "[connect] " + override protected def isConnect: Boolean = true - protected def getServerSession(clientSession: SparkSession): classic.SparkSession = { - val connectSession = clientSession.asInstanceOf[connect.SparkSession] - val userId = connectSession.client.userId - val sessionId = connectSession.sessionId - val key = SessionKey(userId, sessionId) - SparkConnectService.sessionManager - .getIsolatedSessionIfPresent(key) - .get - .session - } + override protected def withTestSession(fn: SparkSession => Unit): Unit = + withSession(fn) + + // Cannot use QueryTest.checkAnswer directly because it accesses df.logicalPlan, + // df.queryExecution, and df.materializedRdd, which are not available on Connect *client* + // DataFrames (they throw ConnectClientUnsupportedErrors). Note: checkAnswer IS usable from + // Connect server tests that operate on classic server-side DataFrames, but in this suite + // `df` is a Connect client DataFrame returned by session.table() / session.sql(). + // Instead, collect the rows and delegate to QueryTest.sameRows, which is the same + // value-based, order-agnostic comparison that checkAnswer uses internally. + override protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit = + QueryTest.sameRows(expected, df.collect().toSeq).foreach(msg => fail(msg)) override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, catalogName: String): C = { - val catalog = getServerSession(session).sessionState.catalogManager.catalog(catalogName) + val serverSession = getServerSession(session) + val catalog = serverSession.sessionState.catalogManager.catalog(catalogName) val ct = implicitly[ClassTag[C]] require( ct.runtimeClass.isInstance(catalog), s"Expected ${ct.runtimeClass.getName} but got ${catalog.getClass.getName}") catalog.asInstanceOf[C] } + + // No explicit clearCache() for cachingcat is needed here, unlike the classic suite. + // Each withSession call creates a freshly isolated SparkSession on the server side + // (via SparkConnectSessionManager.newIsolatedSession), and afterEach invalidates all + // sessions, so the CachingInMemoryTableCatalog instance is per-test. + override protected def withTestTableAndViews( + session: SparkSession, + table: String, + views: Seq[String] = Seq.empty)(fn: => Unit): Unit = { + try { fn } + finally { + views.foreach(v => session.sql(s"DROP VIEW IF EXISTS $v").collect()) + session.sql(s"DROP TABLE IF EXISTS $table").collect() + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala index 79c101d524a07..ac6ffcc6ecc0d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.Row +import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, Column, InMemoryTableCatalog, TableChange, TableInfo} import org.apache.spark.sql.types.IntegerType @@ -49,209 +49,223 @@ import org.apache.spark.sql.types.IntegerType * (via the CacheManager), making a session drop+recreate scenario trivially different from * the external variant. * - * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * DDL / DML is a no-op (these execute eagerly), so this is harmless. */ trait DSv2CacheTableReadTests extends DSv2ExternalMutationTestBase { - private def assertTableCached(tableName: String): Unit = - assert(spark.catalog.isCached(tableName)) + private def assertTableCached(session: SparkSession, tableName: String): Unit = + assert(session.catalog.isCached(tableName)) test(s"${testPrefix}SPARK-54022: cached table pinned against external data write") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - spark.table(testTable).cache() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + session.table(testTable).cache() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) - spark.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) + session.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) + } } } test(s"${testPrefix}SPARK-54022: connector w/ cache: cached table pinned, " + "REFRESH clears both layers") { - withTable(cachingTestTable) { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - - spark.table(cachingTestTable).cache() - assertTableCached(cachingTestTable) - checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) - - val catalog = - getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - - // Both CacheManager and connector cache are stale: external write invisible - assertTableCached(cachingTestTable) - checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) - - // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds - // the CacheManager entry, so the external write becomes visible. - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - assertTableCached(cachingTestTable) - checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100), Row(2, 200))) + withTestSession { session => + withTestTableAndViews(session, cachingTestTable) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + + session.table(cachingTestTable).cache() + assertTableCached(session, cachingTestTable) + checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) + + val catalog = + getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + + // Both CacheManager and connector cache are stale: external write invisible + assertTableCached(session, cachingTestTable) + checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) + + // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds + // the CacheManager entry, so the external write becomes visible. + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + assertTableCached(session, cachingTestTable) + checkRows(session.table(cachingTestTable), Seq(Row(1, 100), Row(2, 200))) + } } } test(s"${testPrefix}SPARK-54022: session write invalidates cache, " + "then external write invisible") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - spark.table(testTable).cache() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + session.table(testTable).cache() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) - spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) + session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(3, 300)) + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(3, 300)) - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) - spark.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200), Row(3, 300))) + session.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200), Row(3, 300))) + } } } test(s"${testPrefix}SPARK-54022: cached table pinned against external schema change") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - - spark.table(testTable).cache() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - - spark.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + + session.table(testTable).cache() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) + + session.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) + } } } test(s"${testPrefix}SPARK-54022: session schema change invalidates cache, " + "external write invisible") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - spark.table(testTable).cache() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + session.table(testTable).cache() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) - spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100, null))) + session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100, null))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100, null))) + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100, null))) - spark.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) + session.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) + } } } test(s"${testPrefix}SPARK-54022: cached table after external drop and " + "recreate sees empty table") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - - spark.table(testTable).cache() - assertTableCached(testTable) - checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - val originalTableId = catalog.loadTable(testIdent).id - - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - val newTableId = catalog.loadTable(testIdent).id - assert(originalTableId != newTableId) - - val result = spark.table(testTable) - assert(result.schema.fieldNames.toSeq == Seq("id", "salary")) - checkAnswer(result, Seq.empty) - - // External drop+recreate produces a new table identity, so the prior cache entry - // is unreachable via name lookup (unlike external write/schema change where the - // cache stays pinned). - assert(!spark.catalog.isCached(testTable)) - - spark.sql(s"REFRESH TABLE $testTable").collect() - checkAnswer(spark.table(testTable), Seq.empty) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + + session.table(testTable).cache() + assertTableCached(session, testTable) + checkRows(session.table(testTable), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val originalTableId = catalog.loadTable(testIdent).id + + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + val newTableId = catalog.loadTable(testIdent).id + assert(originalTableId != newTableId) + + val result = session.table(testTable) + assert(result.schema.fieldNames.toSeq == Seq("id", "salary")) + checkRows(result, Seq.empty) + + // External drop+recreate produces a new table identity, so the prior cache entry + // is unreachable via name lookup (unlike external write/schema change where the + // cache stays pinned). + assert(!session.catalog.isCached(testTable)) + + session.sql(s"REFRESH TABLE $testTable").collect() + checkRows(session.table(testTable), Seq.empty) + } } } test(s"${testPrefix}SPARK-54022: connector w/ cache: cached table stale after " + "external drop and recreate") { - withTable(cachingTestTable) { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - - spark.table(cachingTestTable).cache() - assertTableCached(cachingTestTable) - checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) - - val catalog = - getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") - val originalTableId = catalog.loadTable(testIdent).id - - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - // CachingInMemoryTableCatalog does not invalidate on drop/create, so loadTable - // still returns the old cached table object. CacheManager still matches and - // serves the stale cached data. - assertTableCached(cachingTestTable) - checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) - - // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds - // the CacheManager entry, so the new empty table becomes visible. - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.table(cachingTestTable), Seq.empty) + withTestSession { session => + withTestTableAndViews(session, cachingTestTable) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + + session.table(cachingTestTable).cache() + assertTableCached(session, cachingTestTable) + checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) + + val catalog = + getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val originalTableId = catalog.loadTable(testIdent).id + + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + // CachingInMemoryTableCatalog does not invalidate on drop/create, so loadTable + // still returns the old cached table object. CacheManager still matches and + // serves the stale cached data. + assertTableCached(session, cachingTestTable) + checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) + + // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds + // the CacheManager entry, so the new empty table becomes visible. + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.table(cachingTestTable), Seq.empty) + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 73c69f8a9de41..0b2a50534447c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -21,7 +21,7 @@ import java.util import scala.reflect.ClassTag -import org.apache.spark.sql.{SessionQueryTestBase, SparkSession} +import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Identifier, InMemoryBaseTable, TableCatalog, TableWritePrivilege} @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Iden * [[DSv2TempViewWithStoredPlanTests]], [[DSv2RepeatedTableAccessTests]], * [[DSv2IncrementallyConstructedQueryTests]], or [[DSv2CacheTableReadTests]]. */ -trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { +trait DSv2ExternalMutationTestBase extends QueryTest { /** Fully qualified table name under the non-caching test catalog. */ protected val testTable: String = "testcat.ns1.ns2.tbl" @@ -51,6 +51,17 @@ trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { /** Prefix for test names, e.g. "" or "[connect] ". */ protected def testPrefix: String + /** Whether this suite runs under Spark Connect. */ + protected def isConnect: Boolean + + /** Execute a test body with a session. */ + protected def withTestSession(fn: SparkSession => Unit): Unit + + /** + * Assert that a DataFrame's rows match the expected rows (order-agnostic). + */ + protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit + /** * Get a [[TableCatalog]] by name from the underlying session. */ @@ -58,6 +69,12 @@ trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { session: SparkSession, catalogName: String): C + /** Cleanup wrapper: drop views and the table after the test body, even on failure. */ + protected def withTestTableAndViews( + session: SparkSession, + table: String, + views: Seq[String] = Seq.empty)(fn: => Unit): Unit + /** Appends a row to a DSv2 table via the catalog API, bypassing the session. */ protected def externalAppend( catalog: TableCatalog, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala index a6de3a0139452..1dbaad18e3e71 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala @@ -32,7 +32,7 @@ import org.apache.spark.unsafe.types.UTF8String * mode, resolution is deferred until execution, so both sides of a join always see the * latest table state. * - * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * eager statements (DDL, INSERT) is a no-op, so this is harmless. */ @@ -45,40 +45,44 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join refreshes both sides after external insert" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = spark.table(testTable) + val df2 = session.table(testTable) - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 100, 1, 100), Row(2, 200, 2, 200))) } } + } test(s"${testPrefix}SPARK-54157: join refreshes both sides after same-session insert" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - val df2 = spark.table(testTable) + val df2 = session.table(testTable) - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 100, 1, 100), Row(2, 200, 2, 200))) } } + } // --------------------------------------------------------------------------- // Scenario 2: join after ADD COLUMN. @@ -88,66 +92,70 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external ADD COLUMN" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") catalog.alterTable( testIdent, TableChange.addColumn(Array("new_column"), IntegerType, true)) externalAppend( catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - val df2 = spark.table(testTable) + val df2 = session.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkAnswer(selfJoin, + checkRows(selfJoin, Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) } else { // Classic: df1 keeps its original 2-column schema (id, salary). assert(selfJoin.columns.length == 5, s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkAnswer(selfJoin, + checkRows(selfJoin, Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) } } } + } test(s"${testPrefix}SPARK-54157: join after same-session ADD COLUMN" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() - val df2 = spark.table(testTable) + val df2 = session.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkAnswer(selfJoin, + checkRows(selfJoin, Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) } else { // Classic: df1 keeps its original 2-column schema (id, salary). assert(selfJoin.columns.length == 5, s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkAnswer(selfJoin, + checkRows(selfJoin, Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) } } } + } // --------------------------------------------------------------------------- // Scenario 3: join after DROP COLUMN. @@ -157,22 +165,23 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external DROP COLUMN" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2)) - val df2 = spark.table(testTable) + val df2 = session.table(testTable) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves df1 without the dropped column. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 1), Row(2, 2))) } else { @@ -187,23 +196,25 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } + } test(s"${testPrefix}SPARK-54157: join after same-session DROP COLUMN" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - spark.sql(s"INSERT INTO $testTable VALUES (2)").collect() + session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + session.sql(s"INSERT INTO $testTable VALUES (2)").collect() - val df2 = spark.table(testTable) + val df2 = session.table(testTable) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves df1 without the dropped column. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 1), Row(2, 2))) } else { @@ -218,6 +229,7 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } + } // --------------------------------------------------------------------------- // Scenario 4: external drop and recreate table. @@ -228,12 +240,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external table drop and recreate" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val df1 = session.table(testTable) + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val originTableId = catalog.loadTable(testIdent).id catalog.dropTable(testIdent) @@ -246,13 +259,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = spark.table(testTable) + val df2 = session.table(testTable) val newTableId = catalog.loadTable(testIdent).id assert(originTableId != newTableId) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves both sides to the recreated table. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -270,16 +283,18 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } + } test(s"${testPrefix}SPARK-54157: join after external drop/recreate" + " (table without table ID support, but with column ID support)") { val nullIdT = "nullidcat.ns1.ns2.tbl" - withTable(nullIdT) { - spark.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, nullIdT) { + session.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() - val df1 = spark.table(nullIdT) - val catalog = getTableCatalog[TableCatalog](spark, "nullidcat") + val df1 = session.table(nullIdT) + val catalog = getTableCatalog[TableCatalog](session, "nullidcat") assert(catalog.loadTable(testIdent).id == null, "NullTableIdInMemoryTableCatalog should produce null table IDs") @@ -293,11 +308,11 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = spark.table(nullIdT) + val df2 = session.table(nullIdT) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves both sides to the recreated table. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -312,17 +327,19 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } + } test(s"${testPrefix}SPARK-54157: join does not detect external table drop and recreate" + " (table without table ID support and without column ID support)") { val nullBothT = "nullbothidscat.ns1.ns2.tbl" - withTable(nullBothT) { - spark.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, nullBothT) { + session.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() - val df1 = spark.table(nullBothT) + val df1 = session.table(nullBothT) val catalog = getTableCatalog[TableCatalog]( - spark, "nullbothidscat") + session, "nullbothidscat") assert(catalog.loadTable(testIdent).id == null, "NullTableIdAndNullColumnIdInMemoryTableCatalog should produce null table IDs") assert(catalog.loadTable(testIdent).columns().forall(_.id() == null), @@ -338,12 +355,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = spark.table(nullBothT) + val df2 = session.table(nullBothT) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves both sides to the recreated table, so the join // sees the row appended after recreate. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -351,12 +368,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas // drop and recreate goes undetected. df1 keeps its pre-drop snapshot // (1, 100) while df2 reads the recreated table (2, 200), so the join finds // no matching ids and returns no rows. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq.empty) } } } + } // --------------------------------------------------------------------------- // Scenario 5: external drop+re-add column. @@ -367,23 +385,24 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external drop+re-add column" + " (table without table ID support, but with column ID support)") { val nullIdT = "nullidcat.ns1.ns2.tbl" - withTable(nullIdT) { - spark.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, nullIdT) { + session.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() - val df1 = spark.table(nullIdT) + val df1 = session.table(nullIdT) - val catalog = getTableCatalog[TableCatalog](spark, "nullidcat") + val catalog = getTableCatalog[TableCatalog](session, "nullidcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( testIdent, TableChange.addColumn(Array("salary"), IntegerType, true)) - val df2 = spark.table(nullIdT) + val df2 = session.table(nullIdT) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves both sides with the new column ID. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null))) } else { @@ -398,31 +417,35 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } + } test(s"${testPrefix}SPARK-54157: join does not detect external drop+re-add column" + " (table without table ID support and without column ID support)") { val nullBothT = "nullbothidscat.ns1.ns2.tbl" - withTable(nullBothT) { - spark.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, nullBothT) { + session.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() - val df1 = spark.table(nullBothT) + val df1 = session.table(nullBothT) - val catalog = getTableCatalog[TableCatalog](spark, "nullbothidscat") + val catalog = getTableCatalog[TableCatalog]( + session, "nullbothidscat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( testIdent, TableChange.addColumn(Array("salary"), IntegerType, true)) - val df2 = spark.table(nullBothT) + val df2 = session.table(nullBothT) // Neither TABLE_ID_MISMATCH nor COLUMN_ID_MISMATCH fires. // The change goes undetected and the join succeeds. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null))) } } + } // --------------------------------------------------------------------------- // Scenario 6: external type change (drop INT column, add STRING column). @@ -434,13 +457,14 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external drop+re-add different-type column" + " (table with both table and column ID support)") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = spark.table(testTable) + val df1 = session.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( @@ -448,11 +472,11 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, UTF8String.fromString("high"))) - val df2 = spark.table(testTable) + val df2 = session.table(testTable) - if (sessionType == "connect") { + if (isConnect) { // Connect re-resolves both sides with the new column type. - checkAnswer( + checkRows( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null), Row(2, "high", 2, "high"))) } else { @@ -467,4 +491,5 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala index fb22a8bb7ab79..533d10a949796 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.IntegerType * Each scenario includes a session mutation baseline, an external mutation test, and a * caching-connector variant showing stale results until `REFRESH TABLE`. * - * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * DDL / DML is a no-op (these execute eagerly), so this is harmless. */ @@ -45,160 +45,178 @@ trait DSv2RepeatedTableAccessTests extends DSv2ExternalMutationTestBase { // Scenario 1: data changes via writes test(s"${testPrefix}repeated sql() reflects session write") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) + } } } test(s"${testPrefix}repeated sql() reflects external write") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) + } } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external write") { - withTable(cachingTestTable) { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - - // Caching connector returns stale table: external write invisible - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, external write becomes visible - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100), Row(2, 200))) + withTestSession { session => + withTestTableAndViews(session, cachingTestTable) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + + // Caching connector returns stale table: external write invisible + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, external write becomes visible + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100), Row(2, 200))) + } } } // Scenario 2: schema changes test(s"${testPrefix}repeated sql() reflects session schema change") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_col INT").collect() - spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() - checkAnswer( - spark.sql(s"SELECT * FROM $testTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + session.sql(s"ALTER TABLE $testTable ADD COLUMN new_col INT").collect() + session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + checkRows( + session.sql(s"SELECT * FROM $testTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) + } } } test(s"${testPrefix}repeated sql() reflects external schema change") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - checkAnswer( - spark.sql(s"SELECT * FROM $testTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + checkRows( + session.sql(s"SELECT * FROM $testTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) + } } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external schema change") { - withTable(cachingTestTable) { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") - val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - // Caching connector returns stale table: external changes invisible - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, schema change + data visible - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer( - spark.sql(s"SELECT * FROM $cachingTestTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) + withTestSession { session => + withTestTableAndViews(session, cachingTestTable) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + // Caching connector returns stale table: external changes invisible + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, schema change + data visible + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows( + session.sql(s"SELECT * FROM $cachingTestTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) + } } } // Scenario 3: drop and recreate table test(s"${testPrefix}repeated sql() reflects session drop/recreate") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - spark.sql(s"DROP TABLE $testTable").collect() - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq.empty) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + session.sql(s"DROP TABLE $testTable").collect() + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq.empty) + } } } test(s"${testPrefix}repeated sql() reflects external drop/recreate") { - withTable(testTable) { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq.empty) + withTestSession { session => + withTestTableAndViews(session, testTable) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + checkRows(session.sql(s"SELECT * FROM $testTable"), Seq.empty) + } } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external drop/recreate") { - withTable(cachingTestTable) { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - // Caching connector returns stale table: drop/recreate invisible - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, new empty table visible - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq.empty) + withTestSession { session => + withTestTableAndViews(session, cachingTestTable) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + // Caching connector returns stale table: drop/recreate invisible + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, new empty table visible + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq.empty) + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala index e473968794c37..9f8a93e30550f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, LongType, StringType} * backed by DSv2 tables correctly handle data changes, schema changes, and table recreation, * both via session SQL and external catalog mutations. * - * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on DDL * is a no-op (DDL executes eagerly), so this is harmless. */ @@ -35,143 +35,143 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 1.1 (session write) test(s"${testPrefix}temp view with stored plan reflects session write") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) + session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 1.2 (external write) test(s"${testPrefix}temp view with stored plan reflects external write") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 1.2 connector w/ cache (external write, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external write") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) // Caching connector returns stale table: external write invisible - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, external write becomes visible - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.1 (session ADD COLUMN) test(s"${testPrefix}temp view with stored plan preserves schema after session ADD COLUMN") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() // view preserves original 2-column schema, filter still applied - checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.2 (external ADD COLUMN) test(s"${testPrefix}temp view with stored plan preserves schema after external ADD COLUMN") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) // external schema change via catalog API - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) catalog.alterTable(testIdent, addCol) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) // view preserves original 2-column schema, filter still applied - checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.2 connector w/ cache (external ADD COLUMN, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external ADD COLUMN") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) catalog.alterTable(testIdent, addCol) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) // Caching connector returns stale table: external changes invisible - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, view preserves original 2-column schema - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 3.1 (session column removal) test(s"${testPrefix}temp view with stored plan detects session column removal") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -184,20 +184,20 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 3.2 (external column removal) test(s"${testPrefix}temp view with stored plan detects external column removal") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) catalog.alterTable(testIdent, dropCol) checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -210,25 +210,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 3.2 connector w/ cache (external column removal, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external column removal") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) catalog.alterTable(testIdent, dropCol) // Caching connector returns stale table: column removal invisible, no error - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, column removal detected - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + session.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -241,43 +241,43 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 4.1 (session drop and recreate table) test(s"${testPrefix}temp view with stored plan resolves to session-recreated table") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val originalTableId = catalog.loadTable(testIdent).id - spark.sql(s"DROP TABLE $testTable").collect() - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"DROP TABLE $testTable").collect() + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() val newTableId = catalog.loadTable(testIdent).id assert(originalTableId != newTableId) // view resolves to the new empty table - checkAnswer(spark.table("v"), Seq.empty) + checkRows(session.table("v"), Seq.empty) - spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkAnswer(spark.table("v"), Seq(Row(2, 200))) + session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkRows(session.table("v"), Seq(Row(2, 200))) } } } // Scenario 4.2 (external drop and recreate table) test(s"${testPrefix}temp view with stored plan resolves to externally recreated table") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val originalTableId = catalog.loadTable(testIdent).id catalog.dropTable(testIdent) @@ -293,25 +293,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { assert(originalTableId != newTableId) // view resolves to the new empty table - checkAnswer(spark.table("v"), Seq.empty) + checkRows(session.table("v"), Seq.empty) - spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkAnswer(spark.table("v"), Seq(Row(2, 200))) + session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkRows(session.table("v"), Seq(Row(2, 200))) } } } // Scenario 4.2 connector w/ cache (external drop/recreate, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external drop/recreate") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") catalog.dropTable(testIdent) catalog.createTable( testIdent, @@ -322,11 +322,11 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { .build()) // Caching connector returns stale table: drop/recreate invisible - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, view resolves to new empty table - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.table("v"), Seq.empty) + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.table("v"), Seq.empty) } } } @@ -334,29 +334,29 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.1 (session drop and re-add column with same type, multiple views) test(s"${testPrefix}temp view with stored plan after session drop and re-add column same type" + " with unfiltered view") { - withTable(testTable) { - withView("v", "v_no_filter", "v_filter_is_null") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - spark.table(testTable).createOrReplaceTempView("v_no_filter") - spark.table(testTable).filter("salary IS NULL") + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v", "v_no_filter", "v_filter_is_null")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + session.table(testTable).createOrReplaceTempView("v_no_filter") + session.table(testTable).filter("salary IS NULL") .createOrReplaceTempView("v_filter_is_null") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) - checkAnswer(spark.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) - checkAnswer(spark.table("v_filter_is_null"), Seq.empty) + checkRows(session.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) + checkRows(session.table("v_filter_is_null"), Seq.empty) // drop and re-add column with same name and type - spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - spark.sql(s"ALTER TABLE $testTable ADD COLUMN salary INT").collect() + session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + session.sql(s"ALTER TABLE $testTable ADD COLUMN salary INT").collect() // salary values are now null, so the filtered view returns nothing - checkAnswer(spark.table("v"), Seq.empty) + checkRows(session.table("v"), Seq.empty) // unfiltered view returns rows with null salary - checkAnswer(spark.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) + checkRows(session.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) // IS NULL filter now matches all rows - checkAnswer(spark.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) + checkRows(session.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) } } } @@ -364,31 +364,31 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.2 (external drop and re-add column with same type) test(s"${testPrefix}temp view with stored plan after external drop and re-add column " + "same type") { - withTable(testTable) { - withView("v", "v_no_filter", "v_filter_is_null") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - spark.table(testTable).createOrReplaceTempView("v_no_filter") - spark.table(testTable).filter("salary IS NULL") + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v", "v_no_filter", "v_filter_is_null")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + session.table(testTable).createOrReplaceTempView("v_no_filter") + session.table(testTable).filter("salary IS NULL") .createOrReplaceTempView("v_filter_is_null") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) - checkAnswer(spark.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) - checkAnswer(spark.table("v_filter_is_null"), Seq.empty) + checkRows(session.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) + checkRows(session.table("v_filter_is_null"), Seq.empty) // external drop and re-add column via catalog API - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), IntegerType, true) catalog.alterTable(testIdent, dropCol, addCol) // salary values are now null, so the filtered view returns nothing - checkAnswer(spark.table("v"), Seq.empty) + checkRows(session.table("v"), Seq.empty) // unfiltered view returns rows with null salary - checkAnswer(spark.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) + checkRows(session.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) // IS NULL filter now matches all rows - checkAnswer(spark.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) + checkRows(session.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) } } } @@ -396,44 +396,44 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.2 connector w/ cache (external drop/re-add column, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external drop/re-add column " + "same type") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), IntegerType, true) catalog.alterTable(testIdent, dropCol, addCol) // Caching connector returns stale table: column drop/re-add invisible - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, salary values are null - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkAnswer(spark.table("v"), Seq.empty) + session.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkRows(session.table("v"), Seq.empty) } } } // Scenario 6.1 (session drop and re-add column with different type) test(s"${testPrefix}temp view with stored plan detects session column type change") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - spark.sql(s"ALTER TABLE $testTable ADD COLUMN salary STRING").collect() + session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + session.sql(s"ALTER TABLE $testTable ADD COLUMN salary STRING").collect() checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -446,21 +446,21 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 6.2 (external drop and re-add column with different type) test(s"${testPrefix}temp view with stored plan detects external column type change") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), StringType, true) catalog.alterTable(testIdent, dropCol, addCol) checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -473,26 +473,26 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 6.2 connector w/ cache (external column type change, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external column type change") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), StringType, true) catalog.alterTable(testIdent, dropCol, addCol) // Caching connector returns stale table: type change invisible, no error - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, type change detected - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + session.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -505,18 +505,18 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.1 (session type widening from INT to BIGINT) test(s"${testPrefix}temp view with stored plan detects session type widening") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - spark.sql(s"ALTER TABLE $testTable ALTER COLUMN salary TYPE LONG").collect() + session.sql(s"ALTER TABLE $testTable ALTER COLUMN salary TYPE LONG").collect() checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -529,20 +529,20 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.2 (external type widening from INT to BIGINT) test(s"${testPrefix}temp view with stored plan detects external type widening") { - withTable(testTable) { - withView("v") { - spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, testTable, Seq("v")) { + session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") val updateType = TableChange.updateColumnType(Array("salary"), LongType) catalog.alterTable(testIdent, updateType) checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -555,25 +555,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.2 connector w/ cache (external type widening, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external type widening") { - withTable(cachingTestTable) { - withView("v") { - spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTestSession { session => + withTestTableAndViews(session, cachingTestTable, Seq("v")) { + session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkRows(session.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") val updateType = TableChange.updateColumnType(Array("salary"), LongType) catalog.alterTable(testIdent, updateType) // Caching connector returns stale table: type change invisible, no error - checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkRows(session.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, type change detected - spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + session.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { spark.table("v").collect() }, + exception = intercept[AnalysisException] { session.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala index 4d4b96406bd28..71632e07c78b7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala @@ -24,7 +24,7 @@ import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode, SessionQueryTest, SparkSession} +import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode, SparkSession} import org.apache.spark.sql.QueryTest.withQueryExecutionsCaptured import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, ReplaceTableAsSelect} @@ -47,7 +47,6 @@ import org.apache.spark.unsafe.types.UTF8String class DataSourceV2DataFrameSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = false) - with SessionQueryTest with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -98,6 +97,12 @@ class DataSourceV2DataFrameSuite // DSv2ExternalMutationTestBase implementations for classic mode override protected def testPrefix: String = "" + override protected def isConnect: Boolean = false + + override protected def withTestSession(fn: SparkSession => Unit): Unit = fn(spark) + + override protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit = + checkAnswer(df, expected) override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, @@ -110,6 +115,16 @@ class DataSourceV2DataFrameSuite c.asInstanceOf[C] } + override protected def withTestTableAndViews( + session: SparkSession, + table: String, + views: Seq[String] = Seq.empty)(fn: => Unit): Unit = { + withTable(table) { + try { fn } + finally { views.foreach(v => session.sql(s"DROP VIEW IF EXISTS $v")) } + } + } + override def verifyTable(tableName: String, expected: DataFrame): Unit = { checkAnswer(spark.table(tableName), expected) } From ea62c553484b9e293677a7516d8ed0ed3e2d2ba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthis=20G=C3=B6rdel?= Date: Tue, 23 Jun 2026 17:21:36 +0000 Subject: [PATCH 58/58] refactor some DSv2 classic/connect tests to use SessionQueryTest This reverts commit 176a528b305764ed34fa5f39272c917be2f11781. --- .../DataSourceV2DataFrameConnectSuite.scala | 46 +- .../connector/DSv2CacheTableReadTests.scala | 322 +++++++------- .../DSv2ExternalMutationTestBase.scala | 21 +- ...v2IncrementallyConstructedQueryTests.scala | 223 +++++----- .../DSv2RepeatedTableAccessTests.scala | 252 +++++------ .../DSv2TempViewWithStoredPlanTests.scala | 412 +++++++++--------- .../DataSourceV2DataFrameSuite.scala | 19 +- 7 files changed, 594 insertions(+), 701 deletions(-) diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala index 1a31e5f8ac1a3..d3294f8af3b4a 100644 --- a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala +++ b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/DataSourceV2DataFrameConnectSuite.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.connect import scala.reflect.ClassTag import org.apache.spark.SparkConf -import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession} +import org.apache.spark.sql.{classic, connect, SparkSession} +import org.apache.spark.sql.connect.service.{SessionKey, SparkConnectService} import org.apache.spark.sql.connector.{DSv2CacheTableReadTests, DSv2IncrementallyConstructedQueryTests, DSv2RepeatedTableAccessTests, DSv2TempViewWithStoredPlanTests} import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMemoryTableCatalog, NullTableIdAndNullColumnIdInMemoryTableCatalog, NullTableIdInMemoryTableCatalog, TableCatalog} @@ -34,7 +35,7 @@ import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, InMe * this class only provides the Connect-specific session, catalog access, and result comparison. */ class DataSourceV2DataFrameConnectSuite - extends SparkConnectServerTest + extends SessionQueryTest with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -53,45 +54,26 @@ class DataSourceV2DataFrameConnectSuite .set("spark.sql.catalog.nullbothidscat.copyOnLoad", "true") override protected def testPrefix: String = "[connect] " - override protected def isConnect: Boolean = true - override protected def withTestSession(fn: SparkSession => Unit): Unit = - withSession(fn) - - // Cannot use QueryTest.checkAnswer directly because it accesses df.logicalPlan, - // df.queryExecution, and df.materializedRdd, which are not available on Connect *client* - // DataFrames (they throw ConnectClientUnsupportedErrors). Note: checkAnswer IS usable from - // Connect server tests that operate on classic server-side DataFrames, but in this suite - // `df` is a Connect client DataFrame returned by session.table() / session.sql(). - // Instead, collect the rows and delegate to QueryTest.sameRows, which is the same - // value-based, order-agnostic comparison that checkAnswer uses internally. - override protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit = - QueryTest.sameRows(expected, df.collect().toSeq).foreach(msg => fail(msg)) + protected def getServerSession(clientSession: SparkSession): classic.SparkSession = { + val connectSession = clientSession.asInstanceOf[connect.SparkSession] + val userId = connectSession.client.userId + val sessionId = connectSession.sessionId + val key = SessionKey(userId, sessionId) + SparkConnectService.sessionManager + .getIsolatedSessionIfPresent(key) + .get + .session + } override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, catalogName: String): C = { - val serverSession = getServerSession(session) - val catalog = serverSession.sessionState.catalogManager.catalog(catalogName) + val catalog = getServerSession(session).sessionState.catalogManager.catalog(catalogName) val ct = implicitly[ClassTag[C]] require( ct.runtimeClass.isInstance(catalog), s"Expected ${ct.runtimeClass.getName} but got ${catalog.getClass.getName}") catalog.asInstanceOf[C] } - - // No explicit clearCache() for cachingcat is needed here, unlike the classic suite. - // Each withSession call creates a freshly isolated SparkSession on the server side - // (via SparkConnectSessionManager.newIsolatedSession), and afterEach invalidates all - // sessions, so the CachingInMemoryTableCatalog instance is per-test. - override protected def withTestTableAndViews( - session: SparkSession, - table: String, - views: Seq[String] = Seq.empty)(fn: => Unit): Unit = { - try { fn } - finally { - views.foreach(v => session.sql(s"DROP VIEW IF EXISTS $v").collect()) - session.sql(s"DROP TABLE IF EXISTS $table").collect() - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala index ac6ffcc6ecc0d..79c101d524a07 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2CacheTableReadTests.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{CachingInMemoryTableCatalog, Column, InMemoryTableCatalog, TableChange, TableInfo} import org.apache.spark.sql.types.IntegerType @@ -49,223 +49,209 @@ import org.apache.spark.sql.types.IntegerType * (via the CacheManager), making a session drop+recreate scenario trivially different from * the external variant. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * DDL / DML is a no-op (these execute eagerly), so this is harmless. */ trait DSv2CacheTableReadTests extends DSv2ExternalMutationTestBase { - private def assertTableCached(session: SparkSession, tableName: String): Unit = - assert(session.catalog.isCached(tableName)) + private def assertTableCached(tableName: String): Unit = + assert(spark.catalog.isCached(tableName)) test(s"${testPrefix}SPARK-54022: cached table pinned against external data write") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) - } + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}SPARK-54022: connector w/ cache: cached table pinned, " + "REFRESH clears both layers") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - - session.table(cachingTestTable).cache() - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - val catalog = - getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - - // Both CacheManager and connector cache are stale: external write invisible - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds - // the CacheManager entry, so the external write becomes visible. - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100), Row(2, 200))) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + + spark.table(cachingTestTable).cache() + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + val catalog = + getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + + // Both CacheManager and connector cache are stale: external write invisible + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds + // the CacheManager entry, so the external write becomes visible. + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}SPARK-54022: session write invalidates cache, " + "then external write invisible") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(3, 300)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(3, 300)) - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200))) + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200))) - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100), Row(2, 200), Row(3, 300))) - } + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100), Row(2, 200), Row(3, 300))) } } test(s"${testPrefix}SPARK-54022: cached table pinned against external schema change") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) - - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}SPARK-54022: session schema change invalidates cache, " + "external write invisible") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null))) + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null))) + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null))) - session.sql(s"REFRESH TABLE $testTable").collect() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) - } + spark.sql(s"REFRESH TABLE $testTable").collect() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}SPARK-54022: cached table after external drop and " + "recreate sees empty table") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - - session.table(testTable).cache() - assertTableCached(session, testTable) - checkRows(session.table(testTable), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - val originalTableId = catalog.loadTable(testIdent).id - - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - val newTableId = catalog.loadTable(testIdent).id - assert(originalTableId != newTableId) - - val result = session.table(testTable) - assert(result.schema.fieldNames.toSeq == Seq("id", "salary")) - checkRows(result, Seq.empty) - - // External drop+recreate produces a new table identity, so the prior cache entry - // is unreachable via name lookup (unlike external write/schema change where the - // cache stays pinned). - assert(!session.catalog.isCached(testTable)) - - session.sql(s"REFRESH TABLE $testTable").collect() - checkRows(session.table(testTable), Seq.empty) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + + spark.table(testTable).cache() + assertTableCached(testTable) + checkAnswer(spark.table(testTable), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val originalTableId = catalog.loadTable(testIdent).id + + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + val newTableId = catalog.loadTable(testIdent).id + assert(originalTableId != newTableId) + + val result = spark.table(testTable) + assert(result.schema.fieldNames.toSeq == Seq("id", "salary")) + checkAnswer(result, Seq.empty) + + // External drop+recreate produces a new table identity, so the prior cache entry + // is unreachable via name lookup (unlike external write/schema change where the + // cache stays pinned). + assert(!spark.catalog.isCached(testTable)) + + spark.sql(s"REFRESH TABLE $testTable").collect() + checkAnswer(spark.table(testTable), Seq.empty) } } test(s"${testPrefix}SPARK-54022: connector w/ cache: cached table stale after " + "external drop and recreate") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - - session.table(cachingTestTable).cache() - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - val catalog = - getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - val originalTableId = catalog.loadTable(testIdent).id - - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - // CachingInMemoryTableCatalog does not invalidate on drop/create, so loadTable - // still returns the old cached table object. CacheManager still matches and - // serves the stale cached data. - assertTableCached(session, cachingTestTable) - checkRows(session.table(cachingTestTable), Seq(Row(1, 100))) - - // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds - // the CacheManager entry, so the new empty table becomes visible. - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table(cachingTestTable), Seq.empty) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + + spark.table(cachingTestTable).cache() + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + val catalog = + getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val originalTableId = catalog.loadTable(testIdent).id + + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + // CachingInMemoryTableCatalog does not invalidate on drop/create, so loadTable + // still returns the old cached table object. CacheManager still matches and + // serves the stale cached data. + assertTableCached(cachingTestTable) + checkAnswer(spark.table(cachingTestTable), Seq(Row(1, 100))) + + // REFRESH TABLE calls invalidateTable (clears connector cache) and rebuilds + // the CacheManager entry, so the new empty table becomes visible. + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table(cachingTestTable), Seq.empty) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala index 0b2a50534447c..73c69f8a9de41 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2ExternalMutationTestBase.scala @@ -21,7 +21,7 @@ import java.util import scala.reflect.ClassTag -import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession} +import org.apache.spark.sql.{SessionQueryTestBase, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Identifier, InMemoryBaseTable, TableCatalog, TableWritePrivilege} @@ -37,7 +37,7 @@ import org.apache.spark.sql.connector.catalog.{BufferedRows, CatalogV2Util, Iden * [[DSv2TempViewWithStoredPlanTests]], [[DSv2RepeatedTableAccessTests]], * [[DSv2IncrementallyConstructedQueryTests]], or [[DSv2CacheTableReadTests]]. */ -trait DSv2ExternalMutationTestBase extends QueryTest { +trait DSv2ExternalMutationTestBase extends SessionQueryTestBase { /** Fully qualified table name under the non-caching test catalog. */ protected val testTable: String = "testcat.ns1.ns2.tbl" @@ -51,17 +51,6 @@ trait DSv2ExternalMutationTestBase extends QueryTest { /** Prefix for test names, e.g. "" or "[connect] ". */ protected def testPrefix: String - /** Whether this suite runs under Spark Connect. */ - protected def isConnect: Boolean - - /** Execute a test body with a session. */ - protected def withTestSession(fn: SparkSession => Unit): Unit - - /** - * Assert that a DataFrame's rows match the expected rows (order-agnostic). - */ - protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit - /** * Get a [[TableCatalog]] by name from the underlying session. */ @@ -69,12 +58,6 @@ trait DSv2ExternalMutationTestBase extends QueryTest { session: SparkSession, catalogName: String): C - /** Cleanup wrapper: drop views and the table after the test body, even on failure. */ - protected def withTestTableAndViews( - session: SparkSession, - table: String, - views: Seq[String] = Seq.empty)(fn: => Unit): Unit - /** Appends a row to a DSv2 table via the catalog API, bypassing the session. */ protected def externalAppend( catalog: TableCatalog, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala index 1dbaad18e3e71..a6de3a0139452 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2IncrementallyConstructedQueryTests.scala @@ -32,7 +32,7 @@ import org.apache.spark.unsafe.types.UTF8String * mode, resolution is deferred until execution, so both sides of a join always see the * latest table state. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * eager statements (DDL, INSERT) is a no-op, so this is harmless. */ @@ -45,44 +45,40 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join refreshes both sides after external insert" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 100, 1, 100), Row(2, 200, 2, 200))) } } - } test(s"${testPrefix}SPARK-54157: join refreshes both sides after same-session insert" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 100, 1, 100), Row(2, 200, 2, 200))) } } - } // --------------------------------------------------------------------------- // Scenario 2: join after ADD COLUMN. @@ -92,70 +88,66 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external ADD COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") catalog.alterTable( testIdent, TableChange.addColumn(Array("new_column"), IntegerType, true)) externalAppend( catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) } else { // Classic: df1 keeps its original 2-column schema (id, salary). assert(selfJoin.columns.length == 5, s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) } } } - } test(s"${testPrefix}SPARK-54157: join after same-session ADD COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() - val df2 = session.table(testTable) + val df2 = spark.table(testTable) val selfJoin = df1.join(df2, df1("id") === df2("id")) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 with the new 3-column schema (id, salary, new_column). assert(selfJoin.columns.length == 6, s"Expected 6 columns (3 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, null, 1, 100, null), Row(2, 200, -1, 2, 200, -1))) } else { // Classic: df1 keeps its original 2-column schema (id, salary). assert(selfJoin.columns.length == 5, s"Expected 5 columns (2 + 3) but got: ${selfJoin.columns.mkString(", ")}") - checkRows(selfJoin, + checkAnswer(selfJoin, Seq(Row(1, 100, 1, 100, null), Row(2, 200, 2, 200, -1))) } } } - } // --------------------------------------------------------------------------- // Scenario 3: join after DROP COLUMN. @@ -165,23 +157,22 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external DROP COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 without the dropped column. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 1), Row(2, 2))) } else { @@ -196,25 +187,23 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join after same-session DROP COLUMN" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - session.sql(s"INSERT INTO $testTable VALUES (2)").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2)").collect() - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves df1 without the dropped column. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, 1), Row(2, 2))) } else { @@ -229,7 +218,6 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } // --------------------------------------------------------------------------- // Scenario 4: external drop and recreate table. @@ -240,13 +228,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external table drop and recreate" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val df1 = spark.table(testTable) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val originTableId = catalog.loadTable(testIdent).id catalog.dropTable(testIdent) @@ -259,13 +246,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) val newTableId = catalog.loadTable(testIdent).id assert(originTableId != newTableId) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides to the recreated table. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -283,18 +270,16 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join after external drop/recreate" + " (table without table ID support, but with column ID support)") { val nullIdT = "nullidcat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullIdT) { - session.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() + withTable(nullIdT) { + spark.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() - val df1 = session.table(nullIdT) - val catalog = getTableCatalog[TableCatalog](session, "nullidcat") + val df1 = spark.table(nullIdT) + val catalog = getTableCatalog[TableCatalog](spark, "nullidcat") assert(catalog.loadTable(testIdent).id == null, "NullTableIdInMemoryTableCatalog should produce null table IDs") @@ -308,11 +293,11 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(nullIdT) + val df2 = spark.table(nullIdT) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides to the recreated table. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -327,19 +312,17 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join does not detect external table drop and recreate" + " (table without table ID support and without column ID support)") { val nullBothT = "nullbothidscat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullBothT) { - session.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() + withTable(nullBothT) { + spark.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() - val df1 = session.table(nullBothT) + val df1 = spark.table(nullBothT) val catalog = getTableCatalog[TableCatalog]( - session, "nullbothidscat") + spark, "nullbothidscat") assert(catalog.loadTable(testIdent).id == null, "NullTableIdAndNullColumnIdInMemoryTableCatalog should produce null table IDs") assert(catalog.loadTable(testIdent).columns().forall(_.id() == null), @@ -355,12 +338,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas .build()) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - val df2 = session.table(nullBothT) + val df2 = spark.table(nullBothT) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides to the recreated table, so the join // sees the row appended after recreate. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(2, 200, 2, 200))) } else { @@ -368,13 +351,12 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas // drop and recreate goes undetected. df1 keeps its pre-drop snapshot // (1, 100) while df2 reads the recreated table (2, 200), so the join finds // no matching ids and returns no rows. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq.empty) } } } - } // --------------------------------------------------------------------------- // Scenario 5: external drop+re-add column. @@ -385,24 +367,23 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external drop+re-add column" + " (table without table ID support, but with column ID support)") { val nullIdT = "nullidcat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullIdT) { - session.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() + withTable(nullIdT) { + spark.sql(s"CREATE TABLE $nullIdT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullIdT VALUES (1, 100)").collect() - val df1 = session.table(nullIdT) + val df1 = spark.table(nullIdT) - val catalog = getTableCatalog[TableCatalog](session, "nullidcat") + val catalog = getTableCatalog[TableCatalog](spark, "nullidcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( testIdent, TableChange.addColumn(Array("salary"), IntegerType, true)) - val df2 = session.table(nullIdT) + val df2 = spark.table(nullIdT) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides with the new column ID. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null))) } else { @@ -417,35 +398,31 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } test(s"${testPrefix}SPARK-54157: join does not detect external drop+re-add column" + " (table without table ID support and without column ID support)") { val nullBothT = "nullbothidscat.ns1.ns2.tbl" - withTestSession { session => - withTestTableAndViews(session, nullBothT) { - session.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() + withTable(nullBothT) { + spark.sql(s"CREATE TABLE $nullBothT (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $nullBothT VALUES (1, 100)").collect() - val df1 = session.table(nullBothT) + val df1 = spark.table(nullBothT) - val catalog = getTableCatalog[TableCatalog]( - session, "nullbothidscat") + val catalog = getTableCatalog[TableCatalog](spark, "nullbothidscat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( testIdent, TableChange.addColumn(Array("salary"), IntegerType, true)) - val df2 = session.table(nullBothT) + val df2 = spark.table(nullBothT) // Neither TABLE_ID_MISMATCH nor COLUMN_ID_MISMATCH fires. // The change goes undetected and the join succeeds. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null))) } } - } // --------------------------------------------------------------------------- // Scenario 6: external type change (drop INT column, add STRING column). @@ -457,14 +434,13 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas test(s"${testPrefix}SPARK-54157: join after external drop+re-add different-type column" + " (table with both table and column ID support)") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - val df1 = session.table(testTable) + val df1 = spark.table(testTable) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") catalog.alterTable( testIdent, TableChange.deleteColumn(Array("salary"), false)) catalog.alterTable( @@ -472,11 +448,11 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, UTF8String.fromString("high"))) - val df2 = session.table(testTable) + val df2 = spark.table(testTable) - if (isConnect) { + if (sessionType == "connect") { // Connect re-resolves both sides with the new column type. - checkRows( + checkAnswer( df1.join(df2, df1("id") === df2("id")), Seq(Row(1, null, 1, null), Row(2, "high", 2, "high"))) } else { @@ -491,5 +467,4 @@ trait DSv2IncrementallyConstructedQueryTests extends DSv2ExternalMutationTestBas } } } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala index 533d10a949796..fb22a8bb7ab79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2RepeatedTableAccessTests.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.IntegerType * Each scenario includes a session mutation baseline, an external mutation test, and a * caching-connector variant showing stale results until `REFRESH TABLE`. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on * DDL / DML is a no-op (these execute eagerly), so this is harmless. */ @@ -45,178 +45,160 @@ trait DSv2RepeatedTableAccessTests extends DSv2ExternalMutationTestBase { // Scenario 1: data changes via writes test(s"${testPrefix}repeated sql() reflects session write") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}repeated sql() reflects external write") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) - } + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100), Row(2, 200))) } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external write") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - - // Caching connector returns stale table: external write invisible - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, external write becomes visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100), Row(2, 200))) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) + + // Caching connector returns stale table: external write invisible + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, external write becomes visible + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100), Row(2, 200))) } } // Scenario 2: schema changes test(s"${testPrefix}repeated sql() reflects session schema change") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_col INT").collect() - session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() - checkRows( - session.sql(s"SELECT * FROM $testTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_col INT").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + checkAnswer( + spark.sql(s"SELECT * FROM $testTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}repeated sql() reflects external schema change") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - checkRows( - session.sql(s"SELECT * FROM $testTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + checkAnswer( + spark.sql(s"SELECT * FROM $testTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external schema change") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) - catalog.alterTable(testIdent, addCol) - - externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) - - // Caching connector returns stale table: external changes invisible - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, schema change + data visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows( - session.sql(s"SELECT * FROM $cachingTestTable"), - Seq(Row(1, 100, null), Row(2, 200, -1))) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + val addCol = TableChange.addColumn(Array("new_col"), IntegerType, true) + catalog.alterTable(testIdent, addCol) + + externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) + + // Caching connector returns stale table: external changes invisible + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, schema change + data visible + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer( + spark.sql(s"SELECT * FROM $cachingTestTable"), + Seq(Row(1, 100, null), Row(2, 200, -1))) } } // Scenario 3: drop and recreate table test(s"${testPrefix}repeated sql() reflects session drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - session.sql(s"DROP TABLE $testTable").collect() - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq.empty) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + spark.sql(s"DROP TABLE $testTable").collect() + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq.empty) } } test(s"${testPrefix}repeated sql() reflects external drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, testTable) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - checkRows(session.sql(s"SELECT * FROM $testTable"), Seq.empty) - } + withTable(testTable) { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + checkAnswer(spark.sql(s"SELECT * FROM $testTable"), Seq.empty) } } test(s"${testPrefix}connector w/ cache: repeated sql() stale after external drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") - catalog.dropTable(testIdent) - catalog.createTable( - testIdent, - new TableInfo.Builder() - .withColumns(Array( - Column.create("id", IntegerType), - Column.create("salary", IntegerType))) - .build()) - - // Caching connector returns stale table: drop/recreate invisible - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) - - // REFRESH TABLE invalidates the connector cache, new empty table visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.sql(s"SELECT * FROM $cachingTestTable"), Seq.empty) - } + withTable(cachingTestTable) { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100)").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") + catalog.dropTable(testIdent) + catalog.createTable( + testIdent, + new TableInfo.Builder() + .withColumns(Array( + Column.create("id", IntegerType), + Column.create("salary", IntegerType))) + .build()) + + // Caching connector returns stale table: drop/recreate invisible + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq(Row(1, 100))) + + // REFRESH TABLE invalidates the connector cache, new empty table visible + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.sql(s"SELECT * FROM $cachingTestTable"), Seq.empty) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala index 9f8a93e30550f..e473968794c37 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DSv2TempViewWithStoredPlanTests.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, LongType, StringType} * backed by DSv2 tables correctly handle data changes, schema changes, and table recreation, * both via session SQL and external catalog mutations. * - * NOTE: All `session.sql(...)` calls append `.collect()` because Connect client DataFrames + * NOTE: All `spark.sql(...)` calls append `.collect()` because Connect client DataFrames * are lazy and require an action to trigger execution. In classic mode `.collect()` on DDL * is a no-op (DDL executes eagerly), so this is harmless. */ @@ -35,143 +35,143 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 1.1 (session write) test(s"${testPrefix}temp view with stored plan reflects session write") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 1.2 (external write) test(s"${testPrefix}temp view with stored plan reflects external write") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 1.2 connector w/ cache (external write, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external write") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200)) // Caching connector returns stale table: external write invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, external write becomes visible - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.1 (session ADD COLUMN) test(s"${testPrefix}temp view with stored plan preserves schema after session ADD COLUMN") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() - session.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN new_column INT").collect() + spark.sql(s"INSERT INTO $testTable VALUES (2, 200, -1)").collect() // view preserves original 2-column schema, filter still applied - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.2 (external ADD COLUMN) test(s"${testPrefix}temp view with stored plan preserves schema after external ADD COLUMN") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // external schema change via catalog API - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) catalog.alterTable(testIdent, addCol) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) // view preserves original 2-column schema, filter still applied - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 2.2 connector w/ cache (external ADD COLUMN, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external ADD COLUMN") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val addCol = TableChange.addColumn(Array("new_column"), IntegerType, true) catalog.alterTable(testIdent, addCol) externalAppend(catalog = catalog, ident = testIdent, row = InternalRow(2, 200, -1)) // Caching connector returns stale table: external changes invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, view preserves original 2-column schema - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq(Row(1, 100), Row(2, 200))) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq(Row(1, 100), Row(2, 200))) } } } // Scenario 3.1 (session column removal) test(s"${testPrefix}temp view with stored plan detects session column removal") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -184,20 +184,20 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 3.2 (external column removal) test(s"${testPrefix}temp view with stored plan detects external column removal") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) catalog.alterTable(testIdent, dropCol) checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -210,25 +210,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 3.2 connector w/ cache (external column removal, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external column removal") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) catalog.alterTable(testIdent, dropCol) // Caching connector returns stale table: column removal invisible, no error - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, column removal detected - session.sql(s"REFRESH TABLE $cachingTestTable").collect() + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -241,43 +241,43 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 4.1 (session drop and recreate table) test(s"${testPrefix}temp view with stored plan resolves to session-recreated table") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val originalTableId = catalog.loadTable(testIdent).id - session.sql(s"DROP TABLE $testTable").collect() - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"DROP TABLE $testTable").collect() + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() val newTableId = catalog.loadTable(testIdent).id assert(originalTableId != newTableId) // view resolves to the new empty table - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.table("v"), Seq(Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.table("v"), Seq(Row(2, 200))) } } } // Scenario 4.2 (external drop and recreate table) test(s"${testPrefix}temp view with stored plan resolves to externally recreated table") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val originalTableId = catalog.loadTable(testIdent).id catalog.dropTable(testIdent) @@ -293,25 +293,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { assert(originalTableId != newTableId) // view resolves to the new empty table - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) - session.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() - checkRows(session.table("v"), Seq(Row(2, 200))) + spark.sql(s"INSERT INTO $testTable VALUES (2, 200)").collect() + checkAnswer(spark.table("v"), Seq(Row(2, 200))) } } } // Scenario 4.2 connector w/ cache (external drop/recreate, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external drop/recreate") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") catalog.dropTable(testIdent) catalog.createTable( testIdent, @@ -322,11 +322,11 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { .build()) // Caching connector returns stale table: drop/recreate invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, view resolves to new empty table - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq.empty) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq.empty) } } } @@ -334,29 +334,29 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.1 (session drop and re-add column with same type, multiple views) test(s"${testPrefix}temp view with stored plan after session drop and re-add column same type" + " with unfiltered view") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v", "v_no_filter", "v_filter_is_null")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - session.table(testTable).createOrReplaceTempView("v_no_filter") - session.table(testTable).filter("salary IS NULL") + withTable(testTable) { + withView("v", "v_no_filter", "v_filter_is_null") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + spark.table(testTable).createOrReplaceTempView("v_no_filter") + spark.table(testTable).filter("salary IS NULL") .createOrReplaceTempView("v_filter_is_null") - checkRows(session.table("v"), Seq(Row(1, 100))) - checkRows(session.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) - checkRows(session.table("v_filter_is_null"), Seq.empty) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) + checkAnswer(spark.table("v_filter_is_null"), Seq.empty) // drop and re-add column with same name and type - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - session.sql(s"ALTER TABLE $testTable ADD COLUMN salary INT").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN salary INT").collect() // salary values are now null, so the filtered view returns nothing - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) // unfiltered view returns rows with null salary - checkRows(session.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) // IS NULL filter now matches all rows - checkRows(session.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) } } } @@ -364,31 +364,31 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.2 (external drop and re-add column with same type) test(s"${testPrefix}temp view with stored plan after external drop and re-add column " + "same type") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v", "v_no_filter", "v_filter_is_null")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - session.table(testTable).createOrReplaceTempView("v_no_filter") - session.table(testTable).filter("salary IS NULL") + withTable(testTable) { + withView("v", "v_no_filter", "v_filter_is_null") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + spark.table(testTable).createOrReplaceTempView("v_no_filter") + spark.table(testTable).filter("salary IS NULL") .createOrReplaceTempView("v_filter_is_null") - checkRows(session.table("v"), Seq(Row(1, 100))) - checkRows(session.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) - checkRows(session.table("v_filter_is_null"), Seq.empty) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, 100), Row(10, 1000))) + checkAnswer(spark.table("v_filter_is_null"), Seq.empty) // external drop and re-add column via catalog API - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), IntegerType, true) catalog.alterTable(testIdent, dropCol, addCol) // salary values are now null, so the filtered view returns nothing - checkRows(session.table("v"), Seq.empty) + checkAnswer(spark.table("v"), Seq.empty) // unfiltered view returns rows with null salary - checkRows(session.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_no_filter"), Seq(Row(1, null), Row(10, null))) // IS NULL filter now matches all rows - checkRows(session.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) + checkAnswer(spark.table("v_filter_is_null"), Seq(Row(1, null), Row(10, null))) } } } @@ -396,44 +396,44 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 5.2 connector w/ cache (external drop/re-add column, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external drop/re-add column " + "same type") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), IntegerType, true) catalog.alterTable(testIdent, dropCol, addCol) // Caching connector returns stale table: column drop/re-add invisible - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, salary values are null - session.sql(s"REFRESH TABLE $cachingTestTable").collect() - checkRows(session.table("v"), Seq.empty) + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() + checkAnswer(spark.table("v"), Seq.empty) } } } // Scenario 6.1 (session drop and re-add column with different type) test(s"${testPrefix}temp view with stored plan detects session column type change") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() - session.sql(s"ALTER TABLE $testTable ADD COLUMN salary STRING").collect() + spark.sql(s"ALTER TABLE $testTable DROP COLUMN salary").collect() + spark.sql(s"ALTER TABLE $testTable ADD COLUMN salary STRING").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -446,21 +446,21 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 6.2 (external drop and re-add column with different type) test(s"${testPrefix}temp view with stored plan detects external column type change") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), StringType, true) catalog.alterTable(testIdent, dropCol, addCol) checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -473,26 +473,26 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 6.2 connector w/ cache (external column type change, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external column type change") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val dropCol = TableChange.deleteColumn(Array("salary"), false) val addCol = TableChange.addColumn(Array("salary"), StringType, true) catalog.alterTable(testIdent, dropCol, addCol) // Caching connector returns stale table: type change invisible, no error - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, type change detected - session.sql(s"REFRESH TABLE $cachingTestTable").collect() + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -505,18 +505,18 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.1 (session type widening from INT to BIGINT) test(s"${testPrefix}temp view with stored plan detects session type widening") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - session.sql(s"ALTER TABLE $testTable ALTER COLUMN salary TYPE LONG").collect() + spark.sql(s"ALTER TABLE $testTable ALTER COLUMN salary TYPE LONG").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -529,20 +529,20 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.2 (external type widening from INT to BIGINT) test(s"${testPrefix}temp view with stored plan detects external type widening") { - withTestSession { session => - withTestTableAndViews(session, testTable, Seq("v")) { - session.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() + withTable(testTable) { + withView("v") { + spark.sql(s"CREATE TABLE $testTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $testTable VALUES (1, 100), (10, 1000)").collect() - session.table(testTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(testTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[InMemoryTableCatalog](session, "testcat") + val catalog = getTableCatalog[InMemoryTableCatalog](spark, "testcat") val updateType = TableChange.updateColumnType(Array("salary"), LongType) catalog.alterTable(testIdent, updateType) checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", @@ -555,25 +555,25 @@ trait DSv2TempViewWithStoredPlanTests extends DSv2ExternalMutationTestBase { // Scenario 7.2 connector w/ cache (external type widening, caching connector) test(s"${testPrefix}connector w/ cache: temp view stale after external type widening") { - withTestSession { session => - withTestTableAndViews(session, cachingTestTable, Seq("v")) { - session.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() - session.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() + withTable(cachingTestTable) { + withView("v") { + spark.sql(s"CREATE TABLE $cachingTestTable (id INT, salary INT) USING foo").collect() + spark.sql(s"INSERT INTO $cachingTestTable VALUES (1, 100), (10, 1000)").collect() - session.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") - checkRows(session.table("v"), Seq(Row(1, 100))) + spark.table(cachingTestTable).filter("salary < 999").createOrReplaceTempView("v") + checkAnswer(spark.table("v"), Seq(Row(1, 100))) - val catalog = getTableCatalog[CachingInMemoryTableCatalog](session, "cachingcat") + val catalog = getTableCatalog[CachingInMemoryTableCatalog](spark, "cachingcat") val updateType = TableChange.updateColumnType(Array("salary"), LongType) catalog.alterTable(testIdent, updateType) // Caching connector returns stale table: type change invisible, no error - checkRows(session.table("v"), Seq(Row(1, 100))) + checkAnswer(spark.table("v"), Seq(Row(1, 100))) // REFRESH TABLE invalidates the connector cache, type change detected - session.sql(s"REFRESH TABLE $cachingTestTable").collect() + spark.sql(s"REFRESH TABLE $cachingTestTable").collect() checkError( - exception = intercept[AnalysisException] { session.table("v").collect() }, + exception = intercept[AnalysisException] { spark.table("v").collect() }, condition = "INCOMPATIBLE_COLUMN_CHANGES_AFTER_VIEW_WITH_PLAN_CREATION", parameters = Map( "viewName" -> "`v`", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala index 71632e07c78b7..4d4b96406bd28 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala @@ -24,7 +24,7 @@ import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkException} -import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode, SparkSession} +import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode, SessionQueryTest, SparkSession} import org.apache.spark.sql.QueryTest.withQueryExecutionsCaptured import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, ReplaceTableAsSelect} @@ -47,6 +47,7 @@ import org.apache.spark.unsafe.types.UTF8String class DataSourceV2DataFrameSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = false) + with SessionQueryTest with DSv2TempViewWithStoredPlanTests with DSv2RepeatedTableAccessTests with DSv2IncrementallyConstructedQueryTests @@ -97,12 +98,6 @@ class DataSourceV2DataFrameSuite // DSv2ExternalMutationTestBase implementations for classic mode override protected def testPrefix: String = "" - override protected def isConnect: Boolean = false - - override protected def withTestSession(fn: SparkSession => Unit): Unit = fn(spark) - - override protected def checkRows(df: => DataFrame, expected: Seq[Row]): Unit = - checkAnswer(df, expected) override protected def getTableCatalog[C <: TableCatalog: ClassTag]( session: SparkSession, @@ -115,16 +110,6 @@ class DataSourceV2DataFrameSuite c.asInstanceOf[C] } - override protected def withTestTableAndViews( - session: SparkSession, - table: String, - views: Seq[String] = Seq.empty)(fn: => Unit): Unit = { - withTable(table) { - try { fn } - finally { views.foreach(v => session.sql(s"DROP VIEW IF EXISTS $v")) } - } - } - override def verifyTable(tableName: String, expected: DataFrame): Unit = { checkAnswer(spark.table(tableName), expected) }