diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index d3f51dfbe2..9b376837f7 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -94,12 +94,9 @@ object CometConf extends ShimCometConf { .createWithEnvVarOrDefault("ENABLE_COMET", true) val COMET_NATIVE_SCAN_ENABLED: ConfigEntry[Boolean] = conf("spark.comet.scan.enabled") - .category(CATEGORY_SCAN) - .doc( - "Whether to enable native scans. When this is turned on, Spark will use Comet to " + - "read supported data sources (currently only Parquet is supported natively). Note " + - "that to enable native vectorized execution, both this config and " + - "`spark.comet.exec.enabled` need to be enabled.") + .category(CATEGORY_TESTING) + .doc("Whether to enable native scans. Intended for use in Comet's own test suites to " + + "selectively disable native scans; not intended for production use.") .booleanConf .createWithDefault(true) diff --git a/dev/release/build-release-comet.sh b/dev/release/build-release-comet.sh index 91d02885af..bb38f51f34 100755 --- a/dev/release/build-release-comet.sh +++ b/dev/release/build-release-comet.sh @@ -202,7 +202,10 @@ LOCAL_REPO=$(mktemp -d /tmp/comet-staging-repo-XXXXX) ./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-3.4 -P scala-2.13 -DskipTests install ./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-3.5 -P scala-2.12 -DskipTests install ./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-3.5 -P scala-2.13 -DskipTests install -./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-4.0 -P scala-2.13 -DskipTests install +# The spark-4.x profiles pin their own Scala 2.13.x patch versions to match the +# corresponding Spark release, so the scala-2.13 profile is not used here. +./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-4.0 -DskipTests install +./mvnw "-Dmaven.repo.local=${LOCAL_REPO}" -P spark-4.1 -DskipTests install echo "Installed to local repo: ${LOCAL_REPO}" diff --git a/docs/source/about/gluten_comparison.md b/docs/source/about/gluten_comparison.md index 86dcad56a0..c7807eabc7 100644 --- a/docs/source/about/gluten_comparison.md +++ b/docs/source/about/gluten_comparison.md @@ -62,8 +62,8 @@ code, then we suggest benchmarking with both solutions and choosing the fastest Both projects target a similar set of Spark releases. -Comet supports Spark 3.4, 3.5, and 4.0 in production builds, with experimental builds also published for -Spark 4.1 and the Spark 4.2 preview. See the [Spark version compatibility guide] for the exact patch versions and +Comet supports Spark 3.4, 3.5, 4.0, and 4.1 in production builds, with an experimental build also published for +the Spark 4.2 preview. See the [Spark version compatibility guide] for the exact patch versions and JDK/Scala combinations. [Spark version compatibility guide]: /user-guide/latest/compatibility/spark-versions.md diff --git a/docs/source/about/index.md b/docs/source/about/index.md deleted file mode 100644 index 43d25f8dbd..0000000000 --- a/docs/source/about/index.md +++ /dev/null @@ -1,73 +0,0 @@ - - -# Comet Overview - -Apache DataFusion Comet is a high-performance accelerator for Apache Spark, built on top of the powerful -[Apache DataFusion] query engine. Comet is designed to significantly enhance the -performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the -Spark ecosystem without requiring any code changes. - -[Apache DataFusion]: https://datafusion.apache.org - -The following diagram provides an overview of Comet's architecture. - -![Comet Overview](/_static/images/comet-overview.png) - -## Architecture - -The following diagram shows how Comet integrates with Apache Spark. - -![Comet System Diagram](/_static/images/comet-system-diagram.png) - -## Feature Parity with Apache Spark - -The project strives to keep feature parity with Apache Spark, that is, -users should expect the same behavior (w.r.t features, configurations, -query results, etc) with Comet turned on or turned off in their Spark -jobs. In addition, Comet extension should automatically detect unsupported -features and fallback to Spark engine. - -## Comparison with other open-source Spark accelerators - -There are two other major open-source Spark accelerators: - -- [Apache Gluten (incubating)](https://github.com/apache/incubator-gluten) -- [NVIDIA Spark RAPIDS](https://github.com/NVIDIA/spark-rapids) - -We have a detailed guide [comparing Apache DataFusion Comet with Apache Gluten]. - -Spark RAPIDS is a solution that provides hardware acceleration on NVIDIA GPUs. Comet does not require specialized -hardware. - -[comparing Apache DataFusion Comet with Apache Gluten]: gluten_comparison.md - -## Getting Started - -Refer to the [Comet Installation Guide] to get started. - -[Comet Installation Guide]: /user-guide/latest/installation.md - -```{toctree} -:maxdepth: 1 -:caption: About -:hidden: - -Comparison with Gluten -``` diff --git a/docs/source/asf/index.md b/docs/source/asf/index.md index 3d7c9810db..e461f68d47 100644 --- a/docs/source/asf/index.md +++ b/docs/source/asf/index.md @@ -19,9 +19,14 @@ under the License. # ASF Links +Apache DataFusion Comet is part of the Apache Software Foundation. The links below point to ASF +resources covering licensing, donations, security reporting, and the Foundation's code of conduct. +Select a link from the navigation menu. + ```{toctree} :maxdepth: 1 :caption: ASF Links +:hidden: Apache Software Foundation License diff --git a/docs/source/conf.py b/docs/source/conf.py index faf6709735..311e6fd754 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -134,16 +134,17 @@ "**": ["docs-sidebar.html"], } -# tell myst_parser to auto-generate anchor links for headers h1, h2, h3 -myst_heading_anchors = 3 +# tell myst_parser to auto-generate anchor links for headers h1, h2, h3, h4 +myst_heading_anchors = 4 # enable nice rendering of checkboxes for the task lists myst_enable_extensions = ["colon_fence", "deflist", "tasklist"] redirects = { - "overview.html": "about/index.html", + "overview.html": "index.html", + "about/index.html": "../index.html", "gluten_comparison.html": "about/gluten_comparison.html", - "user-guide/overview.html": "../about/overview.html", + "user-guide/overview.html": "../index.html", "user-guide/gluten_comparison.html": "../about/gluten_comparison.html", "user-guide/compatibility.html": "latest/compatibility.html", "user-guide/configs.html": "latest/configs.html", diff --git a/docs/source/contributor-guide/benchmarking.md b/docs/source/contributor-guide/benchmarking.md index ecc4fcf277..4bfd5ae571 100644 --- a/docs/source/contributor-guide/benchmarking.md +++ b/docs/source/contributor-guide/benchmarking.md @@ -39,3 +39,13 @@ Available benchmarking guides: - [TPC-DS Benchmarking with spark-sql-perf](benchmarking_spark_sql_perf.md) We also have many micro benchmarks that can be run from an IDE located [here](https://github.com/apache/datafusion-comet/tree/main/spark/src/test/scala/org/apache/spark/sql/benchmark). + +```{toctree} +:hidden: + +benchmark-results/tpc-h +benchmark-results/tpc-ds +benchmarking_macos +benchmarking_aws_ec2 +benchmarking_spark_sql_perf +``` diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 20e73c7428..77c73d68da 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -19,9 +19,21 @@ under the License. # Comet Contributor Guide +The Comet contributor guide is for developers working on Comet itself. It covers the project +architecture, the JVM and native code layout, the Arrow FFI bridge, JVM and native shuffle, and +how data and plans flow between Spark and the DataFusion execution engine. + +It also documents day-to-day workflows including building and testing locally, debugging, +benchmarking, profiling, tracing, running the SQL test suites, adding new operators and +expressions, triaging bugs, and the Comet release process. + +New contributors should start with the Getting Started page. Select a topic from the navigation +menu to read more. + ```{toctree} :maxdepth: 2 :caption: Contributor Guide +:hidden: Getting Started Comet Plugin Overview @@ -30,6 +42,7 @@ JVM Shuffle Native Shuffle Development Guide Debugging Guide +ANSI Error Propagation Benchmarking Guide Adding a New Operator Adding a New Expression diff --git a/docs/source/contributor-guide/roadmap.md b/docs/source/contributor-guide/roadmap.md index 8d293c0686..bae4ddba58 100644 --- a/docs/source/contributor-guide/roadmap.md +++ b/docs/source/contributor-guide/roadmap.md @@ -43,14 +43,16 @@ significant family of Spark expressions in one effort. ## Dynamic Partition Pruning -Both Iceberg table scans and Parquet V1 native scans (`CometNativeScanExec`) support non-AQE Dynamic Partition Pruning -(DPP) filters generated by Spark's `PlanDynamicPruningFilters` optimizer rule ([#3349], [#3511]). However, Spark's -`PlanAdaptiveDynamicPruningFilters` optimizer rule runs after Comet's rules, so DPP with Adaptive Query Execution -requires a redesign of Comet's plan translation. This effort can be tracked at [#3510]. +Native Parquet scans (`CometNativeScanExec`) support Dynamic Partition Pruning (DPP) both with and without +Adaptive Query Execution. Non-AQE DPP landed in [#4011] and AQE DPP with broadcast reuse landed in [#4112]. +Iceberg native scans currently support non-AQE DPP only ([#3349], [#3511]); extending broadcast reuse to AQE +DPP for Iceberg is tracked at [#3510]. [#3349]: https://github.com/apache/datafusion-comet/pull/3349 [#3510]: https://github.com/apache/datafusion-comet/issues/3510 [#3511]: https://github.com/apache/datafusion-comet/pull/3511 +[#4011]: https://github.com/apache/datafusion-comet/pull/4011 +[#4112]: https://github.com/apache/datafusion-comet/pull/4112 ## TPC-H and TPC-DS Performance diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 71f4310a8a..588ae5b457 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -356,15 +356,15 @@ - [x] `/` - [x] abs - [x] acos -- [ ] acosh +- [x] acosh - [x] asin -- [ ] asinh +- [x] asinh - [x] atan - [x] atan2 -- [ ] atanh +- [x] atanh - [x] bin - [ ] bround -- [ ] cbrt +- [x] cbrt - [x] ceil - [x] ceiling - [ ] conv @@ -372,7 +372,7 @@ - [x] cosh - [x] cot - [ ] csc -- [ ] degrees +- [x] degrees - [ ] div - [ ] e - [x] exp @@ -390,12 +390,12 @@ - [x] log2 - [x] mod - [x] negative -- [ ] pi +- [x] pi - [ ] pmod - [x] positive - [x] pow - [x] power -- [ ] radians +- [x] radians - [x] rand - [x] randn - [ ] random diff --git a/docs/source/contributor-guide/sql_error_propagation.md b/docs/source/contributor-guide/sql_error_propagation.md index 7becfabe75..a27408510d 100644 --- a/docs/source/contributor-guide/sql_error_propagation.md +++ b/docs/source/contributor-guide/sql_error_propagation.md @@ -398,8 +398,9 @@ def convertToSparkException(e: CometQueryExecutionException): Throwable = { ### `ShimSparkErrorConverter` calls the real Spark API -Because Spark's `QueryExecutionErrors` API changes between Spark versions (3.4, 3.5, 4.0), -there is a separate implementation per version (in `spark-3.4/`, `spark-3.5/`, `spark-4.0/`). +Because Spark's `QueryExecutionErrors` API changes between Spark versions (3.4, 3.5, and the 4.x line), +there is a separate implementation per branch (in `spark-3.4/`, `spark-3.5/`, and `spark-4.x/`, which is +shared by Spark 4.0, 4.1, and 4.2). ![Shim pattern for per-version Spark API bridging](./shim_pattern.svg) diff --git a/docs/source/index.md b/docs/source/index.md index 75c5db07bd..ba421a6036 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -40,21 +40,14 @@ Comet also accelerates Apache Iceberg, when performing Parquet scans from Spark. Comet delivers a performance speedup for many queries, enabling faster data processing and shorter time-to-insights. -The following chart shows the time it takes to run the 22 TPC-H queries against 100 GB of data in Parquet format -using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html) -for details of the environment used for these benchmarks. +The following charts demonstrate Comet accelerating TPC-H @ 1 TB. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html) +for details. -When using Comet, the overall run time is reduced from 687 seconds to 302 seconds, a 2.2x speedup. - -![](_static/images/benchmark-results/0.11.0/tpch_allqueries.png) +![](_static/images/benchmark-results/0.15.0/tpch_allqueries.png) Here is a breakdown showing relative performance of Spark and Comet for each TPC-H query. -![](_static/images/benchmark-results/0.11.0/tpch_queries_compare.png) - -These benchmarks can be reproduced in any environment using the documentation in the -[Comet Benchmarking Guide](/contributor-guide/benchmarking.md). We encourage -you to run your own benchmarks. +![](_static/images/benchmark-results/0.15.0/tpch_queries_compare.png) ## Use Commodity Hardware @@ -68,12 +61,26 @@ Comet aims for 100% compatibility with all supported versions of Apache Spark, a your existing Spark deployments and workflows seamlessly. With no code changes required, you can immediately harness the benefits of Comet's acceleration capabilities without disrupting your Spark applications. +The project strives to keep feature parity with Apache Spark, that is, users should expect the same behavior (w.r.t +features, configurations, query results, etc) with Comet turned on or turned off in their Spark jobs. In addition, +the Comet extension automatically detects unsupported features and falls back to the Spark engine. + ## Tight Integration with Apache DataFusion Comet tightly integrates with the core Apache DataFusion project, leveraging its powerful execution engine. With seamless interoperability between Comet and DataFusion, you can achieve optimal performance and efficiency in your Spark workloads. +## Architecture + +The following diagram provides an overview of Comet's architecture. + +![Comet Overview](_static/images/comet-overview.png) + +The following diagram shows how Comet integrates with Apache Spark. + +![Comet System Diagram](_static/images/comet-system-diagram.png) + ## Active Community Comet boasts a vibrant and active community of developers, contributors, and users dedicated to advancing the @@ -86,8 +93,6 @@ To get started with Apache DataFusion Comet, follow the [DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect with other users, ask questions, and share your experiences with Comet. -Follow [Apache DataFusion Comet Overview](https://datafusion.apache.org/comet/about/index.html) to get more detailed information - ## Contributing We welcome contributions from the community to help improve and enhance Apache DataFusion Comet. Whether it's fixing @@ -100,8 +105,8 @@ shaping the future of Comet. Check out our :caption: Index :hidden: -Comet Overview User Guide Contributor Guide +Comparison with Gluten ASF Links ``` diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md index 8fe9f93286..65736c1d46 100644 --- a/docs/source/user-guide/index.md +++ b/docs/source/user-guide/index.md @@ -19,9 +19,17 @@ under the License. # Comet User Guide +The Comet user guide covers installation, configuration, supported data sources, supported operators +and expressions, and tuning advice for running Apache Spark with Comet acceleration. + +User guides are published for each release. The development snapshot tracks the upcoming release and +may include features and fixes that are not yet generally available. Select a version from the +navigation menu to view its guide. + ```{toctree} :maxdepth: 2 :caption: User Guides +:hidden: 0.16.0-SNAPSHOT 0.15.x <0.15/index> diff --git a/docs/source/user-guide/latest/compatibility/expressions/index.md b/docs/source/user-guide/latest/compatibility/expressions/index.md index 9fbec44f0b..ba0c4b5d50 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/index.md +++ b/docs/source/user-guide/latest/compatibility/expressions/index.md @@ -27,6 +27,7 @@ Compatibility notes are grouped by expression category: ```{toctree} :maxdepth: 1 +:hidden: aggregate array diff --git a/docs/source/user-guide/latest/compatibility/index.md b/docs/source/user-guide/latest/compatibility/index.md index 7bda0570d1..1ba9d9e181 100644 --- a/docs/source/user-guide/latest/compatibility/index.md +++ b/docs/source/user-guide/latest/compatibility/index.md @@ -32,6 +32,7 @@ This guide documents areas where Comet's behavior is known to differ from Spark. ```{toctree} :maxdepth: 1 +:hidden: scans floating-point diff --git a/docs/source/user-guide/latest/compatibility/scans.md b/docs/source/user-guide/latest/compatibility/scans.md index d68c59d562..d713dd4e8f 100644 --- a/docs/source/user-guide/latest/compatibility/scans.md +++ b/docs/source/user-guide/latest/compatibility/scans.md @@ -48,7 +48,6 @@ The following features are not supported by either scan implementation, and Come - Spark's Datasource V2 API. When `spark.sql.sources.useV1SourceList` does not include `parquet`, Spark uses the V2 API for Parquet scans. The DataFusion-based implementations only support the V1 API. - Spark metadata columns (e.g., `_metadata.file_path`) -- No support for AQE Dynamic Partition Pruning (DPP). Non-AQE DPP is supported. The following shared limitation may produce incorrect results without falling back to Spark: @@ -81,6 +80,22 @@ requires `spark.comet.exec.enabled=true` because the scan node must be wrapped b are detected at read time and raise a `SparkRuntimeException` with error class `_LEGACY_ERROR_TEMP_2093`, matching Spark's behavior. +The following `native_datafusion` limitations may produce incorrect results on Spark versions prior to 4.0 +without falling back to Spark: + +- Reading `TimestampLTZ` as `TimestampNTZ`. On Spark 3.x, Spark raises an error per + [SPARK-36182](https://issues.apache.org/jira/browse/SPARK-36182) because LTZ encodes UTC-adjusted instants + that cannot be safely reinterpreted as timezone-free values. Comet does not raise this error and instead + returns the raw UTC instant as a `TimestampNTZ` value. This applies to all LTZ physical encodings (INT96, + TIMESTAMP_MICROS, TIMESTAMP_MILLIS). On Spark 4.0+, this read is permitted + ([SPARK-47447](https://issues.apache.org/jira/browse/SPARK-47447)) and Comet matches Spark's behavior. + See [#4219](https://github.com/apache/datafusion-comet/issues/4219). + +- Unsupported Parquet type conversions. Spark raises schema incompatibility errors for certain conversions + (e.g., reading INT32 as BIGINT, reading BINARY as TIMESTAMP, unsupported decimal precision changes). + The `native_datafusion` scan may not detect these mismatches and could return unexpected values instead + of raising an error. See [#3720](https://github.com/apache/datafusion-comet/issues/3720). + ## `native_iceberg_compat` Limitations The `native_iceberg_compat` scan has the following additional limitation that may produce incorrect results diff --git a/docs/source/user-guide/latest/compatibility/spark-versions.md b/docs/source/user-guide/latest/compatibility/spark-versions.md index 115b1595be..663be37a2a 100644 --- a/docs/source/user-guide/latest/compatibility/spark-versions.md +++ b/docs/source/user-guide/latest/compatibility/spark-versions.md @@ -28,10 +28,22 @@ compatibility guide. Spark 3.4.3 is supported with Java 11/17 and Scala 2.12/2.13. +### Known Limitations + +- **Reading `TimestampLTZ` as `TimestampNTZ`**: Spark 3.4 raises an error for this operation + (SPARK-36182), but Comet's `native_datafusion` scan silently returns the raw UTC value instead. + See [Parquet Compatibility](scans.md#native_datafusion-limitations) for details. + ## Spark 3.5 Spark 3.5.8 is supported with Java 11/17 and Scala 2.12/2.13. +### Known Limitations + +- **Reading `TimestampLTZ` as `TimestampNTZ`**: Spark 3.5 raises an error for this operation + (SPARK-36182), but Comet's `native_datafusion` scan silently returns the raw UTC value instead. + See [Parquet Compatibility](scans.md#native_datafusion-limitations) for details. + ## Spark 4.0 Spark 4.0.2 is supported with Java 17 and Scala 2.13. @@ -42,14 +54,9 @@ Spark 4.0.2 is supported with Java 17 and Scala 2.13. [#4051](https://github.com/apache/datafusion-comet/issues/4051)): Spark 4.0 introduced collation support. Non-default collated strings are not yet supported by Comet and will fall back to Spark. -## Spark 4.1 (Experimental) - -Spark 4.1.1 is provided as experimental support with Java 17 and Scala 2.13. +## Spark 4.1 -```{warning} -Spark 4.1 support is experimental and intended for development and testing only. It should not be used -in production. -``` +Spark 4.1.1 is supported with Java 17/21 and Scala 2.13. ### Known Limitations diff --git a/docs/source/user-guide/latest/datasources.md b/docs/source/user-guide/latest/datasources.md index bbf45dba96..065b719ba5 100644 --- a/docs/source/user-guide/latest/datasources.md +++ b/docs/source/user-guide/latest/datasources.md @@ -23,10 +23,9 @@ ### Parquet -When `spark.comet.scan.enabled` is enabled, Parquet scans will be performed natively by Comet if all data types -in the schema are supported. When this option is not enabled, the scan will fall back to Spark. In this case, -enabling `spark.comet.convert.parquet.enabled` will immediately convert the data into Arrow format, allowing native -execution to happen after that, but the process may not be efficient. +Parquet scans are performed natively by Comet if all data types in the schema are supported. When the scan +falls back to Spark, enabling `spark.comet.convert.parquet.enabled` will immediately convert the data into +Arrow format, allowing native execution to happen after that, but the process may not be efficient. ### Apache Iceberg @@ -182,7 +181,7 @@ the `object_store` crate's format. This implementation maintains compatibility with existing Hadoop S3A configurations, so existing code will continue to work as long as the configurations are supported and can be translated without loss of functionality. -#### Root CA Certificates +### Root CA Certificates One major difference between Spark and Comet is the mechanism for discovering Root CA Certificates. Spark uses the JVM to read CA Certificates from the Java Trust Store, but native Comet @@ -190,7 +189,7 @@ scans use system Root CA Certificates (typically stored in `/etc/ssl/certs` on Linux). These scans will not be able to interact with S3 if the Root CA Certificates are not installed. -#### Supported Credential Providers +### Supported Credential Providers AWS credential providers can be configured using the `fs.s3a.aws.credentials.provider` configuration. The following table shows the supported credential providers and their configuration options: @@ -209,7 +208,7 @@ AWS credential providers can be configured using the `fs.s3a.aws.credentials.pro Multiple credential providers can be specified in a comma-separated list using the `fs.s3a.aws.credentials.provider` configuration, just as Hadoop AWS supports. If `fs.s3a.aws.credentials.provider` is not configured, Hadoop S3A's default credential provider chain will be used. All configuration options also support bucket-specific overrides using the pattern `fs.s3a.bucket.{bucket-name}.{option}`. -#### Additional S3 Configuration Options +### Additional S3 Configuration Options Beyond credential providers, the `native_datafusion` and `native_iceberg_compat` implementations support additional S3 configuration options: @@ -223,7 +222,7 @@ S3 configuration options: All configuration options support bucket-specific overrides using the pattern `fs.s3a.bucket.{bucket-name}.{option}`. -#### Examples +### Examples The following examples demonstrate how to configure S3 access with the `native_datafusion` and `native_iceberg_compat` Parquet scan implementations using different authentication methods. @@ -256,7 +255,7 @@ $SPARK_HOME/bin/spark-shell \ ... ``` -#### Limitations +### Limitations The S3 support of `native_datafusion` and `native_iceberg_compat` has the following limitations: diff --git a/docs/source/user-guide/latest/iceberg.md b/docs/source/user-guide/latest/iceberg.md index cb6fdab2c9..5c63ae9ad6 100644 --- a/docs/source/user-guide/latest/iceberg.md +++ b/docs/source/user-guide/latest/iceberg.md @@ -25,13 +25,17 @@ Comet's native Iceberg reader relies on reflection to extract `FileScanTask`s fr then serialized to Comet's native execution engine (see [PR #2528](https://github.com/apache/datafusion-comet/pull/2528)). -The example below uses Spark's package downloader to retrieve Comet 0.14.0 and Iceberg +The example below uses Spark's package downloader to retrieve Comet $COMET_VERSION and Iceberg 1.8.1, but Comet has been tested with Iceberg 1.5, 1.7, 1.8, 1.9, and 1.10. The native Iceberg reader is enabled by default. To disable it, set `spark.comet.scan.icebergNative.enabled=false`. +The example uses the Spark 3.5 / Scala 2.12 build of Comet; substitute the Comet artifact +matching your Spark and Scala versions (Comet also ships Spark 3.5 / Scala 2.13 and Spark +4.0/4.1 / Scala 2.13 jars; see the [installation guide](installation.md) for the full list). + ```shell $SPARK_HOME/bin/spark-shell \ - --packages org.apache.datafusion:comet-spark-spark4.1_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ + --packages org.apache.datafusion:comet-spark-spark3.5_2.12:$COMET_VERSION,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ --repositories https://repo1.maven.org/maven2/ \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkCatalog \ @@ -39,7 +43,6 @@ $SPARK_HOME/bin/spark-shell \ --conf spark.sql.catalog.spark_catalog.warehouse=/tmp/warehouse \ --conf spark.plugins=org.apache.spark.CometPlugin \ --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ --conf spark.comet.explainFallback.enabled=true \ --conf spark.memory.offHeap.enabled=true \ --conf spark.memory.offHeap.size=2g @@ -106,7 +109,7 @@ configure Spark to use a REST catalog with Comet's native Iceberg scan: ```shell $SPARK_HOME/bin/spark-shell \ - --packages org.apache.datafusion:comet-spark-spark4.1_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ + --packages org.apache.datafusion:comet-spark-spark3.5_2.12:$COMET_VERSION,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ --repositories https://repo1.maven.org/maven2/ \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.rest_cat=org.apache.iceberg.spark.SparkCatalog \ @@ -115,7 +118,6 @@ $SPARK_HOME/bin/spark-shell \ --conf spark.sql.catalog.rest_cat.warehouse=/tmp/warehouse \ --conf spark.plugins=org.apache.spark.CometPlugin \ --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ --conf spark.comet.explainFallback.enabled=true \ --conf spark.memory.offHeap.enabled=true \ --conf spark.memory.offHeap.size=2g @@ -141,6 +143,8 @@ The following scenarios will fall back to Spark's native Iceberg reader: - Scans with residual filters using `truncate`, `bucket`, `year`, `month`, `day`, or `hour` transform functions (partition pruning still works, but row-level filtering of these transforms falls back) +- Dynamic Partition Pruning under Adaptive Query Execution (non-AQE DPP is supported); + see [#3510](https://github.com/apache/datafusion-comet/issues/3510) ### Task input metrics diff --git a/docs/source/user-guide/latest/index.rst b/docs/source/user-guide/latest/index.rst index 480ec4f702..314a0a51bd 100644 --- a/docs/source/user-guide/latest/index.rst +++ b/docs/source/user-guide/latest/index.rst @@ -22,10 +22,20 @@ Comet $COMET_VERSION User Guide ================================ +This guide covers Comet $COMET_VERSION: how to install it, build it from source, configure it for +your Spark deployment, and get the best results from it. It also documents the data sources, data +types, operators, and expressions that Comet supports, along with a compatibility guide describing +known differences from Apache Spark. + +Operational topics include reading and understanding Comet query plans, tuning, available metrics, +and integration guides for Apache Iceberg and Kubernetes. Select a topic from the navigation menu +to read more. + .. _toc.user-guide-links-$COMET_VERSION: .. toctree:: :maxdepth: 1 :caption: Comet $COMET_VERSION User Guide + :hidden: Installing Comet Building From Source diff --git a/docs/source/user-guide/latest/installation.md b/docs/source/user-guide/latest/installation.md index 3da84e210d..4b7717b688 100644 --- a/docs/source/user-guide/latest/installation.md +++ b/docs/source/user-guide/latest/installation.md @@ -25,8 +25,14 @@ Make sure the following requirements are met and software installed on your mach ### Supported Operating Systems -- Linux -- Apple macOS (Intel and Apple Silicon) +The published Comet jar files in Maven Central bundle native libraries for Linux only (amd64 and arm64). macOS +users must [build from source](source.md). + +| Operating System | Published Maven Jars | Build from Source | +| --------------------------- | -------------------- | ----------------- | +| Linux (amd64) | Yes | Yes | +| Linux (arm64) | Yes | Yes | +| Apple macOS (Apple Silicon) | No | Yes | ### Supported Spark Versions @@ -34,7 +40,7 @@ Comet $COMET_VERSION supports the following versions of Apache Spark. Refer to t in the [Compatibility Guide] for more information, such as known limitations per Spark version. [Spark Version Compatibility]: compatibility/spark-versions.md -[Compatibility Guide]: compatibility +[Compatibility Guide]: compatibility/index.md We recommend only using Comet with Spark versions where we currently have both Comet and Spark tests enabled in CI. Other versions may work well enough for development and evaluation purposes. @@ -44,6 +50,7 @@ Other versions may work well enough for development and evaluation purposes. | 3.4.3 | 11/17 | 2.12/2.13 | Yes | Yes | | 3.5.8 | 11/17 | 2.12/2.13 | Yes | Yes | | 4.0.2 | 17/21 | 2.13 | Yes | Yes | +| 4.1.1 | 17/21 | 2.13 | Yes | Yes | Note that we do not test the full matrix of supported Java and Scala versions in CI for every Spark version. @@ -52,7 +59,6 @@ use only and should not be used in production yet. | Spark Version | Java Version | Scala Version | Comet Tests in CI | Spark SQL Tests in CI | | -------------- | ------------ | ------------- | ----------------- | --------------------- | -| 4.1.1 | 17 | 2.13 | Yes | No | | 4.2.0-preview4 | 17 | 2.13 | No | No | Note that Comet may not fully work with proprietary forks of Apache Spark such as the Spark versions offered by @@ -86,7 +92,6 @@ Here are the direct links for downloading the Comet $COMET_VERSION jar file. - [Comet plugin for Spark 3.5 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark3.5_2.13/$COMET_VERSION/comet-spark-spark3.5_2.13-$COMET_VERSION.jar) - [Comet plugin for Spark 4.0 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.0_2.13/$COMET_VERSION/comet-spark-spark4.0_2.13-$COMET_VERSION.jar) - [Comet plugin for Spark 4.1 / Scala 2.13](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.1_2.13/$COMET_VERSION/comet-spark-spark4.1_2.13-$COMET_VERSION.jar) -- [Comet plugin for Spark 4.2 / Scala 2.13 (Experimental)](https://repo1.maven.org/maven2/org/apache/datafusion/comet-spark-spark4.2_2.13/$COMET_VERSION/comet-spark-spark4.2_2.13-$COMET_VERSION.jar) ## Building from source @@ -115,7 +120,7 @@ $SPARK_HOME/bin/spark-shell \ --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ --conf spark.comet.explainFallback.enabled=true \ --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g + --conf spark.memory.offHeap.size=4g ``` ### Verify Comet enabled for Spark SQL query @@ -126,6 +131,16 @@ Create a test Parquet source scala> (0 until 10).toDF("a").write.mode("overwrite").parquet("/tmp/test") ``` +Comet will log output similar to: + +```shell +INFO core/src/lib.rs: Comet native library version $COMET_VERSION initialized +WARN CometExecRule: Comet cannot execute some parts of this plan natively (set spark.comet.explainFallback.enabled=false to disable this logging): + Execute InsertIntoHadoopFsRelationCommand [COMET: Native support for operator DataWritingCommandExec is disabled. Set spark.comet.parquet.write.enabled=true to enable it.] ++- WriteFiles + +- LocalTableScan [COMET: Native support for operator LocalTableScanExec is disabled. Set spark.comet.exec.localTableScan.enabled=true to enable it.] +``` + Query the data from the test source and check: - INFO message shows the native Comet library has been initialized. @@ -134,24 +149,15 @@ Query the data from the test source and check: ```scala scala> spark.read.parquet("/tmp/test").createOrReplaceTempView("t1") scala> spark.sql("select * from t1 where a > 5").explain -INFO src/lib.rs: Comet native library initialized -== Physical Plan == - *(1) ColumnarToRow - +- CometFilter [a#14], (isnotnull(a#14) AND (a#14 > 5)) - +- CometScan parquet [a#14] Batched: true, DataFilters: [isnotnull(a#14), (a#14 > 5)], - Format: CometParquet, Location: InMemoryFileIndex(1 paths)[file:/tmp/test], PartitionFilters: [], - PushedFilters: [IsNotNull(a), GreaterThan(a,5)], ReadSchema: struct ``` -With the configuration `spark.comet.explainFallback.enabled=true`, Comet will log any reasons that prevent a plan from -being executed natively. +Comet will log output similar to: -```scala -scala> Seq(1,2,3,4).toDF("a").write.parquet("/tmp/test.parquet") -WARN CometSparkSessionExtensions$CometExecRule: Comet cannot execute some parts of this plan natively because: - - LocalTableScan is not supported - - WriteFiles is not supported - - Execute InsertIntoHadoopFsRelationCommand is not supported +```shell +== Physical Plan == +CometNativeColumnarToRow ++- CometFilter [a#6], (isnotnull(a#6) AND (a#6 > 5)) + +- CometNativeScan parquet [a#6] Batched: true, DataFilters: [isnotnull(a#6), (a#6 > 5)], Format: CometParquet, Location: InMemoryFileIndex(1 paths)[file:/tmp/test], PartitionFilters: [], PushedFilters: [IsNotNull(a), GreaterThan(a,5)], ReadSchema: struct ``` ## Additional Configuration @@ -160,7 +166,7 @@ Depending on your deployment mode you may also need to set the driver & executor explicitly contain Comet otherwise Spark may use a different class-loader for the Comet components than its internal components which will then fail at runtime. For example: -``` +```shell --driver-class-path spark/target/comet-spark-spark4.1_2.13-$COMET_VERSION.jar ``` diff --git a/docs/source/user-guide/latest/kubernetes.md b/docs/source/user-guide/latest/kubernetes.md index fd84b7ad9b..c06c469f32 100644 --- a/docs/source/user-guide/latest/kubernetes.md +++ b/docs/source/user-guide/latest/kubernetes.md @@ -69,13 +69,13 @@ metadata: spec: type: Scala mode: cluster - image: apache/datafusion-comet:0.7.0-spark4.1.1-scala2.13-java17 + image: apache/datafusion-comet:$COMET_VERSION-spark3.5.5-scala2.12-java11 imagePullPolicy: IfNotPresent mainClass: org.apache.spark.examples.SparkPi mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.13-4.1.1.jar sparkConf: - "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark4.1_2.13-0.7.0.jar" - "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark4.1_2.13-0.7.0.jar" + "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-$COMET_VERSION.jar" + "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-$COMET_VERSION.jar" "spark.plugins": "org.apache.spark.CometPlugin" "spark.comet.enabled": "true" "spark.comet.exec.enabled": "true" diff --git a/docs/source/user-guide/latest/source.md b/docs/source/user-guide/latest/source.md index 6ae43be56a..841c4e2084 100644 --- a/docs/source/user-guide/latest/source.md +++ b/docs/source/user-guide/latest/source.md @@ -23,6 +23,15 @@ It is sometimes preferable to build from source for a specific platform. ## Using a Published Source Release + + +This documentation is for the current development version of Comet. Published source releases are only available for released versions. +To use this version of Comet, see the following section on building from the GitHubn repository. + + + + + Official source releases can be downloaded from https://dist.apache.org/repos/dist/release/datafusion/ ```console @@ -41,6 +50,8 @@ Build make release-nogit PROFILES="-Pspark-4.1" ``` + + ## Building from the GitHub repository Clone the repository: diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala index e35f90e5c7..870fb5e47d 100644 --- a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala +++ b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala @@ -237,7 +237,7 @@ object GenerateDocs { compat.nonEmpty || incompat.nonEmpty || unsupported.nonEmpty } for ((name, compat, incompat, unsupported) <- sorted) { - w.write(s"\n### $name\n".getBytes) + w.write(s"\n## $name\n".getBytes) if (compat.nonEmpty) { w.write( ("\nThe following differences from Spark are always present and do not require" +