diff --git a/.claude/skills/rationalize-deps/SKILL.md b/.claude/skills/rationalize-deps/SKILL.md new file mode 100644 index 00000000000..829a70c67ec --- /dev/null +++ b/.claude/skills/rationalize-deps/SKILL.md @@ -0,0 +1,125 @@ +--- +name: rationalize-deps +description: Analyze Cargo.toml dependencies and attempt to remove unused features to reduce compile times and binary size +--- + +# Rationalize Dependencies + +This skill analyzes Cargo.toml dependencies to identify and remove unused features. + +## Overview + +Many crates enable features by default that may not be needed. This skill: +1. Identifies dependencies with default features enabled +2. Tests if `default-features = false` works +3. Identifies which specific features are actually needed +4. Verifies compilation after changes + +## Step 1: Identify the target + +Ask the user which crate(s) to analyze: +- A specific crate name (e.g., "tokio", "serde") +- A specific workspace member (e.g., "quickwit-search") +- "all" to scan the entire workspace + +## Step 2: Analyze current dependencies + +For the workspace Cargo.toml (`quickwit/Cargo.toml`), list dependencies that: +- Do NOT have `default-features = false` +- Have default features that might be unnecessary + +Run: `cargo tree -p -f "{p} {f}" --edges features` to see what features are actually used. + +## Step 3: For each candidate dependency + +### 3a: Check the crate's default features + +Look up the crate on crates.io or check its Cargo.toml to understand: +- What features are enabled by default +- What each feature provides + +Use: `cargo metadata --format-version=1 | jq '.packages[] | select(.name == "") | .features'` + +### 3b: Try disabling default features + +Modify the dependency in `quickwit/Cargo.toml`: + +From: +```toml +some-crate = { version = "1.0" } +``` + +To: +```toml +some-crate = { version = "1.0", default-features = false } +``` + +### 3c: Run cargo check + +Run: `cargo check --workspace` (or target specific packages for faster feedback) + +If compilation fails: +1. Read the error messages to identify which features are needed +2. Add only the required features explicitly: + ```toml + some-crate = { version = "1.0", default-features = false, features = ["needed-feature"] } + ``` +3. Re-run cargo check + +### 3d: Binary search for minimal features + +If there are many default features, use binary search: +1. Start with no features +2. If it fails, add half the default features +3. Continue until you find the minimal set + +## Step 4: Document findings + +For each dependency analyzed, report: +- Original configuration +- New configuration (if changed) +- Features that were removed +- Any features that are required + +## Step 5: Verify full build + +After all changes, run: +```bash +cargo check --workspace --all-targets +cargo test --workspace --no-run +``` + +## Common Patterns + +### Serde +Often only needs `derive`: +```toml +serde = { version = "1.0", default-features = false, features = ["derive", "std"] } +``` + +### Tokio +Identify which runtime features are actually used: +```toml +tokio = { version = "1.0", default-features = false, features = ["rt-multi-thread", "macros", "sync"] } +``` + +### Reqwest +Often doesn't need all TLS backends: +```toml +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls", "json"] } +``` + +## Rollback + +If changes cause issues: +```bash +git checkout quickwit/Cargo.toml +cargo check --workspace +``` + +## Tips + +- Start with large crates that have many default features (tokio, reqwest, hyper) +- Use `cargo bloat --crates` to identify large dependencies +- Check `cargo tree -d` for duplicate dependencies that might indicate feature conflicts +- Some features are needed only for tests - consider using `[dev-dependencies]` features diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7af5fbda950..d72638ba08e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,6 +78,7 @@ jobs: - quickwit/**/*.rs - quickwit/**/*.toml - quickwit/**/*.proto + - quickwit/Cargo.lock - quickwit/rest-api-tests/** - .github/workflows/ci.yml - name: Setup stable Rust Toolchain @@ -131,6 +132,7 @@ jobs: - quickwit/**/*.rs - quickwit/**/*.toml - quickwit/**/*.proto + - quickwit/Cargo.lock - .github/workflows/ci.yml - name: Install Ubuntu packages if: always() && steps.modified.outputs.rust_src == 'true' @@ -188,28 +190,3 @@ jobs: if: always() && steps.modified.outputs.rust_src == 'true' run: cargo +nightly fmt --all -- --check working-directory: ./quickwit - - thirdparty-license: - name: Check Datadog third-party license file - runs-on: ubuntu-latest - permissions: - contents: read - actions: write - steps: - - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@f7ccc83f9ed1e5b9c81d8a67d7ad1a747e22a561 # master - with: - toolchain: stable - - - name: Cache cargo tools - uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1 - with: - path: ~/.cargo/bin - key: ${{ runner.os }}-cargo-tools-${{ hashFiles('**/Cargo.lock') }} - - - name: Install dd-rust-license-tool - run: dd-rust-license-tool --help || cargo install --git https://github.com/DataDog/rust-license-tool.git --force - - - name: Check Datadog third-party license file - run: dd-rust-license-tool --config quickwit/license-tool.toml --manifest-path quickwit/Cargo.toml check diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 13904cb90c2..ed79fbdb132 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -52,7 +52,6 @@ base16ct,https://github.com/RustCrypto/formats/tree/master/base16ct,Apache-2.0 O base64,https://github.com/marshallpierce/rust-base64,MIT OR Apache-2.0,Marshall Pierce base64-simd,https://github.com/Nugine/simd,MIT,The base64-simd Authors base64ct,https://github.com/RustCrypto/formats,Apache-2.0 OR MIT,RustCrypto Developers -bincode,https://github.com/servo/bincode,MIT,"Ty Overby , Francesco Mazzoli , David Tolnay , Zoey Riordan " bit-set,https://github.com/contain-rs/bit-set,Apache-2.0 OR MIT,Alexis Beingessner bit-vec,https://github.com/contain-rs/bit-vec,Apache-2.0 OR MIT,Alexis Beingessner bitflags,https://github.com/bitflags/bitflags,MIT OR Apache-2.0,The Rust Project Developers @@ -104,8 +103,6 @@ crossbeam-utils,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crunchy,https://github.com/eira-fransham/crunchy,MIT,Eira Fransham crypto-bigint,https://github.com/RustCrypto/crypto-bigint,Apache-2.0 OR MIT,RustCrypto Developers crypto-common,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers -csv,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant -csv-core,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant darling,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_core,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_macro,https://github.com/TedDriggs/darling,MIT,Ted Driggs @@ -130,15 +127,7 @@ elliptic-curve,https://github.com/RustCrypto/traits/tree/master/elliptic-curve,A embedded-io,https://github.com/embassy-rs/embedded-io,MIT OR Apache-2.0,The embedded-io Authors embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The embedded-io Authors encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu -encoding,https://github.com/lifthrasiir/rust-encoding,MIT,Kang Seonghoon -encoding-index-japanese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-korean,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-simpchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-singlebyte,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-tradchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding_index_tests,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen -encoding_rs_io,https://github.com/BurntSushi/encoding_rs_io,MIT OR Apache-2.0,Andrew Gallant enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors @@ -150,7 +139,6 @@ fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel fastrand,https://github.com/smol-rs/fastrand,Apache-2.0 OR MIT,Stjepan Glavina ff,https://github.com/zkcrypto/ff,MIT OR Apache-2.0,"Sean Bowe , Jack Grigg " -filetime,https://github.com/alexcrichton/filetime,MIT OR Apache-2.0,Alex Crichton find-msvc-tools,https://github.com/rust-lang/cc-rs,MIT OR Apache-2.0,The find-msvc-tools Authors fixedbitset,https://github.com/petgraph/fixedbitset,MIT OR Apache-2.0,bluss flate2,https://github.com/rust-lang/flate2-rs,MIT OR Apache-2.0,"Alex Crichton , Josh Triplett " @@ -224,8 +212,6 @@ is-terminal,https://github.com/sunfishcode/is-terminal,MIT,"softprops -jiff,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant -jiff-static,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant jobserver,https://github.com/rust-lang/jobserver-rs,MIT OR Apache-2.0,Alex Crichton js-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/js-sys,MIT OR Apache-2.0,The wasm-bindgen Developers json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McCombs @@ -233,19 +219,6 @@ lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2. levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio -libredox,https://gitlab.redox-os.org/redox-os/libredox,MIT,4lDO2 <4lDO2@protonmail.com> -lindera-cc-cedict,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict Authors -lindera-cc-cedict-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict-builder Authors -lindera-core,https://github.com/lindera-morphology/lindera,MIT,The lindera-core Authors -lindera-decompress,https://github.com/lindera-morphology/lindera,MIT,The lindera-decompress Authors -lindera-dictionary,https://github.com/lindera-morphology/lindera,MIT,The lindera-dictionary Authors -lindera-ipadic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic Authors -lindera-ipadic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-builder Authors -lindera-ipadic-neologd-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-neologd-builder Authors -lindera-ko-dic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic Authors -lindera-ko-dic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic-builder Authors -lindera-tokenizer,https://github.com/lindera-morphology/lindera,MIT,The lindera-tokenizer Authors -lindera-unidic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-unidic-builder Authors linked-hash-map,https://github.com/contain-rs/linked-hash-map,MIT OR Apache-2.0,"Stepan Koltsov , Andrew Paseltiner " linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers @@ -330,7 +303,6 @@ pnet_packet,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham pnet_sys,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,"Robert Clipsham , Linus Färnstrand " pnet_transport,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham portable-atomic,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic Authors -portable-atomic-util,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic-util Authors postcard,https://github.com/jamesmunns/postcard,MIT OR Apache-2.0,James Munns potential_utf,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers powerfmt,https://github.com/jhpratt/powerfmt,MIT OR Apache-2.0,Jacob Pratt @@ -353,8 +325,6 @@ prost,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-derive,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-types,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " -protobuf,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov -protobuf-support,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov pulldown-cmark,https://github.com/raphlinus/pulldown-cmark,MIT,"Raph Levien , Marcus Klaas de Vries " pulldown-cmark-to-cmark,https://github.com/Byron/pulldown-cmark-to-cmark,Apache-2.0,"Sebastian Thiel , Dylan Owen , Alessandro Ogier , Zixian Cai <2891235+caizixian@users.noreply.github.com>, Andrew Lyjak " quanta,https://github.com/metrics-rs/quanta,MIT,Toby Lawrence @@ -388,7 +358,6 @@ roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Evgeniy Reiz rust-embed,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-impl,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-utils,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh -rust-stemmers,https://github.com/CurrySoftware/rust-stemmers,MIT OR BSD-3-Clause,"Jakob Demler , CurrySoftware " rustc-hash,https://github.com/rust-lang/rustc-hash,Apache-2.0 OR MIT,The Rust Project Developers rustix,https://github.com/bytecodealliance/rustix,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,"Dan Gohman , Jakub Konka " rustls,https://github.com/rustls/rustls,Apache-2.0 OR ISC OR MIT,The rustls Authors @@ -448,8 +417,6 @@ syn,https://github.com/dtolnay/syn,MIT OR Apache-2.0,David Tolnay synstructure,https://github.com/mystor/synstructure,MIT,Nika Layzell sysinfo,https://github.com/GuillaumeGomez/sysinfo,MIT,Guillaume Gomez -system-configuration,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN -system-configuration-sys,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN tabled,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tabled_derive,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tantivy,https://github.com/quickwit-oss/tantivy,MIT,Paul Masurel @@ -545,7 +512,6 @@ wasmtimer,https://github.com/whizsid/wasmtimer-rs,MIT,"WhizSid web-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/web-sys,MIT OR Apache-2.0,The wasm-bindgen Developers web-time,https://github.com/daxpedda/web-time,MIT OR Apache-2.0,The web-time Authors webpki-roots,https://github.com/rustls/webpki-roots,CDLA-Permissive-2.0,The webpki-roots Authors -whichlang,https://github.com/quickwit-oss/whichlang,MIT,"Quickwit, Inc. " winapi,https://github.com/retep998/winapi-rs,MIT,Peter Atashian winapi,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian winapi-i686-pc-windows-gnu,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian @@ -561,7 +527,6 @@ windows-interface,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link Authors windows-numerics,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-numerics Authors -windows-registry,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-registry Authors windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-result Authors windows-strings,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft @@ -590,9 +555,7 @@ windows_x86_64_msvc,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Th winnow,https://github.com/winnow-rs/winnow,MIT,The winnow Authors wit-bindgen,https://github.com/bytecodealliance/wit-bindgen,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Alex Crichton writeable,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers -xattr,https://github.com/Stebalien/xattr,MIT OR Apache-2.0,Steven Allen xmlparser,https://github.com/RazrFalcon/xmlparser,MIT OR Apache-2.0,Yevhenii Reizner -yada,https://github.com/takuyaa/yada,MIT OR Apache-2.0,Takuya Asano yansi,https://github.com/SergioBenitez/yansi,MIT OR Apache-2.0,Sergio Benitez yoke,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar yoke-derive,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 24ce8677902..c8f26ded709 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -94,6 +94,7 @@ The doc mapping defines how a document and the fields it contains are stored and | `tag_fields` | Collection of fields* explicitly defined in `field_mappings` whose values will be stored as part of the `tags` metadata. Allowed types are: `text` (with raw tokenizer), `i64` and `u64`. [Learn more about tags](../overview/concepts/querying.md#tag-pruning). | `[]` | | `store_source` | Whether or not the original JSON document is stored or not in the index. | `false` | | `timestamp_field` | Timestamp field* used for sharding documents in splits. The field has to be of type `datetime`. [Learn more about time sharding](./../overview/architecture.md). | `None` | +| `indexation_time_field` | Field with that will hold the indexation time of the document. This field is populated during indexation. The field has to be of type `datetime`. | `None` | | `partition_key` | If set, quickwit will route documents into different splits depending on the field name declared as the `partition_key`. | `null` | | `max_num_partitions` | Limits the number of splits created through partitioning. (See [Partitioning](../overview/concepts/querying.md#partitioning)) | `200` | | `index_field_presence` | `exists` queries are enabled automatically for fast fields. To enable it for all other fields set this parameter to `true`. Enabling it can have a significant CPU-cost on indexing. | false | diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 32cbdafd761..885ac39e67b 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -187,11 +187,12 @@ It is also possible to not supply an order and rely on the default order using t } ``` -If no format is provided for timestamps, timestamps are returned with milliseconds precision. - -If you need nanosecond precision, you can use the `epoch_nanos_int` format. Beware this means the resulting -JSON may contain high numbers for which there is loss of precision when using languages where all numbers are -floats, such as JavaScript. +Fields explicitly specified as `datetime` in the doc mapping also support an +output format. If no format is provided, timestamps are returned with +milliseconds precision. If you need nanosecond precision, you can use the +`epoch_nanos_int` format. Beware, this means the resulting JSON may contain high +numbers for which there is loss of precision when using languages where all +numbers are floats, such as JavaScript. ```json { @@ -237,6 +238,40 @@ You can pass the `sort` value of the last hit in a subsequent request where othe This allows you to paginate your results. + +#### Note regarding multi-type pagination + +Pagination can get tricky on fields that have multiple types. In dynamic fields, multiple column types can be present for a given field within a single split. When using doc mapping updates, any type combination can be present across split. + +First, let's take a look at the various type systems we are working with. + +The JSON representation used for the sort values provides the following primitive types: +- numerical +- bool +- string + +Tantivy uses the following types: +- i64 / u64 / f64 (only one of these can be present in a split) +- datetime +- string +- bool +- ip (not supported in sort yet) +- bytes (not supported in sort yet) + +Elasticsearch can represent date field sort values in various formats. In Quickwit, only integer formats are supported (millisecond or nanosecond). Either way, the fact that datetime can live along with another type inside a split yields unreliable pagination: +- Because there isn't a simple and efficient common representation in the fast field u64 space, it's hard to represent datetime within the numerical (i64/u64/f64) order. +- To paginate separately across numerical and datetime types a strongly typed representation of the json sort key would be necessary. + +The current implementation does the following: +- If the mapping is explicitly set to datetime and never changed, pagination works as expected. +- If the mapping evolved to datetime, pagination fails for splits that contain numerical values (i64, u64, f64 columns). +- If the mapping is a json/dynamic field, pagination fails for splits that contain a datetime column. This can happen because on JSON field Tantivy automatically stores RFC3339 date strings in a datetime column. +- If other types are mixed, the sort will iterate over all values type by type + - Asc: numeric -> string -> boolean -> datetime -> null + - Desc: datetime -> boolean -> string -> numeric -> null +- Quickwit used to support specifying numbers as string in the search after value. That isn't possible anymore. + + ### `_msearch`   Multi search API ``` diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index a0f47b86c7d..79b23eef0b1 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -51,7 +51,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" dependencies = [ "cfg-if", "cipher", - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -1186,15 +1186,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d809780667f4410e7c41b07f52439b94d2bdf8528eeedc287fa38d3b7f95d82" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.72.1" @@ -1376,9 +1367,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" dependencies = [ "serde", ] @@ -1501,7 +1492,18 @@ checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" dependencies = [ "cfg-if", "cipher", - "cpufeatures", + "cpufeatures 0.2.17", +] + +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.1", ] [[package]] @@ -1511,7 +1513,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" dependencies = [ "aead", - "chacha20", + "chacha20 0.9.1", "cipher", "poly1305", "zeroize", @@ -1529,16 +1531,16 @@ dependencies = [ [[package]] name = "chitchat" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "735f8a51f68b353b17e351b38317433d6afcaa9cc04f4d0f6c9e9125c49c1efe" +checksum = "ec9d05384b0f3f305c5a379b1ec8e716cae107a66eae27e46b120f0a91b7f5ef" dependencies = [ "anyhow", "async-trait", "bytes", "itertools 0.14.0", - "lru 0.13.0", - "rand 0.9.2", + "lru 0.17.0", + "rand 0.10.1", "serde", "tokio", "tokio-stream", @@ -1875,6 +1877,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "3.4.0" @@ -2091,7 +2102,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "curve25519-dalek-derive", "digest", "fiat-crypto", @@ -2226,6 +2237,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" +[[package]] +name = "datasketches" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745" + [[package]] name = "dbl" version = "0.3.2" @@ -2312,8 +2329,6 @@ checksum = "25f104b501bf2364e78d0d3974cbc774f738f5865306ed128e1e0d7499c0ad96" dependencies = [ "console", "shell-words", - "tempfile", - "zeroize", ] [[package]] @@ -2598,70 +2613,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - [[package]] name = "encoding_rs" version = "0.8.35" @@ -2671,15 +2622,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "encoding_rs_io" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" -dependencies = [ - "encoding_rs", -] - [[package]] name = "enum-iterator" version = "2.3.0" @@ -2700,29 +2642,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "env_filter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "jiff", - "log", -] - [[package]] name = "equator" version = "0.4.2" @@ -2892,18 +2811,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" -[[package]] -name = "filetime" -version = "0.2.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" -dependencies = [ - "cfg-if", - "libc", - "libredox", - "windows-sys 0.60.2", -] - [[package]] name = "find-msvc-tools" version = "0.1.6" @@ -3244,11 +3151,25 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", + "wasip2", + "wasip3", +] + [[package]] name = "gimli" version = "0.32.3" @@ -3479,6 +3400,17 @@ dependencies = [ "foldhash 0.2.0", ] +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + [[package]] name = "hashlink" version = "0.10.0" @@ -3824,21 +3756,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", - "system-configuration", "tokio", - "tower-layer", "tower-service", "tracing", - "windows-registry", -] - -[[package]] -name = "hyperloglogplus" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" -dependencies = [ - "serde", ] [[package]] @@ -3946,6 +3866,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -4292,11 +4218,11 @@ dependencies = [ [[package]] name = "keccak" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" dependencies = [ - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -4348,6 +4274,12 @@ dependencies = [ "spin 0.9.8", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -4418,219 +4350,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7595a377b9723e837711366721b02662dac64d734af3dac1c01941e779e95a6b" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-cc-cedict-builder", - "lindera-core", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-cc-cedict-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c6fbd76a65b5df73574898e871d7cff3e34bf89f544f6e1a1087cba82e25cce" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-core" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85af015d15c25cb3b7af82ba181908f4afbec6a2636f0fdfcca6d173c1b2c7fe" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding_rs", - "log", - "once_cell", - "serde", - "thiserror 1.0.69", - "yada", -] - -[[package]] -name = "lindera-decompress" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3dfc054b2f3f3eb21a24ce062a3d5f969339ddf50652038ea33993b1b97d4ba" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b1a5d8f4cba37dcca18dc0e827233ff46695a6d878d716f16f755d264d588a" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-ipadic", - "lindera-ipadic-builder", - "lindera-ipadic-neologd-builder", - "lindera-ko-dic", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "serde", -] - -[[package]] -name = "lindera-ipadic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5f1d26aba22d8a9193dcd2d087205d89e0ffb19490bc305b341e25c037f353" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ipadic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "184a9769b05ae857bd55f5e8a94b2ae2ba8816c5c6b78c73f161b4d7490c0461" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b8cd28b5402425184d0f719d5bd81af87a7e36e2032b5bcceddf55011b1b22c" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6d718720a28ac5d93b449661d8844f7858b2b71595e3198bc90e437f01e5ce" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ko-dic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f22de1fcdc33de258037145ae86686125214206b98d04c6dfe01f36c136c0022" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-tokenizer" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cca45cbc1af512ce2aa9dea9a1d694430480a53bb53e37165ba143e27e81f7dd" -dependencies = [ - "bincode", - "lindera-core", - "lindera-dictionary", - "once_cell", - "serde", - "serde_json", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "359425c8dff54164ff1b068122d26df358ce18533e4771eb5c5ce68888d988f2" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -4681,20 +4400,20 @@ dependencies = [ [[package]] name = "lru" -version = "0.13.0" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "227748d55f2f0ab4735d87fd623798cb6b664512fe979705f829c9f81c934465" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" dependencies = [ - "hashbrown 0.15.5", + "hashbrown 0.16.1", ] [[package]] name = "lru" -version = "0.16.3" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +checksum = "0e0b564323a0fb6d54b864f625ae139de9612e27edb944dda37c109f05aac531" dependencies = [ - "hashbrown 0.16.1", + "hashbrown 0.17.1", ] [[package]] @@ -4731,6 +4450,12 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lz4_flex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" + [[package]] name = "matchers" version = "0.2.0" @@ -5090,9 +4815,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-format" @@ -5297,9 +5022,9 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oneshot" -version = "0.1.11" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" [[package]] name = "onig" @@ -5598,7 +5323,7 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "ownedbytes" version = "0.9.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "stable_deref_trait", ] @@ -6109,7 +5834,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" dependencies = [ - "cpufeatures", + "cpufeatures 0.2.17", "opaque-debug", "universal-hash", ] @@ -6364,7 +6089,6 @@ dependencies = [ "memchr", "parking_lot 0.12.5", "procfs", - "protobuf", "thiserror 2.0.17", ] @@ -6504,26 +6228,6 @@ dependencies = [ "prost 0.14.1", ] -[[package]] -name = "protobuf" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d65a1d4ddae7d8b5de68153b48f6aa3bba8cb002b243dbdbc55a5afbc98f99f4" -dependencies = [ - "once_cell", - "protobuf-support", - "thiserror 1.0.69", -] - -[[package]] -name = "protobuf-support" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e36c2f31e0a47f9280fb347ef5e461ffcd2c52dd520d8e216b52f93b0b0d7d6" -dependencies = [ - "thiserror 1.0.69", -] - [[package]] name = "psl" version = "2.1.176" @@ -6719,7 +6423,7 @@ dependencies = [ "quickwit-cluster", "quickwit-common", "quickwit-config", - "quickwit-doc-mapper", + "quickwit-directories", "quickwit-index-management", "quickwit-indexing", "quickwit-ingest", @@ -6734,6 +6438,7 @@ dependencies = [ "rustls 0.23.36", "serde_json", "tabled", + "tantivy", "tempfile", "thiserror 2.0.17", "thousands", @@ -6825,7 +6530,6 @@ dependencies = [ "bytesize", "coarsetime", "dyn-clone", - "env_logger", "fnv", "futures", "home", @@ -6856,6 +6560,7 @@ dependencies = [ "tonic 0.14.2", "tower 0.5.2", "tracing", + "tracing-subscriber", ] [[package]] @@ -6897,6 +6602,7 @@ version = "0.8.0" dependencies = [ "anyhow", "async-trait", + "base64 0.22.1", "bytesize", "fnv", "futures", @@ -6905,6 +6611,7 @@ dependencies = [ "mockall", "once_cell", "proptest", + "prost 0.14.1", "quickwit-actors", "quickwit-cluster", "quickwit-common", @@ -6917,6 +6624,7 @@ dependencies = [ "serde", "serde_json", "smallvec", + "time", "tokio", "tracing", "ulid", @@ -6977,6 +6685,7 @@ dependencies = [ "serde_yaml", "siphasher", "tantivy", + "tantivy-fst", "thiserror 2.0.17", "time", "tracing", @@ -7328,9 +7037,6 @@ dependencies = [ "bitpacking", "criterion", "hex", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", "once_cell", "proptest", "quickwit-common", @@ -7346,7 +7052,6 @@ dependencies = [ "thiserror 2.0.17", "time", "tracing", - "whichlang", ] [[package]] @@ -7445,6 +7150,7 @@ dependencies = [ "mockall", "once_cell", "percent-encoding", + "pin-project", "pprof", "prost 0.14.1", "prost-types 0.14.1", @@ -7636,6 +7342,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.7.3" @@ -7670,6 +7382,17 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20 0.10.0", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.2.2" @@ -7727,6 +7450,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "rand_distr" version = "0.5.1" @@ -8195,16 +7924,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - [[package]] name = "rust_decimal" version = "1.39.0" @@ -8285,7 +8004,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.8", + "rustls-webpki 0.103.13", "subtle", "zeroize", ] @@ -8333,9 +8052,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "aws-lc-rs", "ring", @@ -8805,7 +8524,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -8816,7 +8535,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -8827,7 +8546,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -8956,9 +8675,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "sketches-ddsketch" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea" dependencies = [ "serde", ] @@ -9428,27 +9147,6 @@ dependencies = [ "nom 8.0.0", ] -[[package]] -name = "system-configuration" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" -dependencies = [ - "bitflags 2.10.0", - "core-foundation 0.9.4", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "tabled" version = "0.20.0" @@ -9483,8 +9181,8 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" -version = "0.26.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.26.1" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "aho-corasick", "arc-swap", @@ -9495,6 +9193,7 @@ dependencies = [ "census", "crc32fast", "crossbeam-channel", + "datasketches", "downcast-rs", "fastdivide", "fnv", @@ -9502,19 +9201,17 @@ dependencies = [ "futures-channel", "futures-util", "htmlescape", - "hyperloglogplus", "itertools 0.14.0", "levenshtein_automata", "log", - "lru 0.12.5", - "lz4_flex", + "lru 0.16.3", + "lz4_flex 0.13.0", "measure_time", "memmap2", "once_cell", "oneshot", "rayon", "regex", - "rust-stemmers", "rustc-hash", "serde", "serde_json", @@ -9539,16 +9236,16 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.9.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.10.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "bitpacking", ] [[package]] name = "tantivy-columnar" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "downcast-rs", "fastdivide", @@ -9562,8 +9259,8 @@ dependencies = [ [[package]] name = "tantivy-common" -version = "0.10.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.11.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "async-trait", "byteorder", @@ -9584,8 +9281,8 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.25.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.26.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "fnv", "nom 7.1.3", @@ -9596,8 +9293,8 @@ dependencies = [ [[package]] name = "tantivy-sstable" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "futures-util", "itertools 0.14.0", @@ -9609,8 +9306,8 @@ dependencies = [ [[package]] name = "tantivy-stacker" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "murmurhash32", "tantivy-common", @@ -9618,23 +9315,12 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "git+https://github.com/SekoiaLab/tantivy.git?tag=v0.26.1_percentile_offset#32b361587a43bfb285cf3690fb81357d241964e1" dependencies = [ "serde", ] -[[package]] -name = "tar" -version = "0.4.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" -dependencies = [ - "filetime", - "libc", - "xattr", -] - [[package]] name = "tempfile" version = "3.24.0" @@ -9770,9 +9456,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.44" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", @@ -9781,16 +9467,16 @@ dependencies = [ "num-conv", "num_threads", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-fmt" @@ -9804,9 +9490,9 @@ dependencies = [ [[package]] name = "time-macros" -version = "0.2.24" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -9950,10 +9636,7 @@ dependencies = [ "futures-core", "futures-io", "futures-sink", - "futures-util", - "hashbrown 0.15.5", "pin-project-lite", - "slab", "tokio", ] @@ -10496,21 +10179,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "log", - "once_cell", - "rustls 0.23.36", - "rustls-pki-types", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "url" version = "2.5.8" @@ -10679,7 +10347,7 @@ dependencies = [ "jsonschema", "lalrpop", "lalrpop-util", - "lz4_flex", + "lz4_flex 0.11.5", "md-5", "nom 8.0.0", "nom-language", @@ -10828,7 +10496,16 @@ version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.46.0", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", ] [[package]] @@ -10904,6 +10581,28 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.0", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" version = "0.4.2" @@ -10932,6 +10631,18 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.10.0", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "semver", +] + [[package]] name = "wasmtimer" version = "0.4.3" @@ -10984,12 +10695,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "whichlang" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9aa3ad29c3d08283ac6b769e3ec15ad1ddb88af7d2e9bc402c574973b937e7" - [[package]] name = "whoami" version = "1.6.1" @@ -11146,17 +10851,6 @@ dependencies = [ "windows-link 0.1.3", ] -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link 0.2.1", - "windows-result 0.4.1", - "windows-strings 0.5.1", -] - [[package]] name = "windows-result" version = "0.3.4" @@ -11480,6 +11174,94 @@ version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck 0.5.0", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck 0.5.0", + "indexmap 2.13.0", + "prettyplease", + "syn 2.0.114", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.114", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.10.0", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "woothee" version = "0.13.0" @@ -11496,16 +11278,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "xattr" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" -dependencies = [ - "libc", - "rustix 1.1.3", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -11518,12 +11290,6 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" -[[package]] -name = "yada" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" - [[package]] name = "yansi" version = "1.0.1" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 453b5850761..02a23e4f164 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -91,7 +91,7 @@ bitpacking = "0.9.3" bytes = { version = "1", features = ["serde"] } bytesize = { version = "1.3", features = ["serde"] } bytestring = "1.5" -chitchat = "0.10.0" +chitchat = "0.11.0" chrono = { version = "0.4", default-features = false, features = [ "clock", "std", @@ -102,11 +102,11 @@ colored = "3.0" console-subscriber = "0.5" criterion = { version = "0.8", features = ["async_tokio"] } cron = "0.15" -dialoguer = "0.12" +dialoguer = { version = "0.12", default-features = false } dotenvy = "0.15" dyn-clone = "1.0" enum-iterator = "2.3" -env_logger = "0.11" +env_logger = { version = "0.11", default-features = false, features = ["auto-color"] } fail = "0.5" flate2 = "1.1" flume = "0.12" @@ -131,23 +131,18 @@ http-serde = "2.1" humantime = "2.3" hyper = { version = "1.8", features = ["client", "http1", "http2", "server"] } hyper-rustls = "0.27" -hyper-util = { version = "0.1", features = ["full"] } +hyper-util = { version = "0.1", default-features = false, features = [ + "client-legacy", + "server-auto", + "server-graceful", + "service", + "tokio", +] } indexmap = { version = "2.12", features = ["serde"] } indicatif = "0.18" itertools = "0.14" json_comments = "0.2" libz-sys = "1.1" -# Lindera tokenizer 0.30+ versions (tested up to 0.32.3) are currently broken due to upstream build failures. -# The dictionary crates attempt to download artifacts from S3 URLs that return 404 Not Found. -# Version 0.29.0 is the latest version that builds correctly. It also explicitly depends on lindera-core 0.29 -# and lindera-dictionary 0.29. -lindera-core = "0.29" -lindera-dictionary = "0.29" -lindera-tokenizer = { version = "0.29", features = [ - "cc-cedict", - "ipadic", - "ko-dic", -] } lru = "0.16" matches = "0.1" md5 = "0.8" @@ -175,7 +170,7 @@ pprof = { version = "0.15", features = ["flamegraph"] } predicates = "3" prettyplease = "0.2" proc-macro2 = "1.0" -prometheus = { version = "0.14", features = ["process"] } +prometheus = { version = "0.14", default-features = false, features = ["process"] } proptest = "1" prost = { version = "0.14", default-features = false, features = [ "derive", @@ -245,7 +240,10 @@ tokio = { version = "1.48", features = ["full"] } tokio-metrics = { version = "0.4", features = ["rt"] } tokio-rustls = { version = "0.26", default-features = false } tokio-stream = { version = "0.1", features = ["sync"] } -tokio-util = { version = "0.7", features = ["full"] } +tokio-util = { version = "0.7", default-features = false, features = [ + "compat", + "io-util", +] } toml = "0.9" tonic = { version = "0.14", features = [ "_tls-any", @@ -295,9 +293,8 @@ vrl = { version = "0.29", default-features = false, features = [ "value", ] } warp = { version = "0.4", features = ["server", "test"] } -whichlang = "0.1" wiremock = "0.6" -zstd = "0.13" +zstd = { version = "0.13", default-features = false } aws-config = "1.8" aws-credential-types = { version = "1.2", features = ["hardcoded-credentials"] } @@ -356,7 +353,7 @@ quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/SekoiaLab/tantivy/", rev = "e9aede4", default-features = false, features = [ +tantivy = { git = "https://github.com/SekoiaLab/tantivy.git", tag = "v0.26.1_percentile_offset", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-actors/src/actor.rs b/quickwit/quickwit-actors/src/actor.rs index 2fa32d7f2a5..c24d784017c 100644 --- a/quickwit/quickwit-actors/src/actor.rs +++ b/quickwit/quickwit-actors/src/actor.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use async_trait::async_trait; use thiserror::Error; -use tracing::error; use crate::{ActorContext, QueueCapacity, SendError}; @@ -66,7 +65,7 @@ pub enum ActorExitStatus { Killed, /// An unexpected error happened while processing a message. - #[error("failure(cause={0:?})")] + #[error("failure(cause={0:#})")] Failure(Arc), /// The thread or the task executing the actor loop panicked. diff --git a/quickwit/quickwit-actors/src/spawn_builder.rs b/quickwit/quickwit-actors/src/spawn_builder.rs index 6dfc1aa9155..47f8403f25f 100644 --- a/quickwit/quickwit-actors/src/spawn_builder.rs +++ b/quickwit/quickwit-actors/src/spawn_builder.rs @@ -409,10 +409,10 @@ async fn actor_loop( | ActorExitStatus::Quit | ActorExitStatus::DownstreamClosed | ActorExitStatus::Killed => { - info!(actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); + info!(actor_id, phase = ?exit_phase, exit_status = %after_process_exit_status, "actor-exit"); } ActorExitStatus::Failure(_) | ActorExitStatus::Panicked => { - error!(actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); + error!(actor_id, phase = ?exit_phase, exit_status = %after_process_exit_status, "actor-exit"); } }; diff --git a/quickwit/quickwit-aws/src/error.rs b/quickwit/quickwit-aws/src/error.rs index 53983e5fcc6..97e44f55f1d 100644 --- a/quickwit/quickwit-aws/src/error.rs +++ b/quickwit/quickwit-aws/src/error.rs @@ -35,7 +35,7 @@ where E: AwsRetryable match self { SdkError::ConstructionFailure(_) => false, SdkError::TimeoutError(_) => true, - SdkError::DispatchFailure(_) => false, + SdkError::DispatchFailure(error) => error.is_io() || error.is_timeout(), SdkError::ResponseError(_) => true, SdkError::ServiceError(error) => error.err().is_retryable(), _ => false, diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index c595cb7e90a..ce598a417ee 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -58,8 +58,8 @@ tracing-subscriber = { workspace = true } quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } +quickwit-directories = { workspace = true } quickwit-config = { workspace = true } -quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } quickwit-ingest = { workspace = true } @@ -70,6 +70,7 @@ quickwit-search = { workspace = true } quickwit-serve = { workspace = true } quickwit-storage = { workspace = true } quickwit-telemetry = { workspace = true } +tantivy = { workspace = true } [dev-dependencies] predicates = { workspace = true } @@ -105,7 +106,6 @@ release-feature-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-feature-vendored-set = [ "jemalloc", @@ -119,7 +119,6 @@ release-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-macos-feature-vendored-set = [ "jemalloc", @@ -132,13 +131,8 @@ release-macos-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-jemalloc-profiled = [ "release-feature-set", "jemalloc-profiled", ] - -[package.metadata.cargo-machete] -# used to enable the `multilang` feature -ignored = ["quickwit-doc-mapper"] diff --git a/quickwit/quickwit-cli/src/cli.rs b/quickwit/quickwit-cli/src/cli.rs index 91bb338ae89..9532ad230bc 100644 --- a/quickwit/quickwit-cli/src/cli.rs +++ b/quickwit/quickwit-cli/src/cli.rs @@ -18,6 +18,7 @@ use quickwit_serve::EnvFilterReloadFn; use tracing::Level; use crate::index::{IndexCliCommand, build_index_command}; +use crate::maintenance::{MaintenanceCliCommand, build_maintenance_command}; use crate::service::{RunCliCommand, build_run_command}; use crate::source::{SourceCliCommand, build_source_command}; use crate::split::{SplitCliCommand, build_split_command}; @@ -47,6 +48,7 @@ pub fn build_cli() -> Command { .subcommand(build_source_command().display_order(3)) .subcommand(build_split_command().display_order(4)) .subcommand(build_tool_command().display_order(5)) + .subcommand(build_maintenance_command().display_order(6)) .arg_required_else_help(true) .disable_help_subcommand(true) .subcommand_required(true) @@ -59,6 +61,7 @@ pub enum CliCommand { Split(SplitCliCommand), Source(SourceCliCommand), Tool(ToolCliCommand), + Maintenance(MaintenanceCliCommand), } impl CliCommand { @@ -69,6 +72,7 @@ impl CliCommand { CliCommand::Source(_) => Level::ERROR, CliCommand::Split(_) => Level::ERROR, CliCommand::Tool(_) => Level::ERROR, + CliCommand::Maintenance(_) => Level::ERROR, } } @@ -82,6 +86,9 @@ impl CliCommand { "source" => SourceCliCommand::parse_cli_args(submatches).map(CliCommand::Source), "split" => SplitCliCommand::parse_cli_args(submatches).map(CliCommand::Split), "tool" => ToolCliCommand::parse_cli_args(submatches).map(CliCommand::Tool), + "maintenance" => { + MaintenanceCliCommand::parse_cli_args(submatches).map(CliCommand::Maintenance) + } _ => bail!("unknown command `{subcommand}`"), } } @@ -93,6 +100,7 @@ impl CliCommand { CliCommand::Source(subcommand) => subcommand.execute().await, CliCommand::Split(subcommand) => subcommand.execute().await, CliCommand::Tool(subcommand) => subcommand.execute().await, + CliCommand::Maintenance(subcommand) => subcommand.execute().await, } } } diff --git a/quickwit/quickwit-cli/src/lib.rs b/quickwit/quickwit-cli/src/lib.rs index aaeb4da7e9d..e05d5dc25aa 100644 --- a/quickwit/quickwit-cli/src/lib.rs +++ b/quickwit/quickwit-cli/src/lib.rs @@ -50,6 +50,7 @@ pub mod index; #[cfg(feature = "jemalloc")] pub mod jemalloc; pub mod logger; +pub mod maintenance; pub mod metrics; pub mod service; pub mod source; diff --git a/quickwit/quickwit-cli/src/maintenance.rs b/quickwit/quickwit-cli/src/maintenance.rs new file mode 100644 index 00000000000..d639d19b4d0 --- /dev/null +++ b/quickwit/quickwit-cli/src/maintenance.rs @@ -0,0 +1,149 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, bail}; +use clap::{ArgMatches, Command}; +use colored::Colorize; +use tracing::debug; + +use crate::checklist::{GREEN_COLOR, RED_COLOR}; +use crate::{ClientArgs, client_args}; + +pub fn build_maintenance_command() -> Command { + Command::new("maintenance") + .about("Manages cluster maintenance mode for safe rolling upgrades.") + .args(client_args()) + .subcommand(Command::new("enable").about( + "Enables maintenance mode. Freezes the indexing plan; metadata mutations are accepted \ + but the plan is not rebuilt.", + )) + .subcommand( + Command::new("disable") + .about("Disables maintenance mode and triggers a full indexing plan rebuild."), + ) + .subcommand(Command::new("status").about("Shows the current maintenance mode status.")) + .subcommand_required(true) + .arg_required_else_help(true) +} + +#[derive(Debug, PartialEq)] +pub struct EnableMaintenanceArgs { + pub client_args: ClientArgs, +} + +#[derive(Debug, PartialEq)] +pub struct DisableMaintenanceArgs { + pub client_args: ClientArgs, +} + +#[derive(Debug, PartialEq)] +pub struct MaintenanceStatusArgs { + pub client_args: ClientArgs, +} + +#[derive(Debug, PartialEq)] +pub enum MaintenanceCliCommand { + Enable(EnableMaintenanceArgs), + Disable(DisableMaintenanceArgs), + Status(MaintenanceStatusArgs), +} + +impl MaintenanceCliCommand { + pub fn parse_cli_args(mut matches: ArgMatches) -> anyhow::Result { + let (subcommand, submatches) = matches + .remove_subcommand() + .context("failed to parse maintenance subcommand")?; + match subcommand.as_str() { + "enable" => Self::parse_enable_args(submatches), + "disable" => Self::parse_disable_args(submatches), + "status" => Self::parse_status_args(submatches), + _ => bail!("unknown maintenance subcommand `{subcommand}`"), + } + } + + fn parse_enable_args(mut matches: ArgMatches) -> anyhow::Result { + let client_args = ClientArgs::parse(&mut matches)?; + Ok(Self::Enable(EnableMaintenanceArgs { client_args })) + } + + fn parse_disable_args(mut matches: ArgMatches) -> anyhow::Result { + let client_args = ClientArgs::parse(&mut matches)?; + Ok(Self::Disable(DisableMaintenanceArgs { client_args })) + } + + fn parse_status_args(mut matches: ArgMatches) -> anyhow::Result { + let client_args = ClientArgs::parse(&mut matches)?; + Ok(Self::Status(MaintenanceStatusArgs { client_args })) + } + + pub fn default_log_level(&self) -> tracing::Level { + tracing::Level::ERROR + } + + pub async fn execute(self) -> anyhow::Result<()> { + match self { + Self::Enable(args) => enable_maintenance_cli(args).await, + Self::Disable(args) => disable_maintenance_cli(args).await, + Self::Status(args) => maintenance_status_cli(args).await, + } + } +} + +async fn enable_maintenance_cli(args: EnableMaintenanceArgs) -> anyhow::Result<()> { + debug!(args=?args, "enable-maintenance"); + println!("❯ Enabling maintenance mode..."); + let qw_client = args.client_args.client(); + let response = qw_client.maintenance().enable().await?; + println!( + "{} Maintenance mode enabled. Indexing plan frozen.", + "✔".color(GREEN_COLOR) + ); + debug!(frozen_plan_json_len = response.frozen_plan_json.len()); + Ok(()) +} + +async fn disable_maintenance_cli(args: DisableMaintenanceArgs) -> anyhow::Result<()> { + debug!(args=?args, "disable-maintenance"); + println!("❯ Disabling maintenance mode..."); + let qw_client = args.client_args.client(); + qw_client.maintenance().disable().await?; + println!( + "{} Maintenance mode disabled. Indexing plan rebuild triggered.", + "✔".color(GREEN_COLOR) + ); + Ok(()) +} + +async fn maintenance_status_cli(args: MaintenanceStatusArgs) -> anyhow::Result<()> { + debug!(args=?args, "maintenance-status"); + let qw_client = args.client_args.client(); + let status = qw_client.maintenance().status().await?; + if status.is_maintenance_mode { + println!( + "{} Maintenance mode is {}", + "●".color(RED_COLOR), + "ENABLED".color(RED_COLOR).bold() + ); + if let Some(enabled_at) = status.enabled_at { + println!(" Enabled at: {enabled_at}"); + } + } else { + println!( + "{} Maintenance mode is {}", + "●".color(GREEN_COLOR), + "DISABLED".color(GREEN_COLOR).bold() + ); + } + Ok(()) +} diff --git a/quickwit/quickwit-cli/src/tool.rs b/quickwit/quickwit-cli/src/tool.rs index d32db8a9e45..b9670c0b867 100644 --- a/quickwit/quickwit-cli/src/tool.rs +++ b/quickwit/quickwit-cli/src/tool.rs @@ -21,6 +21,7 @@ use std::time::{Duration, Instant}; use std::{env, fmt, io}; use anyhow::{Context, bail}; +use bytesize::ByteSize; use clap::{ArgMatches, Command, arg}; use colored::{ColoredString, Colorize}; use humantime::format_duration; @@ -34,11 +35,13 @@ use quickwit_common::uri::Uri; use quickwit_config::service::QuickwitService; use quickwit_config::{ CLI_SOURCE_ID, IndexerConfig, NodeConfig, SourceConfig, SourceInputFormat, SourceParams, - TransformConfig, VecSourceParams, + TransformConfig, VecSourceParams, is_delete_task_service_disabled, }; +use quickwit_directories::BundleDirectory; use quickwit_index_management::{IndexService, clear_cache_directory}; use quickwit_indexing::IndexingPipeline; use quickwit_indexing::actors::{IndexingService, MergePipeline, MergeSchedulerService}; +use quickwit_indexing::mature_merge::{MatureMergeConfig, merge_mature_all_indexes}; use quickwit_indexing::models::{ DetachIndexingPipeline, DetachMergePipeline, IndexingStatistics, SpawnPipeline, }; @@ -53,6 +56,9 @@ use quickwit_serve::{ BodyFormat, SearchRequestQueryString, SortBy, search_request_from_api_request, }; use quickwit_storage::{BundleStorage, Storage}; +use tantivy::Index; +use tantivy::directory::FileSlice; +use tantivy::schema::FieldType; use thousands::Separable; use tracing::{debug, info}; @@ -135,6 +141,16 @@ pub fn build_tool_command() -> Command { arg!(--"target-dir" "Directory to extract the split to."), ]) ) + .subcommand( + Command::new("analyze-split-file") + .about("Analyze a local split file.") + .long_about("Analyzes the contents of a local .split file. Does not require a node config or metastore access.") + .args(&[ + arg!(--"split-file" "Path to the local .split file to analyze.") + .display_order(1) + .required(true), + ]) + ) .subcommand( Command::new("gc") .display_order(10) @@ -163,6 +179,60 @@ pub fn build_tool_command() -> Command { .required(true), ]) ) + .subcommand( + Command::new("merge-mature") + .display_order(10) + .about("Merges mature splits across all indexes and nodes.") + .long_about( + "Scans indexes for merge opportunities in mature Published splits. Considers \ + opportunities across all origin nodes and sources. Runs once and exits." + ) + .args(&[ + arg!(--"dry-run" + "Prints the planned merge operations without executing them.") + .required(false), + arg!(--"max-concurrent-merges" + "Maximum number of merges to run concurrently (default: 10).") + .display_order(1) + .required(false), + arg!(--"retention-safety-buffer-days" + "Splits within this many days of the retention cutoff are excluded (default: 5).") + .display_order(2) + .required(false), + arg!(--"min-merge-group-size" + "Minimum number of splits in a group to trigger a merge (default: 5).") + .display_order(3) + .required(false), + arg!(--"input-split-max-num-docs" + "Maximum number of docs in a split for it to be eligible (default: 10_000).") + .display_order(4) + .required(false), + arg!(--"max-merge-group-size" + "Maximum number of splits per merge operation (default: 100).") + .display_order(5) + .required(false), + arg!(--"split-target-num-docs" + "Maximum total docs per merge operation (default: 5_000_000).") + .display_order(6) + .required(false), + arg!(--"split-timestamp-days-range" + "Group splits that span this many days together (0 = single-day, default: 0).") + .display_order(7) + .required(false), + arg!(--"index-parallelism" + "Number of indexes processed concurrently (default: 50).") + .display_order(8) + .required(false), + arg!(--"index-id-patterns" + "Comma-separated list of index ID patterns to include (default: '*').") + .display_order(9) + .required(false), + arg!(--"metrics" + "Expose Prometheus metrics on the REST listen address during the run.") + .display_order(10) + .required(false), + ]) + ) .arg_required_else_help(true) } @@ -207,6 +277,13 @@ pub struct MergeArgs { pub source_id: SourceId, } +#[derive(Debug, Eq, PartialEq)] +pub struct MatureMergeArgs { + pub config_uri: Uri, + pub merge_config: MatureMergeConfig, + pub serve_metrics: bool, +} + #[derive(Debug, Eq, PartialEq)] pub struct ExtractSplitArgs { pub config_uri: Uri, @@ -215,13 +292,20 @@ pub struct ExtractSplitArgs { pub target_dir: PathBuf, } +#[derive(Debug, Eq, PartialEq)] +pub struct AnalyzeSplitFileArgs { + pub split_file: PathBuf, +} + #[derive(Debug, Eq, PartialEq)] pub enum ToolCliCommand { GarbageCollect(GarbageCollectIndexArgs), LocalIngest(LocalIngestDocsArgs), LocalSearch(LocalSearchArgs), Merge(MergeArgs), + MatureMerge(MatureMergeArgs), ExtractSplit(ExtractSplitArgs), + AnalyzeSplitFile(AnalyzeSplitFileArgs), } impl ToolCliCommand { @@ -234,7 +318,9 @@ impl ToolCliCommand { "local-ingest" => Self::parse_local_ingest_args(submatches), "local-search" => Self::parse_local_search_args(submatches), "merge" => Self::parse_merge_args(submatches), + "merge-mature" => Self::parse_mature_merge_args(submatches), "extract-split" => Self::parse_extract_split_args(submatches), + "analyze-split-file" => Self::analyze_split_file_args(submatches), _ => bail!("unknown tool subcommand `{subcommand}`"), } } @@ -385,13 +471,100 @@ impl ToolCliCommand { })) } + fn parse_mature_merge_args(mut matches: ArgMatches) -> anyhow::Result { + let config_uri = matches + .remove_one::("config") + .map(|uri_str| Uri::from_str(&uri_str)) + .expect("`config` should be a required arg.")?; + let defaults = MatureMergeConfig::default(); + let dry_run = matches.get_flag("dry-run"); + let max_concurrent_merges = matches + .remove_one::("max-concurrent-merges") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.max_concurrent_merges); + let retention_safety_buffer_days = matches + .remove_one::("retention-safety-buffer-days") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.retention_safety_buffer_days); + let min_merge_group_size = matches + .remove_one::("min-merge-group-size") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.min_merge_group_size); + let input_split_max_num_docs = matches + .remove_one::("input-split-max-num-docs") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.input_split_max_num_docs); + let max_merge_group_size = matches + .remove_one::("max-merge-group-size") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.max_merge_group_size); + let split_target_num_docs = matches + .remove_one::("split-target-num-docs") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.split_target_num_docs); + let split_timestamp_days_range = matches + .remove_one::("split-timestamp-days-range") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.split_timestamp_days_range); + let index_parallelism = matches + .remove_one::("index-parallelism") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.index_parallelism); + let index_id_patterns = matches + .remove_one::("index-id-patterns") + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or(defaults.index_id_patterns); + let serve_metrics = matches.get_flag("metrics"); + + if max_concurrent_merges == 0 { + bail!("`max-concurrent-merges` must be greater than or equal to 1."); + } + if index_parallelism == 0 { + bail!("`index-parallelism` must be greater than or equal to 1."); + } + Ok(Self::MatureMerge(MatureMergeArgs { + config_uri, + serve_metrics, + merge_config: MatureMergeConfig { + dry_run, + max_concurrent_merges, + retention_safety_buffer_days, + min_merge_group_size, + input_split_max_num_docs, + max_merge_group_size, + split_target_num_docs, + split_timestamp_days_range, + index_parallelism, + index_id_patterns, + }, + })) + } + + fn analyze_split_file_args(mut matches: ArgMatches) -> anyhow::Result { + let split_file = matches + .remove_one::("split-file") + .map(PathBuf::from) + .expect("`split-file` should be a required arg."); + Ok(Self::AnalyzeSplitFile(AnalyzeSplitFileArgs { split_file })) + } + pub async fn execute(self) -> anyhow::Result<()> { match self { Self::GarbageCollect(args) => garbage_collect_index_cli(args).await, Self::LocalIngest(args) => local_ingest_docs_cli(args).await, Self::LocalSearch(args) => local_search_cli(args).await, Self::Merge(args) => merge_cli(args).await, + Self::MatureMerge(args) => merge_mature_cli(args).await, Self::ExtractSplit(args) => extract_split_cli(args).await, + Self::AnalyzeSplitFile(args) => analyze_split_file_cli(args).await, } } } @@ -459,6 +632,7 @@ pub async fn local_ingest_docs_cli(args: LocalIngestDocsArgs) -> anyhow::Result< IngesterPool::default(), storage_resolver, EventBroker::default(), + is_delete_task_service_disabled(), ) .await?; let (indexing_server_mailbox, indexing_server_handle) = @@ -555,7 +729,7 @@ pub async fn local_search_cli(args: LocalSearchArgs) -> anyhow::Result<()> { split_id: None, }; let search_request = - search_request_from_api_request(vec![args.index_id], search_request_query_string)?; + search_request_from_api_request(vec![args.index_id], search_request_query_string, None)?; debug!(search_request=?search_request, "search-request"); let search_response: SearchResponse = single_node_search(search_request, metastore, storage_resolver).await?; @@ -597,6 +771,7 @@ pub async fn merge_cli(args: MergeArgs) -> anyhow::Result<()> { IngesterPool::default(), storage_resolver, EventBroker::default(), + is_delete_task_service_disabled(), ) .await?; let (indexing_service_mailbox, indexing_service_handle) = @@ -651,6 +826,43 @@ pub async fn merge_cli(args: MergeArgs) -> anyhow::Result<()> { Ok(()) } +pub async fn merge_mature_cli(args: MatureMergeArgs) -> anyhow::Result<()> { + debug!(args=?args, "merge-mature"); + info!(merge_config=?args.merge_config, "merge-mature configuration"); + println!("❯ Scanning all indexes for mature merge opportunities..."); + let config = load_node_config(&args.config_uri).await?; + let (storage_resolver, metastore_resolver) = + get_resolvers(&config.storage_configs, &config.metastore_configs); + let metastore = metastore_resolver.resolve(&config.metastore_uri).await?; + + let runtimes_config = RuntimesConfig::default(); + start_actor_runtimes( + runtimes_config, + &HashSet::from_iter([QuickwitService::Indexer]), + )?; + + if args.serve_metrics { + let metrics_addr = config.rest_config.listen_addr; + tokio::spawn(serve_metrics(metrics_addr)); + } + + merge_mature_all_indexes( + metastore, + storage_resolver, + &config.data_dir_path, + args.merge_config.clone(), + config.node_id, + ) + .await?; + + if !args.merge_config.dry_run { + info!("mature splits successfully merged, waiting for explicit termination signal"); + tokio::time::sleep(Duration::MAX).await; + } + + Ok(()) +} + pub async fn garbage_collect_index_cli(args: GarbageCollectIndexArgs) -> anyhow::Result<()> { debug!(args=?args, "garbage-collect-index"); println!("❯ Garbage collecting index..."); @@ -748,6 +960,149 @@ async fn extract_split_cli(args: ExtractSplitArgs) -> anyhow::Result<()> { Ok(()) } +fn print_per_field( + label: &str, + usage: &tantivy::space_usage::PerFieldSpaceUsage, + // Per-JSON-field sub-key breakdown: field_name -> sorted (sub_key, bytes) + json_sub_keys: &std::collections::HashMap>, +) { + let total = usage.total().get_bytes(); + if total == 0 { + return; + } + let mut fields: Vec<_> = usage.fields().collect(); + fields.sort_by_key(|f| std::cmp::Reverse(f.total())); + println!(" {label:<14} {}", ByteSize(total)); + for field in &fields { + println!( + " {:<40} {}", + field.field_name(), + ByteSize(field.total().get_bytes()) + ); + if let Some(sub_keys) = json_sub_keys.get(field.field_name()) { + for (key, bytes) in sub_keys { + println!(" {:<38} {}", key, ByteSize(*bytes)); + } + } + } +} + +async fn analyze_split_file_cli(args: AnalyzeSplitFileArgs) -> anyhow::Result<()> { + debug!(args=?args, "extract-split-file"); + println!("❯ Extracting split file..."); + + let split_file_path = args.split_file.canonicalize().with_context(|| { + format!( + "failed to resolve split file path `{}`", + args.split_file.display() + ) + })?; + let split_data_vec = std::fs::read(&split_file_path) + .with_context(|| format!("failed to read split file `{}`", split_file_path.display()))?; + + // --- Tantivy space usage analysis --- + let file_slice = FileSlice::from(split_data_vec); + match BundleDirectory::open_split(file_slice) + .and_then(|dir| Index::open(dir).map_err(std::io::Error::other)) + { + Ok(index) => { + let reader = index.reader()?; + let searcher = reader.searcher(); + let seg_reader = searcher.segment_reader(0); + let schema = index.schema(); + let usage = searcher.space_usage()?; + if let Some(seg) = usage.segments().first() { + println!("\n{} docs:", seg.num_docs()); + + // Scan each JSON field's term dictionary and accumulate postings / positions + // bytes separately per top-level sub-key. + // Result maps: field_name -> sorted Vec<(sub_key, bytes)> + let mut postings_sub_keys: std::collections::HashMap> = + std::collections::HashMap::new(); + let mut positions_sub_keys: std::collections::HashMap> = + std::collections::HashMap::new(); + for (field, field_entry) in schema.fields() { + if !matches!(field_entry.field_type(), FieldType::JsonObject(_)) { + continue; + } + let inv_index = seg_reader.inverted_index(field)?; + let mut stream = inv_index.terms().stream()?; + let mut postings_per_key: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + let mut positions_per_key: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + // Term key layout for JSON fields: [path bytes][0x00][value type][value bytes] + // Path segments are separated by 0x01. No type-code prefix in the SSTable key. + while let Some((key_bytes, term_info)) = stream.next() { + let path_end = key_bytes + .iter() + .position(|&b| b == 0x00) + .unwrap_or(key_bytes.len()); + let path_bytes = &key_bytes[..path_end]; + // Replace 0x01 segment separators with '.' to reconstruct the full path. + let full_path: String = path_bytes + .split(|&b| b == 0x01) + .map(|seg| std::str::from_utf8(seg).unwrap_or("")) + .collect::>() + .join("."); + let top_key = full_path; + *postings_per_key.entry(top_key.clone()).or_default() += + term_info.postings_range.len() as u64; + *positions_per_key.entry(top_key).or_default() += + term_info.positions_range.len() as u64; + } + let field_name = field_entry.name().to_string(); + let mut postings_sorted: Vec<_> = postings_per_key.into_iter().collect(); + postings_sorted.sort_by(|a, b| b.1.cmp(&a.1)); + postings_sub_keys.insert(field_name.clone(), postings_sorted); + let mut positions_sorted: Vec<_> = positions_per_key.into_iter().collect(); + positions_sorted.retain(|(_, b)| *b > 0); + positions_sorted.sort_by(|a, b| b.1.cmp(&a.1)); + positions_sub_keys.insert(field_name, positions_sorted); + } + + print_per_field( + "term dict", + seg.termdict(), + &std::collections::HashMap::new(), + ); + print_per_field("postings", seg.postings(), &postings_sub_keys); + print_per_field("positions", seg.positions(), &positions_sub_keys); + print_per_field( + "fast fields", + seg.fast_fields(), + &std::collections::HashMap::new(), + ); + print_per_field( + "field norms", + seg.fieldnorms(), + &std::collections::HashMap::new(), + ); + let store = seg.store(); + let store_total = store.total().get_bytes(); + if store_total > 0 { + println!(" {:<14} {}", "store", ByteSize(store_total)); + println!( + " {:<40} {}", + "data", + ByteSize(store.data_usage().get_bytes()) + ); + println!( + " {:<40} {}", + "offsets", + ByteSize(store.offsets_usage().get_bytes()) + ); + } + println!(); + } + } + Err(err) => { + debug!("could not open split as tantivy index for space analysis: {err:#}"); + } + } + Ok(()) +} + /// Starts a tokio task that displays the indexing statistics /// every once in awhile. pub async fn start_statistics_reporting_loop( @@ -955,3 +1310,48 @@ async fn create_empty_cluster(config: &NodeConfig) -> anyhow::Result { Ok(cluster) } + +/// A shortcut to expose the metrics without loading the whole quickwit_serve +/// machinery. +async fn serve_metrics(addr: std::net::SocketAddr) { + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + let listener = match tokio::net::TcpListener::bind(addr).await { + Ok(l) => l, + Err(err) => { + tracing::warn!("metrics server could not bind to {addr}: {err}"); + return; + } + }; + tracing::info!("metrics server listening on http://{addr}/metrics"); + loop { + let Ok((mut stream, _peer)) = listener.accept().await else { + continue; + }; + tokio::spawn(async move { + let mut buf = [0u8; 4096]; + let n = match stream.read(&mut buf).await { + Ok(n) => n, + Err(_) => return, + }; + let request = std::str::from_utf8(&buf[..n]).unwrap_or(""); + let is_metrics = request.starts_with("GET /metrics"); + let (status, body) = if is_metrics { + match quickwit_common::metrics::metrics_text_payload() { + Ok(payload) => ("200 OK", payload), + Err(e) => { + tracing::error!("failed to encode prometheus metrics: {e}"); + ("500 Internal Server Error", String::new()) + } + } + } else { + ("404 Not Found", String::new()) + }; + let response = format!( + "HTTP/1.1 {status}\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: \ + {}\r\nConnection: close\r\n\r\n{body}", + body.len() + ); + let _ = stream.write_all(response.as_bytes()).await; + }); + } +} diff --git a/quickwit/quickwit-cluster/src/change.rs b/quickwit/quickwit-cluster/src/change.rs index 1491212281c..01b953bb6b0 100644 --- a/quickwit/quickwit-cluster/src/change.rs +++ b/quickwit/quickwit-cluster/src/change.rs @@ -139,7 +139,7 @@ async fn compute_cluster_change_events_on_added( client_grpc_config: ClientGrpcConfig, ) -> Vec { let is_self_node = self_chitchat_id == new_chitchat_id; - let new_node_id: NodeId = new_chitchat_id.node_id.clone().into(); + let new_node_id = NodeId::from_arc_str(new_chitchat_id.node_id.clone()); let maybe_previous_node_entry = previous_nodes.entry(new_node_id); let mut events = Vec::new(); @@ -160,7 +160,7 @@ async fn compute_cluster_change_events_on_added( let previous_node = previous_node_entry.remove(); verb = "rejoined"; - if previous_node.is_ready() { + if previous_node.is_ready { events.push(ClusterChange::Remove(previous_node)); } } @@ -181,10 +181,10 @@ async fn compute_cluster_change_events_on_added( "node `{}` has {verb} the cluster", new_chitchat_id.node_id, ); - let new_node_id: NodeId = new_node.node_id().into(); + let new_node_id: NodeId = new_node.node_id.clone(); previous_nodes.insert(new_node_id, new_node.clone()); - if new_node.is_ready() { + if new_node.is_ready { info!( node_id=%new_chitchat_id.node_id, generation_id=%new_chitchat_id.generation_id, @@ -204,7 +204,7 @@ async fn compute_cluster_change_events_on_updated( updated_node_state: &NodeState, previous_nodes: &mut BTreeMap, ) -> Option { - let previous_node = previous_nodes.get(&updated_chitchat_id.node_id)?.clone(); + let previous_node = previous_nodes.get(&*updated_chitchat_id.node_id)?.clone(); if previous_node.chitchat_id().generation_id > updated_chitchat_id.generation_id { warn!( @@ -224,10 +224,10 @@ async fn compute_cluster_change_events_on_updated( previous_channel, is_self_node, )?; - let updated_node_id: NodeId = updated_node.chitchat_id().node_id.clone().into(); + let updated_node_id = updated_node.node_id.clone(); previous_nodes.insert(updated_node_id, updated_node.clone()); - if !previous_node.is_ready() && updated_node.is_ready() { + if !previous_node.is_ready && updated_node.is_ready { warmup_channel(updated_node.channel()).await; info!( @@ -237,7 +237,7 @@ async fn compute_cluster_change_events_on_updated( updated_chitchat_id.node_id ); Some(ClusterChange::Add(updated_node)) - } else if previous_node.is_ready() && !updated_node.is_ready() { + } else if previous_node.is_ready && !updated_node.is_ready { info!( node_id=%updated_chitchat_id.node_id, generation_id=%updated_chitchat_id.generation_id, @@ -245,7 +245,7 @@ async fn compute_cluster_change_events_on_updated( updated_chitchat_id.node_id ); Some(ClusterChange::Remove(updated_node)) - } else if previous_node.is_ready() && updated_node.is_ready() { + } else if previous_node.is_ready && updated_node.is_ready { Some(ClusterChange::Update(updated_node)) } else { None @@ -256,7 +256,7 @@ fn compute_cluster_change_events_on_removed( removed_chitchat_id: &ChitchatId, previous_nodes: &mut BTreeMap, ) -> Option { - let removed_node_id: NodeId = removed_chitchat_id.node_id.clone().into(); + let removed_node_id = NodeId::from_arc_str(removed_chitchat_id.node_id.clone()); if let Entry::Occupied(previous_node_entry) = previous_nodes.entry(removed_node_id) { let previous_node_ref = previous_node_entry.get(); @@ -270,7 +270,7 @@ fn compute_cluster_change_events_on_removed( ); let previous_node = previous_node_entry.remove(); - if previous_node.is_ready() { + if previous_node.is_ready { return Some(ClusterChange::Remove(previous_node)); } } @@ -476,12 +476,14 @@ pub(crate) mod tests { .await; assert!(events.is_empty()); - let node = previous_nodes.get(&new_chitchat_id.node_id).unwrap(); + let node = previous_nodes + .get(new_chitchat_id.node_id.as_ref()) + .unwrap(); - assert_eq!(node.chitchat_id(), &new_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); + assert_eq!(node.chitchat_id(), new_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); assert!(!node.is_self_node()); - assert!(!node.is_ready()); + assert!(!node.is_ready); } { // New node joins the cluster and is ready. @@ -507,11 +509,16 @@ pub(crate) mod tests { let ClusterChange::Add(node) = &events[0] else { panic!("expected `ClusterChange::Add` event, got `{:?}`", events[0]); }; - assert_eq!(node.chitchat_id(), &new_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); + assert_eq!(node.chitchat_id(), new_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); assert!(!node.is_self_node()); - assert!(node.is_ready()); - assert_eq!(previous_nodes.get(&new_chitchat_id.node_id).unwrap(), node); + assert!(node.is_ready); + assert_eq!( + previous_nodes + .get(new_chitchat_id.node_id.as_ref()) + .unwrap(), + node + ); // Node rejoins with same node ID but newer generation ID. let mut rejoined_chitchat_id = ChitchatId::for_local_test(port); @@ -534,14 +541,16 @@ pub(crate) mod tests { events[0] ); }; - assert_eq!(removed_node.chitchat_id(), &new_chitchat_id); + assert_eq!(removed_node.chitchat_id(), new_chitchat_id); let ClusterChange::Add(rejoined_node) = &events[1] else { panic!("expected `ClusterChange::Add` event, got `{:?}`", events[1]); }; - assert_eq!(rejoined_node.chitchat_id(), &rejoined_chitchat_id); + assert_eq!(rejoined_node.chitchat_id(), rejoined_chitchat_id); assert_eq!( - previous_nodes.get(&rejoined_chitchat_id.node_id).unwrap(), + previous_nodes + .get(rejoined_chitchat_id.node_id.as_ref()) + .unwrap(), rejoined_node ); @@ -557,7 +566,9 @@ pub(crate) mod tests { .await; assert!(events.is_empty()); assert_eq!( - previous_nodes.get(&rejoined_chitchat_id.node_id).unwrap(), + previous_nodes + .get(rejoined_chitchat_id.node_id.as_ref()) + .unwrap(), rejoined_node ); } @@ -585,11 +596,16 @@ pub(crate) mod tests { let ClusterChange::Add(node) = &events[0] else { panic!("expected `ClusterChange::Add` event, got `{:?}`", events[0]); }; - assert_eq!(node.chitchat_id(), &new_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); + assert_eq!(node.chitchat_id(), new_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); assert!(node.is_self_node()); - assert!(node.is_ready()); - assert_eq!(previous_nodes.get(&new_chitchat_id.node_id).unwrap(), node); + assert!(node.is_ready); + assert_eq!( + previous_nodes + .get(new_chitchat_id.node_id.as_ref()) + .unwrap(), + node + ); } } @@ -603,7 +619,7 @@ pub(crate) mod tests { let port = 1235; let grpc_advertise_addr: SocketAddr = ([127, 0, 0, 1], port + 1).into(); let updated_chitchat_id = ChitchatId::for_local_test(port); - let updated_node_id: NodeId = updated_chitchat_id.node_id.clone().into(); + let updated_node_id: NodeId = NodeId::from_arc_str(updated_chitchat_id.node_id.clone()); let previous_node_state = NodeStateBuilder::default() .with_grpc_advertise_addr(grpc_advertise_addr) .with_readiness(false) @@ -636,12 +652,14 @@ pub(crate) mod tests { let ClusterChange::Add(node) = event else { panic!("expected `ClusterChange::Add` event, got `{event:?}`"); }; - assert_eq!(node.chitchat_id(), &updated_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); - assert!(node.is_ready()); + assert_eq!(node.chitchat_id(), updated_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); + assert!(node.is_ready); assert!(!node.is_self_node()); assert_eq!( - previous_nodes.get(&updated_chitchat_id.node_id).unwrap(), + previous_nodes + .get(updated_chitchat_id.node_id.as_ref()) + .unwrap(), &node ); } @@ -650,7 +668,7 @@ pub(crate) mod tests { let port = 1235; let grpc_advertise_addr: SocketAddr = ([127, 0, 0, 1], port + 1).into(); let updated_chitchat_id = ChitchatId::for_local_test(port); - let updated_node_id: NodeId = updated_chitchat_id.node_id.clone().into(); + let updated_node_id: NodeId = NodeId::from_arc_str(updated_chitchat_id.node_id.clone()); let previous_node_state = NodeStateBuilder::default() .with_grpc_advertise_addr(grpc_advertise_addr) .with_readiness(true) @@ -684,12 +702,14 @@ pub(crate) mod tests { let ClusterChange::Update(node) = event else { panic!("expected `ClusterChange::Remove` event, got `{event:?}`"); }; - assert_eq!(node.chitchat_id(), &updated_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); + assert_eq!(node.chitchat_id(), updated_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); assert!(!node.is_self_node()); - assert!(node.is_ready()); + assert!(node.is_ready); assert_eq!( - previous_nodes.get(&updated_chitchat_id.node_id).unwrap(), + previous_nodes + .get(updated_chitchat_id.node_id.as_ref()) + .unwrap(), &node ); } @@ -698,7 +718,7 @@ pub(crate) mod tests { let port = 1235; let grpc_advertise_addr: SocketAddr = ([127, 0, 0, 1], port + 1).into(); let updated_chitchat_id = ChitchatId::for_local_test(port); - let updated_node_id: NodeId = updated_chitchat_id.node_id.clone().into(); + let updated_node_id: NodeId = NodeId::from_arc_str(updated_chitchat_id.node_id.clone()); let previous_node_state = NodeStateBuilder::default() .with_grpc_advertise_addr(grpc_advertise_addr) .with_readiness(true) @@ -731,12 +751,14 @@ pub(crate) mod tests { let ClusterChange::Remove(node) = event else { panic!("expected `ClusterChange::Remove` event, got `{event:?}`"); }; - assert_eq!(node.chitchat_id(), &updated_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); + assert_eq!(node.chitchat_id(), updated_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); assert!(!node.is_self_node()); - assert!(!node.is_ready()); + assert!(!node.is_ready); assert_eq!( - previous_nodes.get(&updated_chitchat_id.node_id).unwrap(), + previous_nodes + .get(updated_chitchat_id.node_id.as_ref()) + .unwrap(), &node ); } @@ -745,7 +767,7 @@ pub(crate) mod tests { let port = 1235; let grpc_advertise_addr: SocketAddr = ([127, 0, 0, 1], port + 1).into(); let updated_chitchat_id = ChitchatId::for_local_test(port); - let updated_node_id: NodeId = updated_chitchat_id.node_id.clone().into(); + let updated_node_id: NodeId = NodeId::from_arc_str(updated_chitchat_id.node_id.clone()); let mut previous_chitchat_id = updated_chitchat_id.clone(); previous_chitchat_id.generation_id += 1; let previous_node_state = NodeStateBuilder::default() @@ -780,7 +802,9 @@ pub(crate) mod tests { assert!(event_opt.is_none()); assert_eq!( - previous_nodes.get(&updated_chitchat_id.node_id).unwrap(), + previous_nodes + .get(updated_chitchat_id.node_id.as_ref()) + .unwrap(), &previous_node ); } @@ -803,7 +827,7 @@ pub(crate) mod tests { let port = 1235; let grpc_advertise_addr: SocketAddr = ([127, 0, 0, 1], port + 1).into(); let removed_chitchat_id = ChitchatId::for_local_test(port); - let removed_node_id: NodeId = removed_chitchat_id.node_id.clone().into(); + let removed_node_id: NodeId = NodeId::from_arc_str(removed_chitchat_id.node_id.clone()); let previous_node_state = NodeStateBuilder::default() .with_grpc_advertise_addr(grpc_advertise_addr) .with_readiness(false) @@ -822,14 +846,14 @@ pub(crate) mod tests { let event_opt = compute_cluster_change_events_on_removed(&removed_chitchat_id, &mut previous_nodes); assert!(event_opt.is_none()); - assert!(!previous_nodes.contains_key(&removed_chitchat_id.node_id)); + assert!(!previous_nodes.contains_key(removed_chitchat_id.node_id.as_ref())); } { // Node leaves the cluster in ready state. let port = 1235; let grpc_advertise_addr: SocketAddr = ([127, 0, 0, 1], port + 1).into(); let removed_chitchat_id = ChitchatId::for_local_test(port); - let removed_node_id: NodeId = removed_chitchat_id.node_id.clone().into(); + let removed_node_id: NodeId = NodeId::from_arc_str(removed_chitchat_id.node_id.clone()); let removed_node_state = NodeStateBuilder::default() .with_grpc_advertise_addr(grpc_advertise_addr) .with_readiness(true) @@ -851,11 +875,11 @@ pub(crate) mod tests { let ClusterChange::Remove(node) = event else { panic!("expected `ClusterChange::Remove` event, got `{event:?}`"); }; - assert_eq!(node.chitchat_id(), &removed_chitchat_id); - assert_eq!(node.grpc_advertise_addr(), grpc_advertise_addr); + assert_eq!(node.chitchat_id(), removed_chitchat_id); + assert_eq!(node.grpc_advertise_addr, grpc_advertise_addr); assert!(!node.is_self_node()); - assert!(node.is_ready()); - assert!(!previous_nodes.contains_key(&removed_chitchat_id.node_id)); + assert!(node.is_ready); + assert!(!previous_nodes.contains_key(removed_chitchat_id.node_id.as_ref())); } { // Node leaves the cluster in ready state but in the meantime it has rejoined the @@ -866,7 +890,8 @@ pub(crate) mod tests { let mut rejoined_chitchat_id = removed_chitchat_id.clone(); rejoined_chitchat_id.generation_id += 1; - let rejoined_node_id: NodeId = rejoined_chitchat_id.node_id.clone().into(); + let rejoined_node_id: NodeId = + NodeId::from_arc_str(rejoined_chitchat_id.node_id.clone()); let rejoined_node_state = NodeStateBuilder::default() .with_grpc_advertise_addr(grpc_advertise_addr) .with_readiness(true) @@ -897,7 +922,7 @@ pub(crate) mod tests { let cluster_id = "test-cluster".to_string(); let self_port = 1234; let self_chitchat_id = ChitchatId::for_local_test(self_port); - let self_node_id: NodeId = self_chitchat_id.node_id.clone().into(); + let self_node_id: NodeId = NodeId::from_arc_str(self_chitchat_id.node_id.clone()); { let mut previous_nodes = BTreeMap::default(); let previous_node_states = BTreeMap::default(); diff --git a/quickwit/quickwit-cluster/src/cluster.rs b/quickwit/quickwit-cluster/src/cluster.rs index 7acb6692717..2ab050a779e 100644 --- a/quickwit/quickwit-cluster/src/cluster.rs +++ b/quickwit/quickwit-cluster/src/cluster.rs @@ -19,6 +19,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +#[cfg(any(test, feature = "testsuite"))] use anyhow::Context; use chitchat::transport::Transport; use chitchat::{ @@ -28,12 +29,12 @@ use chitchat::{ use itertools::Itertools; use quickwit_common::tower::ClientGrpcConfig; use quickwit_proto::indexing::{IndexingPipelineId, IndexingTask, PipelineMetrics}; -use quickwit_proto::types::{NodeId, NodeIdRef, PipelineUid, ShardId}; +use quickwit_proto::types::{NodeId, PipelineUid, ShardId}; use serde::{Deserialize, Serialize}; use tokio::sync::{Mutex, RwLock, mpsc, watch}; +#[cfg(any(test, feature = "testsuite"))] use tokio::time::timeout; use tokio_stream::StreamExt; -use tokio_stream::wrappers::WatchStream; use tracing::{info, warn}; use crate::change::{ClusterChange, ClusterChangeStreamFactory, compute_cluster_change_events}; @@ -41,7 +42,6 @@ use crate::grpc_gossip::spawn_catchup_callback_task; use crate::member::{ ClusterMember, ENABLED_SERVICES_KEY, GRPC_ADVERTISE_ADDR_KEY, NodeStateExt, PIPELINE_METRICS_PREFIX, READINESS_KEY, READINESS_VALUE_NOT_READY, READINESS_VALUE_READY, - build_cluster_member, }; use crate::metrics::spawn_metrics_task; use crate::{ClusterChangeStream, ClusterNode}; @@ -86,6 +86,63 @@ impl Debug for Cluster { } } +#[cfg(any(test, feature = "testsuite"))] +impl Cluster { + pub async fn ready_members(&self) -> Vec { + self.ready_nodes() + .await + .into_iter() + .map(|node: ClusterNode| node.member().clone()) + .collect() + } + + /// Waits until the predicate holds true for the set of ready members. + /// + /// Uses `change_stream_with_initial()` to obtain the current ready set atomically alongside + /// the subscription, so the predicate is evaluated on the full initial set before any future + /// change event arrives. + pub async fn wait_for_ready_members( + &self, + mut predicate: F, + timeout_after: Duration, + ) -> anyhow::Result<()> + where + F: FnMut(&[ClusterMember]) -> bool, + { + let (initial_nodes, mut change_stream) = self.change_stream_with_initial().await; + let mut ready_members: BTreeMap = initial_nodes + .into_iter() + .map(|node| (node.node_id.clone(), node.member().clone())) + .collect(); + let members: Vec = ready_members.values().cloned().collect(); + if predicate(&members) { + return Ok(()); + } + timeout(timeout_after, async move { + while let Some(change) = change_stream.next().await { + match change { + ClusterChange::Add(node) => { + ready_members.insert(node.node_id.clone(), (*node).clone()); + } + ClusterChange::Update(node) => { + ready_members.insert(node.node_id.clone(), (*node).clone()); + } + ClusterChange::Remove(node) => { + ready_members.remove(&node.node_id); + } + } + let members: Vec = ready_members.values().cloned().collect(); + if predicate(&members) { + return; + } + } + }) + .await + .context("deadline has passed before predicate held true")?; + Ok(()) + } +} + impl Cluster { pub fn cluster_id(&self) -> &str { &self.cluster_id @@ -95,8 +152,8 @@ impl Cluster { &self.self_chitchat_id } - pub fn self_node_id(&self) -> &NodeIdRef { - NodeIdRef::from_str(&self.self_chitchat_id.node_id) + pub fn self_node_id(&self) -> NodeId { + NodeId::from_arc_str(self.self_chitchat_id.node_id.clone()) } pub fn gossip_listen_addr(&self) -> SocketAddr { @@ -173,9 +230,6 @@ impl Cluster { let chitchat = chitchat_handle.chitchat(); let chitchat_guard = chitchat.lock().await; let live_nodes_rx = chitchat_guard.live_nodes_watcher(); - let live_nodes_stream = chitchat_guard.live_nodes_watch_stream(); - let (ready_members_tx, ready_members_rx) = watch::channel(Vec::new()); - spawn_ready_members_task(cluster_id.clone(), live_nodes_stream, ready_members_tx); drop(chitchat_guard); let weak_chitchat = Arc::downgrade(&chitchat); @@ -197,7 +251,6 @@ impl Cluster { chitchat_handle, live_nodes: BTreeMap::new(), change_stream_subscribers: Vec::new(), - ready_members_rx, }; let cluster = Cluster { cluster_id, @@ -211,28 +264,21 @@ impl Cluster { Ok(cluster) } - /// Deprecated: this is going away soon. - pub async fn ready_members(&self) -> Vec { - self.inner.read().await.ready_members_rx.borrow().clone() - } - - /// Deprecated: this is going away soon. - async fn ready_members_watcher(&self) -> WatchStream> { - WatchStream::new(self.inner.read().await.ready_members_rx.clone()) - } - pub async fn ready_nodes(&self) -> Vec { self.inner .write() .await .live_nodes .values() - .filter(|node| node.is_ready()) + .filter(|node| node.is_ready) .cloned() .collect() } /// Returns a stream of changes affecting the set of ready nodes in the cluster. + /// + /// Replays currently-ready nodes as `Add` events before future changes, under the write lock, + /// so no change can slip through unobserved. pub fn change_stream(&self) -> ClusterChangeStream { let (change_stream, change_stream_tx) = ClusterChangeStream::new_unbounded(); let inner = self.inner.clone(); @@ -240,7 +286,7 @@ impl Cluster { let future = async move { let mut inner = inner.write().await; for node in inner.live_nodes.values() { - if node.is_ready() { + if node.is_ready { change_stream_tx .send(ClusterChange::Add(node.clone())) .expect("receiver end of the channel should be open"); @@ -252,6 +298,25 @@ impl Cluster { change_stream } + /// Returns the current snapshot of ready nodes and a stream of future changes. + /// + /// Unlike `change_stream()`, the initial ready set is returned as a `Vec` rather than replayed + /// as individual `Add` events. The write lock is held for the duration of the snapshot so the + /// returned pair is consistent: no change event can arrive between the snapshot and the first + /// stream item. + pub async fn change_stream_with_initial(&self) -> (Vec, ClusterChangeStream) { + let (change_stream, change_stream_tx) = ClusterChangeStream::new_unbounded(); + let mut inner = self.inner.write().await; + let initial_nodes: Vec = inner + .live_nodes + .values() + .filter(|node| node.is_ready) + .cloned() + .collect(); + inner.change_stream_subscribers.push(change_stream_tx); + (initial_nodes, change_stream) + } + /// Returns whether the self node is ready. pub async fn is_self_node_ready(&self) -> bool { self.chitchat() @@ -328,27 +393,6 @@ impl Cluster { .subscribe_event(key_prefix, callback) } - /// Waits until the predicate holds true for the set of ready members. - pub async fn wait_for_ready_members( - &self, - mut predicate: F, - timeout_after: Duration, - ) -> anyhow::Result<()> - where - F: FnMut(&[ClusterMember]) -> bool, - { - timeout( - timeout_after, - self.ready_members_watcher() - .await - .skip_while(|members| !predicate(members)) - .next(), - ) - .await - .context("deadline has passed before predicate held true")?; - Ok(()) - } - /// Returns a snapshot of the cluster state, including the underlying Chitchat state. pub async fn snapshot(&self) -> ClusterSnapshot { let chitchat = self.chitchat().await; @@ -372,7 +416,7 @@ impl Cluster { ClusterSnapshot { cluster_id: self.cluster_id.clone(), - self_node_id: self.self_chitchat_id.node_id.clone(), + self_node_id: self.self_chitchat_id.node_id.to_string(), ready_nodes, live_nodes, dead_nodes, @@ -455,42 +499,6 @@ impl ClusterChangeStreamFactory for Cluster { } } -/// Deprecated: this is going away soon. -fn spawn_ready_members_task( - cluster_id: String, - mut live_nodes_stream: WatchStream>, - ready_members_tx: watch::Sender>, -) { - let fut = async move { - while let Some(new_live_nodes) = live_nodes_stream.next().await { - let mut new_ready_members = Vec::with_capacity(new_live_nodes.len()); - - for (chitchat_id, node_state) in new_live_nodes { - let member = match build_cluster_member(chitchat_id, &node_state) { - Ok(member) => member, - Err(error) => { - warn!( - cluster_id=%cluster_id, - error=?error, - "Failed to build cluster member from Chitchat node state." - ); - continue; - } - }; - if member.is_ready { - new_ready_members.push(member); - } - } - if *ready_members_tx.borrow() != new_ready_members - && ready_members_tx.send(new_ready_members).is_err() - { - break; - } - } - }; - tokio::spawn(fut); -} - /// Parses indexing tasks from the chitchat node state. pub fn parse_indexing_tasks(node_state: &NodeState) -> Vec { node_state @@ -623,7 +631,6 @@ struct InnerCluster { chitchat_handle: ChitchatHandle, live_nodes: BTreeMap, change_stream_subscribers: Vec>, - ready_members_rx: watch::Receiver>, } // Not used within the code, used for documentation. @@ -747,7 +754,7 @@ pub async fn create_cluster_for_test( static GOSSIP_ADVERTISE_PORT_SEQUENCE: AtomicU16 = AtomicU16::new(1u16); let gossip_advertise_port = GOSSIP_ADVERTISE_PORT_SEQUENCE.fetch_add(1, Ordering::Relaxed); - let node_id: NodeId = format!("node-{gossip_advertise_port}").into(); + let node_id: NodeId = NodeId::from_str(&format!("node-{gossip_advertise_port}")); let enabled_services = enabled_services .iter() @@ -789,10 +796,10 @@ mod tests { .await .unwrap(); - let mut ready_members_watcher = node.ready_members_watcher().await; - let ready_members = ready_members_watcher.next().await.unwrap(); + // Node starts not-ready: subscribe before any readiness change so we don't miss events. + let mut change_stream = node.change_stream(); - assert!(ready_members.is_empty()); + assert!(node.ready_nodes().await.is_empty()); assert!(!node.is_self_node_ready().await); let cluster_snapshot = node.snapshot().await; @@ -811,8 +818,8 @@ mod tests { node.set_self_node_readiness(true).await; - let ready_members = ready_members_watcher.next().await.unwrap(); - assert_eq!(ready_members.len(), 1); + let change = change_stream.next().await.unwrap(); + assert!(matches!(change, ClusterChange::Add(_))); assert!(node.is_self_node_ready().await); let cluster_snapshot = node.snapshot().await; @@ -831,8 +838,8 @@ mod tests { node.set_self_node_readiness(false).await; - let ready_members = ready_members_watcher.next().await.unwrap(); - assert!(ready_members.is_empty()); + let change = change_stream.next().await.unwrap(); + assert!(matches!(change, ClusterChange::Remove(_))); assert!(!node.is_self_node_ready().await); let cluster_snapshot = node.snapshot().await; @@ -1187,7 +1194,7 @@ mod tests { let transport = ChannelTransport::default(); let cluster1a = create_cluster_for_test_with_id( - "node-11".into(), + NodeId::from_str("node-11"), 11, "cluster1".to_string(), Vec::new(), @@ -1197,7 +1204,7 @@ mod tests { ) .await?; let cluster2a = create_cluster_for_test_with_id( - "node-21".into(), + NodeId::from_str("node-21"), 21, "cluster2".to_string(), vec![cluster1a.gossip_listen_addr.to_string()], @@ -1207,7 +1214,7 @@ mod tests { ) .await?; let cluster1b = create_cluster_for_test_with_id( - "node-12".into(), + NodeId::from_str("node-12"), 12, "cluster1".to_string(), vec![ @@ -1220,7 +1227,7 @@ mod tests { ) .await?; let cluster2b = create_cluster_for_test_with_id( - "node-22".into(), + NodeId::from_str("node-22"), 22, "cluster2".to_string(), vec![ diff --git a/quickwit/quickwit-cluster/src/grpc_gossip.rs b/quickwit/quickwit-cluster/src/grpc_gossip.rs index 10be33970db..e15901a984d 100644 --- a/quickwit/quickwit-cluster/src/grpc_gossip.rs +++ b/quickwit/quickwit-cluster/src/grpc_gossip.rs @@ -117,7 +117,7 @@ async fn perform_grpc_gossip_rounds( .chitchat_id .expect("`chitchat_id` should be a required field"); let chitchat_id = ChitchatId { - node_id: proto_chitchat_id.node_id.clone(), + node_id: Arc::from(proto_chitchat_id.node_id.as_str()), generation_id: proto_chitchat_id.generation_id, gossip_advertise_addr: proto_chitchat_id .gossip_advertise_addr @@ -195,7 +195,7 @@ fn select_gossip_candidates( }) .choose_multiple(&mut rand::rng(), MAX_GOSSIP_PEERS) .into_iter() - .map(|(node_id, grpc_addr)| (node_id.clone(), grpc_addr)) + .map(|(node_id, grpc_addr)| (node_id.to_string(), grpc_addr)) .unzip() } @@ -338,7 +338,7 @@ mod tests { let chitchat_mutex_guard = chitchat.lock().await; let chitchat_id = ChitchatId { - node_id: "node-4".to_string(), + node_id: Arc::from("node-4"), generation_id: 0, gossip_advertise_addr: "127.0.0.1:14000".parse().unwrap(), }; diff --git a/quickwit/quickwit-cluster/src/grpc_service.rs b/quickwit/quickwit-cluster/src/grpc_service.rs index 5798a385e27..f984b8c44d4 100644 --- a/quickwit/quickwit-cluster/src/grpc_service.rs +++ b/quickwit/quickwit-cluster/src/grpc_service.rs @@ -72,7 +72,7 @@ impl ClusterService for Cluster { for (chitchat_id, node_state) in chitchat_guard.node_states() { let proto_chitchat_id = ProtoChitchatId { - node_id: chitchat_id.node_id.clone(), + node_id: chitchat_id.node_id.to_string(), generation_id: chitchat_id.generation_id, gossip_advertise_addr: chitchat_id.gossip_advertise_addr.to_string(), }; @@ -135,7 +135,7 @@ mod tests { .unwrap(); let cluster_id = cluster.cluster_id().to_string(); - let node_id = cluster.self_node_id().to_owned(); + let node_id = cluster.self_node_id(); cluster.set_self_key_value("foo", "bar").await; @@ -155,7 +155,7 @@ mod tests { let node_state = &mut fetch_cluster_state_response.node_states[0]; let chitchat_id = node_state.chitchat_id.clone().unwrap(); - assert_eq!(chitchat_id.node_id, node_id); + assert_eq!(chitchat_id.node_id.as_str(), node_id.as_str()); assert_eq!(chitchat_id.generation_id, 1); node_state diff --git a/quickwit/quickwit-cluster/src/member.rs b/quickwit/quickwit-cluster/src/member.rs index 4b5d276e96f..31f950d9f2d 100644 --- a/quickwit/quickwit-cluster/src/member.rs +++ b/quickwit/quickwit-cluster/src/member.rs @@ -106,11 +106,27 @@ pub struct ClusterMember { impl ClusterMember { pub fn chitchat_id(&self) -> ChitchatId { ChitchatId::new( - self.node_id.clone().into(), + self.node_id.clone(), self.generation_id.as_u64(), self.gossip_advertise_addr, ) } + + pub fn is_service_enabled(&self, service: QuickwitService) -> bool { + self.enabled_services.contains(&service) + } + + pub fn is_indexer(&self) -> bool { + self.is_service_enabled(QuickwitService::Indexer) + } + + pub fn is_ingester(&self) -> bool { + self.is_service_enabled(QuickwitService::Indexer) + } + + pub fn is_searcher(&self) -> bool { + self.is_service_enabled(QuickwitService::Searcher) + } } impl From for ChitchatId { @@ -153,7 +169,7 @@ pub(crate) fn build_cluster_member( let indexing_tasks = parse_indexing_tasks(node_state); let indexing_cpu_capacity = parse_indexing_cpu_capacity(node_state); let member = ClusterMember { - node_id: chitchat_id.node_id.into(), + node_id: NodeId::from_arc_str(chitchat_id.node_id.clone()), generation_id: chitchat_id.generation_id.into(), is_ready, enabled_services, diff --git a/quickwit/quickwit-cluster/src/node.rs b/quickwit/quickwit-cluster/src/node.rs index 4a8b11dbafc..6cc9b7f2f9c 100644 --- a/quickwit/quickwit-cluster/src/node.rs +++ b/quickwit/quickwit-cluster/src/node.rs @@ -12,18 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::fmt::Debug; -use std::net::SocketAddr; use std::sync::Arc; use chitchat::{ChitchatId, NodeState}; use quickwit_config::service::QuickwitService; -use quickwit_proto::indexing::{CpuCapacity, IndexingTask}; -use quickwit_proto::types::NodeIdRef; +#[cfg(any(test, feature = "testsuite"))] +use quickwit_proto::indexing::IndexingTask; use tonic::transport::Channel; -use crate::member::build_cluster_member; +use crate::member::{ClusterMember, build_cluster_member}; #[derive(Clone)] pub struct ClusterNode { @@ -38,15 +36,10 @@ impl ClusterNode { channel: Channel, is_self_node: bool, ) -> anyhow::Result { - let member = build_cluster_member(chitchat_id.clone(), node_state)?; + let member = build_cluster_member(chitchat_id, node_state)?; let inner = InnerNode { - chitchat_id, + member, channel, - enabled_services: member.enabled_services, - grpc_advertise_addr: member.grpc_advertise_addr, - indexing_tasks: member.indexing_tasks, - indexing_capacity: member.indexing_cpu_capacity, - is_ready: member.is_ready, is_self_node, }; let node = ClusterNode { @@ -70,7 +63,7 @@ impl ClusterNode { let gossip_advertise_addr = ([127, 0, 0, 1], port).into(); let grpc_advertise_addr = ([127, 0, 0, 1], port + 1).into(); - let chitchat_id = ChitchatId::new(node_id.to_string(), 0, gossip_advertise_addr); + let chitchat_id = ChitchatId::new(Arc::from(node_id), 0, gossip_advertise_addr); let channel = make_channel(grpc_advertise_addr, ClientGrpcConfig::default()).await; let mut node_state = NodeState::for_test(); node_state.set(ENABLED_SERVICES_KEY, enabled_services.join(",")); @@ -79,67 +72,37 @@ impl ClusterNode { Self::try_new(chitchat_id, &node_state, channel, is_self_node).unwrap() } - pub fn chitchat_id(&self) -> &ChitchatId { - &self.inner.chitchat_id - } - - pub fn node_id(&self) -> &NodeIdRef { - NodeIdRef::from_str(&self.inner.chitchat_id.node_id) + pub fn member(&self) -> &ClusterMember { + &self.inner.member } pub fn channel(&self) -> Channel { self.inner.channel.clone() } - pub fn enabled_services(&self) -> &HashSet { - &self.inner.enabled_services - } - - pub fn is_indexer(&self) -> bool { - self.inner - .enabled_services - .contains(&QuickwitService::Indexer) - } - - pub fn is_ingester(&self) -> bool { - self.inner - .enabled_services - .contains(&QuickwitService::Indexer) - } - - pub fn is_searcher(&self) -> bool { - self.inner - .enabled_services - .contains(&QuickwitService::Searcher) - } - - pub fn grpc_advertise_addr(&self) -> SocketAddr { - self.inner.grpc_advertise_addr - } - - pub fn indexing_tasks(&self) -> &[IndexingTask] { - &self.inner.indexing_tasks + pub fn is_service_enabled(&self, service: QuickwitService) -> bool { + self.inner.member.is_service_enabled(service) } - pub fn indexing_capacity(&self) -> CpuCapacity { - self.inner.indexing_capacity + pub fn is_self_node(&self) -> bool { + self.inner.is_self_node } +} - pub fn is_ready(&self) -> bool { - self.inner.is_ready - } +impl std::ops::Deref for ClusterNode { + type Target = ClusterMember; - pub fn is_self_node(&self) -> bool { - self.inner.is_self_node + fn deref(&self) -> &ClusterMember { + self.member() } } impl Debug for ClusterNode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Node") - .field("node_id", &self.inner.chitchat_id.node_id) - .field("enabled_services", &self.inner.enabled_services) - .field("is_ready", &self.inner.is_ready) + .field("node_id", &self.inner.member.node_id) + .field("enabled_services", &self.inner.member.enabled_services) + .field("is_ready", &self.inner.member.is_ready) .finish() } } @@ -147,22 +110,13 @@ impl Debug for ClusterNode { #[cfg(test)] impl PartialEq for ClusterNode { fn eq(&self, other: &Self) -> bool { - self.inner.chitchat_id == other.inner.chitchat_id - && self.inner.enabled_services == other.inner.enabled_services - && self.inner.grpc_advertise_addr == other.inner.grpc_advertise_addr - && self.inner.indexing_tasks == other.inner.indexing_tasks - && self.inner.is_ready == other.inner.is_ready + self.inner.member == other.inner.member && self.inner.is_self_node == other.inner.is_self_node } } struct InnerNode { - chitchat_id: ChitchatId, + member: ClusterMember, channel: Channel, - enabled_services: HashSet, - grpc_advertise_addr: SocketAddr, - indexing_tasks: Vec, - indexing_capacity: CpuCapacity, - is_ready: bool, is_self_node: bool, } diff --git a/quickwit/quickwit-codegen/example/src/codegen/hello.rs b/quickwit/quickwit-codegen/example/src/codegen/hello.rs index bacf8b3edac..11a840053d8 100644 --- a/quickwit/quickwit-codegen/example/src/codegen/hello.rs +++ b/quickwit/quickwit-codegen/example/src/codegen/hello.rs @@ -352,9 +352,9 @@ type PingLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct HelloTowerLayerStack { - hello_layers: Vec, - goodbye_layers: Vec, - ping_layers: Vec, + pub hello_layers: Vec, + pub goodbye_layers: Vec, + pub ping_layers: Vec, } impl HelloTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-codegen/src/codegen.rs b/quickwit/quickwit-codegen/src/codegen.rs index a7d6d311be0..84f15385539 100644 --- a/quickwit/quickwit-codegen/src/codegen.rs +++ b/quickwit/quickwit-codegen/src/codegen.rs @@ -832,7 +832,7 @@ fn generate_layer_stack_types_and_attributes( type #type_alias_name = quickwit_common::tower::BoxLayer, #request_type, #response_type, #error_type>; }; let attribute = quote! { - #attribute_name: Vec<#type_alias_name>, + pub #attribute_name: Vec<#type_alias_name>, }; type_aliases.extend(type_alias); attributes.extend(attribute); diff --git a/quickwit/quickwit-common/Cargo.toml b/quickwit/quickwit-common/Cargo.toml index fe8c066c171..351b456a9ea 100644 --- a/quickwit/quickwit-common/Cargo.toml +++ b/quickwit/quickwit-common/Cargo.toml @@ -18,7 +18,6 @@ backtrace = { workspace = true, optional = true } bytesize = { workspace = true } coarsetime = { workspace = true } dyn-clone = { workspace = true } -env_logger = { workspace = true } fnv = { workspace = true } futures = { workspace = true } home = { workspace = true } @@ -50,9 +49,12 @@ tonic = { workspace = true, features = [ ] } tower = { workspace = true } tracing = { workspace = true } +tracing-subscriber = { workspace = true } [features] -testsuite = ["hyper-util"] +testsuite = [ + "hyper-util", +] named_tasks = ["tokio/tracing"] jemalloc-profiled = [ "named_tasks", diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 0f3af2bc5ba..7feff4976ce 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -27,6 +27,7 @@ pub mod jemalloc_profiled; mod kill_switch; pub mod metrics; pub mod net; +pub mod numeric_types; mod path_hasher; pub mod pretty; mod progress; @@ -89,7 +90,10 @@ pub fn into_u64_range(range: Range) -> Range { } pub fn setup_logging_for_tests() { - let _ = env_logger::builder().format_timestamp(None).try_init(); + let _ = tracing_subscriber::fmt() + .with_test_writer() + .with_max_level(tracing::Level::INFO) + .try_init(); } pub fn split_file(split_id: impl Display) -> String { diff --git a/quickwit/quickwit-common/src/numeric_types.rs b/quickwit/quickwit-common/src/numeric_types.rs new file mode 100644 index 00000000000..cf4028f2888 --- /dev/null +++ b/quickwit/quickwit-common/src/numeric_types.rs @@ -0,0 +1,470 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! This module is copied over from Tantivy + +/// This module helps compare numerical values of different types (i64, u64 +/// and f64). +pub mod num_cmp { + use std::cmp::Ordering; + + pub fn cmp_i64_f64(left_i: i64, right_f: f64) -> Result { + if right_f.is_nan() { + return Err("NaN comparison is not supported".to_string()); + } + + // If right_f is < i64::MIN then left_i > right_f (i64::MIN=-2^63 can be + // exactly represented as f64) + if right_f < i64::MIN as f64 { + return Ok(Ordering::Greater); + } + // If right_f is >= i64::MAX then left_i < right_f (i64::MAX=2^63-1 cannot + // be exactly represented as f64) + if right_f >= i64::MAX as f64 { + return Ok(Ordering::Less); + } + + // Now right_f is in (i64::MIN, i64::MAX), so `right_f as i64` is + // well-defined (truncation toward 0) + let right_as_i = right_f as i64; + + let result = match left_i.cmp(&right_as_i) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + // they have the same integer part, compare the fraction + let rem = right_f - (right_as_i as f64); + if rem == 0.0 { + Ordering::Equal + } else if right_f > 0.0 { + Ordering::Less + } else { + Ordering::Greater + } + } + }; + Ok(result) + } + + pub fn cmp_u64_f64(left_u: u64, right_f: f64) -> Result { + if right_f.is_nan() { + return Err("NaN comparison is not supported".to_string()); + } + + // Negative floats are always less than any u64 >= 0 + if right_f < 0.0 { + return Ok(Ordering::Greater); + } + + // If right_f is >= u64::MAX then left_u < right_f (u64::MAX=2^64-1 cannot be exactly) + let max_as_f = u64::MAX as f64; + if right_f > max_as_f { + return Ok(Ordering::Less); + } + + // Now right_f is in (0, u64::MAX), so `right_f as u64` is well-defined + // (truncation toward 0) + let right_as_u = right_f as u64; + + let result = match left_u.cmp(&right_as_u) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + // they have the same integer part, compare the fraction + let rem = right_f - (right_as_u as f64); + if rem == 0.0 { + Ordering::Equal + } else { + Ordering::Less + } + } + }; + Ok(result) + } + + pub fn cmp_i64_u64(left_i: i64, right_u: u64) -> Ordering { + if left_i < 0 { + Ordering::Less + } else { + let left_as_u = left_i as u64; + left_as_u.cmp(&right_u) + } + } +} + +/// This modules helps projecting numerical values to other numerical types. +/// When the target value space cannot exactly represent the source value, the +/// next representable value is returned (or AfterLast if the source value is +/// larger than the largest representable value). +/// +/// All functions in this module assume that f64 values are not NaN. +pub mod num_proj { + #[derive(Debug, PartialEq)] + pub enum ProjectedNumber { + Exact(T), + Next(T), + AfterLast, + } + + pub fn i64_to_u64(value: i64) -> ProjectedNumber { + if value < 0 { + ProjectedNumber::Next(0) + } else { + ProjectedNumber::Exact(value as u64) + } + } + + pub fn u64_to_i64(value: u64) -> ProjectedNumber { + if value > i64::MAX as u64 { + ProjectedNumber::AfterLast + } else { + ProjectedNumber::Exact(value as i64) + } + } + + pub fn f64_to_u64(value: f64) -> ProjectedNumber { + if value < 0.0 { + ProjectedNumber::Next(0) + } else if value > u64::MAX as f64 { + ProjectedNumber::AfterLast + } else if value.fract() == 0.0 { + ProjectedNumber::Exact(value as u64) + } else { + // casting f64 to u64 truncates toward zero + ProjectedNumber::Next(value as u64 + 1) + } + } + + pub fn f64_to_i64(value: f64) -> ProjectedNumber { + if value < (i64::MIN as f64) { + ProjectedNumber::Next(i64::MIN) + } else if value >= (i64::MAX as f64) { + ProjectedNumber::AfterLast + } else if value.fract() == 0.0 { + ProjectedNumber::Exact(value as i64) + } else if value > 0.0 { + // casting f64 to i64 truncates toward zero + ProjectedNumber::Next(value as i64 + 1) + } else { + ProjectedNumber::Next(value as i64) + } + } + + pub fn i64_to_f64(value: i64) -> ProjectedNumber { + let value_f = value as f64; + let k_roundtrip = value_f as i64; + if k_roundtrip == value { + // between -2^53 and 2^53 all i64 are exactly represented as f64 + ProjectedNumber::Exact(value_f) + } else { + // for very large/small i64 values, it is approximated to the closest f64 + if k_roundtrip > value { + ProjectedNumber::Next(value_f) + } else { + ProjectedNumber::Next(value_f.next_up()) + } + } + } + + pub fn u64_to_f64(value: u64) -> ProjectedNumber { + let value_f = value as f64; + let k_roundtrip = value_f as u64; + if k_roundtrip == value { + // between 0 and 2^53 all u64 are exactly represented as f64 + ProjectedNumber::Exact(value_f) + } else if k_roundtrip > value { + ProjectedNumber::Next(value_f) + } else { + ProjectedNumber::Next(value_f.next_up()) + } + } +} + +#[cfg(test)] +mod num_cmp_tests { + use std::cmp::Ordering; + + use super::num_cmp::*; + + #[test] + fn test_cmp_u64_f64() { + // Basic comparisons + assert_eq!(cmp_u64_f64(5, 5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_u64_f64(5, 6.0).unwrap(), Ordering::Less); + assert_eq!(cmp_u64_f64(6, 5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(0, 0.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_u64_f64(0, 0.1).unwrap(), Ordering::Less); + + // Negative float values should always be less than any u64 + assert_eq!(cmp_u64_f64(0, -0.1).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(5, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(u64::MAX, -1e20).unwrap(), Ordering::Greater); + + // Tests with extreme values + assert_eq!(cmp_u64_f64(u64::MAX, 1e20).unwrap(), Ordering::Less); + + // Precision edge cases: large u64 that loses precision when converted to f64 + // => 2^54, exactly represented as f64 + let large_f64 = 18_014_398_509_481_984.0; + let large_u64 = 18_014_398_509_481_984; + // prove that large_u64 is exactly represented as f64 + assert_eq!(large_u64 as f64, large_f64); + assert_eq!(cmp_u64_f64(large_u64, large_f64).unwrap(), Ordering::Equal); + // => (2^54 + 1) cannot be exactly represented in f64 + let large_u64_plus_1 = 18_014_398_509_481_985; + // prove that it is represented as f64 by large_f64 + assert_eq!(large_u64_plus_1 as f64, large_f64); + assert_eq!( + cmp_u64_f64(large_u64_plus_1, large_f64).unwrap(), + Ordering::Greater + ); + // => (2^54 - 1) cannot be exactly represented in f64 + let large_u64_minus_1 = 18_014_398_509_481_983; + // prove that it is also represented as f64 by large_f64 + assert_eq!(large_u64_minus_1 as f64, large_f64); + assert_eq!( + cmp_u64_f64(large_u64_minus_1, large_f64).unwrap(), + Ordering::Less + ); + + // NaN comparison results in an error + assert!(cmp_u64_f64(0, f64::NAN).is_err()); + } + + #[test] + fn test_cmp_i64_f64() { + // Basic comparisons + assert_eq!(cmp_i64_f64(5, 5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_i64_f64(5, 6.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(6, 5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(-5, -5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_i64_f64(-5, -4.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-4, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(-5, 5.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(5, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(0, -0.1).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(0, 0.1).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-1, -0.5).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-1, 0.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(0, 0.0).unwrap(), Ordering::Equal); + + // Tests with extreme values + assert_eq!(cmp_i64_f64(i64::MAX, 1e20).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(i64::MIN, -1e20).unwrap(), Ordering::Greater); + + // Precision edge cases: large i64 that loses precision when converted to f64 + // => 2^54, exactly represented as f64 + let large_f64 = 18_014_398_509_481_984.0; + let large_i64 = 18_014_398_509_481_984; + // prove that large_i64 is exactly represented as f64 + assert_eq!(large_i64 as f64, large_f64); + assert_eq!(cmp_i64_f64(large_i64, large_f64).unwrap(), Ordering::Equal); + // => (1_i64 << 54) + 1 cannot be exactly represented in f64 + let large_i64_plus_1 = 18_014_398_509_481_985; + // prove that it is represented as f64 by large_f64 + assert_eq!(large_i64_plus_1 as f64, large_f64); + assert_eq!( + cmp_i64_f64(large_i64_plus_1, large_f64).unwrap(), + Ordering::Greater + ); + // => (1_i64 << 54) - 1 cannot be exactly represented in f64 + let large_i64_minus_1 = 18_014_398_509_481_983; + // prove that it is also represented as f64 by large_f64 + assert_eq!(large_i64_minus_1 as f64, large_f64); + assert_eq!( + cmp_i64_f64(large_i64_minus_1, large_f64).unwrap(), + Ordering::Less + ); + + // Same precision edge case but with negative values + // => -2^54, exactly represented as f64 + let large_neg_f64 = -18_014_398_509_481_984.0; + let large_neg_i64 = -18_014_398_509_481_984; + // prove that large_neg_i64 is exactly represented as f64 + assert_eq!(large_neg_i64 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64, large_neg_f64).unwrap(), + Ordering::Equal + ); + // => (-2^54 + 1) cannot be exactly represented in f64 + let large_neg_i64_plus_1 = -18_014_398_509_481_985; + // prove that it is represented as f64 by large_neg_f64 + assert_eq!(large_neg_i64_plus_1 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64_plus_1, large_neg_f64).unwrap(), + Ordering::Less + ); + // => (-2^54 - 1) cannot be exactly represented in f64 + let large_neg_i64_minus_1 = -18_014_398_509_481_983; + // prove that it is also represented as f64 by large_neg_f64 + assert_eq!(large_neg_i64_minus_1 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64_minus_1, large_neg_f64).unwrap(), + Ordering::Greater + ); + + // NaN comparison results in an error + assert!(cmp_i64_f64(0, f64::NAN).is_err()); + } + + #[test] + fn test_cmp_i64_u64() { + // Test with negative i64 values (should always be less than any u64) + assert_eq!(cmp_i64_u64(-1, 0), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MIN, 0), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MIN, u64::MAX), Ordering::Less); + + // Test with positive i64 values + assert_eq!(cmp_i64_u64(0, 0), Ordering::Equal); + assert_eq!(cmp_i64_u64(1, 0), Ordering::Greater); + assert_eq!(cmp_i64_u64(1, 1), Ordering::Equal); + assert_eq!(cmp_i64_u64(0, 1), Ordering::Less); + assert_eq!(cmp_i64_u64(5, 10), Ordering::Less); + assert_eq!(cmp_i64_u64(10, 5), Ordering::Greater); + + // Test with values near i64::MAX and u64 conversion + assert_eq!(cmp_i64_u64(i64::MAX, i64::MAX as u64), Ordering::Equal); + assert_eq!(cmp_i64_u64(i64::MAX, (i64::MAX as u64) + 1), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MAX, u64::MAX), Ordering::Less); + } +} + +#[cfg(test)] +mod num_proj_tests { + use super::num_proj::{self, ProjectedNumber}; + + #[test] + fn test_i64_to_u64() { + assert_eq!(num_proj::i64_to_u64(-1), ProjectedNumber::Next(0)); + assert_eq!(num_proj::i64_to_u64(i64::MIN), ProjectedNumber::Next(0)); + assert_eq!(num_proj::i64_to_u64(0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::i64_to_u64(42), ProjectedNumber::Exact(42)); + assert_eq!( + num_proj::i64_to_u64(i64::MAX), + ProjectedNumber::Exact(i64::MAX as u64) + ); + } + + #[test] + fn test_u64_to_i64() { + assert_eq!(num_proj::u64_to_i64(0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::u64_to_i64(42), ProjectedNumber::Exact(42)); + assert_eq!( + num_proj::u64_to_i64(i64::MAX as u64), + ProjectedNumber::Exact(i64::MAX) + ); + assert_eq!( + num_proj::u64_to_i64((i64::MAX as u64) + 1), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::u64_to_i64(u64::MAX), ProjectedNumber::AfterLast); + } + + #[test] + fn test_f64_to_u64() { + assert_eq!(num_proj::f64_to_u64(-1e25), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_u64(-0.1), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_u64(1e20), ProjectedNumber::AfterLast); + assert_eq!( + num_proj::f64_to_u64(f64::INFINITY), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::f64_to_u64(0.0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::f64_to_u64(42.0), ProjectedNumber::Exact(42)); + assert_eq!(num_proj::f64_to_u64(0.5), ProjectedNumber::Next(1)); + assert_eq!(num_proj::f64_to_u64(42.1), ProjectedNumber::Next(43)); + } + + #[test] + fn test_f64_to_i64() { + assert_eq!(num_proj::f64_to_i64(-1e20), ProjectedNumber::Next(i64::MIN)); + assert_eq!( + num_proj::f64_to_i64(f64::NEG_INFINITY), + ProjectedNumber::Next(i64::MIN) + ); + assert_eq!(num_proj::f64_to_i64(1e20), ProjectedNumber::AfterLast); + assert_eq!( + num_proj::f64_to_i64(f64::INFINITY), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::f64_to_i64(0.0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::f64_to_i64(42.0), ProjectedNumber::Exact(42)); + assert_eq!(num_proj::f64_to_i64(-42.0), ProjectedNumber::Exact(-42)); + assert_eq!(num_proj::f64_to_i64(0.5), ProjectedNumber::Next(1)); + assert_eq!(num_proj::f64_to_i64(42.1), ProjectedNumber::Next(43)); + assert_eq!(num_proj::f64_to_i64(-0.5), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_i64(-42.1), ProjectedNumber::Next(-42)); + } + + #[test] + fn test_i64_to_f64() { + assert_eq!(num_proj::i64_to_f64(0), ProjectedNumber::Exact(0.0)); + assert_eq!(num_proj::i64_to_f64(42), ProjectedNumber::Exact(42.0)); + assert_eq!(num_proj::i64_to_f64(-42), ProjectedNumber::Exact(-42.0)); + + let max_exact = 9_007_199_254_740_992; // 2^53 + assert_eq!( + num_proj::i64_to_f64(max_exact), + ProjectedNumber::Exact(max_exact as f64) + ); + + // Test values that cannot be exactly represented as f64 (integers above 2^53) + let large_i64 = 9_007_199_254_740_993; // 2^53 + 1 + let closest_f64 = 9_007_199_254_740_992.0; + assert_eq!(large_i64 as f64, closest_f64); + if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_i64) { + // Verify that the returned float is different from the direct cast + assert!(val > closest_f64); + assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_i64"); + } + + // Test with very large negative value + let large_neg_i64 = -9_007_199_254_740_993; // -(2^53 + 1) + let closest_neg_f64 = -9_007_199_254_740_992.0; + assert_eq!(large_neg_i64 as f64, closest_neg_f64); + if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_neg_i64) { + // Verify that the returned float is the closest representable f64 + assert_eq!(val, closest_neg_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_neg_i64"); + } + } + + #[test] + fn test_u64_to_f64() { + assert_eq!(num_proj::u64_to_f64(0), ProjectedNumber::Exact(0.0)); + assert_eq!(num_proj::u64_to_f64(42), ProjectedNumber::Exact(42.0)); + + // Test the largest u64 value that can be exactly represented as f64 (2^53) + let max_exact = 9_007_199_254_740_992; // 2^53 + assert_eq!( + num_proj::u64_to_f64(max_exact), + ProjectedNumber::Exact(max_exact as f64) + ); + + // Test values that cannot be exactly represented as f64 (integers above 2^53) + let large_u64 = 9_007_199_254_740_993; // 2^53 + 1 + let closest_f64 = 9_007_199_254_740_992.0; + assert_eq!(large_u64 as f64, closest_f64); + if let ProjectedNumber::Next(val) = num_proj::u64_to_f64(large_u64) { + // Verify that the returned float is different from the direct cast + assert!(val > closest_f64); + assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_u64"); + } + } +} diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index c9a323f9ec2..198c2bf8bdd 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -179,12 +179,13 @@ fn _check_macro_works() { #[doc(hidden)] pub use coarsetime::Instant as CoarsetimeInstant; +pub use rate_limited_debug; +pub use rate_limited_error; +pub use rate_limited_info; +pub use rate_limited_trace; #[doc(hidden)] pub use rate_limited_tracing; -pub use { - rate_limited_debug, rate_limited_error, rate_limited_info, rate_limited_trace, - rate_limited_warn, -}; +pub use rate_limited_warn; #[cfg(test)] mod tests { diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index e6e7adb3766..1f8af60aa57 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -487,6 +487,7 @@ impl crate::TestableForRegression for IndexConfig { ], timestamp_field: Some("timestamp".to_string()), secondary_timestamp_field: None, + indexation_time_field: None, tag_fields: BTreeSet::from_iter(["tenant_id".to_string(), "log_level".to_string()]), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), diff --git a/quickwit/quickwit-config/src/lib.rs b/quickwit/quickwit-config/src/lib.rs index b1d675fc1f6..b2703c0d6bd 100644 --- a/quickwit/quickwit-config/src/lib.rs +++ b/quickwit/quickwit-config/src/lib.rs @@ -98,6 +98,12 @@ pub fn disable_ingest_v1() -> bool { *DISABLE_INGEST_V1 } +pub fn is_delete_task_service_disabled() -> bool { + static DISABLE_DELETE_TASK_SERVICE_ENV: Lazy = + Lazy::new(|| get_bool_from_env("QW_DISABLE_DELETE_TASK_SERVICE", false)); + *DISABLE_DELETE_TASK_SERVICE_ENV +} + #[derive(utoipa::OpenApi)] #[openapi(components(schemas( ConstWriteAmplificationMergePolicyConfig, diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index 31e19bce09c..0d70d0a2b77 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -59,6 +59,9 @@ pub struct RestConfig { pub struct GrpcConfig { #[serde(default = "GrpcConfig::default_max_message_size")] pub max_message_size: ByteSize, + /// Search server responses can be larger when returning many hits. + #[serde(default = "GrpcConfig::default_max_search_message_size")] + pub max_search_message_size: ByteSize, #[serde(default)] pub tls: Option, // If set, keeps idle connection alive by periodically perform a @@ -104,6 +107,10 @@ impl GrpcConfig { ByteSize::mib(20) } + fn default_max_search_message_size() -> ByteSize { + ByteSize::mib(60) + } + pub fn validate(&self) -> anyhow::Result<()> { ensure!( self.max_message_size >= ByteSize::mb(1), @@ -118,6 +125,7 @@ impl Default for GrpcConfig { fn default() -> Self { Self { max_message_size: Self::default_max_message_size(), + max_search_message_size: Self::default_max_search_message_size(), tls: None, keep_alive: None, } @@ -846,6 +854,7 @@ mod tests { fn test_grpc_config_validate() { let grpc_config = GrpcConfig { max_message_size: ByteSize::mb(1), + max_search_message_size: ByteSize::mb(1), tls: None, keep_alive: None, }; @@ -853,6 +862,7 @@ mod tests { let grpc_config = GrpcConfig { max_message_size: ByteSize::kb(1), + max_search_message_size: ByteSize::kb(1), tls: None, keep_alive: None, }; diff --git a/quickwit/quickwit-config/src/node_config/serialize.rs b/quickwit/quickwit-config/src/node_config/serialize.rs index 38e01bd635a..52810ee7a9d 100644 --- a/quickwit/quickwit-config/src/node_config/serialize.rs +++ b/quickwit/quickwit-config/src/node_config/serialize.rs @@ -217,7 +217,10 @@ impl NodeConfigBuilder { mut self, env_vars: &HashMap, ) -> anyhow::Result { - let node_id = self.node_id.resolve(env_vars).map(NodeId::new)?; + let node_id = self + .node_id + .resolve(env_vars) + .map(|s| NodeId::from_str(&s))?; let enabled_services = self .enabled_services @@ -467,7 +470,7 @@ pub fn node_config_for_tests_from_ports( rest_listen_port: u16, grpc_listen_port: u16, ) -> NodeConfig { - let node_id = NodeId::new(default_node_id().unwrap()); + let node_id = NodeId::from_str(&default_node_id().unwrap()); let enabled_services = QuickwitService::supported_services(); let listen_address = Host::default(); let rest_listen_addr = listen_address @@ -744,7 +747,10 @@ mod tests { .await .unwrap(); assert_eq!(config.cluster_id, DEFAULT_CLUSTER_ID); - assert_eq!(config.node_id, get_short_hostname().unwrap()); + assert_eq!( + config.node_id.as_str(), + get_short_hostname().unwrap().as_str() + ); assert_eq!( config.enabled_services, QuickwitService::supported_services() diff --git a/quickwit/quickwit-config/src/storage_config.rs b/quickwit/quickwit-config/src/storage_config.rs index 52daffdb537..7a9af4b1cdf 100644 --- a/quickwit/quickwit-config/src/storage_config.rs +++ b/quickwit/quickwit-config/src/storage_config.rs @@ -425,6 +425,7 @@ impl fmt::Debug for S3StorageConfig { "disable_multi_object_delete", &self.disable_multi_object_delete, ) + .field("encryption", &self.encryption) .finish() } } diff --git a/quickwit/quickwit-control-plane/Cargo.toml b/quickwit/quickwit-control-plane/Cargo.toml index 2957c9858c4..63cd6138af7 100644 --- a/quickwit/quickwit-control-plane/Cargo.toml +++ b/quickwit/quickwit-control-plane/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] anyhow = { workspace = true } async-trait = { workspace = true } +base64 = { workspace = true } bytesize = { workspace = true } fnv = { workspace = true } futures = { workspace = true } @@ -20,10 +21,12 @@ itertools = { workspace = true } lru = { workspace = true } mockall = { workspace = true, optional = true } once_cell = { workspace = true } +prost = { workspace = true } rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } smallvec = { workspace = true } +time = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } ulid = { workspace = true } diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index e4c6995d639..e91f8d689bd 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -34,13 +34,15 @@ use quickwit_common::pretty::PrettyDisplay; use quickwit_common::pubsub::EventSubscriber; use quickwit_common::uri::Uri; use quickwit_common::{Progress, shared_consts}; -use quickwit_config::service::QuickwitService; use quickwit_config::{ClusterConfig, IndexConfig, IndexTemplate, SourceConfig}; use quickwit_ingest::{IngesterPool, LocalShardsUpdate}; use quickwit_metastore::{CreateIndexRequestExt, CreateIndexResponseExt, IndexMetadataResponseExt}; use quickwit_proto::control_plane::{ AdviseResetShardsRequest, AdviseResetShardsResponse, ControlPlaneError, ControlPlaneResult, + DisableMaintenanceModeRequest, DisableMaintenanceModeResponse, EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, GetMaintenanceModeRequest, GetMaintenanceModeResponse, GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsResponse, GetOrCreateOpenShardsSubrequest, + SwapIndexingPipelinesRequest, SwapIndexingPipelinesResponse, }; use quickwit_proto::indexing::ShardPositionsUpdate; use quickwit_proto::metastore::{ @@ -62,6 +64,7 @@ use crate::debouncer::Debouncer; use crate::indexing_scheduler::{IndexingScheduler, IndexingSchedulerState}; use crate::ingest::IngestController; use crate::ingest::ingest_controller::{IngestControllerStats, RebalanceShardsCallback}; +use crate::maintenance::{MaintenanceState, MetastoreKvPersistence, serialize_frozen_plan}; use crate::model::ControlPlaneModel; /// Interval between two controls (or checks) of the desired plan VS running plan. @@ -102,6 +105,11 @@ pub struct ControlPlane { readiness_tx: watch::Sender, // Disables the control loop. This is useful for unit testing. disable_control_loop: bool, + /// Maintenance mode state. When active the indexing plan is frozen (not + /// rebuilt on topology changes). + maintenance: MaintenanceState, + /// Persistence backend for maintenance mode state (frozen plan + metadata). + maintenance_persistence: MetastoreKvPersistence, } impl fmt::Debug for ControlPlane { @@ -125,6 +133,7 @@ impl ControlPlane { watch::Receiver, ) { let disable_control_loop = false; + let maintenance_persistence = MetastoreKvPersistence::new(metastore.clone()); Self::spawn_inner( universe, cluster_config, @@ -134,6 +143,7 @@ impl ControlPlane { ingester_pool, metastore, disable_control_loop, + maintenance_persistence, ) } @@ -147,6 +157,7 @@ impl ControlPlane { ingester_pool: IngesterPool, metastore: MetastoreServiceClient, disable_control_loop: bool, + maintenance_persistence: MetastoreKvPersistence, ) -> ( Mailbox, ActorHandle>, @@ -186,6 +197,8 @@ impl ControlPlane { rebuild_plan_debouncer: Debouncer::new(REBUILD_PLAN_COOLDOWN_PERIOD), readiness_tx, disable_control_loop, + maintenance: MaintenanceState::default(), + maintenance_persistence: maintenance_persistence.clone(), } }); (control_plane_mailbox, control_plane_handle, readiness_rx) @@ -199,6 +212,7 @@ pub struct ControlPlaneObservableState { pub num_indexes: usize, pub num_sources: usize, pub readiness: bool, + pub maintenance_mode: bool, } #[async_trait] @@ -216,6 +230,7 @@ impl Actor for ControlPlane { num_indexes: self.model.num_indexes(), num_sources: self.model.num_sources(), readiness: *self.readiness_tx.borrow(), + maintenance_mode: self.maintenance.is_active(), } } @@ -227,7 +242,17 @@ impl Actor for ControlPlane { .await .context("failed to initialize control plane model")?; - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + self.load_maintenance_state_from_persistence().await; + + if self.maintenance.is_active() { + // In maintenance mode: restore the frozen plan without triggering a rebuild. + info!( + enabled_at = self.maintenance.enabled_at().unwrap_or_default(), + "control plane starting in maintenance mode: indexing plan is frozen" + ); + } else { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } self.ingest_controller.sync_with_all_ingesters(&self.model); @@ -245,6 +270,37 @@ impl Actor for ControlPlane { } impl ControlPlane { + /// Loads maintenance state from the persistence backend. + /// Called during `initialize()`. + async fn load_maintenance_state_from_persistence(&mut self) { + match self.maintenance_persistence.load().await { + Some(persisted) => { + self.maintenance.load_from_metadata(persisted.metadata); + if self.maintenance.is_active() { + crate::metrics::CONTROL_PLANE_METRICS + .maintenance_mode + .set(1); + let num_indexers = persisted.frozen_plan.num_indexers(); + let num_pipelines: usize = persisted + .frozen_plan + .indexing_tasks_per_indexer() + .values() + .map(|tasks| tasks.len()) + .sum(); + info!( + num_indexers, + num_pipelines, "restored frozen indexing plan from persistence" + ); + self.indexing_scheduler + .load_frozen_plan(persisted.frozen_plan); + } + } + None => { + // No maintenance state persisted — normal operation. + } + } + } + async fn auto_create_indexes( &mut self, subrequests: &[GetOrCreateOpenShardsSubrequest], @@ -353,7 +409,7 @@ impl ControlPlane { let physical_indexing_plan: Vec = self .indexing_scheduler .observable_state() - .last_applied_physical_plan + .current_targeted_physical_plan .map(|plan| { plan.indexing_tasks_per_indexer() .iter() @@ -427,7 +483,8 @@ impl Handler for ControlPlane { _message: RebuildPlan, _ctx: &ActorContext, ) -> Result<(), ActorExitStatus> { - self.indexing_scheduler.rebuild_plan(&self.model); + self.indexing_scheduler + .rebuild_plan(&self.model, self.maintenance.is_active()); Ok(()) } } @@ -508,14 +565,21 @@ impl Handler for ControlPlane { if self.disable_control_loop { return Ok(()); } + let is_maintenance = self.maintenance.is_active(); if let Err(metastore_error) = self .ingest_controller - .rebalance_shards(&mut self.model, ctx.mailbox(), ctx.progress()) + .rebalance_shards( + &mut self.model, + ctx.mailbox(), + ctx.progress(), + is_maintenance, + ) .await { return convert_metastore_error::<()>(metastore_error).map(|_| ()); } - self.indexing_scheduler.control_running_plan(&self.model); + self.indexing_scheduler + .control_running_plan(&self.model, is_maintenance); ctx.schedule_self_msg(CONTROL_PLAN_LOOP_INTERVAL, ControlPlanLoop); Ok(()) } @@ -596,7 +660,8 @@ impl DeferableReplyHandler for ControlPlane { // Now, create index can also add sources to support creating indexes automatically from // index and source config templates. - let should_rebuild_plan = !index_metadata.sources.is_empty(); + let should_rebuild_plan = + !index_metadata.sources.is_empty() && !self.maintenance.is_active(); self.model.add_index(index_metadata); if should_rebuild_plan { @@ -646,6 +711,7 @@ impl Handler for ControlPlane { if self .model .update_index_config(&index_uid, index_metadata.index_config)? + && !self.maintenance.is_active() { let _rebuild_plan_notifier = self.rebuild_plan_debounced(ctx); } @@ -688,7 +754,9 @@ impl Handler for ControlPlane { // TODO: Refine the event. Notify index will have the effect to reload the entire state from // the metastore. We should update the state of the control plane. - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } info!(%index_uid, "deleted index"); let response = EmptyResponse {}; @@ -730,7 +798,9 @@ impl Handler for ControlPlane { // TODO: Refine the event. Notify index will have the effect to reload the entire state from // the metastore. We should update the state of the control plane. - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } let response = EmptyResponse {}; Ok(Ok(response)) @@ -770,7 +840,9 @@ impl Handler for ControlPlane { // TODO: Refine the event. Notify index will have the effect to reload the entire state from // the metastore. We should update the state of the control plane. - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } info!(%index_uid, source_id, "updated source"); let response = EmptyResponse {}; @@ -806,7 +878,7 @@ impl Handler for ControlPlane { .toggle_source(&index_uid, &source_id, enable) .context("failed to toggle source")?; - if mutation_occurred { + if mutation_occurred && !self.maintenance.is_active() { let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); } info!(%index_uid, source_id, enabled=enable, "toggled source"); @@ -861,7 +933,9 @@ impl Handler for ControlPlane { .sync_with_ingesters(&ingesters_needing_resync, &self.model); self.model.delete_source(&source_uid); - let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + if !self.maintenance.is_active() { + let _rebuild_plan_waiter = self.rebuild_plan_debounced(ctx); + } info!( index_uid=%source_uid.index_uid, @@ -916,9 +990,12 @@ impl Handler for ControlPlane { request: GetOrCreateOpenShardsRequest, ctx: &ActorContext, ) -> Result { - if let Err(metastore_error) = self - .auto_create_indexes(&request.subrequests, ctx.progress()) - .await + // In maintenance mode, block auto-create indexes but still allow shard routing + // for existing sources (ingest must continue). + if !self.maintenance.is_active() + && let Err(metastore_error) = self + .auto_create_indexes(&request.subrequests, ctx.progress()) + .await { return convert_metastore_error(metastore_error); } @@ -953,6 +1030,20 @@ impl Handler for ControlPlane { } } +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + request: SwapIndexingPipelinesRequest, + _ctx: &ActorContext, + ) -> Result { + let response = self.indexing_scheduler.swap_pipelines(request); + Ok(response) + } +} + #[async_trait] impl Handler for ControlPlane { type Reply = ControlPlaneResult<()>; @@ -962,6 +1053,11 @@ impl Handler for ControlPlane { local_shards_update: LocalShardsUpdate, ctx: &ActorContext, ) -> Result { + if self.maintenance.is_active() { + // In maintenance mode: skip shard scaling to avoid changing the plan. + debug!("maintenance mode: ignoring local shards update (scaling frozen)"); + return Ok(Ok(())); + } if let Err(metastore_error) = self .ingest_controller .handle_local_shards_update(local_shards_update, &mut self.model, ctx.progress()) @@ -1053,19 +1149,34 @@ impl Handler for ControlPlane { message: IndexerJoined, ctx: &ActorContext, ) -> Result { - info!( - "indexer `{}` joined the cluster: rebalancing shards and rebuilding indexing plan", - message.0.node_id() - ); + let is_maintenance = self.maintenance.is_active(); + if is_maintenance { + info!( + "indexer `{}` joined the cluster during maintenance mode", + message.0.node_id + ); + } else { + info!( + "indexer `{}` joined the cluster: rebalancing shards and rebuilding indexing plan", + message.0.node_id + ); + } + // TODO: Update shard table. if let Err(metastore_error) = self .ingest_controller - .rebalance_shards(&mut self.model, ctx.mailbox(), ctx.progress()) + .rebalance_shards( + &mut self.model, + ctx.mailbox(), + ctx.progress(), + is_maintenance, + ) .await { return convert_metastore_error::<()>(metastore_error).map(|_| ()); } - self.indexing_scheduler.rebuild_plan(&self.model); + self.indexing_scheduler + .rebuild_plan(&self.model, is_maintenance); Ok(()) } } @@ -1083,134 +1194,827 @@ impl Handler for ControlPlane { message: IndexerLeft, ctx: &ActorContext, ) -> Result { + let is_maintenance = self.maintenance.is_active(); + if is_maintenance { + info!( + "indexer `{}` left the cluster during maintenance mode", + message.0.node_id + ); + return Ok(()); + } else { + info!( + "indexer `{}` left the cluster: rebalancing shards and rebuilding indexing plan", + message.0.node_id + ); + } + // TODO: Update shard table. + if let Err(metastore_error) = self + .ingest_controller + .rebalance_shards( + &mut self.model, + ctx.mailbox(), + ctx.progress(), + is_maintenance, + ) + .await + { + return convert_metastore_error::<()>(metastore_error).map(|_| ()); + } + self.indexing_scheduler + .rebuild_plan(&self.model, is_maintenance); + Ok(()) + } +} + +#[async_trait] +impl Handler for ControlPlane { + type Reply = (); + + async fn handle( + &mut self, + message: RebalanceShardsCallback, + _ctx: &ActorContext, + ) -> Result { + let num_closed_shards = message.closed_shards.len(); + debug!("closing {num_closed_shards} shards after rebalance"); + + for closed_shard in message.closed_shards { + let shard_id = closed_shard.shard_id().clone(); + let source_uid = SourceUid { + index_uid: closed_shard.index_uid().clone(), + source_id: closed_shard.source_id, + }; + self.model.close_shards(&source_uid, &[shard_id]); + } + // We drop the rebalance guard explicitly here to put some emphasis on where a the rebalance + // lock is released. + drop(message.rebalance_guard); + Ok(()) + } +} + +// -- Maintenance Mode Handlers -- + +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + request: EnableMaintenanceModeRequest, + _ctx: &ActorContext, + ) -> Result { + self.handle_enable_maintenance(request).await + } +} + +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + _request: DisableMaintenanceModeRequest, + _ctx: &ActorContext, + ) -> Result { + self.handle_disable_maintenance().await + } +} + +#[async_trait] +impl Handler for ControlPlane { + type Reply = ControlPlaneResult; + + async fn handle( + &mut self, + _request: GetMaintenanceModeRequest, + _ctx: &ActorContext, + ) -> Result { + self.handle_get_maintenance() + } +} + +impl ControlPlane { + async fn handle_enable_maintenance( + &mut self, + _request: EnableMaintenanceModeRequest, + ) -> Result, ActorExitStatus> { + if self.maintenance.is_active() { + return Ok(Err(ControlPlaneError::Internal( + "maintenance mode is already enabled".to_string(), + ))); + } + + // Freeze the current plan. + let frozen_plan = self + .indexing_scheduler + .observable_state() + .current_targeted_physical_plan + .unwrap_or_else(|| crate::indexing_plan::PhysicalIndexingPlan::with_indexer_ids(&[])); + + let frozen_plan_json = match serialize_frozen_plan(&frozen_plan) { + Ok(json) => json, + Err(err) => { + return Ok(Err(ControlPlaneError::Internal(format!( + "failed to serialize frozen plan: {err}" + )))); + } + }; + + // Build the metadata (with RFC 3339 datetime). + let metadata = crate::maintenance::MaintenanceModeMetadata::new_now(); + + // Persist to durable storage BEFORE enabling in-memory state. + // This ensures that on restart, the control plane will find the persisted state + // even if it crashes right after this point. + if let Err(err) = self + .maintenance_persistence + .save(&metadata, &frozen_plan) + .await + { + return Ok(Err(ControlPlaneError::Internal(format!( + "failed to persist maintenance state: {err}" + )))); + } + + // Only now enable in-memory state (persistence succeeded). + self.maintenance.load_from_metadata(metadata); + crate::metrics::CONTROL_PLANE_METRICS + .maintenance_mode + .set(1); + info!( - "indexer `{}` left the cluster: rebalancing shards and rebuilding indexing plan", - message.0.node_id() + num_indexers = frozen_plan.num_indexers(), + "maintenance mode enabled: indexing plan frozen" + ); + + Ok(Ok(EnableMaintenanceModeResponse { frozen_plan_json })) + } + + async fn handle_disable_maintenance( + &mut self, + ) -> Result, ActorExitStatus> { + if !self.maintenance.is_active() { + return Ok(Err(ControlPlaneError::Internal( + "maintenance mode is not currently enabled".to_string(), + ))); + } + + // Clear persisted state BEFORE disabling in-memory. + // This ensures that on restart, the control plane will NOT reload maintenance mode + // even if it crashes right after this point. + if let Err(err) = self.maintenance_persistence.clear().await { + return Ok(Err(ControlPlaneError::Internal(format!( + "failed to clear persisted maintenance state: {err}" + )))); + } + + // Only now disable in-memory state (persistence clear succeeded). + self.maintenance.disable(); + crate::metrics::CONTROL_PLANE_METRICS + .maintenance_mode + .set(0); + + // Trigger a full plan rebuild to reconcile the cluster. + info!("maintenance mode disabled: triggering full indexing plan rebuild"); + self.indexing_scheduler.rebuild_plan(&self.model, false); + + Ok(Ok(DisableMaintenanceModeResponse {})) + } + + fn handle_get_maintenance( + &self, + ) -> Result, ActorExitStatus> { + let is_maintenance_mode = self.maintenance.is_active(); + let enabled_at = self.maintenance.enabled_at(); + + Ok(Ok(GetMaintenanceModeResponse { + is_maintenance_mode, + enabled_at, + })) + } +} + +fn spawn_watch_indexers_task( + weak_mailbox: WeakMailbox, + cluster_change_stream: ClusterChangeStream, +) { + tokio::spawn(watcher_indexers(weak_mailbox, cluster_change_stream)); +} + +async fn watcher_indexers( + weak_mailbox: WeakMailbox, + mut cluster_change_stream: ClusterChangeStream, +) { + while let Some(cluster_change) = cluster_change_stream.next().await { + let Some(mailbox) = weak_mailbox.upgrade() else { + return; + }; + match cluster_change { + ClusterChange::Add(node) => { + if node.is_indexer() + && let Err(error) = mailbox.send_message(IndexerJoined(node)).await + { + error!(%error, "failed to forward `IndexerJoined` event to control plane"); + } + } + ClusterChange::Remove(node) => { + if node.is_indexer() + && let Err(error) = mailbox.send_message(IndexerLeft(node)).await + { + error!(%error, "failed to forward `IndexerLeft` event to control plane"); + } + } + ClusterChange::Update(_) => { + // We are not interested in updates (yet). + } + } + } +} + +#[cfg(test)] +mod tests { + use std::num::NonZero; + use std::sync::Arc; + + use futures::FutureExt; + use mockall::Sequence; + use quickwit_actors::{AskError, Observe, SupervisorMetrics}; + use quickwit_cluster::ClusterChangeStreamFactoryForTest; + use quickwit_common::test_utils::wait_until_predicate; + use quickwit_config::{ + CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, IndexConfig, KafkaSourceParams, SourceParams, + }; + use quickwit_indexing::IndexingService; + use quickwit_metastore::{ + CreateIndexRequestExt, IndexMetadata, ListIndexesMetadataResponseExt, + }; + use quickwit_proto::control_plane::{ + GetOrCreateOpenShardsFailureReason, GetOrCreateOpenShardsSubrequest, + SwapIndexingPipelinesEntry, + }; + use quickwit_proto::indexing::{ + ApplyIndexingPlanRequest, ApplyIndexingPlanResponse, CpuCapacity, IndexingServiceClient, + MockIndexingService, + }; + use quickwit_proto::ingest::ingester::{ + IngesterServiceClient, InitShardSuccess, InitShardsResponse, MockIngesterService, + RetainShardsResponse, + }; + use quickwit_proto::ingest::{Shard, ShardPKey, ShardState}; + use quickwit_proto::metastore::{ + DeleteShardsResponse, EmptyResponse, EntityKind, FindIndexTemplateMatchesResponse, + GetKvResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, ListShardsRequest, + ListShardsResponse, ListShardsSubresponse, MetastoreError, MockMetastoreService, + OpenShardSubresponse, OpenShardsResponse, SourceType, + }; + use quickwit_proto::types::{DocMappingUid, Position}; + use tokio::sync::Mutex; + + use super::*; + use crate::IndexerNodeInfo; + use crate::indexing_plan::PhysicalIndexingPlan; + use crate::maintenance::MetastoreKvPersistence; + + fn setup_disabled_maintenance(mock_metastore: &mut MockMetastoreService) { + mock_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + } + + fn setup_maintenance_enable(mock_metastore: &mut MockMetastoreService) { + mock_metastore + .expect_get_kv() + .return_once(|_| Ok(GetKvResponse { value: None })); + mock_metastore + .expect_set_kv() + .return_once(|_| Ok(EmptyResponse {})); + } + + async fn observe_current_plan( + control_plane_handle: &ActorHandle>, + ) -> Option { + control_plane_handle + .observe() + .await + .state_opt + .as_ref()? + .indexing_scheduler + .current_targeted_physical_plan + .clone() + } + + #[must_use] + fn add_test_indexer_with_mailbox( + universe: &Universe, + indexer_pool: &IndexerPool, + node_id: NodeId, + ) -> quickwit_actors::Inbox { + let (client_mailbox, client_inbox) = universe.create_test_mailbox(); + let client = IndexingServiceClient::from_mailbox::(client_mailbox); + let indexer_info = IndexerNodeInfo { + node_id: node_id.clone(), + generation_id: 0, + client, + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }; + indexer_pool.insert(node_id, indexer_info); + client_inbox + } + + #[tokio::test] + async fn test_maintenance_mode_allows_create_index_without_rebuild() { + let universe = Universe::with_accelerated_time(); + + let indexer_pool = IndexerPool::default(); + + // Add one indexer to the pool + let node_1: NodeId = NodeId::from_str("test-node-1"); + let _indexing_inbox_1 = + add_test_indexer_with_mailbox(&universe, &indexer_pool, node_1.clone()); + + let ingester_pool = IngesterPool::default(); + + let index_uid: IndexUid = IndexUid::for_test("test-index", 0); + let index_uid_clone = index_uid.clone(); + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + mock_metastore + .expect_create_index() + .return_once(move |req| { + // re-serialize the received requested config + let index_config = req.deserialize_index_config().unwrap(); + let source_configs = req.deserialize_source_configs().unwrap(); + let mut index_metadata = IndexMetadata::new(index_config); + index_metadata.index_uid = index_uid_clone.clone(); + for source_config in source_configs { + index_metadata.add_source(source_config).unwrap(); + } + let index_metadata_json = serde_json::to_string(&index_metadata).unwrap(); + Ok(CreateIndexResponse { + index_uid: Some(index_uid_clone), + index_metadata_json, + }) + }); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + node_1.clone(), + cluster_change_stream_factory, + indexer_pool.clone(), + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Wait for a first (empty) plan to be calculated. + wait_until_predicate( + || observe_current_plan(&control_plane_handle).map(|plan| plan.is_some()), + Duration::from_secs(5), + Duration::from_millis(100), + ) + .await + .unwrap(); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + let original_physical_plan = observe_current_plan(&control_plane_handle).await; + + // Create index in maintenance mode + let index_config = IndexConfig::for_test("test-index", "ram:///test-index"); + let kafka_source = SourceConfig::for_test( + "kafka-source", + SourceParams::Kafka(KafkaSourceParams { + topic: "test-topic".to_string(), + client_log_level: None, + enable_backfill_mode: false, + client_params: json!({}), + }), + ); + let create_index_request = + CreateIndexRequest::try_from_index_and_source_configs(&index_config, &[kafka_source]) + .unwrap(); + let create_result = control_plane_mailbox + .ask_for_res(create_index_request) + .await; + assert!(create_result.is_ok()); + assert_eq!(create_result.unwrap().index_uid(), &index_uid); + // Check that plan rebuild is skipped + universe.sleep(Duration::from_secs(60)).await; + assert_eq!( + original_physical_plan, + observe_current_plan(&control_plane_handle).await, + "physical plan should not change after creating index in maintenance mode" + ); + + // Add another node + let node_2: NodeId = NodeId::from_str("test-node-2"); + let _indexing_inbox_2 = + add_test_indexer_with_mailbox(&universe, &indexer_pool, node_2.clone()); + // Check that the rebuild is still skipped + universe.sleep(Duration::from_secs(60)).await; + assert_eq!( + original_physical_plan, + observe_current_plan(&control_plane_handle).await, + "physical plan should not change after adding new node in maintenance mode" + ); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_allows_delete_index() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = NodeId::from_str("test-node"); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + mock_metastore + .expect_delete_index() + .return_once(|_| Ok(EmptyResponse {})); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + // Delete index in maintenance mode — should succeed, but plan rebuild is skipped. + let index_uid = IndexUid::for_test("test-index", 0); + let delete_index_request = DeleteIndexRequest { + index_uid: Some(index_uid), + }; + let delete_result = control_plane_mailbox + .ask(delete_index_request) + .await + .unwrap(); + assert!(delete_result.is_ok()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_allows_add_source() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = NodeId::from_str("test-node"); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + // Pre-load an index with an enabled ingest_v2 source so that + // `create_or_enable_ingest_v2_sources_if_necessary` does not call `add_source` on + // startup and consume the mock expectation meant for the test's own call. + let mut index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let mut ingest_v2_source = SourceConfig::ingest_v2(); + ingest_v2_source.enabled = true; + index_metadata.add_source(ingest_v2_source).unwrap(); + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .return_once(move |_| Ok(ListIndexesMetadataResponse::for_test(vec![index_metadata]))); + mock_metastore + .expect_list_shards() + .return_once(|_| Ok(ListShardsResponse::default())); + mock_metastore + .expect_add_source() + .return_once(|_| Ok(EmptyResponse {})); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + // Add source in maintenance mode — should succeed, but plan rebuild is skipped. + let index_uid = IndexUid::for_test("test-index", 0); + let source_config = SourceConfig::for_test("test-source", SourceParams::void()); + let add_source_request = AddSourceRequest { + index_uid: Some(index_uid), + source_config_json: serde_json::to_string(&source_config).unwrap(), + }; + let result = control_plane_mailbox.ask(add_source_request).await.unwrap(); + assert!(result.is_ok()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_enable_disable_cycle() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = NodeId::from_str("test-node"); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + let mut mock_metastore = MockMetastoreService::new(); + mock_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + mock_metastore + .expect_set_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_metastore + .expect_delete_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Initially not in maintenance mode. + let status = control_plane_mailbox + .ask(GetMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(!status.is_maintenance_mode); + + // Enable. + let enable_resp = control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(!enable_resp.frozen_plan_json.is_empty()); + + // Check status. + let status = control_plane_mailbox + .ask(GetMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(status.is_maintenance_mode); + assert!(status.enabled_at.is_some()); + + // Enable again — should fail. + let double_enable = control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap(); + assert!(double_enable.is_err()); + + // Disable. + let disable_resp = control_plane_mailbox + .ask(DisableMaintenanceModeRequest {}) + .await + .unwrap(); + assert!(disable_resp.is_ok()); + + // Check status again. + let status = control_plane_mailbox + .ask(GetMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + assert!(!status.is_maintenance_mode); + + // Disable again — should fail. + let double_disable = control_plane_mailbox + .ask(DisableMaintenanceModeRequest {}) + .await + .unwrap(); + assert!(double_disable.is_err()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_observable_state() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = NodeId::from_str("test-node"); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); + + // Observe initial state. + let obs = control_plane_handle.process_pending_and_observe().await; + let state = obs.state_opt.as_ref().unwrap(); + assert!(!state.maintenance_mode); + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); + + // Give the supervisor time to observe the inner actor's updated state. + universe.sleep(Duration::from_secs(1)).await; + + let obs = control_plane_handle.process_pending_and_observe().await; + let state = obs.state_opt.as_ref().unwrap(); + assert!(state.maintenance_mode); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_maintenance_mode_allows_toggle_source() { + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = NodeId::from_str("test-node"); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + // Pre-load an index with the test source and an enabled ingest_v2 source so that + // `create_or_enable_ingest_v2_sources_if_necessary` does not call `add_source` on + // startup and trigger unexpected mock calls. + let mut index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let test_source_config = SourceConfig::for_test("test-source", SourceParams::void()); + index_metadata.add_source(test_source_config).unwrap(); + let mut ingest_v2_source = SourceConfig::ingest_v2(); + ingest_v2_source.enabled = true; + index_metadata.add_source(ingest_v2_source).unwrap(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .return_once(move |_| Ok(ListIndexesMetadataResponse::for_test(vec![index_metadata]))); + mock_metastore + .expect_list_shards() + .return_once(|_| Ok(ListShardsResponse::default())); + mock_metastore + .expect_toggle_source() + .return_once(|_| Ok(EmptyResponse {})); + + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), ); - // TODO: Update shard table. - if let Err(metastore_error) = self - .ingest_controller - .rebalance_shards(&mut self.model, ctx.mailbox(), ctx.progress()) + + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) .await - { - return convert_metastore_error::<()>(metastore_error).map(|_| ()); - } - self.indexing_scheduler.rebuild_plan(&self.model); - Ok(()) + .unwrap() + .unwrap(); + + // Toggle source in maintenance mode — should succeed, but plan rebuild is skipped. + let index_uid = IndexUid::for_test("test-index", 0); + let toggle_request = ToggleSourceRequest { + index_uid: Some(index_uid), + source_id: "test-source".to_string(), + enable: false, + }; + let result = control_plane_mailbox.ask(toggle_request).await.unwrap(); + assert!(result.is_ok()); + + universe.assert_quit().await; } -} -#[async_trait] -impl Handler for ControlPlane { - type Reply = (); + #[tokio::test] + async fn test_maintenance_mode_allows_get_or_create_open_shards() { + // In maintenance mode, GetOrCreateOpenShards should still work for existing sources + // (ingest must continue), but auto_create_indexes is skipped. + let universe = Universe::with_accelerated_time(); + let self_node_id: NodeId = NodeId::from_str("test-node"); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); - async fn handle( - &mut self, - message: RebalanceShardsCallback, - _ctx: &ActorContext, - ) -> Result { - let num_closed_shards = message.closed_shards.len(); - debug!("closing {num_closed_shards} shards after rebalance"); + let mut mock_metastore = MockMetastoreService::new(); + setup_maintenance_enable(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .returning(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); + // Note: no expect_find_index_template_matches — if auto_create was NOT skipped, + // this would panic due to unexpected call. - for closed_shard in message.closed_shards { - let shard_id = closed_shard.shard_id().clone(); - let source_uid = SourceUid { - index_uid: closed_shard.index_uid().clone(), - source_id: closed_shard.source_id, - }; - self.model.close_shards(&source_uid, &[shard_id]); - } - // We drop the rebalance guard explicitly here to put some emphasis on where a the rebalance - // lock is released. - drop(message.rebalance_guard); - Ok(()) - } -} + let cluster_config = ClusterConfig::for_test(); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = ControlPlane::spawn( + &universe, + cluster_config, + self_node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + ); -fn spawn_watch_indexers_task( - weak_mailbox: WeakMailbox, - cluster_change_stream: ClusterChangeStream, -) { - tokio::spawn(watcher_indexers(weak_mailbox, cluster_change_stream)); -} + // Enable maintenance mode. + control_plane_mailbox + .ask(EnableMaintenanceModeRequest {}) + .await + .unwrap() + .unwrap(); -async fn watcher_indexers( - weak_mailbox: WeakMailbox, - mut cluster_change_stream: ClusterChangeStream, -) { - while let Some(cluster_change) = cluster_change_stream.next().await { - let Some(mailbox) = weak_mailbox.upgrade() else { - return; + // Send a GetOrCreateOpenShards with a nonexistent index. + // In maintenance, auto_create is skipped, so the index won't be found. + // The ingest controller will report a failure for unknown indexes, which is expected. + let request = GetOrCreateOpenShardsRequest { + subrequests: vec![GetOrCreateOpenShardsSubrequest { + subrequest_id: 0, + index_id: "nonexistent-index".to_string(), + source_id: "source".to_string(), + }], + closed_shards: Vec::new(), + unavailable_leaders: Vec::new(), }; - match cluster_change { - ClusterChange::Add(node) => { - if node.enabled_services().contains(&QuickwitService::Indexer) - && let Err(error) = mailbox.send_message(IndexerJoined(node)).await - { - error!(%error, "failed to forward `IndexerJoined` event to control plane"); - } - } - ClusterChange::Remove(node) => { - if node.enabled_services().contains(&QuickwitService::Indexer) - && let Err(error) = mailbox.send_message(IndexerLeft(node)).await - { - error!(%error, "failed to forward `IndexerLeft` event to control plane"); - } + let result = control_plane_mailbox.ask(request).await.unwrap(); + // The request should succeed at the handler level. + // It may fail internally because the index doesn't exist, but that's expected. + match result { + Ok(response) => { + // The response should contain a failure for the unknown index. + assert!(!response.failures.is_empty()); + assert_eq!( + response.failures[0].reason(), + GetOrCreateOpenShardsFailureReason::IndexNotFound + ); } - ClusterChange::Update(_) => { - // We are not interested in updates (yet). + Err(_err) => { + // Any internal error is acceptable here (index not found, etc.). } } - } -} - -#[cfg(test)] -mod tests { - use std::num::NonZero; - use std::sync::Arc; - - use mockall::Sequence; - use quickwit_actors::{AskError, Observe, SupervisorMetrics}; - use quickwit_cluster::ClusterChangeStreamFactoryForTest; - use quickwit_config::{ - CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, IndexConfig, KafkaSourceParams, SourceParams, - }; - use quickwit_indexing::IndexingService; - use quickwit_metastore::{ - CreateIndexRequestExt, IndexMetadata, ListIndexesMetadataResponseExt, - }; - use quickwit_proto::control_plane::{ - GetOrCreateOpenShardsFailureReason, GetOrCreateOpenShardsSubrequest, - }; - use quickwit_proto::indexing::{ - ApplyIndexingPlanRequest, ApplyIndexingPlanResponse, CpuCapacity, IndexingServiceClient, - MockIndexingService, - }; - use quickwit_proto::ingest::ingester::{ - IngesterServiceClient, InitShardSuccess, InitShardsResponse, MockIngesterService, - RetainShardsResponse, - }; - use quickwit_proto::ingest::{Shard, ShardPKey, ShardState}; - use quickwit_proto::metastore::{ - DeleteShardsResponse, EntityKind, FindIndexTemplateMatchesResponse, - ListIndexesMetadataRequest, ListIndexesMetadataResponse, ListShardsRequest, - ListShardsResponse, ListShardsSubresponse, MetastoreError, MockMetastoreService, - OpenShardSubresponse, OpenShardsResponse, SourceType, - }; - use quickwit_proto::types::{DocMappingUid, Position}; - use tokio::sync::Mutex; - use super::*; - use crate::IndexerNodeInfo; + universe.assert_quit().await; + } #[tokio::test] async fn test_control_plane_create_index() { let universe = Universe::with_accelerated_time(); - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); let index_uid_clone = index_uid.clone(); mock_metastore @@ -1262,12 +2066,13 @@ mod tests { #[tokio::test] async fn test_control_plane_delete_index() { let universe = Universe::with_accelerated_time(); - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid_clone = index_uid.clone(); mock_metastore .expect_delete_index() @@ -1304,7 +2109,7 @@ mod tests { #[tokio::test] async fn test_control_plane_add_source() { let universe = Universe::with_accelerated_time(); - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); @@ -1314,6 +2119,7 @@ mod tests { .unwrap(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_add_source() .withf(|add_source_request| { @@ -1367,7 +2173,7 @@ mod tests { async fn test_control_plane_update_source() { let universe = Universe::with_accelerated_time(); let pipelines_after_update = 3; - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let mut mock_indexer = MockIndexingService::new(); // call when starting the cp @@ -1411,6 +2217,7 @@ mod tests { .unwrap(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_update_source() .withf(move |update_source_request| { @@ -1465,7 +2272,7 @@ mod tests { #[tokio::test] async fn test_control_plane_toggle_source() { let universe = Universe::with_accelerated_time(); - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); @@ -1478,6 +2285,7 @@ mod tests { index_metadata.add_source(test_source_config).unwrap(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(vec![index_metadata]))); @@ -1543,11 +2351,12 @@ mod tests { #[tokio::test] async fn test_control_plane_delete_source() { let universe = Universe::with_accelerated_time(); - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); let index_uid_clone = index_uid.clone(); mock_metastore @@ -1590,12 +2399,13 @@ mod tests { #[tokio::test] async fn test_control_plane_get_or_create_open_shards() { let universe = Universe::with_accelerated_time(); - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let index_uid: IndexUid = IndexUid::for_test("test-index", 0); mock_metastore .expect_list_indexes_metadata() @@ -1670,10 +2480,11 @@ mod tests { #[tokio::test] async fn test_control_plane_supervision_reload_from_metastore() { let universe = Universe::default(); - let node_id = NodeId::new("test_node".to_string()); + let node_id = NodeId::from_str("test_node"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let source = SourceConfig::ingest_v2(); @@ -1806,20 +2617,16 @@ mod tests { #[tokio::test] async fn test_delete_shard_on_eof() { let universe = Universe::with_accelerated_time(); - let node_id = NodeId::new("test-control-plane".to_string()); + let node_id = NodeId::from_str("test-control-plane"); let indexer_pool = IndexerPool::default(); - let (client_mailbox, client_inbox) = universe.create_test_mailbox(); - let client = IndexingServiceClient::from_mailbox::(client_mailbox); - let indexer_node_info = IndexerNodeInfo { - node_id: NodeId::new("test-indexer".to_string()), - generation_id: 0, - client, - indexing_tasks: Vec::new(), - indexing_capacity: CpuCapacity::from_cpu_millis(4_000), - }; - indexer_pool.insert(indexer_node_info.node_id.clone(), indexer_node_info); + let client_inbox = add_test_indexer_with_mailbox( + &universe, + &indexer_pool, + NodeId::from_str("test-indexer"), + ); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let mut source = SourceConfig::ingest_v2(); @@ -1906,7 +2713,7 @@ mod tests { control_plane_mailbox.ask(Observe).await.unwrap(); let last_applied_physical_plan = control_plane_obs .indexing_scheduler - .last_applied_physical_plan + .current_targeted_physical_plan .unwrap(); let indexing_tasks = last_applied_physical_plan .indexing_tasks_per_indexer() @@ -1937,7 +2744,7 @@ mod tests { control_plane_mailbox.ask(Observe).await.unwrap(); let last_applied_physical_plan = control_plane_obs .indexing_scheduler - .last_applied_physical_plan + .current_targeted_physical_plan .unwrap(); let indexing_tasks = last_applied_physical_plan .indexing_tasks_per_indexer() @@ -1955,20 +2762,16 @@ mod tests { #[tokio::test] async fn test_fill_shard_table_position_from_metastore_on_startup() { let universe = Universe::with_accelerated_time(); - let node_id = NodeId::new("test-control-plane".to_string()); + let node_id = NodeId::from_str("test-control-plane"); let indexer_pool = IndexerPool::default(); - let (client_mailbox, _client_inbox) = universe.create_test_mailbox(); - let client = IndexingServiceClient::from_mailbox::(client_mailbox); - let indexer_node_info = IndexerNodeInfo { - node_id: NodeId::new("test-indexer".to_string()), - generation_id: 0, - client, - indexing_tasks: Vec::new(), - indexing_capacity: CpuCapacity::from_cpu_millis(4_000), - }; - indexer_pool.insert(indexer_node_info.node_id.clone(), indexer_node_info); + let _indexing_inbox = add_test_indexer_with_mailbox( + &universe, + &indexer_pool, + NodeId::from_str("test-indexer"), + ); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); let mut source_config = SourceConfig::ingest_v2(); @@ -2033,20 +2836,16 @@ mod tests { async fn test_delete_non_existing_shard() { quickwit_common::setup_logging_for_tests(); let universe = Universe::default(); - let node_id = NodeId::new("test-control-plane".to_string()); + let node_id = NodeId::from_str("test-control-plane"); let indexer_pool = IndexerPool::default(); - let (client_mailbox, _client_inbox) = universe.create_test_mailbox(); - let client = IndexingServiceClient::from_mailbox::(client_mailbox); - let indexer_node_info = IndexerNodeInfo { - node_id: NodeId::new("test-indexer".to_string()), - generation_id: 0, - client, - indexing_tasks: Vec::new(), - indexing_capacity: CpuCapacity::from_cpu_millis(4_000), - }; - indexer_pool.insert(indexer_node_info.node_id.clone(), indexer_node_info); + let _indexing_inbox = add_test_indexer_with_mailbox( + &universe, + &indexer_pool, + NodeId::from_str("test-indexer"), + ); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let mut source = SourceConfig::ingest_v2(); @@ -2126,7 +2925,7 @@ mod tests { async fn test_delete_index() { quickwit_common::setup_logging_for_tests(); let universe = Universe::default(); - let node_id = NodeId::new("test-control-plane".to_string()); + let node_id = NodeId::from_str("test-control-plane"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); @@ -2142,6 +2941,7 @@ mod tests { let index_0_clone = index_0.clone(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .times(1) @@ -2211,7 +3011,7 @@ mod tests { Ok(RetainShardsResponse {}) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("node1".into(), ingester); + ingester_pool.insert(NodeId::from_str("node1"), ingester); let cluster_config = ClusterConfig::for_test(); let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); @@ -2240,7 +3040,7 @@ mod tests { async fn test_delete_source() { quickwit_common::setup_logging_for_tests(); let universe = Universe::default(); - let node_id = NodeId::new("test-control-plane".to_string()); + let node_id = NodeId::from_str("test-control-plane"); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); @@ -2257,12 +3057,13 @@ mod tests { Ok(RetainShardsResponse {}) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("node1".into(), ingester); + ingester_pool.insert(NodeId::from_str("node1"), ingester); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); let index_uid_clone = index_0.index_uid.clone(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore.expect_delete_source().return_once( move |delete_source_request: DeleteSourceRequest| { assert_eq!(delete_source_request.index_uid(), &index_uid_clone); @@ -2340,12 +3141,13 @@ mod tests { let mut cluster_config = ClusterConfig::for_test(); cluster_config.auto_create_indexes = true; - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() @@ -2457,10 +3259,10 @@ mod tests { cluster_change_stream_tx.send(cluster_change).unwrap(); let IndexerJoined(joined) = control_plane_inbox.recv_typed_message().await.unwrap(); - assert_eq!(joined.grpc_advertise_addr().port(), 1516); + assert_eq!(joined.grpc_advertise_addr.port(), 1516); let IndexerLeft(left) = control_plane_inbox.recv_typed_message().await.unwrap(); - assert_eq!(left.grpc_advertise_addr().port(), 1516); + assert_eq!(left.grpc_advertise_addr.port(), 1516); universe.assert_quit().await; } @@ -2470,16 +3272,33 @@ mod tests { let universe = Universe::with_accelerated_time(); let cluster_config = ClusterConfig::for_test(); - let node_id = NodeId::from("test-control-plane"); + let node_id = NodeId::from_str("test-control-plane"); let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); let metastore = MetastoreServiceClient::from_mock(mock_metastore); + + // Create mock maintenance persistence metastore + let mut mock_persistence_metastore = MockMetastoreService::new(); + mock_persistence_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + mock_persistence_metastore + .expect_set_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_persistence_metastore + .expect_delete_kv() + .returning(|_| Ok(EmptyResponse {})); + let maintenance_persistence = MetastoreKvPersistence::new( + MetastoreServiceClient::from_mock(mock_persistence_metastore), + ); + let disable_control_loop = true; let (_control_plane_mailbox, control_plane_handle, _readiness_rx) = ControlPlane::spawn_inner( @@ -2491,6 +3310,7 @@ mod tests { ingester_pool, metastore, disable_control_loop, + maintenance_persistence, ); let cluster_change_stream_tx = cluster_change_stream_factory.change_stream_tx(); let indexer_node = @@ -2531,12 +3351,12 @@ mod tests { let universe = Universe::with_accelerated_time(); let cluster_config = ClusterConfig::for_test(); - let node_id = NodeId::from("test-control-plane"); + let node_id = NodeId::from_str("test-control-plane"); let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); let indexer_pool = IndexerPool::default(); let ingester_pool = IngesterPool::default(); - let ingester_id = NodeId::from("test-ingester"); + let ingester_id = NodeId::from_str("test-ingester"); let mut mock_ingester = MockIngesterService::new(); mock_ingester .expect_retain_shards() @@ -2556,6 +3376,7 @@ mod tests { ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); @@ -2664,16 +3485,258 @@ mod tests { universe.assert_quit().await; } + #[tokio::test] + async fn test_control_plane_swap_pipelines_applied_on_next_control_loop() { + let universe = Universe::default(); + let node_id = NodeId::from_str("test-control-plane"); + let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); + let indexer_pool = IndexerPool::default(); + let ingester_pool = IngesterPool::default(); + + // Two mock indexers that accept unlimited apply_indexing_plan calls. + let mut mock_indexer_1 = MockIndexingService::new(); + mock_indexer_1 + .expect_apply_indexing_plan() + .returning(|_| Ok(ApplyIndexingPlanResponse {})); + let mut mock_indexer_2 = MockIndexingService::new(); + mock_indexer_2 + .expect_apply_indexing_plan() + .returning(|_| Ok(ApplyIndexingPlanResponse {})); + + indexer_pool.insert( + NodeId::from_str("indexer-1"), + IndexerNodeInfo { + node_id: NodeId::from_str("indexer-1"), + generation_id: 0, + client: IndexingServiceClient::from_mock(mock_indexer_1), + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }, + ); + indexer_pool.insert( + NodeId::from_str("indexer-2"), + IndexerNodeInfo { + node_id: NodeId::from_str("indexer-2"), + generation_id: 0, + client: IndexingServiceClient::from_mock(mock_indexer_2), + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }, + ); + + // Two indexes, each with a single-pipeline Kafka source and an ingest-v2 source + // (so that `create_or_enable_ingest_v2_sources_if_necessary` does not call `add_source`). + let mut index_a = IndexMetadata::for_test("index-a", "ram:///index-a"); + index_a + .add_source(SourceConfig::for_test( + "kafka-source", + SourceParams::Kafka(KafkaSourceParams { + topic: "topic-a".to_string(), + client_log_level: None, + enable_backfill_mode: false, + client_params: json!({}), + }), + )) + .unwrap(); + index_a.add_source(SourceConfig::ingest_v2()).unwrap(); + + let mut index_b = IndexMetadata::for_test("index-b", "ram:///index-b"); + index_b + .add_source(SourceConfig::for_test( + "kafka-source", + SourceParams::Kafka(KafkaSourceParams { + topic: "topic-b".to_string(), + client_log_level: None, + enable_backfill_mode: false, + client_params: json!({}), + }), + )) + .unwrap(); + index_b.add_source(SourceConfig::ingest_v2()).unwrap(); + + let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); + mock_metastore + .expect_list_indexes_metadata() + .return_once(move |_| { + Ok(ListIndexesMetadataResponse::for_test(vec![ + index_a, index_b, + ])) + }); + mock_metastore + .expect_list_shards() + .return_once(|_| Ok(ListShardsResponse::default())); + + // Create mock maintenance persistence metastore + let mut mock_persistence_metastore = MockMetastoreService::new(); + mock_persistence_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); + mock_persistence_metastore + .expect_set_kv() + .returning(|_| Ok(EmptyResponse {})); + mock_persistence_metastore + .expect_delete_kv() + .returning(|_| Ok(EmptyResponse {})); + let maintenance_persistence = MetastoreKvPersistence::new( + MetastoreServiceClient::from_mock(mock_persistence_metastore), + ); + + let cluster_config = ClusterConfig::for_test(); + let (control_plane_mailbox, _control_plane_handle, _readiness_rx) = + ControlPlane::spawn_inner( + &universe, + cluster_config, + node_id, + cluster_change_stream_factory, + indexer_pool, + ingester_pool, + MetastoreServiceClient::from_mock(mock_metastore), + false, // keep the control loop enabled + maintenance_persistence, + ); + + // ── Wait for the initial plan to be built ────────────────────────── + // Use `mailbox.ask(Observe)` to get state directly from the inner + // actor (the supervisor handle only returns a cached snapshot that may + // lag behind). + let initial_state = { + let mut state = None; + for _ in 0..100 { + tokio::time::sleep(Duration::from_millis(50)).await; + let obs: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + if obs + .indexing_scheduler + .current_targeted_physical_plan + .is_some() + { + state = Some(obs); + break; + } + } + state.expect("initial plan should have been built") + }; + + let initial_plan = initial_state + .indexing_scheduler + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + // Each indexer should have exactly 1 task (4000 mcpu capacity, 3200 mcpu per pipeline). + let i1_tasks = initial_plan.indexer("indexer-1").unwrap(); + let i2_tasks = initial_plan.indexer("indexer-2").unwrap(); + assert_eq!(i1_tasks.len(), 1); + assert_eq!(i2_tasks.len(), 1); + + let idx_on_1 = i1_tasks[0].index_uid().index_id.clone(); + let idx_on_2 = i2_tasks[0].index_uid().index_id.clone(); + assert_ne!(idx_on_1, idx_on_2); + + let num_schedule_before = initial_state.indexing_scheduler.num_schedule_indexing_plan; + + // ── Swap pipelines ───────────────────────────────────────────────── + let response: SwapIndexingPipelinesResponse = control_plane_mailbox + .ask(SwapIndexingPipelinesRequest { + swaps: vec![SwapIndexingPipelinesEntry { + left_node_id: "indexer-1".to_string(), + left_index_id: idx_on_1.clone(), + right_node_id: "indexer-2".to_string(), + right_index_id: Some(idx_on_2.clone()), + }], + }) + .await + .unwrap() + .unwrap(); + assert!(response.results[0].success, "swap must succeed"); + + // Immediately after the swap, the targeted plan should reflect it. + let after_swap: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + let plan = after_swap + .indexing_scheduler + .current_targeted_physical_plan + .as_ref() + .unwrap(); + assert_eq!( + plan.indexer("indexer-1").unwrap()[0].index_uid().index_id, + idx_on_2, + "indexer-1 should now have the index that was on indexer-2" + ); + assert_eq!( + plan.indexer("indexer-2").unwrap()[0].index_uid().index_id, + idx_on_1, + "indexer-2 should now have the index that was on indexer-1" + ); + + let num_applied_after_swap = after_swap + .indexing_scheduler + .num_applied_physical_indexing_plan; + + // ── Wait for the control loop to re-apply the (swapped) plan ─────── + // `control_running_plan` has a MIN_DURATION_BETWEEN_SCHEDULING cooldown + // (50 ms in tests). The control loop interval is 100 ms. We poll until + // the apply counter increases. + let mut reapplied = false; + for _ in 0..40 { + tokio::time::sleep(Duration::from_millis(100)).await; + let obs: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + if obs.indexing_scheduler.num_applied_physical_indexing_plan > num_applied_after_swap { + reapplied = true; + break; + } + } + assert!( + reapplied, + "the control loop should have re-applied the plan" + ); + + // ── Verify the swapped plan is still in place after re-apply ─────── + let final_state: ControlPlaneObservableState = + control_plane_mailbox.ask(Observe).await.unwrap(); + let final_plan = final_state + .indexing_scheduler + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + assert_eq!( + final_plan.indexer("indexer-1").unwrap()[0] + .index_uid() + .index_id, + idx_on_2, + "after control loop re-apply, indexer-1 should still have the swapped index" + ); + assert_eq!( + final_plan.indexer("indexer-2").unwrap()[0] + .index_uid() + .index_id, + idx_on_1, + "after control loop re-apply, indexer-2 should still have the swapped index" + ); + + // No rebuild should have happened; only re-applies of the existing plan. + assert_eq!( + final_state.indexing_scheduler.num_schedule_indexing_plan, num_schedule_before, + "no rebuild should have happened after the swap – the control loop should only \ + re-apply the existing plan" + ); + + universe.assert_quit().await; + } + #[tokio::test] async fn test_control_plane_get_debug_info() { let universe = Universe::with_accelerated_time(); let cluster_config = ClusterConfig::for_test(); - let node_id = NodeId::from("test-control-plane"); + let node_id = NodeId::from_str("test-control-plane"); let cluster_change_stream_factory = ClusterChangeStreamFactoryForTest::default(); let indexer_pool = IndexerPool::default(); - let ingester_id = NodeId::from("test-ingester"); + let ingester_id = NodeId::from_str("test-ingester"); let mut mock_indexer = MockIndexingService::new(); mock_indexer @@ -2710,6 +3773,7 @@ mod tests { ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); + setup_disabled_maintenance(&mut mock_metastore); mock_metastore .expect_list_indexes_metadata() .return_once(|_| Ok(ListIndexesMetadataResponse::for_test(Vec::new()))); diff --git a/quickwit/quickwit-control-plane/src/indexing_plan.rs b/quickwit/quickwit-control-plane/src/indexing_plan.rs index befeef18232..31cce996dfa 100644 --- a/quickwit/quickwit-control-plane/src/indexing_plan.rs +++ b/quickwit/quickwit-control-plane/src/indexing_plan.rs @@ -14,13 +14,13 @@ use fnv::FnvHashMap; use quickwit_proto::indexing::IndexingTask; -use serde::Serialize; +use serde::{Deserialize, Serialize}; /// A [`PhysicalIndexingPlan`] defines the list of indexing tasks /// each indexer, identified by its node ID, should run. /// TODO(fmassot): a metastore version number will be attached to the plan /// to identify if the plan is up to date with the metastore. -#[derive(Debug, PartialEq, Clone, Serialize)] +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] pub struct PhysicalIndexingPlan { indexing_tasks_per_indexer_id: FnvHashMap>, } diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index 7feab6564e7..639f61020e2 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -25,12 +25,18 @@ use fnv::{FnvHashMap, FnvHashSet}; use itertools::Itertools; use once_cell::sync::OnceCell; use quickwit_common::pretty::PrettySample; -use quickwit_config::{FileSourceParams, SourceParams, indexing_pipeline_params_fingerprint}; +use quickwit_config::{ + FileSourceParams, SourceParams, disable_ingest_v1, indexing_pipeline_params_fingerprint, +}; +use quickwit_proto::control_plane::{ + ControlPlaneResult, SwapIndexingPipelinesEntry, SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, SwapIndexingPipelinesResult, +}; use quickwit_proto::indexing::{ ApplyIndexingPlanRequest, CpuCapacity, IndexingService, IndexingTask, PIPELINE_FULL_CAPACITY, PIPELINE_THROUGHPUT, }; -use quickwit_proto::types::NodeId; +use quickwit_proto::types::{NodeId, PipelineUid}; use scheduling::{SourceToSchedule, SourceToScheduleType}; use serde::Serialize; use tracing::{debug, info, warn}; @@ -58,7 +64,7 @@ const MAX_LOAD_PER_PIPELINE: CpuCapacity = CpuCapacity::from_cpu_millis(3_200); pub struct IndexingSchedulerState { pub num_applied_physical_indexing_plan: usize, pub num_schedule_indexing_plan: usize, - pub last_applied_physical_plan: Option, + pub current_targeted_physical_plan: Option, #[serde(skip)] pub last_applied_plan_timestamp: Option, } @@ -218,7 +224,11 @@ fn get_sources_to_schedule(model: &ControlPlaneModel) -> Vec { } SourceParams::IngestApi => { - // TODO ingest v1 is scheduled differently + if disable_ingest_v1() { + // Existing indexes might still have the _ingest-api-source + continue; + } + // Note: ingest v1 is scheduled differently sources.push(SourceToSchedule { source_uid, source_type: SourceToScheduleType::IngestV1, @@ -276,6 +286,17 @@ fn get_sources_to_schedule(model: &ControlPlaneModel) -> Vec { sources } +/// Holds the pre-validated tasks to move for a single swap entry. +/// Tasks are collected from the original plan before any modifications. +struct ValidSwapOperation { + left_node_id: String, + left_tasks: Vec, + right_node_id: String, + right_tasks: Vec, + left_index_id: String, + right_index_id: Option, +} + impl IndexingScheduler { pub fn new(cluster_id: String, self_node_id: NodeId, indexer_pool: IndexerPool) -> Self { IndexingScheduler { @@ -291,12 +312,30 @@ impl IndexingScheduler { self.state.clone() } - // Should be called whenever a change in the list of index/shard - // has happened. - // - // Prefer not calling this method directly, and instead call - // `ControlPlane::rebuild_indexing_plan_debounced`. - pub(crate) fn rebuild_plan(&mut self, model: &ControlPlaneModel) { + /// Loads a frozen indexing plan without applying it + /// to indexers or triggering any scheduling logic. + /// + /// This is used during control plane initialization when maintenance mode is active: + /// the frozen plan is restored as the `current_targeted_physical_plan` so that the + /// `control_running_plan` loop can re-apply it to indexers that restart during the + /// maintenance window. + pub(crate) fn load_frozen_plan(&mut self, plan: crate::indexing_plan::PhysicalIndexingPlan) { + self.state.current_targeted_physical_plan = Some(plan); + } + + /// Should be called whenever a change in the list of index/shard has + /// happened. + /// + /// When in maintenance mode (`is_maintenance` is true), this function exits + /// early to keep the indexing plan frozen. This design provides a simple + /// safeguard to prevent unintended plan modifications during maintenance. + /// + /// Prefer not calling this method directly, and instead call + /// `ControlPlane::rebuild_indexing_plan_debounced`. + pub(crate) fn rebuild_plan(&mut self, model: &ControlPlaneModel, is_maintenance: bool) { + if is_maintenance { + return; + } crate::metrics::CONTROL_PLANE_METRICS.schedule_total.inc(); let notify_on_drop = self.next_rebuild_tracker.start_rebuild(); @@ -327,15 +366,15 @@ impl IndexingScheduler { let new_physical_plan = build_physical_indexing_plan( &sources, &indexer_id_to_cpu_capacities, - self.state.last_applied_physical_plan.as_ref(), + self.state.current_targeted_physical_plan.as_ref(), &shard_locations, ); let shard_locality_metrics = get_shard_locality_metrics(&new_physical_plan, &shard_locations); crate::metrics::CONTROL_PLANE_METRICS.set_shard_locality_metrics(shard_locality_metrics); - if let Some(last_applied_plan) = &self.state.last_applied_physical_plan { + if let Some(current_targeted_plan) = &self.state.current_targeted_physical_plan { let plans_diff = get_indexing_plans_diff( - last_applied_plan.indexing_tasks_per_indexer(), + current_targeted_plan.indexing_tasks_per_indexer(), new_physical_plan.indexing_tasks_per_indexer(), ); // No need to apply the new plan as it is the same as the old one. @@ -351,15 +390,15 @@ impl IndexingScheduler { /// chitchat cluster state. If true, do nothing. /// - If node IDs differ, schedule a new indexing plan. /// - If indexing tasks differ, apply again the last plan. - pub(crate) fn control_running_plan(&mut self, model: &ControlPlaneModel) { - let last_applied_plan = - if let Some(last_applied_plan) = &self.state.last_applied_physical_plan { - last_applied_plan + pub(crate) fn control_running_plan(&mut self, model: &ControlPlaneModel, is_maintenance: bool) { + let current_targeted_plan = + if let Some(current_targeted) = &self.state.current_targeted_physical_plan { + current_targeted } else { // If there is no plan, the node is probably starting and the scheduler did not find // indexers yet. In this case, we want to schedule as soon as possible to find new // indexers. - self.rebuild_plan(model); + self.rebuild_plan(model, is_maintenance); return; }; if let Some(last_applied_plan_timestamp) = self.state.last_applied_plan_timestamp @@ -376,15 +415,15 @@ impl IndexingScheduler { let indexing_plans_diff = get_indexing_plans_diff( &running_indexing_tasks_by_node_id, - last_applied_plan.indexing_tasks_per_indexer(), + current_targeted_plan.indexing_tasks_per_indexer(), ); if !indexing_plans_diff.has_same_nodes() { info!(plans_diff=?indexing_plans_diff, "running plan and last applied plan node IDs differ: schedule an indexing plan"); - self.rebuild_plan(model); + self.rebuild_plan(model, is_maintenance); } else if !indexing_plans_diff.has_same_tasks() { // Some nodes may have not received their tasks, apply it again. info!(plans_diff=?indexing_plans_diff, "running tasks and last applied tasks differ: reapply last plan"); - self.apply_physical_indexing_plan(&indexers, last_applied_plan.clone(), None); + self.apply_physical_indexing_plan(&indexers, current_targeted_plan.clone(), None); } } @@ -408,7 +447,7 @@ impl IndexingScheduler { tokio::spawn({ let indexer = indexers .iter() - .find(|indexer| indexer.node_id == *node_id) + .find(|indexer| indexer.node_id.as_str() == node_id.as_str()) .expect("This should never happen as the plan was built from these indexers.") .clone(); let indexing_tasks = indexing_tasks.clone(); @@ -432,7 +471,262 @@ impl IndexingScheduler { } self.state.num_applied_physical_indexing_plan += 1; self.state.last_applied_plan_timestamp = Some(Instant::now()); - self.state.last_applied_physical_plan = Some(new_physical_plan); + self.state.current_targeted_physical_plan = Some(new_physical_plan); + } + + /// Swaps indexing pipelines between indexers as requested. + /// + /// The swap is applied in 3 phases: + /// 1. Upfront contradiction check (rejects entire request on failure). + /// 2. Per-swap validation against the original (unmodified) plan. + /// 3. Atomic application of all valid swaps to a working copy of the plan. + pub(crate) fn swap_pipelines( + &mut self, + request: SwapIndexingPipelinesRequest, + ) -> ControlPlaneResult { + // Phase 0: Check that a plan exists. + let Some(original_plan) = &mut self.state.current_targeted_physical_plan else { + return Ok(SwapIndexingPipelinesResponse { + results: request + .swaps + .into_iter() + .map(|swap| SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: false, + reason: "no indexing plan is currently applied".to_string(), + }) + .collect(), + }); + }; + + // Phase 1: Upfront contradiction check (rejects entire request on failure). + if let Err(error_response) = Self::check_swap_contradictions(&request) { + return Ok(error_response); + } + + // Phase 2: Validate each swap against the ORIGINAL plan and collect + // the tasks to move. + let mut swap_results: Vec = + Vec::with_capacity(request.swaps.len()); + let mut valid_operations: Vec = Vec::new(); + + for swap in &request.swaps { + match Self::validate_single_swap(original_plan, swap) { + Ok(operation) => { + valid_operations.push(operation); + swap_results.push(SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + }); + } + Err(reason) => { + swap_results.push(SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: false, + reason, + }); + } + } + } + + // Phase 3: Apply all valid swaps atomically to a working copy. + if !valid_operations.is_empty() { + for operation in &valid_operations { + Self::apply_swap_operation(original_plan, operation); + } + } + + Ok(SwapIndexingPipelinesResponse { + results: swap_results, + }) + } + + /// Validates the entire swap request for contradictions. + /// + /// A contradiction exists when: + /// - A swap entry references the same node on both sides. + /// - The same (node_id, index_id) pair appears in more than one swap entry. + /// + /// On contradiction, returns a response with all swaps marked as failed. + fn check_swap_contradictions( + request: &SwapIndexingPipelinesRequest, + ) -> Result<(), SwapIndexingPipelinesResponse> { + let mut seen_slots: FnvHashSet<(&str, &str)> = FnvHashSet::default(); + + let make_error_response = |reason: String| SwapIndexingPipelinesResponse { + results: request + .swaps + .iter() + .map(|s| SwapIndexingPipelinesResult { + swap: Some(s.clone()), + success: false, + reason: reason.clone(), + }) + .collect(), + }; + + for swap in &request.swaps { + // Reject same-node operations. + if swap.left_node_id == swap.right_node_id { + let right_index_desc = swap.right_index_id.as_deref().unwrap_or(""); + let reason = format!( + "request rejected: swap between '{}' (index '{}') and '{}' (index '{}') \ + references the same node", + swap.left_node_id, swap.left_index_id, swap.right_node_id, right_index_desc, + ); + return Err(make_error_response(reason)); + } + + let left_slot = (swap.left_node_id.as_str(), swap.left_index_id.as_str()); + + // Check for duplicate left slots across entries. + if !seen_slots.insert(left_slot) { + let reason = format!( + "request rejected: contradictory swaps — index '{}' on node '{}' is \ + referenced by multiple swap entries", + left_slot.1, left_slot.0, + ); + return Err(make_error_response(reason)); + } + + // Only check right slot for full swaps (when right_index_id is specified). + if let Some(right_index_id) = &swap.right_index_id { + let right_slot = (swap.right_node_id.as_str(), right_index_id.as_str()); + if !seen_slots.insert(right_slot) { + let reason = format!( + "request rejected: contradictory swaps — index '{}' on node '{}' is \ + referenced by multiple swap entries", + right_slot.1, right_slot.0, + ); + return Err(make_error_response(reason)); + } + } + } + + Ok(()) + } + + /// Validates a single swap entry against the original (unmodified) plan. + /// + /// When `right_index_id` is `None`, the operation is a one-way move: the left + /// index's pipelines are moved to the right node without moving any pipelines back. + fn validate_single_swap( + plan: &PhysicalIndexingPlan, + swap: &SwapIndexingPipelinesEntry, + ) -> Result { + // 1. Verify the left indexer exists in the plan. + let left_tasks = plan.indexer(&swap.left_node_id).ok_or_else(|| { + format!( + "indexer '{}' not found in the current plan", + swap.left_node_id + ) + })?; + + // 2. Collect tasks for the left index. + let left_tasks_to_move: Vec = left_tasks + .iter() + .filter(|t| t.index_uid().index_id == swap.left_index_id) + .cloned() + .collect(); + + // 3. Reject if no tasks found on the left side. + if left_tasks_to_move.is_empty() { + return Err(format!( + "no pipelines found for index '{}' on indexer '{}'", + swap.left_index_id, swap.left_node_id, + )); + } + + // 4. For full swaps, validate the right side too. For move-only operations (right_index_id + // is None), just verify the right indexer exists. + let right_tasks_to_move = if let Some(right_index_id) = &swap.right_index_id { + let right_tasks = plan.indexer(&swap.right_node_id).ok_or_else(|| { + format!( + "indexer '{}' not found in the current plan", + swap.right_node_id + ) + })?; + + let right_tasks_to_move: Vec = right_tasks + .iter() + .filter(|t| t.index_uid().index_id == *right_index_id) + .cloned() + .collect(); + + if right_tasks_to_move.is_empty() { + return Err(format!( + "no pipelines found for index '{}' on indexer '{}'", + right_index_id, swap.right_node_id, + )); + } + + if left_tasks_to_move.len() != right_tasks_to_move.len() { + return Err(format!( + "pipeline count mismatch: '{}' has {} pipeline(s) on '{}', but '{}' has {} \ + pipeline(s) on '{}'", + swap.left_index_id, + left_tasks_to_move.len(), + swap.left_node_id, + right_index_id, + right_tasks_to_move.len(), + swap.right_node_id, + )); + } + + right_tasks_to_move + } else { + // Move-only: verify the right indexer exists in the plan. + plan.indexer(&swap.right_node_id).ok_or_else(|| { + format!( + "indexer '{}' not found in the current plan", + swap.right_node_id + ) + })?; + Vec::new() + }; + + Ok(ValidSwapOperation { + left_node_id: swap.left_node_id.clone(), + left_tasks: left_tasks_to_move, + right_node_id: swap.right_node_id.clone(), + right_tasks: right_tasks_to_move, + left_index_id: swap.left_index_id.clone(), + right_index_id: swap.right_index_id.clone(), + }) + } + + /// Applies a validated swap operation to a working copy of the plan. + /// + /// When `right_index_id` is `None`, this is a one-way move: the left index's + /// pipelines are moved to the right node without any pipelines moving back. + fn apply_swap_operation(plan: &mut PhysicalIndexingPlan, operation: &ValidSwapOperation) { + let plan_map = plan.indexing_tasks_per_indexer_mut(); + + // Remove the left index's tasks from the left node. + if let Some(left_node_tasks) = plan_map.get_mut(&operation.left_node_id) { + left_node_tasks.retain(|t| t.index_uid().index_id != operation.left_index_id); + } + // For full swaps, also remove the right index's tasks from the right node. + if let (Some(right_index_id), Some(right_node_tasks)) = ( + &operation.right_index_id, + plan_map.get_mut(&operation.right_node_id), + ) { + right_node_tasks.retain(|t| t.index_uid().index_id != *right_index_id); + } + + // Move left tasks to the right node with fresh pipeline UIDs. + for task in &operation.left_tasks { + let mut moved_task = task.clone(); + moved_task.pipeline_uid = Some(PipelineUid::random()); + plan.add_indexing_task(&operation.right_node_id, moved_task); + } + // For full swaps, move right tasks to the left node with fresh pipeline UIDs. + for task in &operation.right_tasks { + let mut moved_task = task.clone(); + moved_task.pipeline_uid = Some(PipelineUid::random()); + plan.add_indexing_task(&operation.left_node_id, moved_task); + } } } @@ -543,7 +837,10 @@ fn format_indexing_task_map( const MAX_INDEXES: usize = 10; let mut index_displayed = 0; write!(formatter, "{{")?; - let mut indexer_iter = indexing_tasks.iter().enumerate(); + let mut indexer_iter = indexing_tasks + .iter() + .filter(|(_, tasks)| !tasks.is_empty()) + .enumerate(); for (i, (index_name, tasks)) in &mut indexer_iter { if i != 0 { write!(formatter, ", ")?; @@ -610,14 +907,14 @@ fn format_indexing_task_map( /// the last plan applied by the scheduler. fn get_indexing_plans_diff<'a>( running_plan: &'a FnvHashMap>, - last_applied_plan: &'a FnvHashMap>, + current_targeted_plan: &'a FnvHashMap>, ) -> IndexingPlansDiff<'a> { // Nodes diff. let running_node_ids: FnvHashSet<&str> = running_plan .keys() .map(|node_id| node_id.as_str()) .collect(); - let planned_node_ids: FnvHashSet<&str> = last_applied_plan + let planned_node_ids: FnvHashSet<&str> = current_targeted_plan .keys() .map(|node_id| node_id.as_str()) .collect(); @@ -638,7 +935,7 @@ fn get_indexing_plans_diff<'a>( .get(*node_id) .map(Vec::as_slice) .unwrap_or_else(|| &[]); - let last_applied_tasks = last_applied_plan + let last_applied_tasks = current_targeted_plan .get(*node_id) .map(Vec::as_slice) .unwrap_or_else(|| &[]); @@ -705,6 +1002,10 @@ mod tests { use proptest::{prop_compose, proptest}; use quickwit_config::{IndexConfig, KafkaSourceParams, SourceConfig, SourceParams}; use quickwit_metastore::IndexMetadata; + use quickwit_proto::control_plane::{SwapIndexingPipelinesEntry, SwapIndexingPipelinesRequest}; + use quickwit_proto::indexing::{ + ApplyIndexingPlanResponse, IndexingServiceClient, MockIndexingService, + }; use quickwit_proto::types::{IndexUid, PipelineUid, ShardId, SourceUid}; use super::*; @@ -871,6 +1172,626 @@ mod tests { } } + fn make_test_task(index_id: &str, source_id: &str, pipeline_uid: u128) -> IndexingTask { + IndexingTask { + index_uid: Some(IndexUid::for_test(index_id, 0)), + source_id: source_id.to_string(), + pipeline_uid: Some(PipelineUid::for_test(pipeline_uid)), + shard_ids: Vec::new(), + params_fingerprint: 0, + } + } + + fn make_swap_entry( + left_node: &str, + left_index: &str, + right_node: &str, + right_index: &str, + ) -> SwapIndexingPipelinesEntry { + SwapIndexingPipelinesEntry { + left_node_id: left_node.to_string(), + left_index_id: left_index.to_string(), + right_node_id: right_node.to_string(), + right_index_id: Some(right_index.to_string()), + } + } + + fn make_move_entry( + left_node: &str, + left_index: &str, + right_node: &str, + ) -> SwapIndexingPipelinesEntry { + SwapIndexingPipelinesEntry { + left_node_id: left_node.to_string(), + left_index_id: left_index.to_string(), + right_node_id: right_node.to_string(), + right_index_id: None, + } + } + + fn build_test_scheduler_with_plan(plan: PhysicalIndexingPlan) -> IndexingScheduler { + let indexer_pool = IndexerPool::default(); + for node_id in plan.indexing_tasks_per_indexer().keys() { + let mut mock_indexer = MockIndexingService::new(); + mock_indexer + .expect_apply_indexing_plan() + .returning(|_| Ok(ApplyIndexingPlanResponse {})); + let indexer_info = IndexerNodeInfo { + node_id: NodeId::from_str(node_id.as_str()), + generation_id: 0, + client: IndexingServiceClient::from_mock(mock_indexer), + indexing_tasks: Vec::new(), + indexing_capacity: CpuCapacity::from_cpu_millis(4_000), + }; + indexer_pool.insert(indexer_info.node_id.clone(), indexer_info); + } + let mut scheduler = IndexingScheduler::new( + "test-cluster".to_string(), + NodeId::from_str("test-node"), + indexer_pool, + ); + scheduler.state.current_targeted_physical_plan = Some(plan); + scheduler + } + + #[tokio::test] + async fn test_swap_pipelines_basic() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 1); + let task_b = make_test_task("index-b", "source-1", 2); + plan.add_indexing_task("indexer-1", task_a.clone()); + plan.add_indexing_task("indexer-2", task_b.clone()); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 1); + assert!(response.results[0].success); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + // index-a should now be on indexer-2 + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 1); + assert_eq!(indexer_2_tasks[0].index_uid().index_id, "index-a"); + // index-b should now be on indexer-1 + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 1); + assert_eq!(indexer_1_tasks[0].index_uid().index_id, "index-b"); + } + + #[tokio::test] + async fn test_swap_pipelines_count_mismatch() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 2)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 3)); + + let mut scheduler = build_test_scheduler_with_plan(plan.clone()); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 1); + assert!(!response.results[0].success); + assert!( + response.results[0] + .reason + .contains("pipeline count mismatch") + ); + + // Plan should be unchanged. + assert_eq!( + scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(), + &plan, + ); + } + + #[tokio::test] + async fn test_swap_pipelines_unknown_indexer() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-999", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("not found")); + } + + #[tokio::test] + async fn test_swap_pipelines_no_pipelines_for_index() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-NONEXISTENT", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("no pipelines found")); + } + + #[tokio::test] + async fn test_swap_pipelines_multiple_swaps_partial_success() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + "indexer-3".to_string(), + ]); + // Valid swap pair: 1 pipeline each. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + // Invalid swap pair: count mismatch (2 vs 1). + plan.add_indexing_task("indexer-2", make_test_task("index-c", "source-1", 3)); + plan.add_indexing_task("indexer-2", make_test_task("index-c", "source-1", 4)); + plan.add_indexing_task("indexer-3", make_test_task("index-d", "source-1", 5)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![ + make_swap_entry("indexer-1", "index-a", "indexer-2", "index-b"), + make_swap_entry("indexer-2", "index-c", "indexer-3", "index-d"), + ], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 2); + assert!(response.results[0].success); + assert!(!response.results[1].success); + assert!( + response.results[1] + .reason + .contains("pipeline count mismatch") + ); + + // The first swap should have been applied. + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 1); + assert_eq!(indexer_1_tasks[0].index_uid().index_id, "index-b"); + } + + #[tokio::test] + async fn test_swap_pipelines_no_plan() { + let indexer_pool = IndexerPool::default(); + let mut scheduler = IndexingScheduler::new( + "test-cluster".to_string(), + NodeId::from_str("test-node"), + indexer_pool, + ); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("no indexing plan")); + } + + #[tokio::test] + async fn test_swap_pipelines_same_node_rejected() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-b", "source-1", 2)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-1", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("same node")); + } + + #[tokio::test] + async fn test_swap_pipelines_contradiction_same_slot() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + "indexer-3".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + plan.add_indexing_task("indexer-3", make_test_task("index-c", "source-1", 3)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + // Both swaps try to move index-a from indexer-1. + let request = SwapIndexingPipelinesRequest { + swaps: vec![ + make_swap_entry("indexer-1", "index-a", "indexer-2", "index-b"), + make_swap_entry("indexer-1", "index-a", "indexer-3", "index-c"), + ], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + // ALL swaps should be rejected. + assert_eq!(response.results.len(), 2); + assert!(!response.results[0].success); + assert!(!response.results[1].success); + assert!(response.results[0].reason.contains("contradictory")); + } + + #[tokio::test] + async fn test_swap_pipelines_contradiction_does_not_apply_any() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + "indexer-3".to_string(), + ]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + plan.add_indexing_task("indexer-3", make_test_task("index-c", "source-1", 3)); + + let mut scheduler = build_test_scheduler_with_plan(plan.clone()); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![ + make_swap_entry("indexer-1", "index-a", "indexer-2", "index-b"), + make_swap_entry("indexer-1", "index-a", "indexer-3", "index-c"), + ], + }; + let _response = scheduler.swap_pipelines(request).unwrap(); + + // Plan should be completely unchanged. + assert_eq!( + scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(), + &plan, + ); + } + + #[tokio::test] + async fn test_swap_pipelines_fresh_pipeline_uids() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 100); + let task_b = make_test_task("index-b", "source-1", 200); + let original_uid_a = task_a.pipeline_uid; + let original_uid_b = task_b.pipeline_uid; + plan.add_indexing_task("indexer-1", task_a); + plan.add_indexing_task("indexer-2", task_b); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + scheduler.swap_pipelines(request).unwrap(); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let moved_a = &new_plan.indexer("indexer-2").unwrap()[0]; + let moved_b = &new_plan.indexer("indexer-1").unwrap()[0]; + // Pipeline UIDs must be fresh (different from originals). + assert_ne!(moved_a.pipeline_uid, original_uid_a); + assert_ne!(moved_b.pipeline_uid, original_uid_b); + // And different from each other. + assert_ne!(moved_a.pipeline_uid, moved_b.pipeline_uid); + } + + #[tokio::test] + async fn test_swap_pipelines_multiple_sources_same_index() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + // index-a has 2 sources on indexer-1. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-kafka", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-ingest", 2)); + // index-b has 2 sources on indexer-2. + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-kafka", 3)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-ingest", 4)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(response.results[0].success); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + // Both sources of index-a should now be on indexer-2. + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 2); + assert!( + indexer_2_tasks + .iter() + .all(|t| t.index_uid().index_id == "index-a") + ); + // Both sources of index-b should now be on indexer-1. + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 2); + assert!( + indexer_1_tasks + .iter() + .all(|t| t.index_uid().index_id == "index-b") + ); + } + + #[tokio::test] + async fn test_swap_pipelines_preserves_other_tasks() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + // Tasks being swapped. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + // Other tasks that should not be affected. + plan.add_indexing_task("indexer-1", make_test_task("index-c", "source-1", 3)); + plan.add_indexing_task("indexer-2", make_test_task("index-d", "source-1", 4)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_swap_entry( + "indexer-1", + "index-a", + "indexer-2", + "index-b", + )], + }; + scheduler.swap_pipelines(request).unwrap(); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + + // indexer-1 should have index-c (unchanged) and index-b (swapped in). + assert_eq!(indexer_1_tasks.len(), 2); + let indexer_1_index_ids: Vec<&str> = indexer_1_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_1_index_ids.contains(&"index-c")); + assert!(indexer_1_index_ids.contains(&"index-b")); + + // indexer-2 should have index-d (unchanged) and index-a (swapped in). + assert_eq!(indexer_2_tasks.len(), 2); + let indexer_2_index_ids: Vec<&str> = indexer_2_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_2_index_ids.contains(&"index-d")); + assert!(indexer_2_index_ids.contains(&"index-a")); + } + + #[tokio::test] + async fn test_swap_pipelines_move_without_swap() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 1); + let task_b = make_test_task("index-b", "source-1", 2); + plan.add_indexing_task("indexer-1", task_a.clone()); + plan.add_indexing_task("indexer-2", task_b.clone()); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + // Move index-a from indexer-1 to indexer-2 without swapping anything back. + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-2")], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert_eq!(response.results.len(), 1); + assert!( + response.results[0].success, + "{}", + response.results[0].reason + ); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + // indexer-1 should have no tasks (index-a was moved away). + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert!(indexer_1_tasks.is_empty()); + + // indexer-2 should have both index-b (unchanged) and index-a (moved in). + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 2); + let indexer_2_index_ids: Vec<&str> = indexer_2_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_2_index_ids.contains(&"index-a")); + assert!(indexer_2_index_ids.contains(&"index-b")); + } + + #[tokio::test] + async fn test_swap_pipelines_move_fresh_pipeline_uids() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let task_a = make_test_task("index-a", "source-1", 100); + let original_uid_a = task_a.pipeline_uid; + plan.add_indexing_task("indexer-1", task_a); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-2")], + }; + scheduler.swap_pipelines(request).unwrap(); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + let moved_a = &new_plan.indexer("indexer-2").unwrap()[0]; + // Pipeline UID must be refreshed after the move. + assert_ne!(moved_a.pipeline_uid, original_uid_a); + assert_eq!(moved_a.index_uid().index_id, "index-a"); + } + + #[tokio::test] + async fn test_swap_pipelines_move_unknown_right_indexer() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-999")], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!(!response.results[0].success); + assert!(response.results[0].reason.contains("not found")); + } + + #[tokio::test] + async fn test_swap_pipelines_move_preserves_right_node_tasks() { + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + // indexer-1 has two indexes; only index-a will be moved. + plan.add_indexing_task("indexer-1", make_test_task("index-a", "source-1", 1)); + plan.add_indexing_task("indexer-1", make_test_task("index-c", "source-1", 3)); + // indexer-2 has index-b which should remain untouched. + plan.add_indexing_task("indexer-2", make_test_task("index-b", "source-1", 2)); + + let mut scheduler = build_test_scheduler_with_plan(plan); + + let request = SwapIndexingPipelinesRequest { + swaps: vec![make_move_entry("indexer-1", "index-a", "indexer-2")], + }; + let response = scheduler.swap_pipelines(request).unwrap(); + + assert!( + response.results[0].success, + "{}", + response.results[0].reason + ); + + let new_plan = scheduler + .state + .current_targeted_physical_plan + .as_ref() + .unwrap(); + + // indexer-1 should still have index-c (only index-a was moved). + let indexer_1_tasks = new_plan.indexer("indexer-1").unwrap(); + assert_eq!(indexer_1_tasks.len(), 1); + assert_eq!(indexer_1_tasks[0].index_uid().index_id, "index-c"); + + // indexer-2 should have both index-b (unchanged) and index-a (moved in). + let indexer_2_tasks = new_plan.indexer("indexer-2").unwrap(); + assert_eq!(indexer_2_tasks.len(), 2); + let indexer_2_index_ids: Vec<&str> = indexer_2_tasks + .iter() + .map(|t| t.index_uid().index_id.as_str()) + .collect(); + assert!(indexer_2_index_ids.contains(&"index-a")); + assert!(indexer_2_index_ids.contains(&"index-b")); + } + #[test] fn test_get_sources_to_schedule() { let mut model = ControlPlaneModel::default(); diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/scheduling/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/scheduling/mod.rs index 52ee33f6aae..41026067449 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/scheduling/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/scheduling/mod.rs @@ -856,7 +856,7 @@ mod tests { let num_indexers = 10; let num_shards: usize = 1000; let indexers: Vec = (0..num_indexers) - .map(|indexer_id| NodeId::new(format!("indexer{indexer_id}"))) + .map(|indexer_id| NodeId::from_str(&format!("indexer{indexer_id}"))) .collect(); let source_uids: Vec = std::iter::repeat_with(source_id).take(1_000).collect(); let shard_ids: Vec = (0..num_shards as u64).map(ShardId::from).collect(); @@ -1300,11 +1300,11 @@ mod tests { shard2.clone(), shard3.clone(), ]; - let node1 = NodeId::new("node1".to_string()); - let node2 = NodeId::new("node2".to_string()); + let node1 = NodeId::from_str("node1"); + let node2 = NodeId::from_str("node2"); // This node is missing from the capacity map. // It should not be assigned any task despite being present in shard locations. - let node_missing = NodeId::new("node_missing".to_string()); + let node_missing = NodeId::from_str("node_missing"); let mut remaining_num_shards_per_node = HashMap::default(); remaining_num_shards_per_node .insert(node1.as_str().to_string(), NonZeroU32::new(3).unwrap()); diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 63295d61eca..caa4b348aa5 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -458,7 +458,7 @@ impl IngestController { let unavailable_leaders: FnvHashSet = get_open_shards_request .unavailable_leaders .into_iter() - .map(NodeId::from) + .map(|s| NodeId::from_str(&s)) .collect(); // We do a first pass to identify the shards that are missing from the model and need to be @@ -570,7 +570,7 @@ impl IngestController { } for shard in model.all_shards() { - if shard.is_open() && !unavailable_leaders.contains(&shard.leader_id) { + if shard.is_open() && !unavailable_leaders.contains(shard.leader_id.as_str()) { for ingest_node in shard.ingesters() { if let Some(shard_count) = per_node_num_open_shards.get_mut(ingest_node.as_str()) @@ -639,7 +639,7 @@ impl IngestController { } }) .collect(); - let Some(leader) = self.ingester_pool.get(&leader_id) else { + let Some(leader) = self.ingester_pool.get(leader_id.as_str()) else { warn!("failed to init shards: ingester `{leader_id}` is unavailable"); failures.extend(init_shard_failures); continue; @@ -1006,6 +1006,10 @@ impl IngestController { /// Moving a shard consists of closing the shard on the source ingester and opening a new /// one on the target ingester. /// + /// When in maintenance mode (`is_maintenance` is true), this function exits early to keep + /// the indexing plan frozen. This design provides a simple safeguard to prevent unintended + /// plan modifications during maintenance. + /// /// This method is guarded by a lock to ensure that only one rebalance operation is performed at /// a time. pub(crate) async fn rebalance_shards( @@ -1013,7 +1017,11 @@ impl IngestController { model: &mut ControlPlaneModel, mailbox: &Mailbox, progress: &Progress, + is_maintenance: bool, ) -> MetastoreResult>> { + if is_maintenance { + return Ok(None); + } let Ok(rebalance_guard) = self.rebalance_lock.clone().try_lock_owned() else { debug!("skipping rebalance: another rebalance is already in progress"); return Ok(None); @@ -1202,7 +1210,7 @@ impl IngestController { source_id: shard.source_id, shard_id: shard.shard_id, }; - let leader_id = NodeId::from(shard.leader_id); + let leader_id = NodeId::from_str(&shard.leader_id); per_leader_shards_to_close .entry(leader_id) .or_default() @@ -1294,7 +1302,7 @@ fn find_scale_down_candidate( .max_by_key(|(_leader_id, shard_entries)| (shard_entries.len(), rng.next_u32())) .map(|(leader_id, shard_entries)| { ( - leader_id.clone().into(), + NodeId::from_str(leader_id), shard_entries.choose(&mut rng).unwrap().shard_id().clone(), ) }) @@ -1385,7 +1393,7 @@ mod tests { let ingester = IngesterServiceClient::from_mock(mock_ingester); let ingester_pool = IngesterPool::default(); - ingester_pool.insert(NodeId::from("test-ingester-1"), ingester.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester.clone()); let mut mock_ingester = MockIngesterService::new(); let index_uid_1_clone = index_uid_1.clone(); @@ -1413,7 +1421,7 @@ mod tests { Ok(response) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert(NodeId::from("test-ingester-2"), ingester.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-2"), ingester.clone()); let replication_factor = 2; let mut controller = IngestController::new( @@ -1599,7 +1607,7 @@ mod tests { let ingester = IngesterServiceClient::from_mock(mock_ingester); let ingester_pool = IngesterPool::default(); - ingester_pool.insert(NodeId::from("test-ingester-1"), ingester.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester.clone()); let replication_factor = 1; let mut controller = IngestController::new( @@ -1711,7 +1719,7 @@ mod tests { assert!(leader_follower_pairs_opt.is_none()); ingester_pool.insert( - NodeId::from("test-ingester-1"), + NodeId::from_str("test-ingester-1"), IngesterServiceClient::mocked(), ); @@ -1722,7 +1730,10 @@ mod tests { // find any solution. assert!(leader_follower_pairs_opt.is_none()); - ingester_pool.insert("test-ingester-2".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + NodeId::from_str("test-ingester-2"), + IngesterServiceClient::mocked(), + ); let leader_follower_pairs = controller .allocate_shards(0, &FnvHashSet::default(), &model) @@ -1741,13 +1752,13 @@ mod tests { if leader_follower_pairs[0].0 == "test-ingester-1" { assert_eq!( leader_follower_pairs[0].1, - Some(NodeId::from("test-ingester-2")) + Some(NodeId::from_str("test-ingester-2")) ); } else { assert_eq!(leader_follower_pairs[0].0, "test-ingester-2"); assert_eq!( leader_follower_pairs[0].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); } @@ -1760,13 +1771,13 @@ mod tests { if leader_follower_pair.0 == "test-ingester-1" { assert_eq!( leader_follower_pair.1, - Some(NodeId::from("test-ingester-2")) + Some(NodeId::from_str("test-ingester-2")) ); } else { assert_eq!(leader_follower_pair.0, "test-ingester-2"); assert_eq!( leader_follower_pair.1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); } } @@ -1795,19 +1806,19 @@ mod tests { assert_eq!(leader_follower_pairs[0].0, "test-ingester-2"); assert_eq!( leader_follower_pairs[0].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); assert_eq!(leader_follower_pairs[1].0, "test-ingester-2"); assert_eq!( leader_follower_pairs[1].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); assert_eq!(leader_follower_pairs[2].0, "test-ingester-2"); assert_eq!( leader_follower_pairs[2].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); let open_shards = vec![ @@ -1838,11 +1849,14 @@ mod tests { assert_eq!(leader_follower_pairs[0].0, "test-ingester-2"); assert_eq!( leader_follower_pairs[0].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); - ingester_pool.insert("test-ingester-3".into(), IngesterServiceClient::mocked()); - let unavailable_leaders = FnvHashSet::from_iter([NodeId::from("test-ingester-2")]); + ingester_pool.insert( + NodeId::from_str("test-ingester-3"), + IngesterServiceClient::mocked(), + ); + let unavailable_leaders = FnvHashSet::from_iter([NodeId::from_str("test-ingester-2")]); let leader_follower_pairs = controller .allocate_shards(4, &unavailable_leaders, &model) .unwrap(); @@ -1851,25 +1865,25 @@ mod tests { assert_eq!(leader_follower_pairs[0].0, "test-ingester-3"); assert_eq!( leader_follower_pairs[0].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); assert_eq!(leader_follower_pairs[1].0, "test-ingester-3"); assert_eq!( leader_follower_pairs[1].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); assert_eq!(leader_follower_pairs[2].0, "test-ingester-3"); assert_eq!( leader_follower_pairs[2].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); assert_eq!(leader_follower_pairs[3].0, "test-ingester-3"); assert_eq!( leader_follower_pairs[3].1, - Some(NodeId::from("test-ingester-1")) + Some(NodeId::from_str("test-ingester-1")) ); } @@ -1887,7 +1901,7 @@ mod tests { 1.001, ); - let ingester_id_0 = NodeId::from("test-ingester-0"); + let ingester_id_0 = NodeId::from_str("test-ingester-0"); let mut mock_ingester_0 = MockIngesterService::new(); mock_ingester_0 .expect_init_shards() @@ -1936,7 +1950,7 @@ mod tests { let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); ingester_pool.insert(ingester_id_0, ingester_0); - let ingester_id_1 = NodeId::from("test-ingester-1"); + let ingester_id_1 = NodeId::from_str("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); mock_ingester_1 .expect_init_shards() @@ -1958,7 +1972,7 @@ mod tests { let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); ingester_pool.insert(ingester_id_1, ingester_1); - let ingester_id_2 = NodeId::from("test-ingester-2"); + let ingester_id_2 = NodeId::from_str("test-ingester-2"); let mut mock_ingester_2 = MockIngesterService::new(); mock_ingester_2.expect_init_shards().never(); @@ -2172,7 +2186,7 @@ mod tests { }); ingester_pool.insert( - NodeId::from("test-ingester-1"), + NodeId::from_str("test-ingester-1"), IngesterServiceClient::from_mock(mock_ingester), ); let source_uids: HashMap = HashMap::from_iter([(source_uid.clone(), 1)]); @@ -2287,7 +2301,7 @@ mod tests { long_term_ingestion_rate: RateMibPerSec(1), }]); let local_shards_update = LocalShardsUpdate { - leader_id: "test-ingester".into(), + leader_id: NodeId::from_str("test-ingester"), source_uid: source_uid.clone(), shard_infos, }; @@ -2344,7 +2358,7 @@ mod tests { )) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("test-ingester".into(), ingester); + ingester_pool.insert(NodeId::from_str("test-ingester"), ingester); let shard_infos = BTreeSet::from_iter([ ShardInfo { @@ -2361,7 +2375,7 @@ mod tests { }, ]); let local_shards_update = LocalShardsUpdate { - leader_id: "test-ingester".into(), + leader_id: NodeId::from_str("test-ingester"), source_uid: source_uid.clone(), shard_infos, }; @@ -2386,7 +2400,7 @@ mod tests { }, ]); let local_shards_update = LocalShardsUpdate { - leader_id: "test-ingester".into(), + leader_id: NodeId::from_str("test-ingester"), source_uid: source_uid.clone(), shard_infos, }; @@ -2497,7 +2511,7 @@ mod tests { ); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("test-ingester".into(), ingester); + ingester_pool.insert(NodeId::from_str("test-ingester"), ingester); let shard_infos = BTreeSet::from_iter([ShardInfo { shard_id: ShardId::from(1), @@ -2506,7 +2520,7 @@ mod tests { long_term_ingestion_rate: RateMibPerSec(4), }]); let local_shards_update = LocalShardsUpdate { - leader_id: "test-ingester".into(), + leader_id: NodeId::from_str("test-ingester"), source_uid: source_uid.clone(), shard_infos, }; @@ -2643,7 +2657,7 @@ mod tests { Ok(response) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("test-ingester".into(), ingester); + ingester_pool.insert(NodeId::from_str("test-ingester"), ingester); // Test failed to open shards. controller @@ -2765,7 +2779,7 @@ mod tests { Ok(response) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("test-ingester".into(), ingester); + ingester_pool.insert(NodeId::from_str("test-ingester"), ingester); // Test failed to close shard. controller @@ -2995,18 +3009,18 @@ mod tests { Ok(RetainShardsResponse {}) }); ingester_pool.insert( - "node-1".into(), + NodeId::from_str("node-1"), IngesterServiceClient::from_mock(mock_ingester_1), ); ingester_pool.insert( - "node-2".into(), + NodeId::from_str("node-2"), IngesterServiceClient::from_mock(mock_ingester_2), ); ingester_pool.insert( - "node-3".into(), + NodeId::from_str("node-3"), IngesterServiceClient::from_mock(mock_ingester_3), ); - let node_id = "node-1".into(); + let node_id: NodeId = NodeId::from_str("node-1"); let wait_handle = controller.sync_with_ingester(&node_id, &model); wait_handle.wait().await; assert_eq!(count_calls.load(Ordering::Acquire), 1); @@ -3106,7 +3120,7 @@ mod tests { let closed_shards = controller.close_shards(Vec::new()).await; assert_eq!(closed_shards.len(), 0); - let ingester_id_0 = NodeId::from("test-ingester-0"); + let ingester_id_0 = NodeId::from_str("test-ingester-0"); let mut mock_ingester_0 = MockIngesterService::new(); mock_ingester_0 .expect_close_shards() @@ -3136,7 +3150,7 @@ mod tests { let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); ingester_pool.insert(ingester_id_0.clone(), ingester_0); - let ingester_id_1 = NodeId::from("test-ingester-1"); + let ingester_id_1 = NodeId::from_str("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); mock_ingester_1 .expect_close_shards() @@ -3154,7 +3168,7 @@ mod tests { let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); ingester_pool.insert(ingester_id_1.clone(), ingester_1); - let ingester_id_2 = NodeId::from("test-ingester-2"); + let ingester_id_2 = NodeId::from_str("test-ingester-2"); let mut mock_ingester_2 = MockIngesterService::new(); mock_ingester_2.expect_close_shards().never(); @@ -3262,7 +3276,7 @@ mod tests { let progress = Progress::default(); let close_shards_task_opt = controller - .rebalance_shards(&mut model, &control_plane_mailbox, &progress) + .rebalance_shards(&mut model, &control_plane_mailbox, &progress, false) .await .unwrap(); assert!(close_shards_task_opt.is_none()); @@ -3322,7 +3336,7 @@ mod tests { ]; model.insert_shards(&index_uid, &INGEST_V2_SOURCE_ID.to_string(), open_shards); - let ingester_id_0 = NodeId::from("test-ingester-0"); + let ingester_id_0 = NodeId::from_str("test-ingester-0"); let mut mock_ingester_0 = MockIngesterService::new(); mock_ingester_0 .expect_close_shards() @@ -3343,7 +3357,7 @@ mod tests { let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); ingester_pool.insert(ingester_id_0.clone(), ingester_0); - let ingester_id_1 = NodeId::from("test-ingester-1"); + let ingester_id_1 = NodeId::from_str("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); mock_ingester_1.expect_init_shards().return_once(|request| { assert_eq!(request.subrequests.len(), 2); @@ -3386,7 +3400,7 @@ mod tests { ingester_pool.insert(ingester_id_1.clone(), ingester_1); let close_shards_task = controller - .rebalance_shards(&mut model, &control_plane_mailbox, &progress) + .rebalance_shards(&mut model, &control_plane_mailbox, &progress, false) .await .unwrap() .unwrap(); @@ -3470,7 +3484,7 @@ mod tests { .map(|i| format!("shard-{i}")) .collect(); for (shard, &shard_count) in shards.into_iter().zip(shard_counts.iter()) { - shard_counts_map.insert(NodeId::from(shard), shard_count); + shard_counts_map.insert(NodeId::from_str(&shard), shard_count); } for i in 0..10 { test_allocate_shards_aux_aux(&shard_counts_map, i, false); @@ -3568,7 +3582,7 @@ mod tests { .collect(); for ingester_id in &active_ids { - ingester_pool.insert(NodeId::from(ingester_id.clone()), ingester_client.clone()); + ingester_pool.insert(NodeId::from_str(ingester_id), ingester_client.clone()); } let inactive_ids: Vec = (0..unavailable_ingester_shards.len()) diff --git a/quickwit/quickwit-control-plane/src/lib.rs b/quickwit/quickwit-control-plane/src/lib.rs index 01072f7de16..0f1bc8275b8 100644 --- a/quickwit/quickwit-control-plane/src/lib.rs +++ b/quickwit/quickwit-control-plane/src/lib.rs @@ -16,6 +16,7 @@ pub mod control_plane; pub mod indexing_plan; pub mod indexing_scheduler; pub mod ingest; +pub mod maintenance; pub(crate) mod metrics; pub(crate) mod model; diff --git a/quickwit/quickwit-control-plane/src/maintenance.rs b/quickwit/quickwit-control-plane/src/maintenance.rs new file mode 100644 index 00000000000..0149dd453df --- /dev/null +++ b/quickwit/quickwit-control-plane/src/maintenance.rs @@ -0,0 +1,512 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Maintenance mode management for the Quickwit control plane. +//! +//! When maintenance mode is enabled: +//! - Metadata mutations (index/source CRUD) are allowed but the indexing plan is not rebuilt. +//! - The indexing plan is frozen: it is not rebuilt when indexers join or leave. +//! - Shard scaling (up/down) and rebalancing are paused. +//! - The frozen plan and maintenance metadata are persisted to the metastore `kv` table so they +//! survive control plane restarts. +//! +//! # Persistence +//! +//! The state is persisted in the metastore `kv` table under the +//! [`KV_KEY_MAINTENANCE_STATE`] key. The value is a JSON envelope with the +//! with some basic metadata and the binary encoded plan. + +use base64::Engine as _; +use prost::Message; +use quickwit_proto::control_plane::{MaintenanceFrozenPlan, MaintenanceFrozenPlanForNode}; +use quickwit_proto::metastore::{ + DeleteKvRequest, GetKvRequest, MetastoreService, MetastoreServiceClient, SetKvRequest, +}; +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; +use time::format_description::well_known::Rfc3339; +use tracing::info; + +use crate::indexing_plan::PhysicalIndexingPlan; + +/// Key in the metastore `kv` table for the combined maintenance state. +pub const KV_KEY_MAINTENANCE_STATE: &str = "control_plane_maintenance_state"; + +pub const LATEST_MAINTENANCE_FROZEN_PLAN_VERSION: MaintenanceFrozenPlanVersion = + MaintenanceFrozenPlanVersion::V1; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum MaintenanceFrozenPlanVersion { + /// The frozen plan is encoded as protobuf and stored under the + /// "frozen_plan" key as a base64 string. + V1 = 1, +} + +/// Metadata persisted alongside the maintenance mode flag. +/// +/// The `enabled_at` field stores a human-readable RFC 3339 datetime string +/// (e.g., `"2024-06-15T14:30:00Z"`), making it easy to inspect directly in the database. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct MaintenanceModeMetadata { + /// RFC 3339 formatted UTC datetime when maintenance mode was enabled. + enabled_at: String, + /// The version of the maintenance state schema. + version: MaintenanceFrozenPlanVersion, +} + +impl MaintenanceModeMetadata { + /// Creates a new metadata instance with `enabled_at` set to the current UTC time. + pub fn new_now() -> Self { + Self { + enabled_at: now_rfc3339(), + version: LATEST_MAINTENANCE_FROZEN_PLAN_VERSION, + } + } +} + +/// In-memory maintenance mode state for the control plane. +#[derive(Debug, Clone, Default)] +pub struct MaintenanceState { + /// If `Some`, maintenance mode is active with the given metadata. + metadata: Option, +} + +impl MaintenanceState { + /// Returns `true` if maintenance mode is currently active. + pub fn is_active(&self) -> bool { + self.metadata.is_some() + } + + /// Returns the metadata if maintenance mode is active. + pub fn metadata(&self) -> Option<&MaintenanceModeMetadata> { + self.metadata.as_ref() + } + + /// Returns the metadata if maintenance mode is active. + pub fn enabled_at(&self) -> Option { + self.metadata + .as_ref() + .map(|metadata| metadata.enabled_at.clone()) + } + + /// Enables maintenance mode. + /// Returns the metadata that was set. + pub fn enable(&mut self) -> MaintenanceModeMetadata { + let metadata = MaintenanceModeMetadata { + enabled_at: now_rfc3339(), + version: LATEST_MAINTENANCE_FROZEN_PLAN_VERSION, + }; + self.metadata = Some(metadata.clone()); + info!( + enabled_at = %metadata.enabled_at, + version = ?metadata.version, + "maintenance mode enabled" + ); + metadata + } + + /// Disables maintenance mode. + /// Returns `true` if it was previously active. + pub fn disable(&mut self) -> bool { + let was_active = self.metadata.is_some(); + self.metadata = None; + if was_active { + info!("maintenance mode disabled"); + } + was_active + } + + /// Loads maintenance state from persisted metadata. + pub fn load_from_metadata(&mut self, metadata: MaintenanceModeMetadata) { + info!( + enabled_at = %metadata.enabled_at, + "loaded maintenance mode from persisted state" + ); + self.metadata = Some(metadata); + } +} + +// -- Persistence Trait -- + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct MaintenancePersistedState { + pub metadata: MaintenanceModeMetadata, + pub frozen_plan: PhysicalIndexingPlan, +} + +impl MaintenancePersistedState { + pub fn serialize(&self) -> anyhow::Result { + match self.metadata.version { + MaintenanceFrozenPlanVersion::V1 => self.serialize_v1(), + } + } + + pub fn deserialize(encoded: &str) -> anyhow::Result { + let envelope: serde_json::Value = serde_json::from_str(encoded)?; + let metadata: MaintenanceModeMetadata = + serde_json::from_value(envelope["metadata"].clone())?; + let frozen_plan = match metadata.version { + MaintenanceFrozenPlanVersion::V1 => { + Self::deserialize_v1_frozen_plan(envelope["frozen_plan"].as_str().ok_or_else( + || anyhow::anyhow!("missing frozen_plan field in maintenance state"), + )?)? + } + }; + Ok(Self { + metadata, + frozen_plan, + }) + } + + fn deserialize_v1_frozen_plan(encoded: &str) -> anyhow::Result { + let decoded = base64::engine::general_purpose::STANDARD + .decode(encoded) + .map_err(|err| anyhow::anyhow!("failed to base64 decode frozen plan: {err}"))?; + let proto_state = MaintenanceFrozenPlan::decode(&decoded[..]) + .map_err(|err| anyhow::anyhow!("failed to decode protobuf frozen plan: {err}"))?; + + // Collect all indexer node IDs to initialize the plan + let indexer_ids: Vec = proto_state + .state_per_node + .iter() + .map(|node_state| node_state.index_id.clone()) + .collect(); + + let mut plan = PhysicalIndexingPlan::with_indexer_ids(&indexer_ids); + + for node_state in proto_state.state_per_node { + for task in node_state.indexing_tasks { + plan.add_indexing_task(&node_state.index_id, task); + } + } + Ok(plan) + } + + fn serialize_v1(&self) -> anyhow::Result { + let proto_state = self.frozen_plan_to_proto(); + + // Encode the protobuf message to binary + let mut buf = Vec::new(); + prost::Message::encode(&proto_state, &mut buf) + .map_err(|err| anyhow::anyhow!("failed to encode protobuf: {err}"))?; + + // Base64 encode the binary data + let base64_encoded = base64::engine::general_purpose::STANDARD.encode(&buf); + + let json_value = serde_json::json!({ + "frozen_plan": base64_encoded, + "metadata": serde_json::to_value(&self.metadata)?, + }); + Ok(serde_json::to_string(&json_value)?) + } + + /// Converts the frozen plan to the protobuf representation. + fn frozen_plan_to_proto(&self) -> MaintenanceFrozenPlan { + let state_per_node: Vec = self + .frozen_plan + .indexing_tasks_per_indexer() + .iter() + .map(|(node_id, tasks)| MaintenanceFrozenPlanForNode { + index_id: node_id.clone(), + indexing_tasks: tasks.clone(), + }) + .collect(); + + MaintenanceFrozenPlan { state_per_node } + } +} + +/// Persists maintenance state using the metastore's `GetKv`/`SetKv`/`DeleteKv` +/// RPCs to the PostgreSQL `kv` table. +#[derive(Debug, Clone)] +pub struct MetastoreKvPersistence { + metastore: MetastoreServiceClient, +} + +impl MetastoreKvPersistence { + pub fn new(metastore: MetastoreServiceClient) -> Self { + Self { metastore } + } + + /// Loads the maintenance state from persistent storage. Returns `None` if + /// no maintenance state is persisted. + /// + /// Panics if the state can't be fetched or deserialized. + pub async fn load(&self) -> Option { + let response = self + .metastore + .clone() + .get_kv(GetKvRequest { + key: KV_KEY_MAINTENANCE_STATE.to_string(), + }) + .await + .expect("failed to get maintenance state from metastore"); + let encoded = response.value?; // return None if no value is set + let persisted = MaintenancePersistedState::deserialize(&encoded) + .expect("failed to deserialize maintenance state from metastore"); + Some(persisted) + } + + /// Persists the maintenance metadata and frozen plan atomically. + pub async fn save( + &self, + metadata: &MaintenanceModeMetadata, + frozen_plan: &PhysicalIndexingPlan, + ) -> anyhow::Result<()> { + let persisted = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: frozen_plan.clone(), + }; + let serialized = persisted.serialize()?; + self.metastore + .clone() + .set_kv(SetKvRequest { + key: KV_KEY_MAINTENANCE_STATE.to_string(), + value: serialized, + }) + .await?; + Ok(()) + } + + /// Clears all persisted maintenance state. + pub async fn clear(&self) -> anyhow::Result<()> { + self.metastore + .clone() + .delete_kv(DeleteKvRequest { + key: KV_KEY_MAINTENANCE_STATE.to_string(), + }) + .await?; + Ok(()) + } +} + +// -- Helper functions -- + +/// Serializes a `PhysicalIndexingPlan` to a JSON string for use in API responses. +pub fn serialize_frozen_plan(plan: &PhysicalIndexingPlan) -> serde_json::Result { + serde_json::to_string(plan) +} + +/// Returns the current UTC time formatted as an RFC 3339 string. +fn now_rfc3339() -> String { + OffsetDateTime::now_utc() + .format(&Rfc3339) + .expect("formatting OffsetDateTime as RFC 3339 should never fail") +} + +#[cfg(test)] +mod tests { + use quickwit_proto::metastore::{ + EmptyResponse, GetKvResponse, MetastoreServiceClient, MockMetastoreService, + }; + + use super::*; + + #[test] + fn test_maintenance_state_default_is_inactive() { + let state = MaintenanceState::default(); + assert!(!state.is_active()); + assert!(state.metadata().is_none()); + } + + #[test] + fn test_maintenance_state_enable_disable() { + let mut state = MaintenanceState::default(); + + // Enable + let metadata = state.enable(); + assert!(state.is_active()); + assert!(!metadata.enabled_at.is_empty()); + // Should be a valid RFC 3339 datetime + assert!( + OffsetDateTime::parse(&metadata.enabled_at, &Rfc3339).is_ok(), + "enabled_at should be valid RFC 3339: {}", + metadata.enabled_at + ); + + // Disable + let was_active = state.disable(); + assert!(was_active); + assert!(!state.is_active()); + + // Disable again is a no-op + let was_active = state.disable(); + assert!(!was_active); + } + + #[test] + fn test_current_persisted_state_version_roundtrip() { + let metadata = MaintenanceModeMetadata { + enabled_at: "2024-06-15T14:30:00Z".to_string(), + version: LATEST_MAINTENANCE_FROZEN_PLAN_VERSION, + }; + let plan = PhysicalIndexingPlan::with_indexer_ids(&[ + "indexer-1".to_string(), + "indexer-2".to_string(), + ]); + let state = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: plan.clone(), + }; + let serialized = state + .serialize() + .expect("failed to serialize maintenance state"); + let deserialized: MaintenancePersistedState = + MaintenancePersistedState::deserialize(&serialized).unwrap(); + assert_eq!(deserialized, state); + } + + /// Validates that an existing V1 serialization can still be deserialized. + #[test] + fn test_postcard_v1_deserialization_stability() { + let metadata = MaintenanceModeMetadata { + enabled_at: "2024-06-15T14:30:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + let expected_state = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: plan.clone(), + }; + // // this was used to generate the `encoded` string + // println!( + // "{}", + // expected_state + // .serialize() + // .expect("failed to serialize expected state") + // ); + let encoded = r#"{"frozen_plan":"EgsKCWluZGV4ZXItMQ==","metadata":{"enabled_at":"2024-06-15T14:30:00Z","version":"V1"}}"#; + let deserialized = MaintenancePersistedState::deserialize(encoded).unwrap(); + assert_eq!(deserialized, expected_state); + } + + #[tokio::test] + async fn test_metastore_persistence_save_and_load() { + let mut mock_metastore = MockMetastoreService::new(); + + // Initially empty + mock_metastore + .expect_get_kv() + .times(1) + .returning(|_| Ok(GetKvResponse { value: None })); + + // Save + mock_metastore + .expect_set_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // Load + let metadata = MaintenanceModeMetadata { + enabled_at: "2024-01-15T10:00:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan = PhysicalIndexingPlan::with_indexer_ids(&["indexer-1".to_string()]); + let expected_state = MaintenancePersistedState { + metadata: metadata.clone(), + frozen_plan: plan.clone(), + }; + let expected_encoded = expected_state.serialize().unwrap(); + + mock_metastore.expect_get_kv().times(1).returning(move |_| { + Ok(GetKvResponse { + value: Some(expected_encoded.clone()), + }) + }); + + // Clear + mock_metastore + .expect_delete_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // One final load to verify cleared + mock_metastore + .expect_get_kv() + .times(1) + .returning(|_| Ok(GetKvResponse { value: None })); + + let metastore_client = MetastoreServiceClient::from_mock(mock_metastore); + let persistence = MetastoreKvPersistence::new(metastore_client); + + // Initially empty + let loaded = persistence.load().await; + assert!(loaded.is_none()); + + // Save + persistence.save(&metadata, &plan).await.unwrap(); + + // Load + let loaded = persistence.load().await.unwrap(); + assert_eq!(loaded.metadata, metadata); + assert_eq!(loaded.frozen_plan, plan); + + // Clear + persistence.clear().await.unwrap(); + let loaded = persistence.load().await; + assert!(loaded.is_none()); + } + + #[tokio::test] + async fn test_metastore_persistence_overwrite() { + let mut mock_metastore = MockMetastoreService::new(); + + let metadata1 = MaintenanceModeMetadata { + enabled_at: "2024-01-01T00:00:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan1 = PhysicalIndexingPlan::with_indexer_ids(&["a".to_string()]); + + let metadata2 = MaintenanceModeMetadata { + enabled_at: "2024-06-01T12:00:00Z".to_string(), + version: MaintenanceFrozenPlanVersion::V1, + }; + let plan2 = PhysicalIndexingPlan::with_indexer_ids(&["b".to_string()]); + + // First save + mock_metastore + .expect_set_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // Second save (overwrite) + mock_metastore + .expect_set_kv() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + // Load - return the second state + let expected_state2 = MaintenancePersistedState { + metadata: metadata2.clone(), + frozen_plan: plan2.clone(), + }; + let expected_encoded2 = expected_state2.serialize().unwrap(); + + mock_metastore.expect_get_kv().times(1).returning(move |_| { + Ok(GetKvResponse { + value: Some(expected_encoded2.clone()), + }) + }); + + let metastore_client = MetastoreServiceClient::from_mock(mock_metastore); + let persistence = MetastoreKvPersistence::new(metastore_client); + + persistence.save(&metadata1, &plan1).await.unwrap(); + persistence.save(&metadata2, &plan2).await.unwrap(); + + let loaded = persistence.load().await.unwrap(); + assert_eq!(loaded.metadata, metadata2); + assert_eq!(loaded.frozen_plan, plan2); + } +} diff --git a/quickwit/quickwit-control-plane/src/metrics.rs b/quickwit/quickwit-control-plane/src/metrics.rs index 5e534c4f176..c3370d9b3a7 100644 --- a/quickwit/quickwit-control-plane/src/metrics.rs +++ b/quickwit/quickwit-control-plane/src/metrics.rs @@ -42,6 +42,9 @@ pub struct ControlPlaneMetrics { // Indexing plan metrics. pub local_shards: IntGauge, pub remote_shards: IntGauge, + + // Maintenance mode. + pub maintenance_mode: IntGauge, } impl ControlPlaneMetrics { @@ -128,6 +131,12 @@ impl Default for ControlPlaneMetrics { ), local_shards, remote_shards, + maintenance_mode: new_gauge( + "maintenance_mode", + "Whether the control plane is in maintenance mode (1 = enabled, 0 = disabled).", + "control_plane", + &[], + ), } } } diff --git a/quickwit/quickwit-control-plane/src/model/shard_table.rs b/quickwit/quickwit-control-plane/src/model/shard_table.rs index 0377d553ac6..a30eb113658 100644 --- a/quickwit/quickwit-control-plane/src/model/shard_table.rs +++ b/quickwit/quickwit-control-plane/src/model/shard_table.rs @@ -443,7 +443,8 @@ impl ShardTable { .shard_entries .values() .filter(|shard_entry| { - shard_entry.shard.is_open() && !unavailable_leaders.contains(&shard_entry.leader_id) + shard_entry.shard.is_open() + && !unavailable_leaders.contains(shard_entry.leader_id.as_str()) }) .cloned() .collect(); @@ -847,7 +848,7 @@ mod tests { assert_eq!(open_shards[0].shard, shard_03); assert_eq!(open_shards[1].shard, shard_04); - unavailable_ingesters.insert("test-leader-0".into()); + unavailable_ingesters.insert(NodeId::from_str("test-leader-0")); let open_shards = shard_table .find_open_shards_sorted(&index_uid, &source_id, &unavailable_ingesters) @@ -1240,8 +1241,8 @@ mod tests { let shard1 = ShardId::from("shard1"); let shard2 = ShardId::from("shard1"); let unlisted_shard = ShardId::from("unlisted"); - let node1 = NodeId::new("node1".to_string()); - let node2 = NodeId::new("node2".to_string()); + let node1 = NodeId::from_str("node1"); + let node2 = NodeId::from_str("node2"); let mut shard_locations = ShardLocations::default(); shard_locations.add_location(&shard1, &node1); shard_locations.add_location(&shard1, &node2); @@ -1348,19 +1349,19 @@ mod tests { }; assert_eq!( &get_sorted_locations_for_shard(0u64), - &[&NodeId::from("indexer1"), &NodeId::from("indexer2")] + &[&NodeId::from_str("indexer1"), &NodeId::from_str("indexer2")] ); assert_eq!( &get_sorted_locations_for_shard(1u64), - &[&NodeId::from("indexer1")] + &[&NodeId::from_str("indexer1")] ); assert_eq!( &get_sorted_locations_for_shard(2u64), - &[&NodeId::from("indexer2")] + &[&NodeId::from_str("indexer2")] ); assert_eq!( &get_sorted_locations_for_shard(3u64), - &[&NodeId::from("indexer1"), &NodeId::from("indexer2")] + &[&NodeId::from_str("indexer1"), &NodeId::from_str("indexer2")] ); } } diff --git a/quickwit/quickwit-control-plane/src/tests.rs b/quickwit/quickwit-control-plane/src/tests.rs index 9f0cd97b477..73783c9f4b2 100644 --- a/quickwit/quickwit-control-plane/src/tests.rs +++ b/quickwit/quickwit-control-plane/src/tests.rs @@ -18,10 +18,11 @@ use std::time::Duration; use fnv::FnvHashMap; use futures::{Stream, StreamExt}; use quickwit_actors::{Inbox, Mailbox, Observe, Universe}; -use quickwit_cluster::{ChannelTransport, Cluster, ClusterChange, create_cluster_for_test}; +use quickwit_cluster::{ + ChannelTransport, Cluster, ClusterChange, ClusterMember, create_cluster_for_test, +}; use quickwit_common::test_utils::wait_until_predicate; use quickwit_common::tower::{Change, Pool}; -use quickwit_config::service::QuickwitService; use quickwit_config::{ ClusterConfig, KafkaSourceParams, SourceConfig, SourceInputFormat, SourceParams, }; @@ -29,7 +30,8 @@ use quickwit_indexing::IndexingService; use quickwit_metastore::{IndexMetadata, ListIndexesMetadataResponseExt}; use quickwit_proto::indexing::{ApplyIndexingPlanRequest, CpuCapacity, IndexingServiceClient}; use quickwit_proto::metastore::{ - ListIndexesMetadataResponse, ListShardsResponse, MetastoreServiceClient, MockMetastoreService, + GetKvResponse, ListIndexesMetadataResponse, ListShardsResponse, MetastoreServiceClient, + MockMetastoreService, }; use quickwit_proto::types::NodeId; use serde_json::json; @@ -70,12 +72,10 @@ pub fn test_indexer_change_stream( let indexing_clients = indexing_clients.clone(); Box::pin(async move { match cluster_change { - ClusterChange::Add(node) - if node.enabled_services().contains(&QuickwitService::Indexer) => - { - let node_id = node.node_id().to_owned(); + ClusterChange::Add(node) if node.is_indexer() => { + let node_id = node.node_id.clone(); let generation_id = node.chitchat_id().generation_id; - let indexing_tasks = node.indexing_tasks().to_vec(); + let indexing_tasks = node.indexing_tasks.to_vec(); let client_mailbox = indexing_clients.get(&node_id).unwrap().clone(); let client = IndexingServiceClient::from_mailbox(client_mailbox); let change = Change::Insert( @@ -90,7 +90,7 @@ pub fn test_indexer_change_stream( ); Some(change) } - ClusterChange::Remove(node) => Some(Change::Remove(node.node_id().to_owned())), + ClusterChange::Remove(node) => Some(Change::Remove(node.node_id.clone())), _ => None, } }) @@ -121,6 +121,9 @@ async fn start_control_plane( subresponses: Vec::new(), }) }); + mock_metastore + .expect_get_kv() + .returning(|_| Ok(GetKvResponse { value: None })); let mut indexer_inboxes = Vec::new(); let indexer_pool = Pool::default(); @@ -129,7 +132,7 @@ async fn start_control_plane( for indexer in indexers { let (indexing_service_mailbox, indexing_service_inbox) = universe.create_test_mailbox(); - indexing_clients.insert(indexer.self_node_id().to_owned(), indexing_service_mailbox); + indexing_clients.insert(indexer.self_node_id(), indexing_service_mailbox); indexer_inboxes.push(indexing_service_inbox); } let indexer_change_stream = @@ -139,7 +142,7 @@ async fn start_control_plane( let mut cluster_config = ClusterConfig::for_test(); cluster_config.cluster_id = cluster.cluster_id().to_string(); - let self_node_id = cluster.self_node_id().to_owned(); + let self_node_id = cluster.self_node_id(); let (control_plane_mailbox, _control_plane_handle, _is_ready_rx) = ControlPlane::spawn( universe, cluster_config, @@ -178,7 +181,7 @@ async fn test_scheduler_scheduling_and_control_loop_apply_plan_again() { indexing_service_inbox.drain_for_test_typed::(); assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 1); assert_eq!(scheduler_state.num_schedule_indexing_plan, 1); - assert!(scheduler_state.last_applied_physical_plan.is_some()); + assert!(scheduler_state.current_targeted_physical_plan.is_some()); assert_eq!(indexing_service_inbox_messages.len(), 1); // After a CONTROL_PLAN_LOOP_INTERVAL, the control loop will check if the desired plan is @@ -266,7 +269,7 @@ async fn test_scheduler_scheduling_no_indexer() { .indexing_scheduler; assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 0); assert_eq!(scheduler_state.num_schedule_indexing_plan, 0); - assert!(scheduler_state.last_applied_physical_plan.is_none()); + assert!(scheduler_state.current_targeted_physical_plan.is_none()); // There is no indexer, we should observe no // scheduling. @@ -278,7 +281,7 @@ async fn test_scheduler_scheduling_no_indexer() { .indexing_scheduler; assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 0); assert_eq!(scheduler_state.num_schedule_indexing_plan, 0); - assert!(scheduler_state.last_applied_physical_plan.is_none()); + assert!(scheduler_state.current_targeted_physical_plan.is_none()); universe.assert_quit().await; } @@ -324,16 +327,12 @@ async fn test_scheduler_scheduling_multiple_indexers() { indexing_service_inbox_1.drain_for_test_typed::(); assert_eq!(scheduler_state.num_applied_physical_indexing_plan, 0); assert_eq!(scheduler_state.num_schedule_indexing_plan, 0); - assert!(scheduler_state.last_applied_physical_plan.is_none()); + assert!(scheduler_state.current_targeted_physical_plan.is_none()); assert_eq!(indexing_service_inbox_messages.len(), 0); cluster .wait_for_ready_members( - |members| { - members - .iter() - .any(|member| member.enabled_services.contains(&QuickwitService::Indexer)) - }, + |members| members.iter().any(ClusterMember::is_indexer), Duration::from_secs(5), ) .await @@ -391,13 +390,7 @@ async fn test_scheduler_scheduling_multiple_indexers() { cluster .wait_for_ready_members( - |members| { - members - .iter() - .filter(|member| member.enabled_services.contains(&QuickwitService::Indexer)) - .count() - == 1 - }, + |members| members.iter().filter(|m| m.is_indexer()).count() == 1, Duration::from_secs(5), ) .await diff --git a/quickwit/quickwit-datetime/src/java_date_time_format.rs b/quickwit/quickwit-datetime/src/java_date_time_format.rs index 2ef63f32881..bb224d94cfd 100644 --- a/quickwit/quickwit-datetime/src/java_date_time_format.rs +++ b/quickwit/quickwit-datetime/src/java_date_time_format.rs @@ -18,8 +18,8 @@ use std::sync::OnceLock; use time::error::{Format, TryFromParsed}; use time::format_description::modifier::{ - Day, Hour, Minute, Month as MonthModifier, Padding, Second, Subsecond, SubsecondDigits, - WeekNumber, WeekNumberRepr, Weekday, WeekdayRepr, Year, YearRepr, + Day, Hour, Minute, Month as MonthModifier, OffsetHour, OffsetMinute, Padding, Second, + Subsecond, SubsecondDigits, WeekNumber, WeekNumberRepr, Weekday, WeekdayRepr, Year, YearRepr, }; use time::format_description::{Component, OwnedFormatItem}; use time::parsing::Parsed; @@ -38,6 +38,7 @@ const JAVA_DATE_FORMAT_TOKENS: &[&str] = &[ "SSSS", "SSS", "SS", + "XXXXX", // ISO 8601 offset with colon or 'Z' "ZZ", "ww", "w[w]", @@ -99,11 +100,43 @@ fn build_zone_offset(_: &str) -> Option { .into_boxed_slice(); let offset_compound = OwnedFormatItem::Compound(offset_items); + // Offset in '+/-HH' format (abbreviated, hour only) + let offset_hour_only = OwnedFormatItem::Component(Component::OffsetHour(Default::default())); + Some(OwnedFormatItem::First( - vec![z_literal, offset_with_delimiter_compound, offset_compound].into_boxed_slice(), + vec![ + z_literal, + offset_with_delimiter_compound, + offset_compound, + offset_hour_only, + ] + .into_boxed_slice(), )) } +/// Build the ES specific formatting: always outputs '+HH:MM' format +/// +/// NOTE: Unfortunately we cannot have a conditional forwarding that replaces +/// +00:00 with 'Z' for UTC as ES does. We perform this replacement after the +/// formatting. +fn build_iso8601_zone_offset_for_formatting() -> Option { + // Configure OffsetHour to always show the sign + let mut offset_hour_mod = OffsetHour::default(); + offset_hour_mod.sign_is_mandatory = true; + offset_hour_mod.padding = Padding::Zero; + + let mut offset_minute_mod = OffsetMinute::default(); + offset_minute_mod.padding = Padding::Zero; + + let offset_with_delimiter_items: Box<[OwnedFormatItem]> = vec![ + OwnedFormatItem::Component(Component::OffsetHour(offset_hour_mod)), + OwnedFormatItem::Literal(Box::from(b":".as_ref())), + OwnedFormatItem::Component(Component::OffsetMinute(offset_minute_mod)), + ] + .into_boxed_slice(); + Some(OwnedFormatItem::Compound(offset_with_delimiter_items)) +} + // There is a `YearRepr::LastTwo` representation in the time crate, but the parser is unreliable, so // we only support `YearRepr::Full` for now. See also https://github.com/time-rs/time/issues/649. const fn year_item() -> Option { @@ -159,14 +192,37 @@ fn build_second_item(ptn: &str) -> Option { Some(OwnedFormatItem::Component(Component::Second(second))) } -fn build_fraction_of_second_item(_ptn: &str) -> Option { +fn build_fraction_of_second_item_for_parsing() -> Option { let mut subsecond: Subsecond = Default::default(); + // For parsing, use OneOrMore to accept variable precision subsecond.digits = SubsecondDigits::OneOrMore; Some(OwnedFormatItem::Component(Component::Subsecond(subsecond))) } +// Build fractional seconds with fixed precision based on pattern length +fn build_fraction_of_second_item_for_formatting(ptn: &str) -> Option { + use time::format_description::modifier::SubsecondDigits; + + let mut subsecond: Subsecond = Default::default(); + // Use pattern length to determine fixed precision for formatting + subsecond.digits = match ptn.len() { + 1 => SubsecondDigits::One, + 2 => SubsecondDigits::Two, + 3 => SubsecondDigits::Three, + 4 => SubsecondDigits::Four, + 5 => SubsecondDigits::Five, + 6 => SubsecondDigits::Six, + 7 => SubsecondDigits::Seven, + 8 => SubsecondDigits::Eight, + 9 => SubsecondDigits::Nine, + _ => SubsecondDigits::OneOrMore, + }; + Some(OwnedFormatItem::Component(Component::Subsecond(subsecond))) +} + fn parse_java_datetime_format_items_recursive( chars: &mut std::iter::Peekable, + for_formatting: bool, ) -> Result, String> { let mut items = Vec::new(); @@ -174,7 +230,8 @@ fn parse_java_datetime_format_items_recursive( match c { '[' => { chars.next(); - let optional_items = parse_java_datetime_format_items_recursive(chars)?; + let optional_items = + parse_java_datetime_format_items_recursive(chars, for_formatting)?; items.push(OwnedFormatItem::Optional(Box::new( OwnedFormatItem::Compound(optional_items.into_boxed_slice()), ))); @@ -198,7 +255,7 @@ fn parse_java_datetime_format_items_recursive( items.push(literal(literal_str.as_bytes())); } _ => { - if let Some(format_item) = match_java_date_format_token(chars)? { + if let Some(format_item) = match_java_date_format_token(chars, for_formatting)? { items.push(format_item); } else { // Treat as a literal character @@ -216,6 +273,7 @@ fn parse_java_datetime_format_items_recursive( // here https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html fn match_java_date_format_token( chars: &mut std::iter::Peekable, + for_formatting: bool, ) -> Result, String> { if chars.peek().is_none() { return Ok(None); @@ -238,7 +296,18 @@ fn match_java_date_format_token( "mm" | "m" => build_minute_item(token), "ss" | "s" => build_second_item(token), "SSSSSSSSS" | "SSSSSSS" | "SSSSSS" | "SSSSS" | "SSSS" | "SSS" | "SS" | "S" => { - build_fraction_of_second_item(token) + if for_formatting { + build_fraction_of_second_item_for_formatting(token) + } else { + build_fraction_of_second_item_for_parsing() + } + } + "XXXXX" => { + if for_formatting { + build_iso8601_zone_offset_for_formatting() + } else { + return Err("XXXXX pattern is only supported for formatting.".to_string()); + } } "Z" => build_zone_offset(token), "ww" | "w[w]" | "w" => build_week_of_year_item(token), @@ -256,19 +325,22 @@ fn match_java_date_format_token( // Java date format it is mapped to, if any. // If the java_datetime_format is not an alias, it is expected to be a // java date time format and should be returned as is. -fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { +fn resolve_java_datetime_format_alias_for_parsing(java_datetime_format: &str) -> &str { static JAVA_DATE_FORMAT_ALIASES: OnceLock> = OnceLock::new(); let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { let mut m = HashMap::new(); - m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "date_optional_time", + "yyyy[-MM[-dd['T'HH[Z][:mm[Z][:ss[.SSS][Z]]]]]]", + ); m.insert( "strict_date_optional_time", - "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + "yyyy[-MM[-dd['T'HH[Z][:mm[Z][:ss[.SSS][Z]]]]]]", ); m.insert( "strict_date_optional_time_nanos", - "yyyy[-MM[-dd['T'HH:mm:ss.SSSSSSZ]]]", + "yyyy[-MM[-dd['T'HH[Z][:mm[Z][:ss[.SSSSSS][Z]]]]]]", ); m.insert("basic_date", "yyyyMMdd"); @@ -294,18 +366,69 @@ fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { .unwrap_or(java_datetime_format) } +fn resolve_java_datetime_format_alias_for_formatting(java_datetime_format: &str) -> &str { + static JAVA_DATE_FORMAT_ALIASES_FORMATTING: OnceLock> = + OnceLock::new(); + let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES_FORMATTING.get_or_init(|| { + let mut m = HashMap::new(); + // For strict_date_optional_time, format with full date-time and milliseconds + m.insert( + "strict_date_optional_time", + "yyyy-MM-dd'T'HH:mm:ss.SSSXXXXX", + ); + // For strict_date_optional_time_nanos, format with full date-time and nanoseconds + m.insert( + "strict_date_optional_time_nanos", + "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXXXX", + ); + // date_optional_time uses the same format as strict variant + m.insert("date_optional_time", "yyyy-MM-dd'T'HH:mm:ss.SSSXXXXX"); + // Other formats that don't have complex optional structures can use their parse patterns + m.insert("basic_date", "yyyyMMdd"); + m.insert("strict_basic_week_date", "xxxx'W'wwe"); + m.insert("basic_week_date", "xxxx'W'wwe"); + m.insert( + "strict_basic_week_date_time", + "xxxx'W'wwe'T'HHmmss.SSSXXXXX", + ); + m.insert("basic_week_date_time", "xxxx'W'wwe'T'HHmmss.SSSXXXXX"); + m.insert( + "strict_basic_week_date_time_no_millis", + "xxxx'W'wwe'T'HHmmssXXXXX", + ); + m.insert("basic_week_date_time_no_millis", "xxxx'W'wwe'T'HHmmssXXXXX"); + m.insert("strict_week_date", "xxxx-'W'ww-e"); + m.insert("week_date", "xxxx-'W'ww-e"); + m + }); + java_datetime_format_map + .get(java_datetime_format) + .copied() + .unwrap_or(java_datetime_format) +} + /// A date time parser that holds the format specification `Vec`. #[derive(Clone)] pub struct StrptimeParser { pub(crate) strptime_format: String, items: Box<[OwnedFormatItem]>, + format_items: Box<[OwnedFormatItem]>, } pub fn parse_java_datetime_format_items( java_datetime_format: &str, ) -> Result, String> { let mut chars = java_datetime_format.chars().peekable(); - let items = parse_java_datetime_format_items_recursive(&mut chars)?; + let items = parse_java_datetime_format_items_recursive(&mut chars, false)?; + Ok(items.into_boxed_slice()) +} + +// Parse format items with fixed precision for formatting output +fn parse_java_datetime_format_items_for_formatting( + java_datetime_format: &str, +) -> Result, String> { + let mut chars = java_datetime_format.chars().peekable(); + let items = parse_java_datetime_format_items_recursive(&mut chars, true)?; Ok(items.into_boxed_slice()) } @@ -323,9 +446,21 @@ impl StrptimeParser { date_time_str: &str, default_offset: UtcOffset, ) -> Result { + // Elasticsearch/OpenSearch support comma as a decimal separator for fractional seconds + // (in addition to period). The time crate only supports period, so we normalize + // comma to period before parsing. This is safe because comma doesn't appear in + // other parts of the format. + let normalized_date_time_str; + let date_time_str_to_parse = if date_time_str.contains(',') { + normalized_date_time_str = date_time_str.replace(',', "."); + &normalized_date_time_str + } else { + date_time_str + }; + let mut parsed = Parsed::new(); if !parsed - .parse_items(date_time_str.as_bytes(), &self.items) + .parse_items(date_time_str_to_parse.as_bytes(), &self.items) .map_err(|err| err.to_string())? .is_empty() { @@ -372,7 +507,15 @@ impl StrptimeParser { } pub fn format_date_time(&self, date_time: &OffsetDateTime) -> Result { - date_time.format(&self.items) + let mut formatted = date_time.format(&self.format_items)?; + // For ES/Java ISO 8601 compatibility: replace '+00:00' with 'Z' for UTC. + // The time crate doesn't support conditional 'Z' in format items, so we handle it manually + // here. Since the offset is always a suffix, we can efficiently truncate and append. + if date_time.offset() == UtcOffset::UTC && formatted.ends_with("+00:00") { + formatted.truncate(formatted.len() - 6); + formatted.push('Z'); + } + Ok(formatted) } pub fn from_strptime(strptime_format: &str) -> Result { @@ -382,21 +525,43 @@ impl StrptimeParser { .map(|item| item.into()) .collect::>() .into_boxed_slice(); - Ok(StrptimeParser::new(strptime_format.to_string(), items)) + // For strptime, use the same items for both parsing and formatting + let format_items = items.clone(); + Ok(StrptimeParser::new( + strptime_format.to_string(), + items, + format_items, + )) } pub fn from_java_datetime_format(java_datetime_format: &str) -> Result { let java_datetime_format_resolved = - resolve_java_datetime_format_alias(java_datetime_format); + resolve_java_datetime_format_alias_for_parsing(java_datetime_format); let items: Box<[OwnedFormatItem]> = parse_java_datetime_format_items(java_datetime_format_resolved)?; - Ok(StrptimeParser::new(java_datetime_format.to_string(), items)) + + // Get format-specific pattern and create format items + let java_datetime_format_for_formatting = + resolve_java_datetime_format_alias_for_formatting(java_datetime_format); + let format_items: Box<[OwnedFormatItem]> = + parse_java_datetime_format_items_for_formatting(java_datetime_format_for_formatting)?; + + Ok(StrptimeParser::new( + java_datetime_format.to_string(), + items, + format_items, + )) } - fn new(strptime_format: String, items: Box<[OwnedFormatItem]>) -> Self { + fn new( + strptime_format: String, + items: Box<[OwnedFormatItem]>, + format_items: Box<[OwnedFormatItem]>, + ) -> Self { StrptimeParser { strptime_format, items, + format_items, } } } @@ -660,6 +825,7 @@ mod tests { "2019-03-23T21:35:46.123+00:00", "2019-03-23T21:36:46.123+03:00", "2019-03-23T21:37:46.123+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -671,6 +837,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123 UTC), datetime!(2019-03-23 21:36:46.123 +03:00:00), datetime!(2019-03-23 21:37:46.123 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser @@ -692,6 +859,7 @@ mod tests { "2019-03-23T21:35:46.123456789+00:00", "2019-03-23T21:36:46.123456789+03:00", "2019-03-23T21:37:46.123456789+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -701,6 +869,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123456789 UTC), datetime!(2019-03-23 21:36:46.123456789 +03:00:00), datetime!(2019-03-23 21:37:46.123456789 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser @@ -768,3 +937,879 @@ mod tests { ); } } + +/// Tests ported from Elasticsearch's `DateFormattersTests.java` to ensure maximum +/// compatibility with their date parsing behavior. +#[cfg(test)] +mod tests_parsing_ported_from_es { + use time::macros::datetime; + + use super::*; + + #[test] + fn test_strict_date_optional_time_comprehensive() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + // Comprehensive test cases from Elasticsearch + let test_cases = [ + // Date only + ("2018-12-31", datetime!(2018-12-31 00:00:00 UTC)), + // Date with time + ("2010-01-05T02:00", datetime!(2010-01-05 02:00:00 UTC)), + ("2018-12-31T10:15:30", datetime!(2018-12-31 10:15:30 UTC)), + // With UTC timezone + ("2018-12-31T10:15:30Z", datetime!(2018-12-31 10:15:30 UTC)), + ("2015-01-04T00:00Z", datetime!(2015-01-04 00:00:00 UTC)), + // With numeric timezones (compact) + ("2016-11-30T00+01", datetime!(2016-11-30 00:00:00 +01:00:00)), + ( + "2016-11-30T00+0100", + datetime!(2016-11-30 00:00:00 +01:00:00), + ), + ( + "2018-12-31T10:15:30+0100", + datetime!(2018-12-31 10:15:30 +01:00:00), + ), + // With numeric timezones (colon-separated) + ( + "2016-11-30T00+01:00", + datetime!(2016-11-30 00:00:00 +01:00:00), + ), + ( + "2018-12-31T10:15:30+01:00", + datetime!(2018-12-31 10:15:30 +01:00:00), + ), + // With milliseconds + ( + "2018-12-31T10:15:30.1Z", + datetime!(2018-12-31 10:15:30.1 UTC), + ), + ( + "2018-12-31T10:15:30.123Z", + datetime!(2018-12-31 10:15:30.123 UTC), + ), + ( + "2018-12-31T10:15:30.1+0100", + datetime!(2018-12-31 10:15:30.1 +01:00:00), + ), + ( + "2018-12-31T10:15:30.123+0100", + datetime!(2018-12-31 10:15:30.123 +01:00:00), + ), + ( + "2018-12-31T10:15:30.123+01:00", + datetime!(2018-12-31 10:15:30.123 +01:00:00), + ), + // Partial dates + ("2001", datetime!(2001-01-01 00:00:00 UTC)), + ("2001-01", datetime!(2001-01-01 00:00:00 UTC)), + ("2001-01-01", datetime!(2001-01-01 00:00:00 UTC)), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + } + + #[test] + fn test_strict_date_optional_time_nanos_comprehensive() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time_nanos").unwrap(); + + // Comprehensive test cases with nanosecond precision + let test_cases = [ + // From Elasticsearch test suite + ( + "2016-01-01T00:00:00.000", + datetime!(2016-01-01 00:00:00.0 UTC), + ), + ("2018-05-15T17:14:56", datetime!(2018-05-15 17:14:56 UTC)), + ("2018-05-15T17:14:56Z", datetime!(2018-05-15 17:14:56 UTC)), + ( + "2018-05-15T17:14:56+0100", + datetime!(2018-05-15 17:14:56 +01:00:00), + ), + ( + "2018-05-15T17:14:56+01:00", + datetime!(2018-05-15 17:14:56 +01:00:00), + ), + ( + "2022-12-16T10:00:57.149001+00:00", + datetime!(2022-12-16 10:00:57.149001 UTC), + ), + ( + "2018-05-15T17:14:56.123456789+0100", + datetime!(2018-05-15 17:14:56.123456789 +01:00:00), + ), + ( + "2018-05-15T17:14:56.123456789+01:00", + datetime!(2018-05-15 17:14:56.123456789 +01:00:00), + ), + // Fractional second precision tests (1-9 digits) + ( + "2019-05-06T14:52:37.1Z", + datetime!(2019-05-06 14:52:37.1 UTC), + ), + ( + "2019-05-06T14:52:37.12Z", + datetime!(2019-05-06 14:52:37.12 UTC), + ), + ( + "2019-05-06T14:52:37.123Z", + datetime!(2019-05-06 14:52:37.123 UTC), + ), + ( + "2019-05-06T14:52:37.1234Z", + datetime!(2019-05-06 14:52:37.1234 UTC), + ), + ( + "2019-05-06T14:52:37.12345Z", + datetime!(2019-05-06 14:52:37.12345 UTC), + ), + ( + "2019-05-06T14:52:37.123456Z", + datetime!(2019-05-06 14:52:37.123456 UTC), + ), + ( + "2019-05-06T14:52:37.1234567Z", + datetime!(2019-05-06 14:52:37.1234567 UTC), + ), + ( + "2019-05-06T14:52:37.12345678Z", + datetime!(2019-05-06 14:52:37.12345678 UTC), + ), + ( + "2019-05-06T14:52:37.123456789Z", + datetime!(2019-05-06 14:52:37.123456789 UTC), + ), + // Edge case: 1 nanosecond + ( + "1970-01-01T00:00:00.000000001", + datetime!(1970-01-01 00:00:00.000000001 UTC), + ), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + } + + // Additional comprehensive tests including time variations + #[test] + fn test_strict_date_optional_time_variations() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + let test_cases = [ + // Time without timezone + ("2016-11-30T12", datetime!(2016-11-30 12:00:00 UTC)), + ("2016-11-30T12:00", datetime!(2016-11-30 12:00:00 UTC)), + ("2016-11-30T12:00:00", datetime!(2016-11-30 12:00:00 UTC)), + ( + "2016-11-30T12:00:00.000", + datetime!(2016-11-30 12:00:00.0 UTC), + ), + // Hour with timezone (abbreviated formats) + ("2016-11-30T12+01", datetime!(2016-11-30 12:00:00 +01:00:00)), + ( + "2016-11-30T12+0100", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + ( + "2016-11-30T12+01:00", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + // Hour:minute with timezone + ( + "2016-11-30T12:00+01", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + ( + "2016-11-30T12:00+0100", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + ( + "2016-11-30T12:00+01:00", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + // Full time with timezone + ( + "2016-11-30T12:00:00+01", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + ( + "2016-11-30T12:00:00+0100", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + ( + "2016-11-30T12:00:00+01:00", + datetime!(2016-11-30 12:00:00 +01:00:00), + ), + // Milliseconds with timezone + ( + "2016-11-30T12:00:00.000+01", + datetime!(2016-11-30 12:00:00.0 +01:00:00), + ), + ( + "2016-11-30T12:00:00.000+0100", + datetime!(2016-11-30 12:00:00.0 +01:00:00), + ), + ( + "2016-11-30T12:00:00.000+01:00", + datetime!(2016-11-30 12:00:00.0 +01:00:00), + ), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + } + + // Test error cases - strict parsing should reject malformed inputs + #[test] + fn test_strict_date_optional_time_error_cases() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + // These should all fail to parse + let error_cases = [ + // Timezone without time component + "2016-11-30T+01", + // Non-zero-padded time components + "2018-12-31T9:15:30", // hour not padded + "2018-12-31T10:5:30", // minute not padded + "2018-12-31T10:15:3", // second not padded + // Non-zero-padded date components + "2018-12-1", // day not padded + "2018-1-31", // month not padded + // 5-digit year (out of range) + "10000-01-31", + ]; + + for input in error_cases.iter() { + assert!( + parser.parse_date_time(input).is_err(), + "Expected parsing to fail for input: {input}" + ); + } + } + + #[test] + fn test_strict_date_optional_time_fractional_seconds() { + // the difference between strict_date_optional_time and + // strict_date_optional_time_nanos is formatting, not parsing, ES + // accepts all precisions even with strict_date_optional_time (see + // testFractionalSeconds in ES's DateFormattersTests.java) + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + // Test various fractional second precisions (1-9 digits) + let test_cases = [ + ("2019-05-06T14:52:37.1Z", 100_000_000), + ("2019-05-06T14:52:37.12Z", 120_000_000), + ("2019-05-06T14:52:37.123Z", 123_000_000), + ("2019-05-06T14:52:37.1234Z", 123_400_000), + ("2019-05-06T14:52:37.12345Z", 123_450_000), + ("2019-05-06T14:52:37.123456Z", 123_456_000), + ("2019-05-06T14:52:37.1234567Z", 123_456_700), + ("2019-05-06T14:52:37.12345678Z", 123_456_780), + ("2019-05-06T14:52:37.123456789Z", 123_456_789), + ]; + + for (input, expected_nanos) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!( + parsed.nanosecond(), + *expected_nanos, + "mismatch for input: {input}" + ); + } + } + + // Test decimal point variations (comma vs period) + #[test] + fn test_decimal_point_parsing() { + let parser = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + // Period as decimal separator (standard) + let result = parser.parse_date_time("2001-01-01T00:00:00.123Z"); + assert!(result.is_ok(), "Failed to parse with period separator"); + assert_eq!(result.unwrap(), datetime!(2001-01-01 00:00:00.123 UTC)); + + // Comma as decimal separator (some locales) + // Elasticsearch/OpenSearch support both period and comma + let result = parser.parse_date_time("2001-01-01T00:00:00,123Z"); + assert!(result.is_ok(), "Failed to parse with comma separator"); + assert_eq!(result.unwrap(), datetime!(2001-01-01 00:00:00.123 UTC)); + + // Test comma with different precision levels + let test_cases = [ + ( + "2019-05-06T14:52:37,1Z", + datetime!(2019-05-06 14:52:37.1 UTC), + ), + ( + "2019-05-06T14:52:37,12Z", + datetime!(2019-05-06 14:52:37.12 UTC), + ), + ( + "2019-05-06T14:52:37,123Z", + datetime!(2019-05-06 14:52:37.123 UTC), + ), + ( + "2018-12-31T10:15:30,123+01:00", + datetime!(2018-12-31 10:15:30.123 +01:00:00), + ), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + } + + #[test] + fn test_basic_date() { + let parser = StrptimeParser::from_java_datetime_format("basic_date").unwrap(); + + let test_cases = [ + ("20181126", datetime!(2018-11-26 00:00:00 UTC)), + ("20210101", datetime!(2021-01-01 00:00:00 UTC)), + ("19991231", datetime!(1999-12-31 00:00:00 UTC)), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + } + + #[test] + fn test_week_date_formats() { + // basic_week_date with 4-digit years + let basic_parser = StrptimeParser::from_java_datetime_format("basic_week_date").unwrap(); + + let basic_cases = [ + ("2018W313", datetime!(2018-08-02 00:00:00 UTC)), + ("2024W011", datetime!(2024-01-02 00:00:00 UTC)), + ]; + + for (input, expected) in basic_cases.iter() { + let parsed = basic_parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // strict_basic_week_date requires exactly 4-digit year + let strict_parser = + StrptimeParser::from_java_datetime_format("strict_basic_week_date").unwrap(); + + assert!(strict_parser.parse_date_time("2018W313").is_ok()); + assert!(strict_parser.parse_date_time("2024W011").is_ok()); + assert!(strict_parser.parse_date_time("18W313").is_err()); + + // ES allows 1-2 digit years for basic_week_date but our implementation + // currently requires 4 digits like the strict variant + // TODO: implement flexible year parsing to match ES behavior for: + // - "1W313" (1-digit year) + // - "18W313" (2-digit year) + } + + #[test] + fn test_week_date_time_formats() { + let basic_parser = + StrptimeParser::from_java_datetime_format("basic_week_date_time").unwrap(); + + let test_cases = [ + ("2018W313T121212.1Z", datetime!(2018-08-02 12:12:12.1 UTC)), + ( + "2018W313T121212.1+0100", + datetime!(2018-08-02 12:12:12.1 +01:00:00), + ), + ( + "2018W313T121212.123Z", + datetime!(2018-08-02 12:12:12.123 UTC), + ), + ( + "2018W313T121212.123456789Z", + datetime!(2018-08-02 12:12:12.123456789 UTC), + ), + ( + "2018W313T121212.123+0100", + datetime!(2018-08-02 12:12:12.123 +01:00:00), + ), + ( + "2018W313T121212.123+01:00", + datetime!(2018-08-02 12:12:12.123 +01:00:00), + ), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = basic_parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // Test strict variant + let strict_parser = + StrptimeParser::from_java_datetime_format("strict_basic_week_date_time").unwrap(); + + // Additional test case from ES testStrictParsing + let parsed = strict_parser + .parse_date_time("2018W313T121212.1+01:00") + .unwrap(); + assert_eq!(parsed, datetime!(2018-08-02 12:12:12.1 +01:00:00)); + + for (input, expected) in test_cases.iter() { + let parsed = strict_parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // Test strict error cases - invalid time components + assert!( + strict_parser + .parse_date_time("2018W313T12128.123Z") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T81212.123Z") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T12812.123Z") + .is_err() + ); + assert!(strict_parser.parse_date_time("2018W313T12812.1Z").is_err()); + assert!( + strict_parser + .parse_date_time("2018W313T12128.123456789Z") + .is_err() + ); + } + + #[test] + fn test_week_date_time_no_millis() { + let basic_parser = + StrptimeParser::from_java_datetime_format("basic_week_date_time_no_millis").unwrap(); + + let test_cases = [ + ("2018W313T121212Z", datetime!(2018-08-02 12:12:12 UTC)), + ( + "2018W313T121212+0100", + datetime!(2018-08-02 12:12:12 +01:00:00), + ), + ( + "2018W313T121212+01:00", + datetime!(2018-08-02 12:12:12 +01:00:00), + ), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = basic_parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // Test strict variant + let strict_parser = + StrptimeParser::from_java_datetime_format("strict_basic_week_date_time_no_millis") + .unwrap(); + + for (input, expected) in test_cases.iter() { + let parsed = strict_parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // Test strict error cases - invalid time components + assert!(strict_parser.parse_date_time("2018W313T12128Z").is_err()); + assert!(strict_parser.parse_date_time("2018W313T81212Z").is_err()); + assert!(strict_parser.parse_date_time("2018W313T12812Z").is_err()); + assert!( + strict_parser + .parse_date_time("2018W313T12128+0100") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T81212+01:00") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T12128+01:00") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T81212+0100") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T12812+0100") + .is_err() + ); + assert!( + strict_parser + .parse_date_time("2018W313T12812+01:00") + .is_err() + ); + } + + #[test] + fn test_strict_week_date_with_separators() { + let parser = StrptimeParser::from_java_datetime_format("strict_week_date").unwrap(); + + let test_cases = [ + ("2012-W48-6", datetime!(2012-12-02 00:00:00 UTC)), + ("2012-W01-6", datetime!(2012-01-08 00:00:00 UTC)), + ("2018-W31-3", datetime!(2018-08-02 00:00:00 UTC)), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // Test error cases - non-padded week + assert!(parser.parse_date_time("2012-W1-6").is_err()); + // Invalid day of week (should be 1-7) + assert!(parser.parse_date_time("2012-W01-8").is_err()); + // Both errors at once + assert!(parser.parse_date_time("2012-W1-8").is_err()); + } + + #[test] + fn test_week_date_non_strict() { + let parser = StrptimeParser::from_java_datetime_format("week_date").unwrap(); + + let test_cases = [ + ("2012-W48-6", datetime!(2012-12-02 00:00:00 UTC)), + ("2012-W01-6", datetime!(2012-01-08 00:00:00 UTC)), + ("2012-W1-6", datetime!(2012-01-08 00:00:00 UTC)), // Non-strict allows W1 + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // Invalid day of week should still fail + assert!(parser.parse_date_time("2012-W1-8").is_err()); + } + + // Test date_optional_time format (non-strict variant) + #[test] + fn test_date_optional_time() { + let parser = StrptimeParser::from_java_datetime_format("date_optional_time").unwrap(); + + let test_cases = [ + // Date only formats + ("2018-05", datetime!(2018-05-01 00:00:00 UTC)), + ("2018-05-30", datetime!(2018-05-30 00:00:00 UTC)), + // With time components (no timezone) + ("2018-05-30T20", datetime!(2018-05-30 20:00:00 UTC)), + ("2018-05-30T20:21", datetime!(2018-05-30 20:21:00 UTC)), + ("2018-05-30T20:21:23", datetime!(2018-05-30 20:21:23 UTC)), + // With fractional seconds (no timezone) + ( + "2018-05-30T20:21:23.1", + datetime!(2018-05-30 20:21:23.1 UTC), + ), + ( + "2018-05-30T20:21:23.123", + datetime!(2018-05-30 20:21:23.123 UTC), + ), + ( + "2018-05-30T20:21:23.123456789", + datetime!(2018-05-30 20:21:23.123456789 UTC), + ), + // With timezone + ( + "2018-05-30T20:21:23.123Z", + datetime!(2018-05-30 20:21:23.123 UTC), + ), + ( + "2018-05-30T20:21:23.123456789Z", + datetime!(2018-05-30 20:21:23.123456789 UTC), + ), + ( + "2018-05-30T20:21:23.1+0100", + datetime!(2018-05-30 20:21:23.1 +01:00:00), + ), + ( + "2018-05-30T20:21:23.123+0100", + datetime!(2018-05-30 20:21:23.123 +01:00:00), + ), + ( + "2018-05-30T20:21:23.1+01:00", + datetime!(2018-05-30 20:21:23.1 +01:00:00), + ), + ( + "2018-05-30T20:21:23.123+01:00", + datetime!(2018-05-30 20:21:23.123 +01:00:00), + ), + // Padded time components + ("2018-12-31T10:15:30", datetime!(2018-12-31 10:15:30 UTC)), + ]; + + for (input, expected) in test_cases.iter() { + let parsed = parser + .parse_date_time(input) + .unwrap_or_else(|err| panic!("failed to parse {input}: {err}")); + assert_eq!(parsed, *expected, "mismatch for input: {input}"); + } + + // ES allows non-padded date and time components for date_optional_time but our + // current implementation uses strict padding (dd, HH, mm, ss require 2 digits) + // TODO: implement optional padding to match ES behavior for: + // - "2018-12-1" (non-padded day) + // - "2018-12-31T10:15:3" (non-padded second) + // - "2018-12-31T10:5:30" (non-padded minute) + // - "2018-12-31T1:15:30" (non-padded hour) + } +} + +/// Tests for formatting datetime to strings, ported from Elasticsearch's +/// `DateFormattersTests.java` to ensure maximum compatibility with their +/// date formatting behavior. +#[cfg(test)] +mod tests_formatting_ported_from_es { + use time::macros::datetime; + + use super::*; + + #[test] + fn test_strict_date_optional_time_formats_milliseconds() { + // From ES testMinNanos() and testMaxNanos() - strict_date_optional_time + // should format with 3 digits (milliseconds precision) + let formatter = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + // Test with various nanosecond values + let test_cases = [ + // Full milliseconds + ( + datetime!(2019-02-08 11:43:00.123 UTC), + "2019-02-08T11:43:00.123Z", + ), + // Zero milliseconds should show .000 + ( + datetime!(2019-02-08 11:43:00.0 UTC), + "2019-02-08T11:43:00.000Z", + ), + // Partial milliseconds - should round/truncate to 3 digits + ( + datetime!(2019-02-08 11:43:00.1 UTC), + "2019-02-08T11:43:00.100Z", + ), + ( + datetime!(2019-02-08 11:43:00.12 UTC), + "2019-02-08T11:43:00.120Z", + ), + // With microseconds - should truncate to milliseconds + ( + datetime!(2019-02-08 11:43:00.123456 UTC), + "2019-02-08T11:43:00.123Z", + ), + // With nanoseconds - should truncate to milliseconds + ( + datetime!(2019-02-08 11:43:00.123456789 UTC), + "2019-02-08T11:43:00.123Z", + ), + ]; + + for (input, expected) in test_cases.iter() { + let formatted = formatter + .format_date_time(input) + .unwrap_or_else(|err| panic!("failed to format {input}: {err}")); + assert_eq!( + &formatted, expected, + "formatting mismatch for {input}: got {formatted}" + ); + } + } + + #[test] + fn test_strict_date_optional_time_nanos_formats_nanoseconds() { + // From ES testMinNanos() and testMaxNanos() - strict_date_optional_time_nanos + // should format with 9 digits (nanosecond precision) + let formatter = + StrptimeParser::from_java_datetime_format("strict_date_optional_time_nanos").unwrap(); + + // Test with various nanosecond values + let test_cases = [ + // Full nanoseconds + ( + datetime!(2019-02-08 11:43:00.123456789 UTC), + "2019-02-08T11:43:00.123456789Z", + ), + // Zero nanoseconds should show .000000000 + ( + datetime!(2019-02-08 11:43:00.0 UTC), + "2019-02-08T11:43:00.000000000Z", + ), + // Milliseconds only - should pad to 9 digits + ( + datetime!(2019-02-08 11:43:00.123 UTC), + "2019-02-08T11:43:00.123000000Z", + ), + // Microseconds - should pad to 9 digits + ( + datetime!(2019-02-08 11:43:00.123456 UTC), + "2019-02-08T11:43:00.123456000Z", + ), + // Partial nanoseconds + ( + datetime!(2019-02-08 11:43:00.1 UTC), + "2019-02-08T11:43:00.100000000Z", + ), + ( + datetime!(2019-02-08 11:43:00.12 UTC), + "2019-02-08T11:43:00.120000000Z", + ), + ]; + + for (input, expected) in test_cases.iter() { + let formatted = formatter + .format_date_time(input) + .unwrap_or_else(|err| panic!("failed to format {input}: {err}")); + assert_eq!( + &formatted, expected, + "formatting mismatch for {input}: got {formatted}" + ); + } + } + + #[test] + fn test_strict_date_optional_time_formats_with_timezone() { + let formatter = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + let test_cases = [ + // UTC timezone + ( + datetime!(2018-12-31 10:15:30.123 UTC), + "2018-12-31T10:15:30.123Z", + ), + // Positive offset + ( + datetime!(2018-12-31 10:15:30.123 +01:00:00), + "2018-12-31T10:15:30.123+01:00", + ), + // Negative offset + ( + datetime!(2018-12-31 10:15:30.123 -05:00:00), + "2018-12-31T10:15:30.123-05:00", + ), + ]; + + for (input, expected) in test_cases.iter() { + let formatted = formatter + .format_date_time(input) + .unwrap_or_else(|err| panic!("failed to format {input}: {err}")); + assert_eq!( + &formatted, expected, + "formatting mismatch for {input}: got {formatted}" + ); + } + } + + #[test] + fn test_format_and_parse_roundtrip() { + // Verify that formatting and parsing are inverse operations + let formatter = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + let test_datetimes = [ + datetime!(2018-12-31 10:15:30.123 UTC), + datetime!(2019-02-08 11:43:00.0 UTC), + datetime!(2020-01-01 00:00:00.001 UTC), + datetime!(2018-12-31 10:15:30.123 +01:00:00), + ]; + + for original in test_datetimes.iter() { + let formatted = formatter + .format_date_time(original) + .unwrap_or_else(|err| panic!("failed to format {original}: {err}")); + let parsed = formatter + .parse_date_time(&formatted) + .unwrap_or_else(|err| panic!("failed to parse {formatted}: {err}")); + assert_eq!( + parsed, *original, + "roundtrip failed for {original}: formatted as {formatted}, parsed as {parsed}" + ); + } + } + + #[test] + fn test_format_nanos_roundtrip() { + // Verify that formatting and parsing are inverse operations for nanos format + let formatter = + StrptimeParser::from_java_datetime_format("strict_date_optional_time_nanos").unwrap(); + + let test_datetimes = [ + datetime!(2018-12-31 10:15:30.123456789 UTC), + datetime!(2019-02-08 11:43:00.0 UTC), + datetime!(2020-01-01 00:00:00.000000001 UTC), + datetime!(2018-12-31 10:15:30.123456 UTC), + ]; + + for original in test_datetimes.iter() { + let formatted = formatter + .format_date_time(original) + .unwrap_or_else(|err| panic!("failed to format {original}: {err}")); + let parsed = formatter + .parse_date_time(&formatted) + .unwrap_or_else(|err| panic!("failed to parse {formatted}: {err}")); + assert_eq!( + parsed, *original, + "roundtrip failed for {original}: formatted as {formatted}, parsed as {parsed}" + ); + } + } + + #[test] + fn test_zero_millis_formatted_with_trailing_zeros() { + // From ES test0MillisAreFormatted() - even when milliseconds are zero, + // they should be formatted with .000 + let formatter = + StrptimeParser::from_java_datetime_format("strict_date_optional_time").unwrap(); + + let dt = datetime!(2019-02-08 11:43:00.0 UTC); + let formatted = formatter.format_date_time(&dt).unwrap(); + + // Should format as .000, not omit the fractional seconds + assert_eq!(formatted, "2019-02-08T11:43:00.000Z"); + } +} diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index ae0239e53c5..c3eadcd4bd1 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -25,6 +25,7 @@ serde_json = { workspace = true } serde_json_borrow = { workspace = true } siphasher = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } utoipa = { workspace = true } @@ -42,10 +43,9 @@ serde_yaml = { workspace = true } time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -quickwit-query = { workspace = true, features = ["multilang"] } +quickwit-query = { workspace = true } [features] -multilang = ["quickwit-query/multilang"] testsuite = [] [[bench]] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 1eb2cea02d9..1b5dc19b12e 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -78,6 +78,8 @@ pub struct DocMapper { timestamp_field_path: Option>, /// Secondary timestamp field name. secondary_timestamp_field_name: Option, + /// Indexation time field name. + indexation_time_field_name: Option, /// Root node of the field mapping tree. /// See [`MappingNode`]. field_mappings: MappingNode, @@ -128,6 +130,31 @@ fn validate_timestamp_field( Ok(()) } +fn validate_indexation_time_field( + indexation_field_path: &str, + mapping_root_node: &MappingNode, +) -> anyhow::Result<()> { + if indexation_field_path.starts_with('.') || indexation_field_path.starts_with("\\.") { + bail!("indexation_time field `{indexation_field_path}` should not start with a `.`"); + } + if indexation_field_path.ends_with('.') { + bail!("indexation_time field `{indexation_field_path}` should not end with a `.`"); + } + let Some(indexation_time_field_type) = + mapping_root_node.find_field_mapping_type(indexation_field_path) + else { + bail!("could not find indexation_time field `{indexation_field_path}` in field mappings"); + }; + if let FieldMappingType::DateTime(_, cardinality) = &indexation_time_field_type { + if cardinality != &Cardinality::SingleValued { + bail!("indexation_time field `{indexation_field_path}` should be single-valued"); + } + } else { + bail!("indexation_time field `{indexation_field_path}` should be a datetime field"); + } + Ok(()) +} + impl From for DocMapperBuilder { fn from(default_doc_mapper: DocMapper) -> Self { let partition_key_str = default_doc_mapper.partition_key.to_string(); @@ -142,6 +169,7 @@ impl From for DocMapperBuilder { field_mappings: default_doc_mapper.field_mappings.into(), timestamp_field: default_doc_mapper.timestamp_field_name, secondary_timestamp_field: default_doc_mapper.secondary_timestamp_field_name, + indexation_time_field: default_doc_mapper.indexation_time_field_name, tag_fields: default_doc_mapper.tag_field_names, partition_key: partition_key_opt, max_num_partitions: default_doc_mapper.max_num_partitions, @@ -203,6 +231,9 @@ impl TryFrom for DocMapper { } else { None }; + if let Some(indexation_time_field_name) = &doc_mapping.indexation_time_field { + validate_indexation_time_field(indexation_time_field_name, &field_mappings)?; + } let schema = schema_builder.build(); let tokenizer_manager = create_default_quickwit_tokenizer_manager(); @@ -293,6 +324,7 @@ impl TryFrom for DocMapper { timestamp_field_name: doc_mapping.timestamp_field, timestamp_field_path, secondary_timestamp_field_name: doc_mapping.secondary_timestamp_field, + indexation_time_field_name: doc_mapping.indexation_time_field, field_mappings, concatenate_dynamic_fields, tag_field_names, @@ -681,6 +713,11 @@ impl DocMapper { self.secondary_timestamp_field_name.as_deref() } + /// Returns the indexation time field name. + pub fn indexation_time_field_name(&self) -> Option<&str> { + self.indexation_time_field_name.as_deref() + } + /// Returns the tag `NameField`s on the current schema. /// Returns an error if a tag field is not found in this schema. pub fn tag_named_fields(&self) -> anyhow::Result> { diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index ae3388aee32..e69d337a616 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -1152,7 +1152,7 @@ mod tests { "type": "text", "stored": true, "record": "basic", - "tokenizer": "en_stem" + "tokenizer": "lowercase" } "#, )?; @@ -1161,7 +1161,7 @@ mod tests { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); let indexing_options = options.indexing_options.unwrap(); - assert_eq!(indexing_options.tokenizer.name(), "en_stem"); + assert_eq!(indexing_options.tokenizer.name(), "lowercase"); assert_eq!(indexing_options.record, IndexRecordOption::Basic); } _ => panic!("wrong property type"), diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index bed4b18b90f..370674c9536 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -26,10 +26,9 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::ops::Bound; +use anyhow::bail; pub use doc_mapper_builder::DocMapperBuilder; pub use doc_mapper_impl::DocMapper; -#[cfg(all(test, feature = "multilang"))] -pub(crate) use field_mapping_entry::TextIndexingOptions; pub use field_mapping_entry::{ BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, QuickwitTextNormalizer, @@ -43,6 +42,7 @@ pub use field_mapping_type::FieldMappingType; use serde_json::Value as JsonValue; use tantivy::Term; use tantivy::schema::{Field, FieldType}; +use tantivy_fst::Automaton as TantivyFstAutomaton; pub(crate) use tokenizer_entry::{ NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType, }; @@ -78,10 +78,70 @@ pub struct TermRange { #[derive(Debug, Clone, PartialEq, Eq, Hash)] /// Supported automaton types to warmup pub enum Automaton { - /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if + /// A regex in its str representation as tantivy_fst::Regex isn't PartialEq, and the path if /// inside a json field Regex(Option>, String), - // we could add termset query here, instead of downloading the whole dictionary + /// An exact-match automaton for a TermSet query. + TermSet(ExactSetAutomaton), +} + +/// A byte-level DFA that accepts exactly the strings in a sorted, deduplicated byte-sequence +/// set. State = `(depth, lo, hi)` meaning all terms in `self.terms[lo..hi]` share the first +/// `depth` bytes consumed so far. Transitions are computed via binary search, avoiding any +/// upfront DFA materialisation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ExactSetAutomaton { + /// Holds sorted, deduplicated `term.serialized_value_bytes()` for all terms in the set. + /// Using `warm_postings_automaton` coalesces both the SSTable lookup and the postings + /// downloads into a small number of merged range requests. + terms: Vec>, +} + +impl ExactSetAutomaton { + /// Create an `ExactSetAutomaton` from an iterator of terms. + pub fn try_from_terms<'a>(terms: impl IntoIterator) -> anyhow::Result { + let mut sorted_bytes: Vec> = terms + .into_iter() + .map(|term| term.serialized_value_bytes().to_vec()) + .collect(); + if sorted_bytes.is_empty() { + bail!("Cannot create an ExactSetAutomaton from an empty set of terms"); + } + sorted_bytes.sort(); + sorted_bytes.dedup(); + Ok(ExactSetAutomaton { + terms: sorted_bytes, + }) + } +} + +impl TantivyFstAutomaton for ExactSetAutomaton { + /// (depth, lo, hi) + type State = (usize, usize, usize); + + fn start(&self) -> Self::State { + (0, 0, self.terms.len()) + } + + fn is_match(&self, &(depth, lo, hi): &Self::State) -> bool { + lo < hi && self.terms[lo].len() == depth + } + + fn can_match(&self, &(_, lo, hi): &Self::State) -> bool { + lo < hi + } + + fn accept(&self, &(depth, lo, hi): &Self::State, byte: u8) -> Self::State { + // Within [lo, hi), terms are sorted by their bytes. Terms of length == depth (exact + // matches) sort before any extension, so there is at most one such term at index lo. + // Skip it — it has no byte at position `depth`. + let lo = lo + usize::from(lo < hi && self.terms[lo].len() == depth); + // Binary-search for the sub-range where terms[i][depth] == byte. + // All remaining terms in [lo, hi) have length > depth, so indexing [depth] is safe. + let new_lo = lo + self.terms[lo..hi].partition_point(|t| t[depth] < byte); + let new_hi = new_lo + self.terms[new_lo..hi].partition_point(|t| t[depth] <= byte); + (depth + 1, new_lo, new_hi) + } } /// Description of how a fast field should be warmed up @@ -97,9 +157,6 @@ pub struct FastFieldWarmupInfo { /// running the query. #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct WarmupInfo { - /// Name of fields from the term dictionary and posting list which needs to - /// be entirely loaded - pub term_dict_fields: HashSet, /// Fast fields which needs to be loaded pub fast_fields: HashSet, /// Whether to warmup field norms. Used mostly for scoring. @@ -115,7 +172,6 @@ pub struct WarmupInfo { impl WarmupInfo { /// Merge other WarmupInfo into self. pub fn merge(&mut self, other: WarmupInfo) { - self.term_dict_fields.extend(other.term_dict_fields); self.field_norms |= other.field_norms; for fast_field_warmup_info in other.fast_fields.into_iter() { @@ -153,21 +209,6 @@ impl WarmupInfo { /// Simplify a WarmupInfo, removing some redundant tasks pub fn simplify(&mut self) { - self.terms_grouped_by_field.retain(|field, terms| { - if self.term_dict_fields.contains(field) { - // we are already about to full-load this dictionary. We only care about terms - // which needs additional position - terms.retain(|_term, include_position| *include_position); - } - // if no term is left, remove the entry from the hashmap - !terms.is_empty() - }); - self.term_ranges_grouped_by_field.retain(|field, terms| { - if self.term_dict_fields.contains(field) { - terms.retain(|_term, include_position| *include_position); - } - !terms.is_empty() - }); // TODO we could remove from terms_grouped_by_field for ranges with no `limit` in // term_ranges_grouped_by_field } @@ -624,13 +665,6 @@ mod tests { .collect() } - fn hashset_field(elements: &[u32]) -> HashSet { - elements - .iter() - .map(|elem| Field::from_field_id(*elem)) - .collect() - } - fn hashmap(elements: &[(u32, &str, bool)]) -> HashMap> { let mut result: HashMap> = HashMap::new(); for (field, term, pos) in elements { @@ -665,7 +699,6 @@ mod tests { #[test] fn test_warmup_info_merge() { let wi_base = WarmupInfo { - term_dict_fields: hashset_field(&[1, 2]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[(1, "term1", false), (1, "term2", false)]), @@ -688,7 +721,6 @@ mod tests { let mut wi_base = wi_base; let wi_2 = WarmupInfo { - term_dict_fields: hashset_field(&[2, 3]), fast_fields: hashset_fast(&["fast2", "fast3"]), field_norms: true, terms_grouped_by_field: hashmap(&[(2, "term1", false), (1, "term2", true)]), @@ -705,7 +737,6 @@ mod tests { }; wi_base.merge(wi_2.clone()); - assert_eq!(wi_base.term_dict_fields, hashset_field(&[1, 2, 3])); assert_eq!( wi_base.fast_fields, hashset_fast(&["fast1", "fast2", "fast3"]) @@ -771,7 +802,6 @@ mod tests { #[test] fn test_warmup_info_simplify() { let mut warmup_info = WarmupInfo { - term_dict_fields: hashset_field(&[1]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, terms_grouped_by_field: hashmap(&[ @@ -793,11 +823,15 @@ mod tests { .collect(), }; let expected = WarmupInfo { - term_dict_fields: hashset_field(&[1]), fast_fields: hashset_fast(&["fast1", "fast2"]), field_norms: false, - terms_grouped_by_field: hashmap(&[(1, "term2", true), (2, "term3", false)]), + terms_grouped_by_field: hashmap(&[ + (1, "term1", false), + (1, "term2", true), + (2, "term3", false), + ]), term_ranges_grouped_by_field: hashmap_ranges(&[ + (1, "term1", false), (1, "term2", true), (2, "term3", false), ]), @@ -812,55 +846,4 @@ mod tests { warmup_info.simplify(); assert_eq!(warmup_info, expected); } - - #[test] - #[cfg(feature = "multilang")] - fn test_doc_mapper_query_with_multilang_field() { - use quickwit_query::query_ast::TermQuery; - use tantivy::schema::IndexRecordOption; - - use crate::doc_mapper::{ - QuickwitTextOptions, QuickwitTextTokenizer, TextIndexingOptions, TokenizerType, - }; - use crate::{TokenizerConfig, TokenizerEntry}; - let mut doc_mapper_builder = DocMapperBuilder::default(); - doc_mapper_builder - .doc_mapping - .field_mappings - .push(FieldMappingEntry { - name: "multilang".to_string(), - mapping_type: FieldMappingType::Text( - QuickwitTextOptions { - indexing_options: Some(TextIndexingOptions { - tokenizer: QuickwitTextTokenizer::from_static("multilang"), - record: IndexRecordOption::Basic, - fieldnorms: false, - }), - ..Default::default() - }, - Cardinality::SingleValued, - ), - }); - doc_mapper_builder - .doc_mapping - .tokenizers - .push(TokenizerEntry { - name: "multilang".to_string(), - config: TokenizerConfig { - tokenizer_type: TokenizerType::Multilang, - filters: Vec::new(), - }, - }); - let doc_mapper = doc_mapper_builder.try_build().unwrap(); - let schema = doc_mapper.schema(); - let query_ast = quickwit_query::query_ast::QueryAst::Term(TermQuery { - field: "multilang".to_string(), - value: "JPN:す".to_string(), - }); - let (query, _) = doc_mapper.query(schema, query_ast, false, None).unwrap(); - assert_eq!( - format!("{query:?}"), - r#"TermQuery(Term(field=2, type=Str, "JPN:す"))"# - ); - } } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs index b9793dc9548..0488d118c9f 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs @@ -44,10 +44,6 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), - #[cfg(any(test, feature = "multilang"))] - TokenizerType::Multilang => { - TextAnalyzer::builder(quickwit_query::MultiLangTokenizer::default()).dynamic() - } TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(), TokenizerType::Ngram(options) => { let tokenizer = @@ -120,8 +116,6 @@ impl TokenFilterType { #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, utoipa::ToSchema)] #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { - #[cfg(any(test, feature = "multilang"))] - Multilang, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), Simple, diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs index d8afa4b16e9..8fc1ce8096a 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs @@ -133,6 +133,13 @@ pub struct DocMapping { #[serde(skip_serializing_if = "Option::is_none")] pub secondary_timestamp_field: Option, + /// Declares the field which will contain the indexation time for the document. + /// This field is automatically populated by the indexer + /// with the time at which the document is indexed. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub indexation_time_field: Option, + /// Declares the low cardinality fields for which the values ​​are recorded directly in the /// splits metadata. #[schema(value_type = Vec)] @@ -207,6 +214,7 @@ mod tests { ], timestamp_field: Some("timestamp".to_string()), secondary_timestamp_field: None, + indexation_time_field: None, tag_fields: BTreeSet::from_iter(["level".to_string()]), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 8dee8d700ed..1b5a67908e3 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -30,9 +30,9 @@ mod routing_expression; pub mod tag_pruning; pub use doc_mapper::{ - Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, FieldMappingEntry, - FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, - TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text, + Automaton, BinaryFormat, DocMapper, DocMapperBuilder, ExactSetAutomaton, FastFieldWarmupInfo, + FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, + QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo, analyze_text, }; use doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 5900b577795..38d4bab60b1 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -30,7 +30,7 @@ use tantivy::schema::{Field, Schema}; use tracing::error; use crate::doc_mapper::FastFieldWarmupInfo; -use crate::{Automaton, QueryParserError, TermRange, WarmupInfo}; +use crate::{Automaton, ExactSetAutomaton, QueryParserError, TermRange, WarmupInfo}; #[derive(Default)] struct RangeQueryFields { @@ -198,8 +198,7 @@ pub(crate) fn build_query( let query = query_ast.build_tantivy_query(context)?; - let term_set_query_fields = extract_term_set_query_fields(&query_ast, context.schema)?; - let (term_ranges_grouped_by_field, automatons_grouped_by_field) = + let (term_ranges_grouped_by_field, mut automatons_grouped_by_field) = extract_prefix_term_ranges_and_automaton( &query_ast, context.schema, @@ -219,8 +218,13 @@ pub(crate) fn build_query( .or_default() |= need_position; }); + coalesce_multi_term_fields_into_automatons( + &mut terms_grouped_by_field, + &mut automatons_grouped_by_field, + 2, + )?; + let warmup_info = WarmupInfo { - term_dict_fields: term_set_query_fields, terms_grouped_by_field, term_ranges_grouped_by_field, fast_fields, @@ -231,44 +235,54 @@ pub(crate) fn build_query( Ok((query, warmup_info)) } -struct ExtractTermSetFields<'a> { - term_dict_fields_to_warm_up: HashSet, - schema: &'a Schema, -} - -impl<'a> ExtractTermSetFields<'a> { - fn new(schema: &'a Schema) -> Self { - ExtractTermSetFields { - term_dict_fields_to_warm_up: HashSet::new(), - schema, +/// For any field with more than `term_threshold` non-positional terms, moves +/// those terms into an `Automaton::TermSet` and removes them from +/// `terms_grouped_by_field`. +/// +/// This enables `warm_postings_automaton` to coalesce both the SSTable block +/// fetches and the postings downloads into a small number of merged range +/// requests, instead of N individual per-term requests. +/// +/// A minimum of `term_threshold` terms is required because +/// `warm_postings_automaton` has higher per-call overhead than a direct point +/// lookup: spawning a CPU task and traversing the sstable twice. That overhead +/// is only worth paying when there are enough terms to coalesce. +/// +/// Terms that require positions are left in `terms_grouped_by_field` unchanged, +/// as they must be fetched individually. +/// +/// TODO: should positional terms also support some form of grouping? +fn coalesce_multi_term_fields_into_automatons( + terms_grouped_by_field: &mut HashMap>, + automatons_grouped_by_field: &mut HashMap>, + term_threshold: usize, +) -> anyhow::Result<()> { + let fields: Vec = terms_grouped_by_field.keys().copied().collect(); + for field in fields { + let no_pos_terms: Vec<&Term> = terms_grouped_by_field + .get(&field) + .unwrap() + .iter() + .filter(|(_, need_pos)| !**need_pos) + .map(|(term, _)| term) + .collect(); + if no_pos_terms.len() <= term_threshold { + continue; } - } -} - -impl<'a> QueryAstVisitor<'a> for ExtractTermSetFields<'_> { - type Err = anyhow::Error; - - fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> anyhow::Result<()> { - for field in term_set_query.terms_per_field.keys() { - if let Some((field, _field_entry, _path)) = - find_field_or_hit_dynamic(field, self.schema) - { - self.term_dict_fields_to_warm_up.insert(field); - } else { - anyhow::bail!("field does not exist: {}", field); - } + let automaton = ExactSetAutomaton::try_from_terms(no_pos_terms)?; + automatons_grouped_by_field + .entry(field) + .or_default() + .insert(Automaton::TermSet(automaton)); + // Remove the no-position terms: the automaton covers their SSTable lookup + postings. + // Terms still needing positions are kept for warm_up_terms. + let field_terms = terms_grouped_by_field.get_mut(&field).unwrap(); + field_terms.retain(|_, need_pos| *need_pos); + if field_terms.is_empty() { + terms_grouped_by_field.remove(&field); } - Ok(()) } -} - -fn extract_term_set_query_fields( - query_ast: &QueryAst, - schema: &Schema, -) -> anyhow::Result> { - let mut visitor = ExtractTermSetFields::new(schema); - visitor.visit(query_ast)?; - Ok(visitor.term_dict_fields_to_warm_up) + Ok(()) } /// Converts a `prefix` term into the equivalent term range. @@ -437,7 +451,7 @@ mod test { use tantivy::schema::{DateOptions, DateTimePrecision, FAST, INDEXED, STORED, Schema, TEXT}; use super::{ExtractPrefixTermRanges, build_query}; - use crate::{DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange}; + use crate::{Automaton, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, TermRange}; enum TestExpectation<'a> { Err(&'a str), @@ -881,26 +895,96 @@ mod test { #[test] fn test_build_query_warmup_info() { - let query_with_set = query_ast_from_user_text("desc: IN [hello]", None) + let query_with_set = query_ast_from_user_text("desc: IN [alpha beta gamma delta]", None) + .parse_user_query(&[]) + .unwrap(); + let query_with_small_set = query_ast_from_user_text("desc: IN [beta]", None) .parse_user_query(&[]) .unwrap(); - let query_without_set = query_ast_from_user_text("desc:hello", None) + let query_with_many_terms = + query_ast_from_user_text("desc:(hello OR world OR extra OR big)", None) + .parse_user_query(&[]) + .unwrap(); + let query_with_single_term = query_ast_from_user_text("desc:hello", None) .parse_user_query(&[]) .unwrap(); let schema = make_schema(true); let context = BuildTantivyAstContext::for_test(&schema); - let (_, warmup_info) = build_query(query_with_set, &context, None).unwrap(); - assert_eq!(warmup_info.term_dict_fields.len(), 1); + for query in [query_with_many_terms, query_with_set] { + let (_, warmup_info) = build_query(query, &context, None).unwrap(); + assert!(warmup_info.terms_grouped_by_field.is_empty()); + assert_eq!(warmup_info.automatons_grouped_by_field.len(), 1); + let automatons = warmup_info + .automatons_grouped_by_field + .values() + .next() + .unwrap(); + assert_eq!(automatons.len(), 1); + assert!(matches!( + automatons.iter().next().unwrap(), + Automaton::TermSet(_) + )); + } + + for query in [query_with_small_set, query_with_single_term] { + let (_, warmup_info) = build_query(query, &context, None).unwrap(); + assert!(warmup_info.automatons_grouped_by_field.is_empty()); + } + } + + #[test] + fn test_build_query_warmup_info_term_set_with_other_queries() { + // Verify that: + // - fields with >= 3 non-positional terms are coalesced into an automaton + // - positional terms on the same field remain in terms_grouped_by_field + // - fields with fewer than 3 non-positional terms are unaffected + let query_ast = query_ast_from_user_text( + r#"desc: IN [alpha beta gamma] AND desc:"world extra" AND title:baz"#, + None, + ) + .parse_user_query(&[]) + .unwrap(); + + let schema = make_schema(false); + let context = BuildTantivyAstContext::for_test(&schema); + let (_, warmup_info) = build_query(query_ast, &context, None).unwrap(); + + let desc_field = schema.get_field("desc").unwrap(); + let title_field = schema.get_field("title").unwrap(); + + // desc: 3 non-positional terms (alpha, beta, gamma) are coalesced into an automaton + let desc_automatons = warmup_info + .automatons_grouped_by_field + .get(&desc_field) + .expect("desc should have an automaton"); + assert_eq!(desc_automatons.len(), 1); + assert!(matches!( + desc_automatons.iter().next().unwrap(), + Automaton::TermSet(_) + )); + + // desc: phrase terms "world" and "extra" stay as positional terms + let desc_terms = warmup_info + .terms_grouped_by_field + .get(&desc_field) + .expect("desc positional terms should still be present"); + assert_eq!(desc_terms.len(), 2); + assert!(desc_terms.values().all(|&need_pos| need_pos)); + + // title: only 1 non-positional term (below threshold), stays in terms_grouped_by_field assert!( - warmup_info - .term_dict_fields - .contains(&tantivy::schema::Field::from_field_id(2)) + !warmup_info + .automatons_grouped_by_field + .contains_key(&title_field) ); - - let (_, warmup_info) = build_query(query_without_set, &context, None).unwrap(); - assert!(warmup_info.term_dict_fields.is_empty()); + let title_terms = warmup_info + .terms_grouped_by_field + .get(&title_field) + .expect("title terms should be present"); + assert_eq!(title_terms.len(), 1); + assert!(title_terms.values().all(|&need_pos| !need_pos)); } #[test] diff --git a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs index ad6ded9444c..e1d58917823 100644 --- a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs +++ b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs @@ -26,18 +26,63 @@ pub fn match_tag_field_name(field_name: &str, tag: &str) -> bool { && tag.starts_with(field_name) } -/// Tags a user query and returns a TagFilterAst that -/// represents a filtering predicate over a set of tags. +/// Tags a user query and returns a TagFilterAst that represents a filtering +/// predicate over a set of tags. /// -/// If the predicate evaluates to false for a given set of tags -/// associated with a split, we are guaranteed that no documents -/// in the split matches the query. -pub fn extract_tags_from_query(query_ast: QueryAst) -> Option { - let unsimplified_tag_filter_ast = extract_unsimplified_tags_filter_ast(query_ast); - let term_filters_ast = simplify_ast(unsimplified_tag_filter_ast)?; +/// If the predicate evaluates to false for a given set of tags associated with +/// a split, we are guaranteed that no documents in the split matches the query. +/// +/// Setting `tag_fields` to `None` will create an AST with all possible tag +/// filters from the query. This ensures that all pruning opportunities are +/// considered, but it can also lead to very large tag filter ASTs. This can put +/// a lot of pressure on the metastore. +pub fn extract_tags_from_query( + query_ast: QueryAst, + tag_fields: Option<&BTreeSet>, +) -> Option { + if let Some(tag_fields) = tag_fields + && tag_fields.is_empty() + { + return None; + } + let mut unsimplified = extract_unsimplified_tags_filter_ast(query_ast); + if let Some(tag_fields) = tag_fields { + unsimplified = prune_unsimplified_tag_filter_ast(unsimplified, tag_fields); + } + let term_filters_ast = simplify_ast(unsimplified)?; Some(expand_to_tag_ast(term_filters_ast)) } +/// Replaces every `Tag` node whose field is not in `tag_fields` with +/// `Uninformative`, leaving the rest of the tree intact. +fn prune_unsimplified_tag_filter_ast( + ast: UnsimplifiedTagFilterAst, + tag_fields: &BTreeSet, +) -> UnsimplifiedTagFilterAst { + match ast { + UnsimplifiedTagFilterAst::And(children) => UnsimplifiedTagFilterAst::And( + children + .into_iter() + .map(|child| prune_unsimplified_tag_filter_ast(child, tag_fields)) + .collect(), + ), + UnsimplifiedTagFilterAst::Or(children) => UnsimplifiedTagFilterAst::Or( + children + .into_iter() + .map(|child| prune_unsimplified_tag_filter_ast(child, tag_fields)) + .collect(), + ), + UnsimplifiedTagFilterAst::Tag { ref field, .. } => { + if tag_fields.contains(field) { + ast + } else { + UnsimplifiedTagFilterAst::Uninformative + } + } + UnsimplifiedTagFilterAst::Uninformative => UnsimplifiedTagFilterAst::Uninformative, + } +} + fn extract_unsimplified_tags_filter_ast(query_ast: QueryAst) -> UnsimplifiedTagFilterAst { match query_ast { QueryAst::Bool(bool_query) => { @@ -294,6 +339,10 @@ fn expand_to_tag_ast(terms_filter_ast: TermFilterAst) -> TagFilterAst { TagFilterAst::Or(children.into_iter().map(expand_to_tag_ast).collect()) } TermFilterAst::Term { field, value } => { + // TODO: this is wasteful when the field is targeted many times, e.g + // in TermSetQuery queries + // - (¬user! ∨ user:bart) ∨ (¬user! ∨ user:homer) ∨ (¬user! ∨ user:lisa) + // - (¬user! ∨ user:bart ∨ user:homer ∨ user:lisa) let field_is_tag = TagFilterAst::Tag { is_present: false, tag: field_tag(&field), @@ -384,13 +433,18 @@ pub fn no_tag(tag: impl ToString) -> TagFilterAst { } #[cfg(test)] mod test { + use std::collections::BTreeSet; + use quickwit_query::BooleanOperand; use quickwit_query::query_ast::{QueryAst, UserInputQuery}; use super::extract_tags_from_query; use crate::tag_pruning::TagFilterAst; - fn extract_tags_from_query_helper(user_query: &str) -> Option { + fn extract_tags_from_query_helper( + user_query: &str, + tag_fields: Option<&[&str]>, + ) -> Option { let query_ast: QueryAst = UserInputQuery { user_text: user_query.to_string(), default_fields: None, @@ -399,55 +453,104 @@ mod test { } .into(); let parsed_query_ast = query_ast.parse_user_query(&[]).unwrap(); - extract_tags_from_query(parsed_query_ast) + let tag_fields_set: Option> = + tag_fields.map(|fields| fields.iter().map(|s| s.to_string()).collect()); + extract_tags_from_query(parsed_query_ast, tag_fields_set.as_ref()) } #[test] fn test_extract_tags_from_query_all() { - assert_eq!(extract_tags_from_query_helper("*"), None); + assert_eq!(extract_tags_from_query_helper("*", None), None); + assert_eq!(extract_tags_from_query_helper("*", Some(&["title"])), None); } #[test] fn test_extract_tags_from_query_range_query() { - assert_eq!(extract_tags_from_query_helper("title:>foo lang:fr"), None); + assert_eq!( + extract_tags_from_query_helper("title:>foo lang:fr", None), + None + ); + assert_eq!( + extract_tags_from_query_helper("title:>foo lang:fr", Some(&["title"])), + None + ); + assert_eq!( + extract_tags_from_query_helper("title:>foo lang:fr", Some(&[])), + None + ); } #[test] fn test_extract_tags_from_query_range_query_conjunction() { assert_eq!( - &extract_tags_from_query_helper("title:>foo AND lang:fr") + &extract_tags_from_query_helper("title:>foo AND lang:fr", None) .unwrap() .to_string(), "(¬lang! ∨ lang:fr)" ); - } - - #[test] - fn test_extract_tags_from_query_mixed_disjunction() -> anyhow::Result<()> { assert_eq!( - &extract_tags_from_query_helper("title:foo user:bart lang:fr") + extract_tags_from_query_helper("title:>foo AND lang:fr", Some(&["title"])), + None + ); + assert_eq!( + extract_tags_from_query_helper("title:>foo AND lang:fr", Some(&["lang"])) .unwrap() .to_string(), - "((¬title! ∨ title:foo) ∨ (¬user! ∨ user:bart) ∨ (¬lang! ∨ lang:fr))" + "(¬lang! ∨ lang:fr)" + ); + assert_eq!( + extract_tags_from_query_helper("title:>foo AND lang:fr", Some(&[])), + None ); - Ok(()) } #[test] fn test_extract_tags_from_query_and_or() -> anyhow::Result<()> { assert_eq!( - &extract_tags_from_query_helper("title:foo AND (user:bart OR lang:fr)") + &extract_tags_from_query_helper("title:foo AND (user:bart OR lang:fr)", None) .unwrap() .to_string(), "(¬title! ∨ title:foo) ∧ ((¬user! ∨ user:bart) ∨ (¬lang! ∨ lang:fr))" ); + // Non-tag fields in the OR branch make it uninformative; it is then dropped from the AND + assert_eq!( + &extract_tags_from_query_helper( + "title:foo AND (user:bart OR lang:fr)", + Some(&["title", "user"]) + ) + .unwrap() + .to_string(), + "(¬title! ∨ title:foo)" + ); + assert_eq!( + &extract_tags_from_query_helper( + "title:foo AND (user:bart OR lang:fr)", + Some(&["title", "user", "lang"]) + ) + .unwrap() + .to_string(), + "(¬title! ∨ title:foo) ∧ ((¬user! ∨ user:bart) ∨ (¬lang! ∨ lang:fr))" + ); Ok(()) } #[test] fn test_conjunction_of_tags() { assert_eq!( - &extract_tags_from_query_helper("(user:bart AND lang:fr)") + &extract_tags_from_query_helper("(user:bart AND lang:fr)", None) + .unwrap() + .to_string(), + "(¬user! ∨ user:bart) ∧ (¬lang! ∨ lang:fr)" + ); + // Non-tag field is dropped from AND, leaving only the tag field + assert_eq!( + &extract_tags_from_query_helper("(user:bart AND lang:fr)", Some(&["user"])) + .unwrap() + .to_string(), + "(¬user! ∨ user:bart)" + ); + assert_eq!( + &extract_tags_from_query_helper("(user:bart AND lang:fr)", Some(&["user", "lang"])) .unwrap() .to_string(), "(¬user! ∨ user:bart) ∧ (¬lang! ∨ lang:fr)" @@ -457,7 +560,18 @@ mod test { #[test] fn test_disjunction_of_tags() { assert_eq!( - &extract_tags_from_query_helper("(user:bart OR lang:fr)") + &extract_tags_from_query_helper("(user:bart OR lang:fr)", None) + .unwrap() + .to_string(), + "((¬user! ∨ user:bart) ∨ (¬lang! ∨ lang:fr))" + ); + // A non-tag field in an OR branch makes the whole disjunction uninformative + assert_eq!( + extract_tags_from_query_helper("(user:bart OR lang:fr)", Some(&["user"])), + None + ); + assert_eq!( + &extract_tags_from_query_helper("(user:bart OR lang:fr)", Some(&["user", "lang"])) .unwrap() .to_string(), "((¬user! ∨ user:bart) ∨ (¬lang! ∨ lang:fr))" @@ -467,28 +581,55 @@ mod test { #[test] fn test_disjunction_of_tag_disjunction_with_not_clause() { // ORed negative tags make the result inconclusive. See simplify_ast() for details - assert!(extract_tags_from_query_helper("(user:bart -lang:fr)").is_none()); + assert!(extract_tags_from_query_helper("(user:bart OR -lang:fr)", None).is_none()); + assert!( + extract_tags_from_query_helper("(user:bart OR -lang:fr)", Some(&["user"])).is_none() + ); } #[test] fn test_disjunction_of_tag_conjunction_with_not_clause() { // negative tags are removed from AND clauses. See simplify_ast() for details assert_eq!( - &extract_tags_from_query_helper("user:bart AND NOT lang:fr") + &extract_tags_from_query_helper("user:bart AND NOT lang:fr", None) + .unwrap() + .to_string(), + "(¬user! ∨ user:bart)" + ); + // user is a tag field: NOT lang was already dropped, result is the same + assert_eq!( + &extract_tags_from_query_helper("user:bart AND NOT lang:fr", Some(&["user"])) .unwrap() .to_string(), "(¬user! ∨ user:bart)" ); + // only lang is a tag field: user becomes uninformative, NOT lang is also dropped + assert_eq!( + extract_tags_from_query_helper("user:bart AND NOT lang:fr", Some(&["lang"])), + None + ); } #[test] fn test_disjunction_of_tag_must_should() { assert_eq!( - &extract_tags_from_query_helper("(+user:bart lang:fr)") + &extract_tags_from_query_helper("(+user:bart lang:fr)", None) + .unwrap() + .to_string(), + "(¬user! ∨ user:bart)" + ); + // Should clauses are dropped when a Must is present; user is a tag field: same result + assert_eq!( + &extract_tags_from_query_helper("(+user:bart lang:fr)", Some(&["user"])) .unwrap() .to_string(), "(¬user! ∨ user:bart)" ); + // user is not a tag field: the Must clause becomes uninformative + assert_eq!( + extract_tags_from_query_helper("(+user:bart lang:fr)", Some(&["lang"])), + None + ); } #[test] diff --git a/quickwit/quickwit-indexing/failpoints/mod.rs b/quickwit/quickwit-indexing/failpoints/mod.rs index d8c5ab0e418..d7f0f0c1cef 100644 --- a/quickwit/quickwit-indexing/failpoints/mod.rs +++ b/quickwit/quickwit-indexing/failpoints/mod.rs @@ -293,7 +293,7 @@ async fn test_merge_executor_controlled_directory_kill_switch() -> anyhow::Resul tantivy_dirs, }; let pipeline_id = MergePipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::new_with_random_ulid(index_id), source_id: "test-source".to_string(), }; diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index 64a08d3f5da..4e46a03497f 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -44,9 +44,10 @@ use tantivy::schema::{Field, Schema, Value}; use tantivy::store::{Compressor, ZstdCompressor}; use tantivy::tokenizer::TokenizerManager; use tantivy::{DateTime, IndexBuilder, IndexSettings}; +use time::OffsetDateTime; use tokio::runtime::Handle; use tokio::sync::Semaphore; -use tracing::{Span, info, info_span, warn}; +use tracing::{Span, error, info, info_span, warn}; use ulid::Ulid; use crate::actors::IndexSerializer; @@ -99,6 +100,8 @@ struct IndexerState { max_num_partitions: NonZeroU32, index_settings: IndexSettings, cooperative_indexing_opt: Option, + indexation_time_field_opt: Option, + is_delete_task_service_disabled: bool, } impl IndexerState { @@ -204,14 +207,28 @@ impl IndexerState { let last_delete_opstamp_request = LastDeleteOpstampRequest { index_uid: Some(self.pipeline_id.index_uid.clone()), }; - let last_delete_opstamp_response = ctx - .protect_future( + let last_delete_opstamp = if self.is_delete_task_service_disabled { + // If the delete task service is disabled, the opstamp is supposed + // to be 0 anyway. If we were to re-enable it, that should be done + // on the indexers first, then the janitor. + 0 + } else { + ctx.protect_future( self.metastore .clone() .last_delete_opstamp(last_delete_opstamp_request), ) - .await?; - let last_delete_opstamp = last_delete_opstamp_response.last_delete_opstamp; + .await + .inspect_err(|error| { + error!( + %error, + index_id=%self.pipeline_id.index_uid.index_id, + source_id=%self.pipeline_id.source_id, + "failed to fetch last delete opstamp from the metastore" + ); + })? + .last_delete_opstamp + }; let checkpoint_delta = IndexCheckpointDelta { source_id: self.pipeline_id.source_id.clone(), @@ -300,7 +317,15 @@ impl IndexerState { .context("batch delta does not follow indexer checkpoint")?; let mut memory_usage_delta: i64 = 0; counters.num_doc_batches_in_workbench += 1; - for doc in batch.docs { + let indexation_time_opt = self + .indexation_time_field_opt + .map(|_| DateTime::from_utc(OffsetDateTime::now_utc())); + for mut doc in batch.docs { + if let (Some(indexation_time), Some(indexation_time_field)) = + (indexation_time_opt, self.indexation_time_field_opt) + { + doc.doc.add_date(indexation_time_field, indexation_time); + } let ProcessedDoc { doc, timestamp_opt, @@ -562,6 +587,7 @@ impl Handler for Indexer { } impl Indexer { + #[allow(clippy::too_many_arguments)] pub fn new( pipeline_id: IndexingPipelineId, doc_mapper: Arc, @@ -570,6 +596,7 @@ impl Indexer { indexing_settings: IndexingSettings, cooperative_indexing_permits_opt: Option>, index_serializer_mailbox: Mailbox, + is_delete_task_service_disabled: bool, ) -> Self { let schema = doc_mapper.schema(); let tokenizer_manager = doc_mapper.tokenizer_manager().clone(); @@ -589,6 +616,17 @@ impl Indexer { cooperative_indexing_permits, ) }); + let indexation_time_field_opt = + doc_mapper + .indexation_time_field_name() + .and_then(|name| match schema.get_field(name) { + Ok(field) => Some(field), + Err(_) => { + warn!("failed to find indexation time field '{}' in schema", name); + None + } + }); + Self { indexer_state: IndexerState { pipeline_id, @@ -604,6 +642,8 @@ impl Indexer { index_settings, max_num_partitions: doc_mapper.max_num_partitions(), cooperative_indexing_opt, + indexation_time_field_opt, + is_delete_task_service_disabled, }, index_serializer_mailbox, indexing_workbench_opt: None, @@ -743,7 +783,7 @@ mod tests { EmptyResponse, LastDeleteOpstampResponse, MockMetastoreService, }; use quickwit_proto::types::{IndexUid, NodeId, PipelineUid}; - use tantivy::{DateTime, doc}; + use tantivy::{DateTime, DocAddress, ReloadPolicy, TantivyDocument, doc}; use super::*; use crate::actors::indexer::{IndexerCounters, record_timestamp}; @@ -783,7 +823,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: index_uid.clone(), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(default_doc_mapper_for_test()); @@ -814,6 +854,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -913,6 +954,64 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_indexer_delete_task_service_disabled() -> anyhow::Result<()> { + let pipeline_id = IndexingPipelineId { + index_uid: IndexUid::new_with_random_ulid("test-index"), + source_id: "test-source".to_string(), + node_id: NodeId::from_str("test-node"), + pipeline_uid: PipelineUid::default(), + }; + let doc_mapper = Arc::new(default_doc_mapper_for_test()); + let schema = doc_mapper.schema(); + let body_field = schema.get_field("body").unwrap(); + let timestamp_field = schema.get_field("timestamp").unwrap(); + let indexing_directory = TempDirectory::for_test(); + let indexing_settings = IndexingSettings::for_test(); + let universe = Universe::with_accelerated_time(); + let (index_serializer_mailbox, index_serializer_inbox) = universe.create_test_mailbox(); + let mut mock_metastore = MockMetastoreService::new(); + // last_delete_opstamp must never be called when delete task service is disabled. + mock_metastore.expect_last_delete_opstamp().never(); + let indexer = Indexer::new( + pipeline_id, + doc_mapper, + MetastoreServiceClient::from_mock(mock_metastore), + indexing_directory, + indexing_settings, + None, + index_serializer_mailbox, + true, // is_delete_task_service_disabled + ); + let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); + indexer_mailbox + .send_message(ProcessedDocBatch::new( + vec![ProcessedDoc { + doc: doc!( + body_field=>"this is a test document", + timestamp_field=>DateTime::from_timestamp_secs(1_662_529_435) + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_529_435)), + partition: 1, + num_bytes: 30, + }], + SourceCheckpointDelta::from_range(0..1), + false, + )) + .await?; + universe.send_exit_with_success(&indexer_mailbox).await?; + let (exit_status, _indexer_counters) = indexer_handle.join().await; + assert!(exit_status.is_success()); + let output_messages: Vec = + index_serializer_inbox.drain_for_test_typed(); + assert_eq!(output_messages.len(), 1); + assert_eq!(output_messages[0].splits[0].split_attrs.num_docs, 1); + // delete_opstamp must be 0 when delete task service is disabled. + assert_eq!(output_messages[0].splits[0].split_attrs.delete_opstamp, 0); + universe.assert_quit().await; + Ok(()) + } + #[tokio::test] async fn test_indexer_triggers_commit_on_memory_limit() -> anyhow::Result<()> { let universe = Universe::new(); @@ -920,7 +1019,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: index_uid.clone(), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(default_doc_mapper_for_test()); @@ -949,6 +1048,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, _indexer_handle) = universe.spawn_builder().spawn(indexer); @@ -997,7 +1097,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(default_doc_mapper_for_test()); @@ -1025,6 +1125,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); tokio::task::spawn({ @@ -1081,7 +1182,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(default_doc_mapper_for_test()); @@ -1107,6 +1208,7 @@ mod tests { indexing_settings, Some(Arc::new(Semaphore::new(1))), index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -1169,7 +1271,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(default_doc_mapper_for_test()); @@ -1194,6 +1296,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -1249,7 +1352,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper: Arc = @@ -1277,6 +1380,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -1349,7 +1453,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper: Arc = @@ -1373,6 +1477,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); @@ -1420,7 +1525,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper: Arc = @@ -1444,6 +1549,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); @@ -1492,7 +1598,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper: Arc = @@ -1516,6 +1622,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); @@ -1557,7 +1664,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper: Arc = @@ -1580,6 +1687,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -1618,7 +1726,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(default_doc_mapper_for_test()); @@ -1648,6 +1756,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -1736,7 +1845,7 @@ mod tests { let pipeline_id = IndexingPipelineId { index_uid: index_uid.clone(), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let doc_mapper = Arc::new(doc_mapper_with_secondary_time()); @@ -1768,6 +1877,7 @@ mod tests { indexing_settings, None, index_serializer_mailbox, + false, ); let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); indexer_mailbox @@ -1851,4 +1961,162 @@ mod tests { universe.assert_quit().await; Ok(()) } + + fn doc_mapper_with_indexation_time() -> DocMapper { + const JSON_CONFIG_VALUE: &str = r#" + { + "store_source": true, + "index_field_presence": true, + "default_search_fields": ["body"], + "timestamp_field": "timestamp", + "indexation_time_field": "indexed_at", + "field_mappings": [ + { + "name": "timestamp", + "type": "datetime", + "output_format": "unix_timestamp_secs", + "fast": true + }, + { + "name": "body", + "type": "text", + "stored": true + }, + { + "name": "indexed_at", + "type": "datetime", + "output_format": "unix_timestamp_secs", + "fast": true, + "stored": true + } + ] + }"#; + serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() + } + + #[tokio::test] + async fn test_indexer_sets_indexation_time() -> anyhow::Result<()> { + let index_uid = IndexUid::new_with_random_ulid("test-index"); + let pipeline_id = IndexingPipelineId { + index_uid: index_uid.clone(), + source_id: "test-source".to_string(), + node_id: NodeId::from_str("test-node"), + pipeline_uid: PipelineUid::default(), + }; + let doc_mapper = Arc::new(doc_mapper_with_indexation_time()); + let last_delete_opstamp = 10; + let schema = doc_mapper.schema(); + let body_field = schema.get_field("body").unwrap(); + let timestamp_field = schema.get_field("timestamp").unwrap(); + let indexed_at_field = schema.get_field("indexed_at").unwrap(); + let indexing_directory = TempDirectory::for_test(); + let mut indexing_settings = IndexingSettings::for_test(); + indexing_settings.split_num_docs_target = 3; + let universe = Universe::with_accelerated_time(); + let (index_serializer_mailbox, index_serializer_inbox) = universe.create_test_mailbox(); + let mut mock_metastore = MockMetastoreService::new(); + mock_metastore + .expect_last_delete_opstamp() + .times(1) + .returning(move |delete_opstamp_request| { + assert_eq!(delete_opstamp_request.index_uid(), &index_uid); + Ok(LastDeleteOpstampResponse::new(last_delete_opstamp)) + }); + mock_metastore.expect_publish_splits().never(); + let indexer = Indexer::new( + pipeline_id, + doc_mapper, + MetastoreServiceClient::from_mock(mock_metastore), + indexing_directory, + indexing_settings, + None, + index_serializer_mailbox, + false, + ); + let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); + + // Send 3 docs in a single batch so they all share the same indexation timestamp + // (the timestamp is sampled once per batch in `index_batch`). + indexer_mailbox + .send_message(ProcessedDocBatch::new( + vec![ + ProcessedDoc { + doc: doc!( + body_field => "document 1", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_001), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_001)), + partition: 1, + num_bytes: 30, + }, + ProcessedDoc { + doc: doc!( + body_field => "document 2", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_002), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_002)), + partition: 1, + num_bytes: 30, + }, + ProcessedDoc { + doc: doc!( + body_field => "document 3", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_003), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_003)), + partition: 1, + num_bytes: 30, + }, + ], + SourceCheckpointDelta::from_range(0..3), + false, + )) + .await?; + + indexer_handle.process_pending_and_observe().await; + let messages: Vec = index_serializer_inbox.drain_for_test_typed(); + assert_eq!(messages.len(), 1); + let batch = messages.into_iter().next().unwrap(); + assert_eq!(batch.commit_trigger, CommitTrigger::NumDocsLimit); + assert_eq!(batch.splits.len(), 1); + assert_eq!(batch.splits[0].split_attrs.num_docs, 3); + + // Finalize the split and open the tantivy index to verify the `indexed_at` field. + let indexed_split = batch.splits.into_iter().next().unwrap().finalize()?; + let reader = indexed_split + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + + // Collect every `indexed_at` value present in the split. + let mut indexed_at_values: Vec = Vec::new(); + for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { + for doc_id in 0..segment_reader.max_doc() { + let doc_address = DocAddress::new(segment_ord as u32, doc_id); + let doc: TantivyDocument = searcher.doc(doc_address)?; + let indexed_at = doc + .get_first(indexed_at_field) + .and_then(|val| val.as_datetime()) + .expect("indexed_at field must be set on every indexed document"); + indexed_at_values.push(indexed_at); + } + } + + // All 3 documents must have been stamped with the indexation time. + assert_eq!(indexed_at_values.len(), 3); + // Because the timestamp is captured once for the whole batch, every document + // in the batch must carry exactly the same `indexed_at` value. + let first = indexed_at_values[0]; + for val in &indexed_at_values { + assert_eq!( + *val, first, + "all documents in the same batch must share the same indexed_at timestamp" + ); + } + + universe.assert_quit().await; + Ok(()) + } } diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 99065651db1..88baa52d179 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -159,7 +159,9 @@ impl IndexingPipeline { let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS .indexing_pipelines .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let mut indexing_pipelines_gauge_guard = + OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + indexing_pipelines_gauge_guard.add(1); let params_fingerprint = params.params_fingerprint; IndexingPipeline { params, @@ -409,6 +411,7 @@ impl IndexingPipeline { self.params.indexing_settings.clone(), self.params.cooperative_indexing_permits.clone(), index_serializer_mailbox, + self.params.is_delete_task_service_disabled, ); let (indexer_mailbox, indexer_handle) = ctx .spawn_actor() @@ -600,6 +603,7 @@ pub struct IndexingPipelineParams { pub params_fingerprint: u64, pub event_broker: EventBroker, + pub is_delete_task_service_disabled: bool, } #[cfg(test)] @@ -639,7 +643,7 @@ mod tests { mut num_fails: usize, test_file: &str, ) -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::for_test("test-index", 2); let pipeline_id = IndexingPipelineId { node_id, @@ -729,6 +733,7 @@ mod tests { merge_planner_mailbox, event_broker: EventBroker::default(), params_fingerprint: 42u64, + is_delete_task_service_disabled: false, }; let pipeline = IndexingPipeline::new(pipeline_params); let (_pipeline_mailbox, pipeline_handle) = universe.spawn_builder().spawn(pipeline); @@ -769,8 +774,11 @@ mod tests { test_indexing_pipeline_num_fails_before_success(1, "data/test_corpus.json.gz").await } - async fn indexing_pipeline_simple(test_file: &str) -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + async fn indexing_pipeline_simple( + test_file: &str, + is_delete_task_service_disabled: bool, + ) -> anyhow::Result<()> { + let node_id = NodeId::from_str("test-node"); let index_uid: IndexUid = IndexUid::for_test("test-index", 1); let pipeline_id = IndexingPipelineId { node_id, @@ -803,10 +811,16 @@ mod tests { Ok(IndexMetadataResponse::try_from_index_metadata(&index_metadata).unwrap()) }); let index_uid_clone = index_uid.clone(); - mock_metastore - .expect_last_delete_opstamp() - .withf(move |last_delete_opstamp| last_delete_opstamp.index_uid() == &index_uid_clone) - .returning(move |_| Ok(LastDeleteOpstampResponse::new(10))); + if is_delete_task_service_disabled { + mock_metastore.expect_last_delete_opstamp().never(); + } else { + mock_metastore + .expect_last_delete_opstamp() + .withf(move |last_delete_opstamp| { + last_delete_opstamp.index_uid() == &index_uid_clone + }) + .returning(move |_| Ok(LastDeleteOpstampResponse::new(10))); + } let index_uid_clone = index_uid.clone(); mock_metastore .expect_stage_splits() @@ -853,6 +867,7 @@ mod tests { merge_planner_mailbox, event_broker: Default::default(), params_fingerprint: 42u64, + is_delete_task_service_disabled, }; let pipeline = IndexingPipeline::new(pipeline_params); let (_pipeline_mailbox, pipeline_handler) = universe.spawn_builder().spawn(pipeline); @@ -867,17 +882,22 @@ mod tests { #[tokio::test] async fn test_indexing_pipeline_simple() -> anyhow::Result<()> { - indexing_pipeline_simple("data/test_corpus.json").await + indexing_pipeline_simple("data/test_corpus.json", false).await + } + + #[tokio::test] + async fn test_indexing_pipeline_without_delete_task_service() -> anyhow::Result<()> { + indexing_pipeline_simple("data/test_corpus.json", true).await } #[tokio::test] async fn test_indexing_pipeline_simple_gz() -> anyhow::Result<()> { - indexing_pipeline_simple("data/test_corpus.json.gz").await + indexing_pipeline_simple("data/test_corpus.json.gz", false).await } #[tokio::test] async fn test_merge_pipeline_does_not_stop_on_indexing_pipeline_failure() { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let pipeline_id = IndexingPipelineId { node_id, index_uid: IndexUid::new_with_random_ulid("test-index"), @@ -954,6 +974,7 @@ mod tests { merge_planner_mailbox: merge_planner_mailbox.clone(), event_broker: Default::default(), params_fingerprint: 42u64, + is_delete_task_service_disabled: false, }; let indexing_pipeline = IndexingPipeline::new(indexing_pipeline_params); let (_indexing_pipeline_mailbox, indexing_pipeline_handler) = @@ -982,7 +1003,7 @@ mod tests { } async fn indexing_pipeline_all_failures_handling(test_file: &str) -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid: IndexUid = IndexUid::for_test("test-index", 2); let pipeline_id = IndexingPipelineId { node_id, @@ -1082,6 +1103,7 @@ mod tests { merge_planner_mailbox, params_fingerprint: 42u64, event_broker: Default::default(), + is_delete_task_service_disabled: false, }; let pipeline = IndexingPipeline::new(pipeline_params); let (_pipeline_mailbox, pipeline_handler) = universe.spawn_builder().spawn(pipeline); diff --git a/quickwit/quickwit-indexing/src/actors/indexing_service.rs b/quickwit/quickwit-indexing/src/actors/indexing_service.rs index afd2637c02c..f4ea45c9a50 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_service.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_service.rs @@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::path::PathBuf; use std::sync::Arc; +use std::time::{Duration, Instant}; use anyhow::Context; use async_trait::async_trait; @@ -115,6 +116,7 @@ pub struct IndexingService { cooperative_indexing_permits: Option>, merge_io_throughput_limiter_opt: Option, event_broker: EventBroker, + is_delete_task_service_disabled: bool, } impl Debug for IndexingService { @@ -142,6 +144,7 @@ impl IndexingService { ingester_pool: IngesterPool, storage_resolver: StorageResolver, event_broker: EventBroker, + is_delete_task_service_disabled: bool, ) -> anyhow::Result { let split_store_space_quota = SplitStoreQuota::try_new( indexer_config.split_store_max_num_splits, @@ -178,6 +181,7 @@ impl IndexingService { merge_io_throughput_limiter_opt, cooperative_indexing_permits, event_broker, + is_delete_task_service_disabled, }) } @@ -379,6 +383,7 @@ impl IndexingService { params_fingerprint, event_broker: self.event_broker.clone(), + is_delete_task_service_disabled: self.is_delete_task_service_disabled, }; let pipeline = IndexingPipeline::new(pipeline_params); let (pipeline_mailbox, pipeline_handle) = ctx.spawn_actor().spawn(pipeline); @@ -402,7 +407,10 @@ impl IndexingService { let index_metadata_response = self .metastore .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) - .await?; + .await + .inspect_err(|error| { + error!(%error, index_id, "failed to fetch index metadata from the metastore"); + })?; let index_metadata = index_metadata_response.deserialize_index_metadata()?; Ok(index_metadata) } @@ -429,7 +437,10 @@ impl IndexingService { let indexes_metadata_response = self .metastore .indexes_metadata(indexes_metadata_request) - .await?; + .await + .inspect_err(|error| { + error!(%error, "failed to fetch indexes metadata from the metastore"); + })?; let indexes_metadata = indexes_metadata_response .deserialize_indexes_metadata() .await?; @@ -468,7 +479,10 @@ impl IndexingService { let mut immature_splits_stream = ctx .protect_future(self.metastore.list_splits(list_splits_request)) - .await?; + .await + .inspect_err(|error| { + error!(%error, "failed to list immature splits from the metastore"); + })?; let mut per_merge_pipeline_immature_splits: HashMap> = indexing_pipeline_ids @@ -478,7 +492,14 @@ impl IndexingService { let mut num_immature_splits = 0usize; - while let Some(list_splits_response) = immature_splits_stream.try_next().await? { + while let Some(list_splits_response) = + immature_splits_stream + .try_next() + .await + .inspect_err(|error| { + error!(%error, "failed to fetch a batch of immature splits from the metastore"); + })? + { for split_metadata in list_splits_response.deserialize_splits_metadata().await? { num_immature_splits += 1; @@ -847,7 +868,10 @@ impl IndexingService { let indexes_metadata = self .metastore .list_indexes_metadata(ListIndexesMetadataRequest::all()) - .await? + .await + .inspect_err(|error| { + error!(%error, "failed to list indexes metadata from the metastore"); + })? .deserialize_indexes_metadata() .await?; let index_ids: HashSet = indexes_metadata @@ -895,6 +919,7 @@ impl Handler for IndexingService { msg: ObservePipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("observe_pipeline"); let pipeline_uid = msg.pipeline_id.pipeline_uid; let observation = self.observe_pipeline(&pipeline_uid).await; Ok(observation) @@ -910,6 +935,7 @@ impl Handler for IndexingService { msg: DetachIndexingPipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("detach_indexing_pipeline"); let pipeline_uid = msg.pipeline_id.pipeline_uid; let detach_pipeline_result = self.detach_indexing_pipeline(&pipeline_uid).await; Ok(detach_pipeline_result) @@ -925,6 +951,7 @@ impl Handler for IndexingService { msg: DetachMergePipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("detach_merge_pipeline"); Ok(self.detach_merge_pipeline(&msg.pipeline_id).await) } } @@ -941,6 +968,7 @@ impl Handler for IndexingService { _message: SuperviseLoop, ctx: &ActorContext, ) -> Result<(), ActorExitStatus> { + let _slow_handler_guard = SlowHandlerGuard::new("supervise_loop"); self.handle_supervise().await?; ctx.schedule_self_msg(*quickwit_actors::HEARTBEAT, SuperviseLoop); Ok(()) @@ -969,6 +997,7 @@ impl Handler for IndexingService { message: SpawnPipeline, ctx: &ActorContext, ) -> Result, ActorExitStatus> { + let _slow_handler_guard = SlowHandlerGuard::new("spawn_pipeline"); Ok(self .spawn_pipeline( ctx, @@ -989,6 +1018,7 @@ impl Handler for IndexingService { plan_request: ApplyIndexingPlanRequest, ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("apply_indexing_plan"); Ok(self .apply_indexing_plan(&plan_request.indexing_tasks, ctx) .await @@ -1016,6 +1046,32 @@ struct IndexingPipelineDiff { pipelines_to_spawn: Vec, } +/// Logs a warning every 5 seconds until dropped. Useful to identify slow +/// handlers that might compromise liveness checks. +pub struct SlowHandlerGuard { + _cancel_tx: oneshot::Sender<()>, +} + +impl SlowHandlerGuard { + pub fn new(handler_name: &'static str) -> Self { + let (cancel_tx, mut cancel_rx) = oneshot::channel::<()>(); + let start = Instant::now(); + tokio::spawn(async move { + loop { + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(5)) => { + warn!(handler=handler_name, elapsed_secs=start.elapsed().as_secs(), "slow indexing service handler"); + } + _ = &mut cancel_rx => { break; } + } + } + }); + Self { + _cancel_tx: cancel_tx, + } + } +} + #[cfg(test)] mod tests { use std::num::NonZeroUsize; @@ -1050,6 +1106,7 @@ mod tests { universe: &Universe, metastore: MetastoreServiceClient, cluster: Cluster, + is_delete_task_service_disabled: bool, ) -> (Mailbox, ActorHandle) { let indexer_config = IndexerConfig::for_test().unwrap(); let num_blocking_threads = 1; @@ -1061,7 +1118,7 @@ mod tests { .unwrap(); let merge_scheduler_mailbox: Mailbox = universe.get_or_spawn_one(); let indexing_server = IndexingService::new( - NodeId::from("test-node"), + NodeId::from_str("test-node"), data_dir_path.to_path_buf(), indexer_config, num_blocking_threads, @@ -1072,6 +1129,7 @@ mod tests { IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + is_delete_task_service_disabled, ) .await .unwrap(); @@ -1109,7 +1167,8 @@ mod tests { let universe = Universe::with_accelerated_time(); let temp_dir = tempfile::tempdir().unwrap(); let (indexing_service, indexing_service_handle) = - spawn_indexing_service_for_test(temp_dir.path(), &universe, metastore, cluster).await; + spawn_indexing_service_for_test(temp_dir.path(), &universe, metastore, cluster, false) + .await; let observation = indexing_service_handle.observe().await; assert_eq!(observation.num_running_pipelines, 0); assert_eq!(observation.num_failed_pipelines, 0); @@ -1215,7 +1274,8 @@ mod tests { let universe = Universe::new(); let temp_dir = tempfile::tempdir().unwrap(); let (indexing_service, indexing_server_handle) = - spawn_indexing_service_for_test(temp_dir.path(), &universe, metastore, cluster).await; + spawn_indexing_service_for_test(temp_dir.path(), &universe, metastore, cluster, false) + .await; indexing_service .ask_for_res(SpawnPipeline { @@ -1275,6 +1335,7 @@ mod tests { &universe, metastore.clone(), cluster.clone(), + false, ) .await; let metadata = metastore @@ -1574,7 +1635,7 @@ mod tests { .unwrap(); let merge_scheduler_service = universe.get_or_spawn_one(); let indexing_server = IndexingService::new( - NodeId::from("test-node"), + NodeId::from_str("test-node"), data_dir_path, indexer_config, num_blocking_threads, @@ -1585,6 +1646,7 @@ mod tests { IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + false, ) .await .unwrap(); @@ -1702,6 +1764,7 @@ mod tests { &universe, MetastoreServiceClient::from_mock(mock_metastore), cluster, + false, ) .await; let _pipeline_id = indexing_service @@ -1776,7 +1839,7 @@ mod tests { let storage_resolver = StorageResolver::unconfigured(); let merge_scheduler_service: Mailbox = universe.get_or_spawn_one(); let mut indexing_server = IndexingService::new( - NodeId::from("test-ingest-api-gc-node"), + NodeId::from_str("test-ingest-api-gc-node"), data_dir_path, indexer_config, num_blocking_threads, @@ -1787,6 +1850,7 @@ mod tests { IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + false, ) .await .unwrap(); @@ -1885,6 +1949,7 @@ mod tests { &universe, MetastoreServiceClient::from_mock(mock_metastore), cluster, + false, ) .await; diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 6b753c7e13b..98557ffe48f 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use std::ops::RangeInclusive; use std::path::Path; use std::sync::Arc; @@ -40,20 +40,40 @@ use quickwit_query::query_ast::QueryAst; use tantivy::directory::{Advice, DirectoryClone, MmapDirectory, RamDirectory}; use tantivy::index::SegmentId; use tantivy::tokenizer::TokenizerManager; -use tantivy::{DateTime, Directory, Index, IndexMeta, IndexWriter, SegmentReader}; +use tantivy::{DateTime, Directory, DocId, Index, IndexMeta, IndexWriter, SegmentReader}; use tokio::runtime::Handle; use tracing::{debug, error, info, instrument, warn}; use crate::actors::Packager; use crate::controlled_directory::ControlledDirectory; use crate::merge_policy::MergeOperationType; -use crate::models::{IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, SplitAttrs}; +use crate::models::{ + IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, ReplacedSplit, SplitAttrs, +}; +use crate::soft_delete_query::SoftDeletedDocIdsQuery; + +/// The mapping resolution assiated to the merge. To perform deletes a full doc +/// mapper is required. For regular merges, we only need the tokenizer manager. +#[derive(Clone)] +enum MapperContext { + TokenizersOnly(quickwit_query::tokenizers::TokenizerManager), + DocMapper(Arc), +} + +impl MapperContext { + fn tokenizer_manager(&self) -> quickwit_query::tokenizers::TokenizerManager { + match self { + MapperContext::TokenizersOnly(tokenizer_manager) => tokenizer_manager.clone(), + MapperContext::DocMapper(doc_mapper) => doc_mapper.tokenizer_manager().clone(), + } + } +} #[derive(Clone)] pub struct MergeExecutor { pipeline_id: MergePipelineId, metastore: MetastoreServiceClient, - doc_mapper: Arc, + mapper_context: MapperContext, io_controls: IoControls, merge_packager_mailbox: Mailbox, } @@ -106,14 +126,16 @@ impl Handler for MergeExecutor { // A failure in a merge is a bit special. // // Instead of failing the pipeline, we just log it. - // The idea is to limit the risk associated with a potential split of death. + // The idea is to limit the risk associated with a potential split of + // death. // - // Such a split is now not tracked by the merge planner and won't undergo a - // merge until the merge pipeline is restarted. + // Such a split is now not tracked by the merge planner and won't + // undergo a merge until the merge pipeline + // is restarted. // - // With a merge policy that marks splits as mature after a day or so, this - // limits the noise associated to those failed - // merges. + // With a merge policy that marks splits as mature after a day or so, + // this limits the noise associated to those + // failed merges. error!(task=?merge_task, err=?err, "failed to merge splits"); return Ok(()); } @@ -171,21 +193,23 @@ fn combine_index_meta(mut index_metas: Vec) -> anyhow::Result>, Vec)>; + fn open_split_directories( // Directories containing the splits to merge tantivy_dirs: &[Box], tokenizer_manager: &TokenizerManager, -) -> anyhow::Result<(IndexMeta, Vec>)> { +) -> OpenSplitDirsResult { let mut directories: Vec> = Vec::new(); - let mut index_metas = Vec::new(); + let mut index_metas: Vec = Vec::new(); for tantivy_dir in tantivy_dirs { directories.push(tantivy_dir.clone()); - let index_meta = open_index(tantivy_dir.clone(), tokenizer_manager)?.load_metas()?; index_metas.push(index_meta); } + let per_split_metas = index_metas.clone(); let union_index_meta = combine_index_meta(index_metas)?; - Ok((union_index_meta, directories)) + Ok((union_index_meta, directories, per_split_metas)) } /// Creates a directory with a single `meta.json` file describe in `index_meta` @@ -278,11 +302,23 @@ pub fn merge_split_attrs( let partition_id = combine_partition_ids_aux(splits.iter().map(|split| split.partition_id)); let time_range: Option> = merge_time_range(splits); let secondary_time_range = merge_secondary_time_range_if_exists(splits); - let uncompressed_docs_size_in_bytes = sum_doc_sizes_in_bytes(splits); - let num_docs = sum_num_docs(splits); - let replaced_split_ids: Vec = splits + let total_soft_deleted: u64 = splits + .iter() + .map(|split| split.soft_deleted_doc_ids.len() as u64) + .sum(); + let raw_num_docs = sum_num_docs(splits); + let num_docs = raw_num_docs.saturating_sub(total_soft_deleted); + let uncompressed_docs_size_in_bytes = if raw_num_docs > 0 { + (sum_doc_sizes_in_bytes(splits) as f64 * num_docs as f64 / raw_num_docs as f64) as u64 + } else { + 0 + }; + let replaced_splits = splits .iter() - .map(|split| split.split_id().to_string()) + .map(|split| ReplacedSplit { + split_id: split.split_id.clone(), + soft_deleted_doc_ids: split.soft_deleted_doc_ids.clone(), + }) .collect(); let delete_opstamp = splits .iter() @@ -306,13 +342,13 @@ pub fn merge_split_attrs( doc_mapping_uid, split_id: merge_split_id, partition_id, - replaced_split_ids, time_range, secondary_time_range, num_docs, uncompressed_docs_size_in_bytes, delete_opstamp, num_merge_ops: max_merge_ops(splits) + 1, + replaced_splits, }) } @@ -324,6 +360,16 @@ fn max_merge_ops(splits: &[SplitMetadata]) -> usize { .unwrap_or(0) } +struct MergeDirectoriesInput { + union_index_meta: IndexMeta, + split_directories: Vec>, + delete_tasks: Vec, + /// Required when `delete_tasks` is non-empty; unused otherwise. + doc_mapper_opt: Option>, + /// Maps each segment ID to the sorted list of soft-deleted doc IDs to remove. + soft_deleted_docs: HashMap>, +} + impl MergeExecutor { pub fn new( pipeline_id: MergePipelineId, @@ -335,7 +381,24 @@ impl MergeExecutor { MergeExecutor { pipeline_id, metastore, - doc_mapper, + mapper_context: MapperContext::DocMapper(doc_mapper), + io_controls, + merge_packager_mailbox, + } + } + + /// Creates a simpler MergeExecutor that doesn't support deletes. + pub fn new_with_tokenizers_only( + pipeline_id: MergePipelineId, + metastore: MetastoreServiceClient, + tokenizer_manager: quickwit_query::tokenizers::TokenizerManager, + io_controls: IoControls, + merge_packager_mailbox: Mailbox, + ) -> Self { + MergeExecutor { + pipeline_id, + metastore, + mapper_context: MapperContext::TokenizersOnly(tokenizer_manager), io_controls, merge_packager_mailbox, } @@ -349,18 +412,33 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result { - let (union_index_meta, split_directories) = open_split_directories( + let (union_index_meta, split_directories, per_split_metas) = open_split_directories( &tantivy_dirs, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; + // Build a mapping from each segment ID to the soft-deleted doc IDs of its parent split. + let soft_deleted_docs: HashMap> = per_split_metas + .iter() + .zip(splits.iter()) + .filter(|(_, split)| !split.soft_deleted_doc_ids.is_empty()) + .flat_map(|(meta, split)| { + let doc_ids: Vec = split.soft_deleted_doc_ids.iter().copied().collect(); + meta.segments + .iter() + .map(move |seg_meta| (seg_meta.id(), doc_ids.clone())) + }) + .collect(); // TODO it would be nice if tantivy could let us run the merge in the current thread. fail_point!("before-merge-split"); let controlled_directory = self .merge_split_directories( - union_index_meta, - split_directories, - Vec::new(), - None, + MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks: Vec::new(), + doc_mapper_opt: None, + soft_deleted_docs, + }, merge_scratch_directory.path(), ctx, ) @@ -371,17 +449,18 @@ impl MergeExecutor { // splits. let merged_index = open_index( controlled_directory.clone(), - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; ctx.record_progress(); let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits)?; - Ok(IndexedSplit { + let indexed_split = IndexedSplit { split_attrs, index: merged_index, split_scratch_directory: merge_scratch_directory, controlled_directory_opt: Some(controlled_directory), - }) + }; + Ok(indexed_split) } async fn process_delete_and_merge( @@ -392,6 +471,9 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result> { + let MapperContext::DocMapper(doc_mapper) = &self.mapper_context else { + anyhow::bail!("DocMapper is required to process delete and merge operations"); + }; let list_delete_tasks_request = ListDeleteTasksRequest::new(split.index_uid.clone(), split.delete_opstamp); let delete_tasks = ctx @@ -417,16 +499,34 @@ impl MergeExecutor { num_delete_tasks = delete_tasks.len() ); - let (union_index_meta, split_directories) = open_split_directories( + let (union_index_meta, split_directories, per_split_metas) = open_split_directories( &tantivy_dirs, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + doc_mapper.tokenizer_manager().tantivy_manager(), )?; + // Build a mapping from each segment ID to the soft-deleted doc IDs of the input split. + let soft_deleted_docs: HashMap> = + if split.soft_deleted_doc_ids.is_empty() { + HashMap::new() + } else { + let doc_ids: Vec = split.soft_deleted_doc_ids.iter().copied().collect(); + per_split_metas + .iter() + .flat_map(|meta| { + meta.segments + .iter() + .map(|seg_meta| (seg_meta.id(), doc_ids.clone())) + }) + .collect() + }; let controlled_directory = self .merge_split_directories( - union_index_meta, - split_directories, - delete_tasks, - Some(self.doc_mapper.clone()), + MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks, + doc_mapper_opt: Some(doc_mapper.clone()), + soft_deleted_docs, + }, merge_scratch_directory.path(), ctx, ) @@ -435,12 +535,7 @@ impl MergeExecutor { // This will have the side effect of deleting the directory containing the downloaded split. let mut merged_index = Index::open(controlled_directory.clone())?; ctx.record_progress(); - merged_index.set_tokenizers( - self.doc_mapper - .tokenizer_manager() - .tantivy_manager() - .clone(), - ); + merged_index.set_tokenizers(doc_mapper.tokenizer_manager().tantivy_manager().clone()); merged_index.set_fast_field_tokenizers( get_quickwit_fastfield_normalizer_manager() .tantivy_manager() @@ -473,8 +568,7 @@ impl MergeExecutor { let uncompressed_docs_size_in_bytes = (num_docs as f32 * split.uncompressed_docs_size_in_bytes as f32 / split.num_docs as f32) as u64; - let time_range = if let Some(timestamp_field_name) = self.doc_mapper.timestamp_field_name() - { + let time_range = if let Some(timestamp_field_name) = doc_mapper.timestamp_field_name() { let reader = merged_segment_reader .fast_fields() .date(timestamp_field_name)?; @@ -484,19 +578,22 @@ impl MergeExecutor { }; let indexed_split = IndexedSplit { split_attrs: SplitAttrs { - node_id: NodeId::new(split.node_id), + node_id: NodeId::from_str(&split.node_id), index_uid: split.index_uid, source_id: split.source_id, doc_mapping_uid: split.doc_mapping_uid, split_id: merge_split_id, partition_id: split.partition_id, - replaced_split_ids: vec![split.split_id.clone()], time_range, secondary_time_range: None, num_docs, uncompressed_docs_size_in_bytes, delete_opstamp: last_delete_opstamp, num_merge_ops: split.num_merge_ops, + replaced_splits: vec![ReplacedSplit { + split_id: split.split_id.clone(), + soft_deleted_doc_ids: split.soft_deleted_doc_ids.clone(), + }], }, index: merged_index, split_scratch_directory: merge_scratch_directory, @@ -507,13 +604,17 @@ impl MergeExecutor { async fn merge_split_directories( &self, - union_index_meta: IndexMeta, - split_directories: Vec>, - delete_tasks: Vec, - doc_mapper_opt: Option>, + input: MergeDirectoriesInput, output_path: &Path, ctx: &ActorContext, ) -> anyhow::Result { + let MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks, + doc_mapper_opt, + soft_deleted_docs, + } = input; let shadowing_meta_json_directory = create_shadowing_meta_json_directory(union_index_meta)?; // This directory is here to receive the merged split, as well as the final meta.json file. @@ -535,7 +636,7 @@ impl MergeExecutor { let union_directory = UnionDirectory::union_of(directory_stack); let union_index = open_index( union_directory, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; ctx.record_progress(); @@ -543,6 +644,12 @@ impl MergeExecutor { let mut index_writer: IndexWriter = union_index.writer_with_num_threads(1, 15_000_000)?; let num_delete_tasks = delete_tasks.len(); + let has_soft_deletes = !soft_deleted_docs.is_empty(); + // Hard-delete soft-deleted doc IDs before applying delete-task queries so that both + // sources of deletion are committed together in a single pass. + if has_soft_deletes { + index_writer.delete_query(Box::new(SoftDeletedDocIdsQuery::new(soft_deleted_docs)))?; + } if num_delete_tasks > 0 { let doc_mapper = doc_mapper_opt .ok_or_else(|| anyhow!("doc mapper must be present if there are delete tasks"))?; @@ -564,6 +671,8 @@ impl MergeExecutor { doc_mapper.query(union_index.schema(), parsed_query_ast, false, None)?; index_writer.delete_query(query)?; } + } + if has_soft_deletes || num_delete_tasks > 0 { debug!("commit-delete-operations"); index_writer.commit()?; } @@ -574,13 +683,13 @@ impl MergeExecutor { .map(|segment_meta| segment_meta.id()) .collect(); - // A merge is useless if there is no delete and only one segment. - if num_delete_tasks == 0 && segment_ids.len() <= 1 { + // A merge is useless if there are no deletions and only one segment. + if !has_soft_deletes && num_delete_tasks == 0 && segment_ids.len() <= 1 { return Ok(output_directory); } - // If after deletion there is no longer any document, don't try to merge. - if num_delete_tasks != 0 && segment_ids.is_empty() { + // If after deletion there are no remaining documents, don't try to merge. + if (has_soft_deletes || num_delete_tasks != 0) && segment_ids.is_empty() { return Ok(output_directory); } @@ -713,6 +822,287 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_merge_executor_with_soft_deleted_docs() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-soft-delete", doc_mapping_yaml, "", &["body"]).await?; + for split_id in 0..4 { + let single_doc = std::iter::once( + serde_json::json!({"body ": format!("split{split_id}"), "ts": 1631072713u64 + split_id }), + ); + test_sandbox.add_documents(single_doc).await?; + } + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + // Load the initial split metadata to obtain split IDs. + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!(split_metas.len(), 4); + + // Soft-delete doc_id=0 from the first split. + // Each split contains exactly one document, so doc_id=0 is the only document. + let soft_deleted_split_id = split_metas[0].split_id.clone(); + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: soft_deleted_split_id, + doc_ids: vec![0], + }], + }) + .await?; + + // Reload split metadata so that soft_deleted_doc_ids is populated. + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!( + split_metas + .iter() + .map(|s| s.soft_deleted_doc_ids.len()) + .sum::(), + 1, + "exactly one doc should be soft-deleted across all splits" + ); + + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let mut tantivy_dirs: Vec> = Vec::new(); + for split_meta in &split_metas { + let split_filename = split_file(split_meta.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()) + } + let merge_operation = MergeOperation::new_merge_operation(split_metas); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs, + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: index_uid.clone(), + source_id: test_sandbox.source_id(), + }; + let (merge_packager_mailbox, merge_packager_inbox) = + test_sandbox.universe().create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + test_sandbox.metastore(), + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = test_sandbox + .universe() + .spawn_builder() + .spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!(packager_msgs.len(), 1); + let split_attrs_after_merge = &packager_msgs[0].splits[0].split_attrs; + // One document was soft-deleted, so only 3 docs should remain. + assert_eq!(split_attrs_after_merge.num_docs, 3); + assert_eq!(split_attrs_after_merge.uncompressed_docs_size_in_bytes, 102); + assert_eq!(split_attrs_after_merge.num_merge_ops, 1); + + let reader = packager_msgs[0].splits[0] + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + // The merged segment must contain exactly 3 live documents. + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!(num_live_docs, 3); + + test_sandbox.assert_quit().await; + Ok(()) + } + + /// Verifies that when a soft-delete lands on an input split while the + /// merge is running, the merge still succeeds. + #[tokio::test] + async fn test_merge_executor_soft_delete_race_condition() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create( + "test-index-soft-delete-race", + doc_mapping_yaml, + "", + &["body"], + ) + .await?; + for split_id in 0..4 { + let single_doc = std::iter::once( + serde_json::json!({"body": format!("split{split_id}"), "ts": 1631072713u64 + split_id}), + ); + test_sandbox.add_documents(single_doc).await?; + } + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + // Read split metadata *before* the soft-delete — this is the stale snapshot that the + // merge task will carry, simulating a race where the delete arrives after the merge + // executor already read the metadata. + let stale_split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!(stale_split_metas.len(), 4); + + // Soft-delete doc_id=0 from the first split *after* the stale metadata was read. + // This simulates a concurrent user action that arrives while the merge is running. + let racing_split_id = stale_split_metas[0].split_id.clone(); + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: racing_split_id.clone(), + doc_ids: vec![0], + }], + }) + .await?; + + // Build the merge scratch using the stale metadata (no soft-deletes recorded). + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let mut tantivy_dirs: Vec> = Vec::new(); + for split_meta in &stale_split_metas { + let split_filename = split_file(split_meta.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()); + } + let merge_operation = MergeOperation::new_merge_operation(stale_split_metas); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs, + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: index_uid.clone(), + source_id: test_sandbox.source_id(), + }; + let (merge_packager_mailbox, merge_packager_inbox) = + test_sandbox.universe().create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + test_sandbox.metastore(), + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = test_sandbox + .universe() + .spawn_builder() + .spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + // The merge must succeed despite the race condition. + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!( + packager_msgs.len(), + 1, + "merge must produce exactly one split batch" + ); + + let split_attrs = &packager_msgs[0].splits[0].split_attrs; + // The stale metadata had no soft-deletes, so all 4 docs are present in the merged + // segment. The racing soft-delete was missed. + assert_eq!(split_attrs.num_docs, 4); + assert_eq!(split_attrs.num_merge_ops, 1); + + // The snapshot carried in the batch reflects the stale state (no soft-deletes). + let replaced_splits = &packager_msgs[0].splits[0].split_attrs.replaced_splits; + assert_eq!( + replaced_splits.len(), + 4, + "all 4 input splits must appear in the snapshot" + ); + let racing_split_snapshot = replaced_splits + .iter() + .find(|replaced_split| replaced_split.split_id == racing_split_id) + .expect("racing split must be present in the snapshot"); + assert!( + racing_split_snapshot.soft_deleted_doc_ids.is_empty(), + "racing split had no soft-deletes at merge start (stale read)" + ); + + let reader = packager_msgs[0].splits[0] + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + // All 4 docs are physically present; the racing soft-delete was not applied. + assert_eq!(num_live_docs, 4); + + test_sandbox.assert_quit().await; + Ok(()) + } + #[test] fn test_combine_partition_ids_singleton_unchanged() { assert_eq!(combine_partition_ids_aux([17]), 17); @@ -950,4 +1340,204 @@ mod tests { ) .await } + + #[tokio::test] + async fn test_delete_and_merge_with_soft_deleted_docs() -> anyhow::Result<()> { + quickwit_common::setup_logging_for_tests(); + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create( + "test-delete-and-merge-with-soft-delete", + doc_mapping_yaml, + "", + &["body"], + ) + .await?; + + // Three docs are ingested into a single split. + // doc_id=0 body="soft_delete" → removed by soft-delete + // doc_id=1 body="query_delete" → removed by the delete query + // doc_id=2 body="keep" → must survive both conditions + test_sandbox + .add_documents(vec![ + serde_json::json!({"body": "soft_delete", "ts": 1624928200}), + serde_json::json!({"body": "query_delete", "ts": 1624928201}), + serde_json::json!({"body": "keep", "ts": 1624928202}), + ]) + .await?; + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + assert_eq!(splits.len(), 1); + let original_split_id = splits[0].split_metadata.split_id.clone(); + + // Soft-delete doc_id=0 (the "soft_delete" document). + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: original_split_id.clone(), + doc_ids: vec![0], + }], + }) + .await?; + + // Register a delete task targeting the "query_delete" document. + metastore + .create_delete_task(DeleteQuery { + index_uid: Some(index_uid.clone()), + start_timestamp: None, + end_timestamp: None, + query_ast: quickwit_query::query_ast::qast_json_helper( + "body:query_delete", + &["body"], + ), + }) + .await?; + + // Reload split metadata so that soft_deleted_doc_ids is populated. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + assert_eq!(splits.len(), 1); + assert_eq!( + splits[0].split_metadata.soft_deleted_doc_ids.len(), + 1, + "doc_id=0 must be recorded as soft-deleted before staging" + ); + + // Stage a replacement split with num_merge_ops=1. By cloning the freshly-read + // metadata the soft_deleted_doc_ids field is carried over into the merge task, + // which is exactly what process_delete_and_merge relies on. + let mut new_split_metadata = splits[0].split_metadata.clone(); + new_split_metadata.split_id = new_split_id(); + new_split_metadata.num_merge_ops = 1; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &new_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![new_split_metadata.split_id.to_string()], + replaced_split_ids: vec![original_split_id.clone()], + index_checkpoint_delta_json_opt: None, + publish_token_opt: None, + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + // Copy the original split bundle to the new split filename so the executor can open it. + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let split_filename = split_file(&original_split_id); + let new_split_filename = split_file(new_split_metadata.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&new_split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + let tantivy_dir = get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap(); + let merge_operation = MergeOperation::new_delete_and_merge_operation(new_split_metadata); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs: vec![tantivy_dir], + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: test_sandbox.index_uid(), + source_id: test_sandbox.source_id(), + }; + let universe = Universe::with_accelerated_time(); + let (merge_packager_mailbox, merge_packager_inbox) = universe.create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + metastore, + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = + universe.spawn_builder().spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!(packager_msgs.len(), 1); + let split = &packager_msgs[0].splits[0]; + + // 3 docs − 1 soft-deleted − 1 query-deleted = 1 surviving document. + assert_eq!(split.split_attrs.num_docs, 1); + assert_eq!(split.split_attrs.delete_opstamp, 1); + // Delete operations must not increment num_merge_ops. + assert_eq!(split.split_attrs.num_merge_ops, 1); + + let reader = split + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!( + num_live_docs, 1, + "exactly one document must remain after all deletions" + ); + + // The surviving document must be the "keep" one. + let documents_left: Vec = searcher + .search( + &tantivy::query::AllQuery, + &tantivy::collector::TopDocs::with_limit(10).order_by_score(), + )? + .into_iter() + .map(|(_, doc_address)| { + let doc: TantivyDocument = searcher.doc(doc_address).unwrap(); + let doc_json = doc.to_json(searcher.schema()); + serde_json::from_str(&doc_json).unwrap() + }) + .collect(); + let expected_doc = serde_json::json!({"body": ["keep"], "ts": ["2021-06-29T00:56:42Z"]}); + assert_eq!( + documents_left, + vec![expected_doc], + "only the 'keep' document must survive both soft-delete and query-delete" + ); + + test_sandbox.assert_quit().await; + universe.assert_quit().await; + Ok(()) + } } diff --git a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs index f871fbabb5e..5d1001c4776 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs @@ -458,10 +458,26 @@ impl MergePipeline { ListSplitsRequest::try_from_list_splits_query(&list_splits_query)?; let immature_splits_stream = ctx .protect_future(self.params.metastore.list_splits(list_splits_request)) - .await?; + .await + .inspect_err(|error| { + error!( + %error, + index_id=%self.params.pipeline_id.index_uid.index_id, + source_id=%self.params.pipeline_id.source_id, + "failed to list immature splits from the metastore" + ); + })?; let immature_splits = ctx .protect_future(immature_splits_stream.collect_splits_metadata()) - .await?; + .await + .inspect_err(|error| { + error!( + %error, + index_id=%self.params.pipeline_id.index_uid.index_id, + source_id=%self.params.pipeline_id.source_id, + "failed to collect immature splits metadata from the metastore" + ); + })?; info!( index_uid=%self.params.pipeline_id.index_uid, source_id=%self.params.pipeline_id.source_id, @@ -599,7 +615,7 @@ mod tests { #[tokio::test] async fn test_merge_pipeline_simple() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::for_test("test-index", 0); let source_id = "test-source".to_string(); let pipeline_id = MergePipelineId { diff --git a/quickwit/quickwit-indexing/src/actors/merge_planner.rs b/quickwit/quickwit-indexing/src/actors/merge_planner.rs index 479cefcaa23..d9d90bf3bfd 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_planner.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_planner.rs @@ -349,7 +349,7 @@ impl MergePlanner { /// We can only merge splits with the same (node_id, index_id, source_id). fn belongs_to_pipeline(pipeline_id: &MergePipelineId, split: &SplitMetadata) -> bool { - pipeline_id.node_id == split.node_id + pipeline_id.node_id.as_str() == split.node_id.as_str() && pipeline_id.index_uid == split.index_uid && pipeline_id.source_id == split.source_id } @@ -408,7 +408,7 @@ mod tests { #[tokio::test] async fn test_merge_planner_with_stable_custom_merge_policy() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); let [doc_mapping_uid1, doc_mapping_uid2] = { @@ -526,7 +526,7 @@ mod tests { #[tokio::test] async fn test_merge_planner_spawns_merge_over_existing_splits_on_startup() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); let doc_mapping_uid = DocMappingUid::random(); @@ -611,7 +611,7 @@ mod tests { #[tokio::test] async fn test_merge_planner_dismiss_splits_from_different_pipeline_id() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); let doc_mapping_uid = DocMappingUid::random(); @@ -682,7 +682,7 @@ mod tests { #[tokio::test] async fn test_merge_planner_inherit_mailbox_with_splits_bug_3847() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); let doc_mapping_uid = DocMappingUid::random(); diff --git a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs index bbe5267d514..3818edd8c73 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs @@ -33,6 +33,15 @@ pub struct MergePermit { } impl MergePermit { + /// Creates a `MergePermit` from an owned semaphore permit, without notifying any + /// `MergeSchedulerService` on drop. Use this when managing concurrency externally. + pub fn new(permit: OwnedSemaphorePermit) -> MergePermit { + MergePermit { + _semaphore_permit: Some(permit), + merge_scheduler_mailbox: None, + } + } + #[cfg(any(test, feature = "testsuite"))] pub fn for_test() -> MergePermit { MergePermit { diff --git a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs index 5d68bb59285..7d124288288 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs @@ -17,7 +17,7 @@ use std::path::Path; use async_trait::async_trait; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; use quickwit_common::io::IoControls; -use quickwit_common::temp_dir::{self, TempDirectory}; +use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::SplitMetadata; use tantivy::Directory; use tracing::{debug, info, instrument}; @@ -62,14 +62,13 @@ impl Handler for MergeSplitDownloader { merge_task: MergeTask, ctx: &ActorContext, ) -> Result<(), quickwit_actors::ActorExitStatus> { - let merge_scratch_directory = temp_dir::Builder::default() - .join("merge") - .tempdir_in(self.scratch_directory.path()) + let merge_scratch_directory = self + .scratch_directory + .named_temp_child("merge") .map_err(|error| anyhow::anyhow!(error))?; info!(dir=%merge_scratch_directory.path().display(), "download-merge-splits"); - let downloaded_splits_directory = temp_dir::Builder::default() - .join("downloaded-splits") - .tempdir_in(merge_scratch_directory.path()) + let downloaded_splits_directory = merge_scratch_directory + .named_temp_child("downloaded-splits") .map_err(|error| anyhow::anyhow!(error))?; let tantivy_dirs = self .download_splits( diff --git a/quickwit/quickwit-indexing/src/actors/packager.rs b/quickwit/quickwit-indexing/src/actors/packager.rs index 18e0bb40d73..9f2575720a4 100644 --- a/quickwit/quickwit-indexing/src/actors/packager.rs +++ b/quickwit/quickwit-indexing/src/actors/packager.rs @@ -509,7 +509,7 @@ mod tests { } let index = index_writer.finalize()?; - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); @@ -527,9 +527,9 @@ mod tests { uncompressed_docs_size_in_bytes: num_docs * 15, time_range: timerange_opt, secondary_time_range: None, - replaced_split_ids: Vec::new(), delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::new(), }, index, split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/actors/publisher.rs b/quickwit/quickwit-indexing/src/actors/publisher.rs index b05081be706..92a005e9f99 100644 --- a/quickwit/quickwit-indexing/src/actors/publisher.rs +++ b/quickwit/quickwit-indexing/src/actors/publisher.rs @@ -12,22 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::{BTreeSet, HashMap}; + use anyhow::Context; use async_trait::async_trait; use fail::fail_point; use quickwit_actors::{Actor, ActorContext, Handler, Mailbox, QueueCapacity}; -use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, PublishSplitsRequest}; +use quickwit_common::Progress; +use quickwit_metastore::{ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt}; +use quickwit_proto::metastore::{ + ListSplitsRequest, MetastoreService, MetastoreServiceClient, PublishSplitsRequest, +}; +use quickwit_proto::types::{IndexUid, SplitId}; use serde::Serialize; -use tracing::{info, instrument, warn}; +use tracing::{error, info, instrument, warn}; use crate::actors::MergePlanner; -use crate::models::{NewSplits, SplitsUpdate}; +use crate::models::{NewSplits, ReplacedSplit, SplitsUpdate}; use crate::source::{SourceActor, SuggestTruncate}; #[derive(Clone, Debug, Default, Serialize)] pub struct PublisherCounters { pub num_published_splits: u64, pub num_replace_operations: u64, + pub num_replaced_splits: u64, pub num_empty_splits: u64, } @@ -127,10 +135,10 @@ impl Handler for Publisher { let SplitsUpdate { index_uid, new_splits, - replaced_split_ids, checkpoint_delta_opt, publish_lock, publish_token_opt, + replaced_splits, .. } = split_update; @@ -143,16 +151,33 @@ impl Handler for Publisher { .iter() .map(|split| split.split_id.clone()) .collect(); + let replaced_split_ids = replaced_splits + .iter() + .map(|replaced| replaced.split_id.clone()) + .collect(); if let Some(_guard) = publish_lock.acquire().await { + if !replaced_splits.is_empty() { + warn_if_soft_deletes_changed_during_merge( + &index_uid, + &replaced_splits, + &self.metastore, + ctx.progress(), + ) + .await; + } + let index_id = index_uid.index_id.clone(); let publish_splits_request = PublishSplitsRequest { index_uid: Some(index_uid), staged_split_ids: split_ids.clone(), - replaced_split_ids: replaced_split_ids.clone(), + replaced_split_ids, index_checkpoint_delta_json_opt, publish_token_opt: publish_token_opt.clone(), }; ctx.protect_future(self.metastore.publish_splits(publish_splits_request)) .await + .inspect_err(|error| { + error!(%error, index_id=%index_id, staged_split_ids=?split_ids, "failed to publish splits to the metastore"); + }) .context("failed to publish splits")?; } else { // TODO: Remove the junk right away? @@ -194,10 +219,11 @@ impl Handler for Publisher { .await; } - if replaced_split_ids.is_empty() { + if replaced_splits.is_empty() { self.counters.num_published_splits += 1; } else { self.counters.num_replace_operations += 1; + self.counters.num_replaced_splits += replaced_splits.len() as u64; } } else { self.counters.num_empty_splits += 1; @@ -207,6 +233,73 @@ impl Handler for Publisher { } } +/// Re-reads the soft-deleted doc IDs for all input splits from the metastore and logs an +/// error for each split whose soft-delete set grew while the merge was running. +async fn warn_if_soft_deletes_changed_during_merge( + index_uid: &IndexUid, + replaced_splits: &[ReplacedSplit], + metastore: &MetastoreServiceClient, + progress: &Progress, +) { + let query = ListSplitsQuery::for_index(index_uid.clone()).with_split_ids( + replaced_splits + .iter() + .map(|replaced| replaced.split_id.clone()) + .collect(), + ); + + let list_splits_request = match ListSplitsRequest::try_from_list_splits_query(&query) { + Ok(request) => request, + Err(err) => { + warn!(error = ?err, "failed to build list_splits request for soft-delete race detection"); + return; + } + }; + let splits_stream = match progress + .protect_future(metastore.list_splits(list_splits_request)) + .await + { + Ok(stream) => stream, + Err(err) => { + warn!(error = ?err, "failed to list splits for soft-delete race detection"); + return; + } + }; + let fresh_splits = match progress + .protect_future(splits_stream.collect_splits_metadata()) + .await + { + Ok(splits) => splits, + Err(err) => { + warn!(error = ?err, "failed to collect split metadata for soft-delete race detection"); + return; + } + }; + let snapshot: HashMap<&SplitId, &BTreeSet> = replaced_splits + .iter() + .map(|n| (&n.split_id, &n.soft_deleted_doc_ids)) + .collect(); + for fresh_split in &fresh_splits { + let Some(snapshot_ids) = snapshot.get(&fresh_split.split_id) else { + continue; + }; + let missed: BTreeSet = fresh_split + .soft_deleted_doc_ids + .difference(snapshot_ids) + .copied() + .collect(); + if !missed.is_empty() { + // TODO: this means that the merge didn't include some committed + // soft deletes. Those are lost. + error!( + split_id = %fresh_split.split_id, + num_missed_soft_deletes = missed.len(), + "soft-delete race condition detected", + ); + } + } +} + #[cfg(test)] mod tests { use quickwit_actors::Universe; @@ -262,7 +355,6 @@ mod tests { split_id: "split".to_string(), ..Default::default() }], - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(IndexCheckpointDelta { source_id: "source".to_string(), source_delta: SourceCheckpointDelta::from_range(1..3), @@ -271,6 +363,7 @@ mod tests { publish_token_opt: None, merge_task: None, parent_span: tracing::Span::none(), + replaced_splits: Vec::new(), }) .await .is_ok() @@ -278,6 +371,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 1); + assert_eq!(publisher_observation.num_replaced_splits, 0); let suggest_truncate_checkpoints: Vec = source_inbox .drain_for_test_typed::() @@ -337,7 +431,6 @@ mod tests { .send_message(SplitsUpdate { index_uid: ref_index_uid.clone(), new_splits: Vec::new(), - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(IndexCheckpointDelta { source_id: "source".to_string(), source_delta: SourceCheckpointDelta::from_range(1..3), @@ -346,6 +439,7 @@ mod tests { publish_token_opt: None, merge_task: None, parent_span: tracing::Span::none(), + replaced_splits: Vec::new(), }) .await .is_ok() @@ -354,6 +448,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); assert_eq!(publisher_observation.num_replace_operations, 0); + assert_eq!(publisher_observation.num_replaced_splits, 0); assert_eq!(publisher_observation.num_empty_splits, 1); let suggest_truncate_checkpoints: Vec = source_inbox @@ -381,12 +476,21 @@ mod tests { let mut mock_metastore = MockMetastoreService::new(); let ref_index_uid: IndexUid = IndexUid::for_test("index", 1); let ref_index_uid_clone = ref_index_uid.clone(); + mock_metastore.expect_list_splits().times(1).returning(|_| { + use quickwit_common::ServiceStream; + use quickwit_metastore::ListSplitsResponseExt; + use quickwit_proto::metastore::ListSplitsResponse; + let response = ListSplitsResponse::try_from_splits(vec![]).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); mock_metastore .expect_publish_splits() .withf(move |publish_splits_requests| { + let mut replaced_split_ids = publish_splits_requests.replaced_split_ids.clone(); + replaced_split_ids.sort(); publish_splits_requests.index_uid() == &ref_index_uid_clone && publish_splits_requests.staged_split_ids[..] == ["split3"] - && publish_splits_requests.replaced_split_ids[..] == ["split1", "split2"] + && replaced_split_ids[..] == ["split1", "split2"] && publish_splits_requests .index_checkpoint_delta_json_opt() .is_empty() @@ -407,12 +511,21 @@ mod tests { split_id: "split3".to_string(), ..Default::default() }], - replaced_split_ids: vec!["split1".to_string(), "split2".to_string()], checkpoint_delta_opt: None, publish_lock: PublishLock::default(), publish_token_opt: None, merge_task: None, parent_span: Span::none(), + replaced_splits: vec![ + ReplacedSplit { + split_id: "split1".to_string(), + ..Default::default() + }, + ReplacedSplit { + split_id: "split2".to_string(), + ..Default::default() + }, + ], }; assert!( publisher_mailbox @@ -423,6 +536,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); assert_eq!(publisher_observation.num_replace_operations, 1); + assert_eq!(publisher_observation.num_replaced_splits, 2); let merge_planner_msgs = merge_planner_inbox.drain_for_test_typed::(); assert_eq!(merge_planner_msgs.len(), 1); assert_eq!(merge_planner_msgs[0].new_splits.len(), 1); @@ -451,21 +565,99 @@ mod tests { .send_message(SplitsUpdate { index_uid: IndexUid::new_with_random_ulid("index"), new_splits: vec![SplitMetadata::for_test("test-split".to_string())], - replaced_split_ids: Vec::new(), checkpoint_delta_opt: None, publish_lock, publish_token_opt: None, merge_task: None, parent_span: Span::none(), + replaced_splits: Vec::new(), }) .await .unwrap(); let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); + assert_eq!(publisher_observation.num_replaced_splits, 0); let merger_messages = merge_planner_inbox.drain_for_test(); assert!(merger_messages.is_empty()); universe.assert_quit().await; } + + #[tokio::test] + async fn test_publisher_warns_on_soft_delete_race_condition() { + use std::collections::BTreeSet; + + use quickwit_common::ServiceStream; + use quickwit_metastore::{ListSplitsResponseExt, Split, SplitState}; + use quickwit_proto::metastore::ListSplitsResponse; + + let universe = Universe::with_accelerated_time(); + let ref_index_uid: IndexUid = IndexUid::for_test("index", 1); + let racing_split_id = "racing-split".to_string(); + + let mut mock_metastore = MockMetastoreService::new(); + + // list_splits returns the racing split with a new soft-delete absent from the snapshot. + let racing_split_id_clone = racing_split_id.clone(); + mock_metastore + .expect_list_splits() + .times(1) + .returning(move |_| { + let split = Split { + split_metadata: SplitMetadata { + split_id: racing_split_id_clone.clone(), + soft_deleted_doc_ids: BTreeSet::from([0u32]), + ..Default::default() + }, + split_state: SplitState::Published, + update_timestamp: 0, + publish_timestamp: None, + }; + let response = ListSplitsResponse::try_from_splits(vec![split]).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + + mock_metastore + .expect_publish_splits() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + let publisher = Publisher::new( + PublisherType::MergePublisher, + MetastoreServiceClient::from_mock(mock_metastore), + None, + None, + ); + let (publisher_mailbox, publisher_handle) = universe.spawn_builder().spawn(publisher); + + // Snapshot shows the racing split had no soft-deletes at merge start (stale read). + let replaced_splits = vec![ReplacedSplit { + split_id: racing_split_id.clone(), + ..Default::default() + }]; + + publisher_mailbox + .send_message(SplitsUpdate { + index_uid: ref_index_uid.clone(), + new_splits: vec![SplitMetadata { + split_id: "merged-split".to_string(), + ..Default::default() + }], + checkpoint_delta_opt: None, + publish_lock: PublishLock::default(), + publish_token_opt: None, + merge_task: None, + parent_span: Span::none(), + replaced_splits, + }) + .await + .unwrap(); + + // Publish must still succeed despite the race condition (warning is non-fatal). + let observation = publisher_handle.process_pending_and_observe().await.state; + assert_eq!(observation.num_replace_operations, 1); + assert_eq!(observation.num_replaced_splits, 1); + universe.assert_quit().await; + } } diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index 2a012858587..94db9379ab2 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; -use std::iter::FromIterator; use std::mem; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; @@ -27,11 +25,9 @@ use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, Qu use quickwit_common::pubsub::EventBroker; use quickwit_common::spawn_named_task; use quickwit_config::RetentionPolicy; -use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_metastore::{SplitMetadata, StageSplitsRequestExt}; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, StageSplitsRequest}; use quickwit_proto::search::{ReportSplit, ReportSplitsRequest}; -use quickwit_proto::types::{IndexUid, PublishToken}; use quickwit_storage::SplitPayloadBuilder; use serde::Serialize; use tokio::sync::oneshot::Sender; @@ -40,10 +36,10 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use crate::actors::Publisher; use crate::actors::sequencer::{Sequencer, SequencerCommand}; -use crate::merge_policy::{MergePolicy, MergeTask}; +use crate::merge_policy::MergePolicy; use crate::metrics::INDEXER_METRICS; use crate::models::{ - EmptySplit, PackagedSplit, PackagedSplitBatch, PublishLock, SplitsUpdate, create_split_metadata, + EmptySplit, PackagedSplit, PackagedSplitBatch, SplitsUpdate, create_split_metadata, }; use crate::split_store::IndexingSplitStore; @@ -370,6 +366,7 @@ impl Handler for Uploader { event_broker.publish(ReportSplitsRequest { report_splits }); + let mut replaced_splits = Vec::new(); for (packaged_split, metadata) in batch.splits.into_iter().zip(split_metadata_list) { let upload_result = upload_split( &packaged_split, @@ -385,18 +382,24 @@ impl Handler for Uploader { return; } + replaced_splits.extend(packaged_split.split_attrs.replaced_splits.iter().cloned()); packaged_splits_and_metadata.push((packaged_split, metadata)); } - let splits_update = make_publish_operation( + assert!(!packaged_splits_and_metadata.is_empty()); + let splits_update = SplitsUpdate { index_uid, - packaged_splits_and_metadata, - batch.checkpoint_delta_opt, - batch.publish_lock, - batch.publish_token_opt, - batch.merge_task_opt, - batch.batch_parent_span, - ); + new_splits: packaged_splits_and_metadata + .into_iter() + .map(|split_and_meta| split_and_meta.1) + .collect_vec(), + checkpoint_delta_opt: batch.checkpoint_delta_opt, + publish_lock: batch.publish_lock, + publish_token_opt: batch.publish_token_opt, + merge_task: batch.merge_task_opt, + parent_span: batch.batch_parent_span, + replaced_splits, + }; let target = match &split_update_sender { SplitsUpdateSender::Sequencer(_) => "sequencer", @@ -439,12 +442,12 @@ impl Handler for Uploader { let splits_update = SplitsUpdate { index_uid: empty_split.index_uid, new_splits: Vec::new(), - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(empty_split.checkpoint_delta), publish_lock: empty_split.publish_lock, publish_token_opt: empty_split.publish_token_opt, merge_task: None, parent_span: empty_split.batch_parent_span, + replaced_splits: Vec::new(), }; split_update_sender.send(splits_update, ctx).await?; @@ -452,35 +455,6 @@ impl Handler for Uploader { } } -fn make_publish_operation( - index_uid: IndexUid, - packaged_splits_and_metadatas: Vec<(PackagedSplit, SplitMetadata)>, - checkpoint_delta_opt: Option, - publish_lock: PublishLock, - publish_token_opt: Option, - merge_task: Option, - parent_span: Span, -) -> SplitsUpdate { - assert!(!packaged_splits_and_metadatas.is_empty()); - let replaced_split_ids = packaged_splits_and_metadatas - .iter() - .flat_map(|(split, _)| split.split_attrs.replaced_split_ids.clone()) - .collect::>(); - SplitsUpdate { - index_uid, - new_splits: packaged_splits_and_metadatas - .into_iter() - .map(|split_and_meta| split_and_meta.1) - .collect_vec(), - replaced_split_ids: Vec::from_iter(replaced_split_ids), - checkpoint_delta_opt, - publish_lock, - publish_token_opt, - merge_task, - parent_span, - } -} - #[instrument( level = "info" name = "upload", @@ -512,6 +486,7 @@ async fn upload_split( #[cfg(test)] mod tests { + use std::collections::BTreeSet; use std::path::PathBuf; use std::time::Duration; @@ -520,20 +495,20 @@ mod tests { use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; - use quickwit_proto::types::{DocMappingUid, NodeId}; + use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId}; use quickwit_storage::RamStorage; use tantivy::DateTime; use tokio::sync::oneshot; use super::*; use crate::merge_policy::{NopMergePolicy, default_merge_policy}; - use crate::models::{SplitAttrs, SplitsUpdate}; + use crate::models::{PublishLock, ReplacedSplit, SplitAttrs, SplitsUpdate}; #[tokio::test] async fn test_uploader_with_sequencer() -> anyhow::Result<()> { quickwit_common::setup_logging_for_tests(); - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); @@ -590,10 +565,10 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), split_id: "test-split".to_string(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, @@ -627,7 +602,6 @@ mod tests { index_uid, new_splits, checkpoint_delta_opt, - replaced_split_ids, .. } = publisher_message; @@ -640,7 +614,6 @@ mod tests { checkpoint_delta.source_delta, SourceCheckpointDelta::from_range(3..15) ); - assert!(replaced_split_ids.is_empty()); let mut files = ram_storage.list_files().await; files.sort(); assert_eq!(&files, &[PathBuf::from("test-split.split")]); @@ -650,7 +623,7 @@ mod tests { #[tokio::test] async fn test_uploader_with_sequencer_emits_replace() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); @@ -703,12 +676,12 @@ mod tests { ..=DateTime::from_timestamp_secs(1_628_203_640), ), secondary_time_range: None, - replaced_split_ids: vec![ - "replaced-split-1".to_string(), - "replaced-split-2".to_string(), - ], delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::from([ReplacedSplit { + split_id: "replaced-split-1".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }]), }, serialized_split_fields: Vec::new(), split_scratch_directory: split_scratch_directory_1, @@ -731,12 +704,12 @@ mod tests { ..=DateTime::from_timestamp_secs(1_628_203_640), ), secondary_time_range: None, - replaced_split_ids: vec![ - "replaced-split-1".to_string(), - "replaced-split-2".to_string(), - ], delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::from([ReplacedSplit { + split_id: "replaced-split-2".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }]), }, serialized_split_fields: Vec::new(), split_scratch_directory: split_scratch_directory_2, @@ -772,21 +745,26 @@ mod tests { let SplitsUpdate { index_uid, new_splits, - mut replaced_split_ids, checkpoint_delta_opt, + replaced_splits, .. } = publisher_message; assert_eq!(index_uid.index_id, "test-index"); // Sort first to avoid test failing. - replaced_split_ids.sort(); assert_eq!(new_splits.len(), 2); assert_eq!(new_splits[0].split_id(), "test-split-1"); assert_eq!(new_splits[1].split_id(), "test-split-2"); assert_eq!( - &replaced_split_ids, - &[ - "replaced-split-1".to_string(), - "replaced-split-2".to_string() + &replaced_splits, + &vec![ + ReplacedSplit { + split_id: "replaced-split-1".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }, + ReplacedSplit { + split_id: "replaced-split-2".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }, ] ); assert!(checkpoint_delta_opt.is_none()); @@ -806,7 +784,7 @@ mod tests { #[tokio::test] async fn test_uploader_without_sequencer() -> anyhow::Result<()> { - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::for_test("test-index", 0); let index_uid_clone = index_uid.clone(); let source_id = "test-source".to_string(); @@ -855,9 +833,9 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, @@ -879,13 +857,13 @@ mod tests { let SplitsUpdate { index_uid, new_splits, - replaced_split_ids, + replaced_splits, .. } = publisher_inbox.recv_typed_message().await.unwrap(); assert_eq!(index_uid.index_id, "test-index"); assert_eq!(new_splits.len(), 1); - assert!(replaced_split_ids.is_empty()); + assert!(replaced_splits.is_empty()); universe.assert_quit().await; Ok(()) } @@ -943,7 +921,7 @@ mod tests { index_uid, new_splits, checkpoint_delta_opt, - replaced_split_ids, + replaced_splits, .. } = publisher_message; @@ -955,7 +933,7 @@ mod tests { checkpoint_delta.source_delta, SourceCheckpointDelta::from_range(3..15) ); - assert!(replaced_split_ids.is_empty()); + assert!(replaced_splits.is_empty()); let files = ram_storage.list_files().await; assert!(files.is_empty()); universe.assert_quit().await; @@ -990,7 +968,7 @@ mod tests { // we need to keep the handle alive. let _subscribe_handle = event_broker.subscribe(report_splits_listener); - let node_id = NodeId::from("test-node"); + let node_id = NodeId::from_str("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); @@ -1037,10 +1015,10 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), split_id: SPLIT_ULID_STR.to_string(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/controlled_directory.rs b/quickwit/quickwit-indexing/src/controlled_directory.rs index b209b4888d6..86e4a5fce0e 100644 --- a/quickwit/quickwit-indexing/src/controlled_directory.rs +++ b/quickwit/quickwit-indexing/src/controlled_directory.rs @@ -93,7 +93,7 @@ impl Directory for ControlledDirectory { self.check_if_alive() .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?; - let underlying_wrt: Box = self + let underlying_wrt: Box = self .underlying .open_write(path)? .into_inner() @@ -154,7 +154,9 @@ impl IoControlsAccess for HotswappableIoControls { } // Wrapper to work around the orphan rule. (hence the word "Adopted"). -struct AdoptedControlledWrite(ControlledWrite>); +struct AdoptedControlledWrite( + ControlledWrite>, +); impl io::Write for AdoptedControlledWrite { fn write(&mut self, buf: &[u8]) -> io::Result { diff --git a/quickwit/quickwit-indexing/src/lib.rs b/quickwit/quickwit-indexing/src/lib.rs index 2c2b28a09d7..d8e2853025d 100644 --- a/quickwit/quickwit-indexing/src/lib.rs +++ b/quickwit/quickwit-indexing/src/lib.rs @@ -35,9 +35,12 @@ pub use crate::split_store::{IndexingSplitStore, get_tantivy_directory_from_spli pub mod actors; mod controlled_directory; +pub mod mature_merge; +pub mod mature_merge_plan; pub mod merge_policy; mod metrics; pub mod models; +mod soft_delete_query; pub mod source; mod split_store; #[cfg(any(test, feature = "testsuite"))] @@ -69,6 +72,7 @@ pub async fn start_indexing_service( ingester_pool: IngesterPool, storage_resolver: StorageResolver, event_broker: EventBroker, + is_delete_task_service_disabled: bool, ) -> anyhow::Result> { info!("starting indexer service"); let ingest_api_service_mailbox = universe.get_one::(); @@ -88,6 +92,7 @@ pub async fn start_indexing_service( ingester_pool, storage_resolver, event_broker, + is_delete_task_service_disabled, ) .await?; let (indexing_service, _) = universe.spawn_builder().spawn(indexing_service); diff --git a/quickwit/quickwit-indexing/src/mature_merge.rs b/quickwit/quickwit-indexing/src/mature_merge.rs new file mode 100644 index 00000000000..00f9ed0928a --- /dev/null +++ b/quickwit/quickwit-indexing/src/mature_merge.rs @@ -0,0 +1,871 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use anyhow::{Context, bail}; +use bytesize::ByteSize; +use futures::StreamExt; +use quickwit_actors::{ActorExitStatus, Universe}; +use quickwit_common::io::IoControls; +use quickwit_common::{KillSwitch, temp_dir}; +use quickwit_metastore::{ + IndexMetadata, ListIndexesMetadataResponseExt, ListSplitsQuery, ListSplitsRequestExt, + MetastoreServiceStreamSplitsExt, SplitState, +}; +use quickwit_proto::indexing::MergePipelineId; +use quickwit_proto::metastore::{ + ListIndexesMetadataRequest, ListSplitsRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_proto::types::NodeId; +use quickwit_storage::StorageResolver; +use tantivy::Inventory; +use time::OffsetDateTime; +use tokio::sync::Semaphore; +use tracing::{error, info, warn}; + +use crate::actors::{ + MergeExecutor, MergePermit, MergeSplitDownloader, Packager, Publisher, PublisherType, Uploader, + UploaderType, +}; +use crate::mature_merge_plan::{MATURITY_BUFFER, plan_merge_operations_for_index}; +use crate::merge_policy::{MergeOperation, MergeTask, NopMergePolicy}; +use crate::split_store::{IndexingSplitCache, IndexingSplitStore}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MatureMergeConfig { + /// Splits within this many days of the retention cutoff are left untouched. + pub retention_safety_buffer_days: u64, + /// Minimum number of splits in a group before a merge operation is emitted. + pub min_merge_group_size: usize, + /// Maximum number of docs in a split for it to be eligible for mature merging. + pub input_split_max_num_docs: usize, + /// Maximum number of splits per merge operation. + pub max_merge_group_size: usize, + /// Maximum total number of documents per merge operation. + pub split_target_num_docs: usize, + /// Focus on splits that span this many days. + pub split_timestamp_days_range: u8, + /// Number of indexes processed concurrently. Lower to avoid fetching splits + /// metadata too eagerly. + pub index_parallelism: usize, + /// Maximum number of merges running concurrently across all indexes. + pub max_concurrent_merges: usize, + /// Print planned operations without executing them. + pub dry_run: bool, + /// List of index patterns to include in the mature merge process. + pub index_id_patterns: Vec, +} + +impl Default for MatureMergeConfig { + fn default() -> Self { + Self { + retention_safety_buffer_days: 5, + min_merge_group_size: 5, + input_split_max_num_docs: 10_000, + max_merge_group_size: 100, + split_target_num_docs: 5_000_000, + split_timestamp_days_range: 0, // by default single day splits + index_parallelism: 50, + max_concurrent_merges: 10, + dry_run: false, + index_id_patterns: vec!["*".to_string()], + } + } +} + +/// Statistics for the merges performed on a single index. +#[derive(Debug, Default)] +struct IndexMergeOutcome { + num_published_merges: u64, + num_replaced_splits: u64, +} + +struct IndexMergeSummary { + num_merges_planned: usize, + num_input_splits: usize, + total_input_bytes: u64, + outcome: IndexMergeOutcome, +} + +/// Fetches all published splits for the given index from the metastore (no +/// node-id filter) and calls [`plan_merge_operations_for_index`]. +async fn fetch_splits_and_plan( + index_metadata: &IndexMetadata, + metastore: &MetastoreServiceClient, + now: OffsetDateTime, + config: &MatureMergeConfig, +) -> anyhow::Result> { + let index_uid = index_metadata.index_uid.clone(); + let list_splits_query = ListSplitsQuery::for_index(index_uid) + .with_split_state(SplitState::Published) + .retain_mature(now - MATURITY_BUFFER); + let list_splits_request = ListSplitsRequest::try_from_list_splits_query(&list_splits_query)?; + let splits_stream = metastore + .list_splits(list_splits_request) + .await + .inspect_err(|error| { + error!(%error, index_id=%index_metadata.index_uid.index_id, "failed to list splits from the metastore"); + })?; + let splits = splits_stream + .collect_splits_metadata() + .await + .inspect_err(|error| { + error!(%error, index_id=%index_metadata.index_uid.index_id, "failed to collect splits metadata from the metastore"); + })?; + + if splits.iter().any(|s| !s.tags.is_empty()) { + // with tags and doc mapping evolutions, we might have weird edge cases + // -> just refuse them for now + bail!("tags not supported in mature merges") + } + + let total_splits = splits.len(); + let operations = + plan_merge_operations_for_index(&index_metadata.index_config, splits, now, config); + + info!( + index_id = %index_metadata.index_config.index_id, + total_splits, + num_planned_merges = operations.len(), + "fetched splits for mature merge planning" + ); + Ok(operations) +} + +/// Executes the given merge operations for a single index using the standard +/// actor pipeline: `MergeSplitDownloader -> MergeExecutor -> Packager -> +/// Uploader -> Publisher`. +/// +/// Tags are not supported and we use the default tokenizer manager. In practice +/// we could use the tags and custom tokenizers from the current doc mapping, +/// but schema evolutions could lead to un-anticipated edge cases. +#[allow(clippy::too_many_arguments)] +async fn run_mature_merges_for_index( + index_metadata: &IndexMetadata, + operations: Vec, + metastore: MetastoreServiceClient, + split_store: IndexingSplitStore, + semaphore: Arc, + data_dir_path: &std::path::Path, + config: &MatureMergeConfig, + node_id: NodeId, +) -> anyhow::Result { + if operations.is_empty() { + return Ok(IndexMergeOutcome { + num_published_merges: 0, + num_replaced_splits: 0, + }); + } + + let index_config = &index_metadata.index_config; + let index_uid = index_metadata.index_uid.clone(); + + let indexing_directory = temp_dir::Builder::default() + .join("mature-merge") + .tempdir_in(data_dir_path) + .context("failed to create temp directory for mature merge")?; + + let pipeline_id = MergePipelineId { + node_id, + index_uid, + source_id: "_mature_merge".to_string(), + }; + + let universe = Universe::new(); + let kill_switch = KillSwitch::default(); + + // Build chain from publisher inward (each actor gets the next actor's mailbox). + + let merge_publisher = Publisher::new( + PublisherType::MergePublisher, + metastore.clone(), + // No feedback loop to a merge planner. + None, + None, + ); + let (merge_publisher_mailbox, merge_publisher_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_publisher); + + let merge_uploader = Uploader::new( + UploaderType::MergeUploader, + metastore.clone(), + Arc::new(NopMergePolicy), + index_config.retention_policy_opt.clone(), + split_store.clone(), + merge_publisher_mailbox.into(), + config.max_concurrent_merges, + Default::default(), + ); + let (merge_uploader_mailbox, merge_uploader_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_uploader); + + // Tag fields not supported for now + let tag_fields = Vec::new(); + let merge_packager = Packager::new("MaturePackager", tag_fields, merge_uploader_mailbox); + let (merge_packager_mailbox, merge_packager_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_packager); + + let merge_executor = MergeExecutor::new_with_tokenizers_only( + pipeline_id, + metastore, + // we only support the default tokenizer manager + quickwit_query::create_default_quickwit_tokenizer_manager(), + IoControls::default().set_component("mature_merger"), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_executor); + + let merge_split_downloader = MergeSplitDownloader { + scratch_directory: indexing_directory, + split_store, + executor_mailbox: merge_executor_mailbox, + io_controls: IoControls::default().set_component("mature_split_downloader"), + }; + let (merge_split_downloader_mailbox, merge_split_downloader_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_split_downloader); + + // Send all merge tasks to the downloader, gated by the concurrency semaphore. + let inventory: Inventory = Inventory::default(); + for operation in operations { + let permit = Arc::clone(&semaphore) + .acquire_owned() + .await + .expect("semaphore should not be closed"); + let merge_task = MergeTask { + merge_operation: inventory.track(operation), + _merge_permit: MergePermit::new(permit), + }; + if merge_split_downloader_mailbox + .send_message(merge_task) + .await + .is_err() + { + anyhow::bail!("merge split downloader actor died unexpectedly"); + } + } + + // Dropping the downloader mailbox signals no more tasks are coming. + // The pipeline will cascade-exit once all pending tasks are processed. + drop(merge_split_downloader_mailbox); + + let (downloader_status, _) = merge_split_downloader_handle.join().await; + let (executor_status, _) = merge_executor_handle.join().await; + let (packager_status, _) = merge_packager_handle.join().await; + let (uploader_status, _) = merge_uploader_handle.join().await; + let (publisher_status, publisher_counters) = merge_publisher_handle.join().await; + + universe.quit().await; + + for (name, status) in [ + ("downloader", downloader_status), + ("executor", executor_status), + ("packager", packager_status), + ("uploader", uploader_status), + ("publisher", publisher_status), + ] { + if !matches!(status, ActorExitStatus::Success | ActorExitStatus::Quit) { + anyhow::bail!( + "mature merge actor `{}` exited with unexpected status: {:?}", + name, + status + ); + } + } + + Ok(IndexMergeOutcome { + num_published_merges: publisher_counters.num_replace_operations, + num_replaced_splits: publisher_counters.num_replaced_splits, + }) +} + +/// Plans and optionally executes mature merges for a single index +#[allow(clippy::too_many_arguments)] +async fn merge_mature_single_index( + index_metadata: IndexMetadata, + metastore: &MetastoreServiceClient, + storage_resolver: &StorageResolver, + semaphore: Arc, + data_dir_path: &std::path::Path, + config: &MatureMergeConfig, + node_id: NodeId, + now: OffsetDateTime, +) -> anyhow::Result { + let index_id = index_metadata.index_config.index_id.clone(); + let operations = fetch_splits_and_plan(&index_metadata, metastore, now, config).await?; + let num_merges_planned = operations.len(); + let num_input_splits: usize = operations.iter().map(|op| op.splits.len()).sum(); + let total_input_bytes: u64 = operations + .iter() + .flat_map(|op| op.splits.iter()) + .map(|s| s.uncompressed_docs_size_in_bytes) + .sum(); + + if config.dry_run { + for op in &operations { + log_op_for_dry_run(op, &index_metadata.index_config.index_id); + } + return Ok(IndexMergeSummary { + num_merges_planned, + num_input_splits, + total_input_bytes, + outcome: IndexMergeOutcome::default(), + }); + } + + if operations.is_empty() { + return Ok(IndexMergeSummary { + num_merges_planned: 0, + total_input_bytes: 0, + num_input_splits: 0, + outcome: IndexMergeOutcome::default(), + }); + } + + let index_uri = index_metadata.index_uri(); + let remote_storage = storage_resolver + .resolve(index_uri) + .await + .context("failed to resolve index storage")?; + let split_store = + IndexingSplitStore::new(remote_storage, Arc::new(IndexingSplitCache::no_caching())); + + let outcome = run_mature_merges_for_index( + &index_metadata, + operations, + metastore.clone(), + split_store, + semaphore, + data_dir_path, + config, + node_id, + ) + .await?; + + if num_merges_planned > 0 { + info!( + index_id = %index_id, + planned = num_merges_planned, + published_merges = outcome.num_published_merges, + replaced_splits = outcome.num_replaced_splits, + input_splits = num_input_splits, + input_bytes = total_input_bytes, + "mature split merges complete for index" + ); + } + + Ok(IndexMergeSummary { + num_merges_planned, + num_input_splits, + total_input_bytes, + outcome, + }) +} + +/// Aggregates per-index results, logs per-index and global summary lines, and warns on errors. +fn log_merge_results(results: Vec>, dry_run: bool) { + let mut total_planned_merges = 0usize; + let mut total_input_splits = 0usize; + let mut total_input_bytes = 0u64; + let mut total_successfully_published_merges = 0u64; + let mut total_successfully_replaced_splits = 0u64; + + let mut num_indexes_successfully_merged = 0usize; + let mut num_indexes_partially_merged = 0usize; + let mut num_indexes_without_opportunity = 0usize; + + for result in results { + match result { + Ok(summary) => { + total_planned_merges += summary.num_merges_planned; + total_input_splits += summary.num_input_splits; + total_input_bytes += summary.total_input_bytes; + total_successfully_published_merges += summary.outcome.num_published_merges; + total_successfully_replaced_splits += summary.outcome.num_replaced_splits; + + if summary.num_merges_planned == 0 { + num_indexes_without_opportunity += 1; + } else if summary.outcome.num_published_merges + == (summary.num_merges_planned as u64) + { + num_indexes_successfully_merged += 1; + } else { + num_indexes_partially_merged += 1; + } + } + Err(err) => { + warn!(err = ?err, "error processing index during mature merge"); + } + } + } + if dry_run { + info!( + num_indexes_with_opportunities = num_indexes_partially_merged, + num_indexes_without_opportunity, + total_planned_merges, + total_input_splits, + total_input_bytes, + "mature merge dry-run complete" + ); + } else { + info!( + num_indexes_successfully_merged, + num_indexes_partially_merged, + num_indexes_without_opportunity, + total_planned_merges, + total_successfully_published_merges, + total_successfully_replaced_splits, + total_input_splits, + total_input_bytes, + "mature merge complete" + ); + } +} + +fn log_op_for_dry_run(op: &MergeOperation, index_id: &str) { + let start_time = op + .splits + .iter() + .filter_map(|s| s.time_range.as_ref().map(|r| r.start())) + .min() + .unwrap_or(&0); + let end_time = op + .splits + .iter() + .filter_map(|s| s.time_range.as_ref().map(|r| r.end())) + .max() + .unwrap_or(&0); + let fmt_ts = |ts: i64| { + OffsetDateTime::from_unix_timestamp(ts) + .map(|dt| { + format!( + "{}-{:02}-{:02}T{:02}", + dt.year(), + dt.month() as u8, + dt.day(), + dt.hour() + ) + }) + .unwrap_or_else(|_| ts.to_string()) + }; + // print is better than log because dry-run will be used interactively from the CLI + println!( + "[dry-run] {index_id}: {} splits | {} docs | {} | {} → {}", + op.splits.len(), + op.splits.iter().map(|s| s.num_docs).sum::(), + ByteSize(op.splits.iter().map(|s| s.footer_offsets.end).sum::()), + fmt_ts(*start_time), + fmt_ts(*end_time), + ); +} + +/// Processes all indexes from the metastore, discovering and running mature +/// merge opportunities. +/// +/// If `dry_run` is `true`, the planned operations are printed but not executed. +pub async fn merge_mature_all_indexes( + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + data_dir_path: &std::path::Path, + config: MatureMergeConfig, + node_id: NodeId, +) -> anyhow::Result<()> { + let indexes_metadata = metastore + .list_indexes_metadata(ListIndexesMetadataRequest { + index_id_patterns: config.index_id_patterns.clone(), + }) + .await + .inspect_err(|error| { + error!(%error, "failed to list indexes metadata from the metastore"); + }) + .context("failed to list indexes")? + .deserialize_indexes_metadata() + .await + .context("failed to deserialize indexes metadata")?; + + info!( + num_indexes = indexes_metadata.len(), + "starting mature merge" + ); + + let semaphore = Arc::new(Semaphore::new(config.max_concurrent_merges)); + let metastore_ref = &metastore; + let storage_resolver_ref = &storage_resolver; + let config_ref = &config; + + if indexes_metadata + .iter() + .any(|m| !m.index_config.doc_mapping.tag_fields.is_empty()) + { + // with tags and doc mapping evolutions, we might have weird edge cases + // -> just refuse them for now + bail!("tags not supported in mature merges"); + } + + let results: Vec> = futures::stream::iter(indexes_metadata) + .map(|index_metadata| { + let node_id = node_id.clone(); + let semaphore = Arc::clone(&semaphore); + async move { + let now = OffsetDateTime::now_utc(); + merge_mature_single_index( + index_metadata, + metastore_ref, + storage_resolver_ref, + semaphore, + data_dir_path, + config_ref, + node_id, + now, + ) + .await + } + }) + .buffer_unordered(config.index_parallelism) + .collect() + .await; + + log_merge_results(results, config.dry_run); + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use quickwit_common::temp_dir::TempDirectory; + use quickwit_config::ConfigFormat; + use quickwit_metastore::{ + IndexMetadata, IndexMetadataResponseExt, SplitMaturity, SplitMetadata, + UpdateIndexRequestExt, + }; + use quickwit_proto::metastore::{ + IndexMetadataRequest, ListSplitsRequest, MetastoreService, MetastoreServiceClient, + MockMetastoreService, UpdateIndexRequest, + }; + use quickwit_proto::types::NodeId; + use quickwit_storage::RamStorage; + + use super::*; + use crate::TestSandbox; + + /// Tests the short-circuit path: when no merge operations are planned, + /// `run_mature_merges_for_index` returns 0 immediately without spawning any actors. + #[tokio::test] + async fn test_run_mature_merges_for_index_no_operations() -> anyhow::Result<()> { + let mock_metastore = MockMetastoreService::new(); + let storage = Arc::new(RamStorage::default()); + let split_store = IndexingSplitStore::create_without_local_store_for_test(storage); + let index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let data_dir = TempDirectory::for_test(); + let node_id = NodeId::from_str("test-node"); + + let semaphore = Arc::new(Semaphore::new(2)); + let outcome = run_mature_merges_for_index( + &index_metadata, + vec![], + MetastoreServiceClient::from_mock(mock_metastore), + split_store, + semaphore, + data_dir.path(), + &MatureMergeConfig::default(), + node_id, + ) + .await?; + + assert_eq!(outcome.num_published_merges, 0); + assert_eq!(outcome.num_replaced_splits, 0); + Ok(()) + } + + /// Tests the full per index pipeline end-to-end with a single merge operation + #[tokio::test] + async fn test_run_mature_merges_for_index_merges_real_splits() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-mature2", doc_mapping_yaml, "", &["body"]).await?; + + // each add_documents() call produces 1 split + for i in 0..4u64 { + test_sandbox + .add_documents(std::iter::once( + serde_json::json!({"body": format!("doc{i}"), "ts": 1_631_072_713u64 + i}), + )) + .await?; + } + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(split_metas.len(), 4); + + let index_metadata = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + + let merge_op = MergeOperation::new_merge_operation(split_metas); + let split_store = + IndexingSplitStore::create_without_local_store_for_test(test_sandbox.storage()); + let data_dir = TempDirectory::for_test(); + let semaphore = Arc::new(Semaphore::new(2)); + + let outcome = run_mature_merges_for_index( + &index_metadata, + vec![merge_op], + metastore.clone(), + split_store, + semaphore, + data_dir.path(), + &MatureMergeConfig::default(), + test_sandbox.node_id(), + ) + .await?; + + assert_eq!(outcome.num_published_merges, 1); + assert_eq!(outcome.num_replaced_splits, 4); + + // The 4 input splits are now MarkedForDeletion; 1 merged Published split should remain. + let published_after: Vec = metastore + .list_splits(ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid).with_split_state(SplitState::Published), + )?) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(published_after.len(), 1); + assert_eq!(published_after[0].num_docs, 4); + assert_eq!(published_after[0].maturity, SplitMaturity::Mature); + assert_eq!( + published_after[0].time_range, + Some(1_631_072_713..=1_631_072_716) + ); + + test_sandbox.assert_quit().await; + Ok(()) + } + + #[tokio::test] + async fn test_merge_mature_single_index_schema_evolution() -> anyhow::Result<()> { + let doc_mapping_v1_yaml = r#" + field_mappings: + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + - name: label + type: text + fast: true + tokenizer: lowercase + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-schema-evo", doc_mapping_v1_yaml, "", &["label"]) + .await?; + + let base_time = 1_631_072_713i64; // Wednesday, September 8, 2021 at 3:45:13 AM UTC + + // create 3 splits with v1 mapping + for i in 0..3i64 { + test_sandbox + .add_documents(std::iter::once( + serde_json::json!({"label": format!("Doc{i}"), "ts": base_time + i}), + )) + .await?; + } + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let v1_splits: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(v1_splits.len(), 3); + let v1_doc_mapping_uid = v1_splits[0].doc_mapping_uid; + + // Update the index config: change tokenizer to `default` and add a secondary timestamp. + let index_metadata_v1 = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + let doc_mapping_v2 = ConfigFormat::Yaml.parse( + r#" + field_mappings: + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + - name: label + type: text + fast: true + tokenizer: default + - name: ts2 + type: datetime + input_formats: [unix_timestamp] + fast: true + timestamp_field: ts + secondary_timestamp_field: ts2 + "# + .as_bytes(), + )?; + let update_request = UpdateIndexRequest::try_from_updates( + index_uid.clone(), + &doc_mapping_v2, + &index_metadata_v1.index_config.indexing_settings, + &index_metadata_v1.index_config.ingest_settings, + &index_metadata_v1.index_config.search_settings, + &index_metadata_v1.index_config.retention_policy_opt, + )?; + metastore.update_index(update_request).await?; + + // create 3 more splits with v2 mapping + for i in 3..6i64 { + test_sandbox + .add_documents(std::iter::once(serde_json::json!({ + "label": format!("Doc{i}"), + "ts": base_time + i, + "ts2": base_time + i + 1000, + }))) + .await?; + } + + let all_splits: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(all_splits.len(), 6); + let v2_doc_mapping_uid = all_splits + .iter() + .find(|s| s.doc_mapping_uid != v1_doc_mapping_uid) + .unwrap() + .doc_mapping_uid; + assert_eq!( + all_splits + .iter() + .filter(|s| s.doc_mapping_uid == v1_doc_mapping_uid) + .count(), + 3 + ); + assert_eq!( + all_splits + .iter() + .filter(|s| s.doc_mapping_uid == v2_doc_mapping_uid) + .count(), + 3 + ); + + let index_metadata_v2 = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + let data_dir = TempDirectory::for_test(); + let semaphore = Arc::new(Semaphore::new(2)); + // Splits have the default 48h maturation period. Pass a `now` far enough in the future + // so all splits (both v1 and v2) are mature at `now - MATURITY_BUFFER (6h)`. + let now = OffsetDateTime::now_utc() + time::Duration::days(3); + // Override min_merge_group_size to 2 so that 3-split groups qualify. + let config = MatureMergeConfig { + min_merge_group_size: 2, + ..MatureMergeConfig::default() + }; + + let summary = merge_mature_single_index( + index_metadata_v2, + &metastore, + &test_sandbox.storage_resolver(), + semaphore, + data_dir.path(), + &config, + test_sandbox.node_id(), + now, + ) + .await?; + + // Both the v1 and v2 groups (3 splits each, different doc_mapping_uid) get merged. + assert_eq!(summary.num_merges_planned, 2); + assert_eq!(summary.outcome.num_published_merges, 2); + assert_eq!(summary.outcome.num_replaced_splits, 6); + + let published_after: Vec = metastore + .list_splits(ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid).with_split_state(SplitState::Published), + )?) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(published_after.len(), 2); + + // The merged v1 split preserves the original doc_mapping_uid, time range, and has no + // secondary_time_range because the v1 schema had no secondary timestamp field. + let merged_v1 = published_after + .iter() + .find(|s| s.doc_mapping_uid == v1_doc_mapping_uid) + .expect("merged v1 split must exist"); + assert_eq!(merged_v1.num_docs, 3); + assert_eq!(merged_v1.maturity, SplitMaturity::Mature); + assert_eq!(merged_v1.time_range, Some(base_time..=base_time + 2)); + assert_eq!(merged_v1.secondary_time_range, None); + + // The merged v2 split has the updated doc_mapping_uid and a secondary_time_range + // derived from the ts2 field. + let merged_v2 = published_after + .iter() + .find(|s| s.doc_mapping_uid == v2_doc_mapping_uid) + .expect("merged v2 split must exist"); + assert_eq!(merged_v2.num_docs, 3); + assert_eq!(merged_v2.maturity, SplitMaturity::Mature); + assert_eq!(merged_v2.time_range, Some(base_time + 3..=base_time + 5)); + assert_eq!( + merged_v2.secondary_time_range, + Some(base_time + 1003..=base_time + 1005) + ); + + test_sandbox.assert_quit().await; + Ok(()) + } +} diff --git a/quickwit/quickwit-indexing/src/mature_merge_plan.rs b/quickwit/quickwit-indexing/src/mature_merge_plan.rs new file mode 100644 index 00000000000..e71736d1891 --- /dev/null +++ b/quickwit/quickwit-indexing/src/mature_merge_plan.rs @@ -0,0 +1,494 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::Duration; + +use quickwit_config::IndexConfig; +use quickwit_metastore::SplitMetadata; +use time::OffsetDateTime; + +use crate::mature_merge::MatureMergeConfig; +use crate::merge_policy::MergeOperation; + +pub const SECS_PER_DAY: i64 = 60 * 60 * 24; + +/// Wait a couple of hours after the split got mature to be extra sure no merge +/// process is still running on it. +pub const MATURITY_BUFFER: Duration = Duration::from_hours(6); + +/// Computes the earliest UTC-day midnight (seconds since epoch) that is safe to merge, +/// given the index's retention policy and the current time. +fn retention_safety_cutoff_secs( + index_config: &IndexConfig, + now_secs: i64, + config: &MatureMergeConfig, +) -> Option { + let retention_policy = index_config.retention_policy_opt.as_ref()?; + let period = retention_policy.retention_period().ok()?; + let retention_safety_buffer = Duration::from_hours(config.retention_safety_buffer_days * 24); + if period <= retention_safety_buffer { + // No safe window: exclude every split by returning a cutoff in the far future. + return Some(i64::MAX); + } + let cutoff_raw = now_secs - period.as_secs() as i64 + retention_safety_buffer.as_secs() as i64; + // Round up to the next day boundary so we never partially exclude a day bucket. + Some((cutoff_raw / SECS_PER_DAY + 1) * SECS_PER_DAY) +} + +/// Converts a single day-bucket group of eligible splits into one or more balanced +/// [`MergeOperation`]s respecting constraints. +fn plan_operations_for_group( + mut group_splits: Vec, + config: &MatureMergeConfig, +) -> Vec { + if group_splits.len() < config.min_merge_group_size { + return Vec::new(); + } + // Sort ascending by end time so each sub-operation covers the most compact range. + group_splits.sort_by_key(|s| s.time_range.as_ref().map(|r| *r.end()).unwrap_or(0)); + + let n = group_splits.len(); + let total_docs: usize = group_splits.iter().map(|s| s.num_docs).sum(); + + // Minimum number of balanced operations needed to respect both per-operation limits. + let k = n + .div_ceil(config.max_merge_group_size) + .max(total_docs.div_ceil(config.split_target_num_docs)) + .max(1); + + // Divide into k balanced chunks (first chunks are ≥ last chunks by at most 1 split). + let chunk_size = n.div_ceil(k); + group_splits + .chunks(chunk_size) + .filter(|chunk| chunk.len() >= config.min_merge_group_size) + .map(|chunk| MergeOperation::new_merge_operation(chunk.to_vec())) + .collect() +} + +/// Group by UTC day (floored to midnight in seconds) of the split's time range, +/// and returns one or more [`MergeOperation`]s per group that meets the size +/// threshold. +/// +/// Rules: +/// - Splits without a `time_range` are skipped (cannot assign a day). +/// - A split is only assigned to a bucket when the UTC day number of `time_range.end()` minus the +/// UTC day number of `time_range.start()` equals the configured number of days. +/// - Immature splits are excluded. +/// - Splits whose `time_range.end()` falls within the retention safety buffer are excluded. +/// +/// Important: This plan merges splits accross sources. It can be problematic if +/// the IndexingSettings are different (e.g different maturation period), which +/// was made possible on Kafka sources by specifying an override in the +/// client_params. +pub fn plan_merge_operations_for_index( + index_config: &IndexConfig, + splits: Vec, + now: OffsetDateTime, + config: &MatureMergeConfig, +) -> Vec { + let now_secs = now.unix_timestamp(); + + let earliest_cutoff_timestamp = retention_safety_cutoff_secs(index_config, now_secs, config); + + // Key: (partition_id, doc_mapping_uid_string, day_bucket_seconds, secondary_day_opt) + let mut groups: HashMap<(u64, String, i64, Option), Vec> = HashMap::new(); + + for split in splits { + // Only splits that have been mature for a while + if !split.is_mature(now - MATURITY_BUFFER) { + continue; + } + + // Enforce the max size for splits to be considered for merging. + if split.num_docs > config.input_split_max_num_docs { + continue; + } + + // The timestamp field is required + let Some(ref time_range) = split.time_range else { + continue; + }; + + let start_day = time_range.start() / SECS_PER_DAY; + let end_day = time_range.end() / SECS_PER_DAY; + + // also group on secondary time range to make sure retention can still be applied + let secondary_day_opt = split + .secondary_time_range + .as_ref() + // In the nominal case, the secondary time (ingest time) is only + // slightly greater than the primary time (event time). Using + // `start()` here decreases the chances of further fragmenting the + // group at the day limits. + .map(|r| r.start() / SECS_PER_DAY); + + // Focus on splits with a specific timestamp range. + if end_day - start_day != (config.split_timestamp_days_range as i64) { + continue; + } + + // Check that we are not too close to the retention cutoff. + if let Some(cutoff) = earliest_cutoff_timestamp + && *time_range.end() < cutoff + { + continue; + } + + let key = ( + split.partition_id, + split.doc_mapping_uid.to_string(), + start_day, + secondary_day_opt, + ); + groups.entry(key).or_default().push(split); + } + + let mut operations = Vec::new(); + for (_key, group_splits) in groups { + operations.extend(plan_operations_for_group(group_splits, config)); + } + operations +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use quickwit_config::{IndexConfig, RetentionPolicy}; + use quickwit_metastore::{SplitMaturity, SplitMetadata}; + use quickwit_proto::types::{DocMappingUid, IndexUid}; + use time::OffsetDateTime; + + use super::*; + + /// Builds a mature [`SplitMetadata`] for use in tests. + /// + /// - `day_bucket`: UTC day expressed as seconds-since-epoch (midnight). For example `day_bucket + /// = 0` means 1970-01-01, `day_bucket = SECS_PER_DAY` means 1970-01-02. + fn mature_split_for_test( + split_id: &str, + index_uid: &IndexUid, + partition_id: u64, + doc_mapping_uid: DocMappingUid, + num_docs: usize, + day_bucket: i64, + ) -> SplitMetadata { + SplitMetadata { + split_id: split_id.to_string(), + index_uid: index_uid.clone(), + partition_id, + num_docs, + doc_mapping_uid, + // Both endpoints on the same UTC day — the split spans one hour. + time_range: Some(day_bucket..=(day_bucket + 3600)), + maturity: SplitMaturity::Mature, + ..Default::default() + } + } + + fn index_config_no_retention() -> IndexConfig { + IndexConfig::for_test("test-index", "s3://test-bucket/test-index") + } + + fn index_config_with_retention(period: &str) -> IndexConfig { + let mut config = index_config_no_retention(); + config.retention_policy_opt = Some(RetentionPolicy { + retention_period: period.to_string(), + evaluation_schedule: "daily".to_string(), + timestamp_type: Default::default(), + }); + config + } + + // UTC day 0 = 1970-01-01. Use a recent-ish day to avoid the retention buffer. + // We use day 20000 (approx 2024-10) so splits are "recent" relative to a "now" we control. + const RECENT_DAY: i64 = 20_000 * SECS_PER_DAY; + + fn now_well_after_recent_day() -> OffsetDateTime { + // 1 day after the splits' day — they are mature but not in a retention buffer. + OffsetDateTime::from_unix_timestamp(RECENT_DAY + SECS_PER_DAY + 1).unwrap() + } + + #[test] + fn test_plan_basic() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert_eq!(operations.len(), 1); + assert_eq!(operations[0].splits.len(), 10); + } + + #[test] + fn test_plan_below_threshold() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + // Only 4 splits — below the min_merge_group_size (5). + let splits: Vec = (0..4) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig { + min_merge_group_size: 5, + ..Default::default() + }, + ); + + assert!(operations.is_empty(), "expected no operations for 4 splits"); + } + + #[test] + fn test_plan_immature_splits_excluded() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + let now = now_well_after_recent_day(); + let now_ts = now.unix_timestamp(); + + // All splits are immature (maturation period far in the future). + let splits: Vec = (0..10) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + split.maturity = SplitMaturity::Immature { + maturation_period: Duration::from_secs(999_999), + }; + // Make sure create_timestamp is recent so the split is truly immature. + split.create_timestamp = now_ts; + split + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now, + &MatureMergeConfig::default(), + ); + + assert!(operations.is_empty(), "immature splits should be excluded"); + } + + #[test] + fn test_plan_multiday_split_skipped() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // 10 splits, but each one spans midnight (start on day N, end on day N+1). + let splits: Vec = (0..10) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + // Extend time_range to cross midnight. + split.time_range = Some(RECENT_DAY - 3600..=RECENT_DAY + 3600); + split + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert!(operations.is_empty(), "multi-day splits should be skipped"); + } + + #[test] + fn test_plan_retention_safety_buffer() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // Retention period = 90 days. Safety buffer = 30 days. + // Splits must have time_range.end >= now - 90d + 30d = now - 60d. + // We put splits at RECENT_DAY but set "now" to be RECENT_DAY + 91 days. + // Then: cutoff_raw = (RECENT_DAY + 91d) - 90d + 30d = RECENT_DAY + 31d + // cutoff = RECENT_DAY + 32d (rounded up to next day boundary) + // Because RECENT_DAY + 3600 < cutoff, splits should be excluded. + let now_ts = RECENT_DAY + 91 * SECS_PER_DAY; + let now = OffsetDateTime::from_unix_timestamp(now_ts).unwrap(); + + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let config = index_config_with_retention("90 days"); + + let merge_config = MatureMergeConfig { + retention_safety_buffer_days: 30, + ..MatureMergeConfig::default() + }; + let operations = plan_merge_operations_for_index(&config, splits, now, &merge_config); + + assert!( + operations.is_empty(), + "splits within retention safety buffer should be excluded" + ); + } + + #[test] + fn test_plan_retention_period_too_short_skipped() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + // Retention period of 3 days is <= retention_safety_buffer_days (default 5 days) + // so the index should be skipped entirely. + let config = index_config_with_retention("3 days"); + + let operations = plan_merge_operations_for_index( + &config, + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert!( + operations.is_empty(), + "index with short retention should produce no operations" + ); + } + + #[test] + fn test_plan_different_partitions_grouped_separately() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // 6 splits per partition, two partitions => 2 separate merge operations. + let splits: Vec = (0..12) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + i as u64 / 6, // partition 0 for i in 0..6, partition 1 for i in 6..12 + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let mut operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + operations.sort_by_key(|op| op.splits[0].partition_id); + + assert_eq!(operations.len(), 2); + assert!(operations[0].splits.iter().all(|s| s.partition_id == 0)); + assert!(operations[1].splits.iter().all(|s| s.partition_id == 1)); + } + + #[test] + fn test_plan_split_timestamp_days_range_one() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + let splits: Vec = (0..60) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + split.time_range = Some(RECENT_DAY..=RECENT_DAY + i * 3600); + split + }) + .collect(); + + let config = MatureMergeConfig { + split_timestamp_days_range: 1, + ..MatureMergeConfig::default() + }; + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &config, + ); + + assert_eq!(operations.len(), 1); + assert_eq!(operations[0].splits.len(), 24); + } +} diff --git a/quickwit/quickwit-indexing/src/merge_policy/mod.rs b/quickwit/quickwit-indexing/src/merge_policy/mod.rs index 9319f8d8498..c9ae4b56aef 100644 --- a/quickwit/quickwit-indexing/src/merge_policy/mod.rs +++ b/quickwit/quickwit-indexing/src/merge_policy/mod.rs @@ -386,7 +386,7 @@ pub mod tests { let merged_split_id = new_split_id(); let tags = merge_tags(splits); let pipeline_id = MergePipelineId { - node_id: NodeId::from("test_node"), + node_id: NodeId::from_str("test_node"), index_uid: IndexUid::new_with_random_ulid("test_index"), source_id: "test_source".to_string(), }; @@ -418,7 +418,7 @@ pub mod tests { let pipeline_id = IndexingPipelineId { index_uid: IndexUid::new_with_random_ulid("test-index"), source_id: "test-source".to_string(), - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), pipeline_uid: PipelineUid::default(), }; let merge_planner = MergePlanner::new( diff --git a/quickwit/quickwit-indexing/src/models/indexed_split.rs b/quickwit/quickwit-indexing/src/models/indexed_split.rs index e129feede9b..a622b241da9 100644 --- a/quickwit/quickwit-indexing/src/models/indexed_split.rs +++ b/quickwit/quickwit-indexing/src/models/indexed_split.rs @@ -105,12 +105,12 @@ impl IndexedSplitBuilder { partition_id, split_id, num_docs: 0, - replaced_split_ids: Vec::new(), uncompressed_docs_size_in_bytes: 0, time_range: None, secondary_time_range: None, delete_opstamp: last_delete_opstamp, num_merge_ops: 0, + replaced_splits: Vec::new(), }, index_writer, split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/models/mod.rs b/quickwit/quickwit-indexing/src/models/mod.rs index 9dfdfde1594..d1642791933 100644 --- a/quickwit/quickwit-indexing/src/models/mod.rs +++ b/quickwit/quickwit-indexing/src/models/mod.rs @@ -47,7 +47,7 @@ use quickwit_proto::types::PublishToken; pub use raw_doc_batch::RawDocBatch; pub(crate) use shard_positions::LocalShardPositionsUpdate; pub use shard_positions::ShardPositionsService; -pub use split_attrs::{SplitAttrs, create_split_metadata}; +pub use split_attrs::{ReplacedSplit, SplitAttrs, create_split_metadata}; #[derive(Debug)] pub struct NewPublishToken(pub PublishToken); diff --git a/quickwit/quickwit-indexing/src/models/publisher_message.rs b/quickwit/quickwit-indexing/src/models/publisher_message.rs index 13182a8f76a..e1ba9eb8ae2 100644 --- a/quickwit/quickwit-indexing/src/models/publisher_message.rs +++ b/quickwit/quickwit-indexing/src/models/publisher_message.rs @@ -22,11 +22,11 @@ use tracing::Span; use crate::merge_policy::MergeTask; use crate::models::PublishLock; +use crate::models::split_attrs::ReplacedSplit; pub struct SplitsUpdate { pub index_uid: IndexUid, pub new_splits: Vec, - pub replaced_split_ids: Vec, pub checkpoint_delta_opt: Option, pub publish_lock: PublishLock, pub publish_token_opt: Option, @@ -36,6 +36,7 @@ pub struct SplitsUpdate { /// If `None`, the split batch was built in the `IndexingPipeline`. pub merge_task: Option, pub parent_span: Span, + pub replaced_splits: Vec, } impl fmt::Debug for SplitsUpdate { diff --git a/quickwit/quickwit-indexing/src/models/split_attrs.rs b/quickwit/quickwit-indexing/src/models/split_attrs.rs index dde48fab25a..4a8076c4ed6 100644 --- a/quickwit/quickwit-indexing/src/models/split_attrs.rs +++ b/quickwit/quickwit-indexing/src/models/split_attrs.rs @@ -25,6 +25,14 @@ use time::OffsetDateTime; use crate::merge_policy::MergePolicy; +#[derive(PartialEq, Eq, Debug, Default, Clone)] +pub struct ReplacedSplit { + pub split_id: SplitId, + /// Snapshot of the split's soft-deletes. These will be consolidated into + /// the split during the merge. + pub soft_deleted_doc_ids: BTreeSet, +} + pub struct SplitAttrs { /// ID of the node that produced the split. pub node_id: NodeId, @@ -61,13 +69,13 @@ pub struct SplitAttrs { pub time_range: Option>, pub secondary_time_range: Option>, - pub replaced_split_ids: Vec, - /// Delete opstamp. pub delete_opstamp: u64, // Number of merge operation the split has been through so far. pub num_merge_ops: usize, + + pub replaced_splits: Vec, } impl fmt::Debug for SplitAttrs { @@ -75,7 +83,14 @@ impl fmt::Debug for SplitAttrs { f.debug_struct("SplitAttrs") .field("split_id", &self.split_id) .field("partition_id", &self.partition_id) - .field("replaced_split_ids", &self.replaced_split_ids) + .field( + "replaced_split_ids", + &self + .replaced_splits + .iter() + .map(|s| &s.split_id) + .collect::>(), + ) .field("time_range", &self.time_range) .field( "uncompressed_docs_size_in_bytes", @@ -137,6 +152,7 @@ pub fn create_split_metadata( footer_offsets, delete_opstamp: split_attrs.delete_opstamp, num_merge_ops: split_attrs.num_merge_ops, + soft_deleted_doc_ids: BTreeSet::new(), } } diff --git a/quickwit/quickwit-indexing/src/soft_delete_query.rs b/quickwit/quickwit-indexing/src/soft_delete_query.rs new file mode 100644 index 00000000000..fad5ed564b9 --- /dev/null +++ b/quickwit/quickwit-indexing/src/soft_delete_query.rs @@ -0,0 +1,377 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use tantivy::index::SegmentId; +use tantivy::query::{EmptyScorer, EnableScoring, Explanation, Query, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TERMINATED, TantivyError, Term}; + +/// A tantivy [`Query`] that matches specific doc IDs within their respective segments. +/// +/// Built from the `soft_deleted_doc_ids` fields of the input [`SplitMetadata`] structs, this +/// query is passed to [`IndexWriter::delete_query`] so that the matched documents are marked for +/// deletion and then physically removed during the subsequent tantivy merge. The query itself only +/// identifies which documents to remove; the actual deletion is performed by the caller. +#[derive(Clone, Debug)] +pub(crate) struct SoftDeletedDocIdsQuery { + /// Maps each segment ID to the **sorted** list of doc IDs to delete within that segment. + docs_per_segment: HashMap>, +} + +impl SoftDeletedDocIdsQuery { + pub(crate) fn new(docs_per_segment: HashMap>) -> Self { + Self { docs_per_segment } + } +} + +impl Query for SoftDeletedDocIdsQuery { + fn weight(&self, _: EnableScoring<'_>) -> tantivy::Result> { + Ok(Box::new(SoftDeletedDocIdsWeight { + docs_per_segment: self.docs_per_segment.clone(), + })) + } + + fn query_terms<'a>(&'a self, _visitor: &mut dyn FnMut(&'a Term, bool)) { + // Doc-ID–based query — no index terms to visit. + } +} + +/// Minimal `DocSet + Scorer` over a pre-sorted, deduplicated list of doc IDs. +/// +/// Starts positioned at the first document (no initial `advance()` call required). +struct SortedDocIdScorer { + doc_ids: Vec, + pos: usize, +} + +impl DocSet for SortedDocIdScorer { + fn advance(&mut self) -> DocId { + self.pos += 1; + self.doc() + } + + fn seek(&mut self, target: DocId) -> DocId { + // Binary-search to the first id >= target. + self.pos = self.doc_ids.partition_point(|&id| id < target); + self.doc() + } + + fn doc(&self) -> DocId { + self.doc_ids.get(self.pos).copied().unwrap_or(TERMINATED) + } + + fn size_hint(&self) -> u32 { + self.doc_ids.len().saturating_sub(self.pos) as u32 + } +} + +impl Scorer for SortedDocIdScorer { + fn score(&mut self) -> Score { + 1.0 + } +} + +struct SoftDeletedDocIdsWeight { + docs_per_segment: HashMap>, +} + +impl Weight for SoftDeletedDocIdsWeight { + fn scorer(&self, reader: &SegmentReader, _boost: Score) -> tantivy::Result> { + let Some(doc_ids) = self.docs_per_segment.get(&reader.segment_id()) else { + return Ok(Box::new(EmptyScorer)); + }; + // Filter defensively: doc IDs must be < max_doc. The BTreeSet source guarantees + // strict ascending order, which SortedDocIdScorer requires. + let doc_ids: Vec = doc_ids + .iter() + .copied() + .filter(|&id| id < reader.max_doc()) + .collect(); + if doc_ids.is_empty() { + return Ok(Box::new(EmptyScorer)); + } + Ok(Box::new(SortedDocIdScorer { doc_ids, pos: 0 })) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + let is_deleted = self + .docs_per_segment + .get(&reader.segment_id()) + .map(|ids| ids.binary_search(&doc).is_ok()) + .unwrap_or(false); + if is_deleted { + Ok(Explanation::new("SoftDeletedDocIdsQuery", 1.0)) + } else { + Err(TantivyError::InvalidArgument(format!( + "Document #{doc} is not soft-deleted in this segment" + ))) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use tantivy::collector::TopDocs; + use tantivy::index::SegmentId; + use tantivy::query::AllQuery; + use tantivy::schema::{STORED, Schema, TEXT, Value}; + use tantivy::{Index, IndexWriter, ReloadPolicy, TantivyDocument, doc}; + + use super::*; + + /// Build an in-RAM single-segment index where each entry in `texts` becomes + /// one stored document. All documents are committed in a single pass so + /// tantivy assigns them contiguous doc IDs starting at 0. + fn make_index(texts: &[&str]) -> tantivy::Result<(Index, tantivy::schema::Field)> { + let mut schema_builder = Schema::builder(); + let body = schema_builder.add_text_field("body", TEXT | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer(15_000_000)?; + for text in texts { + writer.add_document(doc!(body => *text))?; + } + writer.commit()?; + Ok((index, body)) + } + + /// Apply `query` via `IndexWriter::delete_query`, commit, and return a + /// freshly-opened reader that reflects the resulting deletion state. + fn apply_delete_query( + index: &Index, + query: SoftDeletedDocIdsQuery, + ) -> tantivy::Result { + let mut writer: IndexWriter = index.writer(15_000_000)?; + writer.delete_query(Box::new(query))?; + writer.commit()?; + index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + } + + /// Collect and sort the stored body values of all live documents so that + /// tests can assert on the exact surviving content, independent of score + /// ordering. + fn live_bodies( + reader: &tantivy::IndexReader, + body: tantivy::schema::Field, + ) -> tantivy::Result> { + let searcher = reader.searcher(); + let top_docs = searcher.search(&AllQuery, &TopDocs::with_limit(1_000).order_by_score())?; + let mut texts: Vec = top_docs + .iter() + .map(|(_, addr)| { + let doc: TantivyDocument = searcher.doc(*addr).unwrap(); + doc.get_first(body) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string() + }) + .collect(); + texts.sort(); + Ok(texts) + } + + #[test] + fn test_delete_query_removes_targeted_docs() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c", "d", "e"])?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + let seg_readers = searcher.segment_readers(); + assert_eq!( + seg_readers.len(), + 1, + "expected a single segment after one commit" + ); + let segment_id = seg_readers[0].segment_id(); + drop(searcher); + + // Target doc IDs 1 ("b") and 3 ("d"). + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + let seg = &searcher_after.segment_readers()[0]; + + assert_eq!(seg.num_docs(), 3, "exactly 3 docs must survive"); + Ok(()) + } + + #[test] + fn test_delete_query_leaves_correct_docs_alive() -> tantivy::Result<()> { + let (index, body) = make_index(&["a", "b", "c", "d", "e"])?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let segment_id = { + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + // Delete docs 1 ("b") and 3 ("d"); "a", "c", "e" must survive. + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["a", "c", "e"]); + Ok(()) + } + + #[test] + fn test_delete_query_removes_all_docs() -> tantivy::Result<()> { + let (index, _) = make_index(&["x", "y", "z"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![0u32, 1u32, 2u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + let total_live_docs: u32 = searcher_after + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!(total_live_docs, 0, "all docs must be deleted"); + Ok(()) + } + + #[test] + fn test_delete_query_boundary_doc_ids() -> tantivy::Result<()> { + // Deleting the very first (0) and very last (3) doc IDs exercises the boundary + // positions of SortedDocIdScorer. + let (index, body) = make_index(&["a", "b", "c", "d"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![0u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["b", "c"]); + Ok(()) + } + + #[test] + fn test_delete_query_single_doc() -> tantivy::Result<()> { + let (index, body) = make_index(&["keep", "remove", "keep-too"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["keep", "keep-too"]); + Ok(()) + } + + #[test] + fn test_delete_query_unknown_segment_id_has_no_effect() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c"])?; + + // Obtain a segment ID that definitely does not belong to `index` by + // creating an independent second index. + let (other_index, _) = make_index(&["z"])?; + let foreign_id: SegmentId = { + let other_reader: tantivy::IndexReader = other_index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let other_searcher = other_reader.searcher(); + other_searcher.segment_readers()[0].segment_id() + }; + + // Targeting all three doc IDs under the foreign segment must not delete anything. + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(foreign_id, vec![0u32, 1u32, 2u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 3, + "unknown segment ID must leave all docs intact" + ); + Ok(()) + } + + #[test] + fn test_delete_query_out_of_range_doc_ids_are_ignored() -> tantivy::Result<()> { + // The index has 2 docs (max_doc = 2, valid IDs are 0 and 1). + // Providing only out-of-range IDs must not delete anything. + let (index, _) = make_index(&["a", "b"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![10u32, 20u32, 100u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 2, + "out-of-range doc IDs must be silently ignored" + ); + Ok(()) + } + + #[test] + fn test_delete_query_empty_map_has_no_effect() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c"])?; + let query = SoftDeletedDocIdsQuery::new(HashMap::new()); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 3, + "empty docs-per-segment map must delete nothing" + ); + Ok(()) + } +} diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index d9e21affb87..3509a60cb10 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -158,7 +158,7 @@ impl IngestSource { source_runtime: SourceRuntime, retry_params: RetryParams, ) -> anyhow::Result { - let self_node_id: NodeId = source_runtime.node_id().into(); + let self_node_id: NodeId = source_runtime.node_id().to_owned(); let client_id = ClientId::new( self_node_id.clone(), SourceUid { @@ -350,7 +350,7 @@ impl IngestSource { continue; }; let truncate_shards_request = TruncateShardsRequest { - ingester_id: ingester_id.clone().into(), + ingester_id: ingester_id.to_string(), subrequests: truncate_subrequests, }; let truncate_future = async move { @@ -410,9 +410,8 @@ impl IngestSource { .assigned_shards .keys() .filter(|&shard_id| !new_assigned_shard_ids.contains(shard_id)) - .cloned() .any(|removed_shard_id| { - let Some(assigned_shard) = self.assigned_shards.get(&removed_shard_id) else { + let Some(assigned_shard) = self.assigned_shards.get(removed_shard_id) else { return false; }; assigned_shard.status != IndexingStatus::Complete @@ -547,6 +546,14 @@ impl Source for IngestSource { let acquire_shards_response: AcquireShardsResponse = ctx .protect_future(self.metastore.acquire_shards(acquire_shards_request)) .await + .inspect_err(|error| { + error!( + %error, + index_uid=%self.client_id.source_uid.index_uid, + source_id=%self.client_id.source_uid.source_id, + "failed to acquire shards from the metastore" + ); + }) .context("failed to acquire shards")?; if acquire_shards_response.acquired_shards.len() != added_shard_ids.len() { @@ -571,8 +578,9 @@ impl Source for IngestSource { let index_uid = acquired_shard.index_uid().clone(); let shard_id = acquired_shard.shard_id().clone(); let mut current_position_inclusive = acquired_shard.publish_position_inclusive(); - let leader_id: NodeId = acquired_shard.leader_id.into(); - let follower_id_opt: Option = acquired_shard.follower_id.map(Into::into); + let leader_id: NodeId = NodeId::from_str(&acquired_shard.leader_id); + let follower_id_opt: Option = + acquired_shard.follower_id.map(|s| NodeId::from_str(&s)); let source_id: SourceId = acquired_shard.source_id; let partition_id = PartitionId::from(shard_id.as_str()); let from_position_exclusive = current_position_inclusive.clone(); @@ -696,7 +704,7 @@ mod tests { #[tokio::test] async fn test_ingest_source_assign_shards() { let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), @@ -931,7 +939,7 @@ mod tests { }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1018,7 +1026,7 @@ mod tests { let assigned_shard = source.assigned_shards.get(&ShardId::from(1)).unwrap(); let expected_assigned_shard = AssignedShard { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), follower_id_opt: None, partition_id: 1u64.into(), current_position_inclusive: Position::offset(11u64), @@ -1028,7 +1036,7 @@ mod tests { let assigned_shard = source.assigned_shards.get(&ShardId::from(2)).unwrap(); let expected_assigned_shard = AssignedShard { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), follower_id_opt: None, partition_id: 2u64.into(), current_position_inclusive: Position::offset(12u64), @@ -1047,7 +1055,7 @@ mod tests { // - emission of a suggest truncate // - no stream request is emitted let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), @@ -1128,7 +1136,7 @@ mod tests { }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let event_broker = EventBroker::default(); let (shard_positions_update_tx, mut shard_positions_update_rx) = @@ -1192,7 +1200,7 @@ mod tests { // - emission of a suggest truncate // - the stream request emitted does not include the EOF shards let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), @@ -1293,7 +1301,7 @@ mod tests { }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let event_broker = EventBroker::default(); let (shard_positions_update_tx, mut shard_positions_update_rx) = @@ -1360,7 +1368,7 @@ mod tests { #[tokio::test] async fn test_ingest_source_emit_batches() { let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), @@ -1397,7 +1405,7 @@ mod tests { source.assigned_shards.insert( ShardId::from(1), AssignedShard { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), follower_id_opt: None, partition_id: 1u64.into(), current_position_inclusive: Position::offset(11u64), @@ -1407,7 +1415,7 @@ mod tests { source.assigned_shards.insert( ShardId::from(2), AssignedShard { - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), follower_id_opt: None, partition_id: 2u64.into(), current_position_inclusive: Position::offset(22u64), @@ -1549,7 +1557,7 @@ mod tests { #[tokio::test] async fn test_ingest_source_emit_batches_shard_not_found() { let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), @@ -1601,7 +1609,7 @@ mod tests { }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let event_broker = EventBroker::default(); let source_runtime = SourceRuntime { @@ -1659,7 +1667,7 @@ mod tests { #[tokio::test] async fn test_ingest_source_suggest_truncate() { let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), @@ -1701,7 +1709,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let mut mock_ingester_1 = MockIngesterService::new(); mock_ingester_1 @@ -1728,7 +1736,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert("test-ingester-1".into(), ingester_1.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester_1.clone()); let mut mock_ingester_3 = MockIngesterService::new(); mock_ingester_3 @@ -1748,7 +1756,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); let ingester_3 = IngesterServiceClient::from_mock(mock_ingester_3); - ingester_pool.insert("test-ingester-3".into(), ingester_3.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-3"), ingester_3.clone()); let event_broker = EventBroker::default(); let (shard_positions_update_tx, mut shard_positions_update_rx) = @@ -1784,7 +1792,7 @@ mod tests { source.assigned_shards.insert( ShardId::from(1), AssignedShard { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), follower_id_opt: None, partition_id: 1u64.into(), current_position_inclusive: Position::offset(11u64), @@ -1794,8 +1802,8 @@ mod tests { source.assigned_shards.insert( ShardId::from(2), AssignedShard { - leader_id: "test-ingester-0".into(), - follower_id_opt: Some("test-ingester-1".into()), + leader_id: NodeId::from_str("test-ingester-0"), + follower_id_opt: Some(NodeId::from_str("test-ingester-1")), partition_id: 2u64.into(), current_position_inclusive: Position::offset(22u64), status: IndexingStatus::Active, @@ -1804,8 +1812,8 @@ mod tests { source.assigned_shards.insert( ShardId::from(3), AssignedShard { - leader_id: "test-ingester-1".into(), - follower_id_opt: Some("test-ingester-0".into()), + leader_id: NodeId::from_str("test-ingester-1"), + follower_id_opt: Some(NodeId::from_str("test-ingester-0")), partition_id: 3u64.into(), current_position_inclusive: Position::offset(33u64), status: IndexingStatus::Active, @@ -1814,8 +1822,8 @@ mod tests { source.assigned_shards.insert( ShardId::from(4), AssignedShard { - leader_id: "test-ingester-2".into(), - follower_id_opt: Some("test-ingester-3".into()), + leader_id: NodeId::from_str("test-ingester-2"), + follower_id_opt: Some(NodeId::from_str("test-ingester-3")), partition_id: 4u64.into(), current_position_inclusive: Position::offset(44u64), status: IndexingStatus::Active, @@ -1824,8 +1832,8 @@ mod tests { source.assigned_shards.insert( ShardId::from(5), AssignedShard { - leader_id: "test-ingester-2".into(), - follower_id_opt: Some("test-ingester-3".into()), + leader_id: NodeId::from_str("test-ingester-2"), + follower_id_opt: Some(NodeId::from_str("test-ingester-3")), partition_id: 5u64.into(), current_position_inclusive: Position::Beginning, status: IndexingStatus::Active, @@ -1867,7 +1875,7 @@ mod tests { // away. In that case, the ingester should just ignore the assigned shard, as // opposed to fail as the metastore does not let it `acquire` the shard. let pipeline_id = IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), pipeline_uid: PipelineUid::default(), diff --git a/quickwit/quickwit-indexing/src/source/kafka_source.rs b/quickwit/quickwit-indexing/src/source/kafka_source.rs index 5f93d0a9344..f1aca45bb98 100644 --- a/quickwit/quickwit-indexing/src/source/kafka_source.rs +++ b/quickwit/quickwit-indexing/src/source/kafka_source.rs @@ -25,7 +25,7 @@ use quickwit_actors::{ActorExitStatus, Mailbox}; use quickwit_config::KafkaSourceParams; use quickwit_metastore::checkpoint::{PartitionId, SourceCheckpoint}; use quickwit_proto::metastore::SourceType; -use quickwit_proto::types::{IndexUid, Position}; +use quickwit_proto::types::{IndexUid, NodeIdRef, Position}; use rdkafka::config::{ClientConfig, RDKafkaLogLevel}; use rdkafka::consumer::{ BaseConsumer, CommitMode, Consumer, ConsumerContext, DefaultConsumerContext, Rebalance, @@ -240,6 +240,7 @@ impl KafkaSource { let (events_tx, events_rx) = mpsc::channel(100); let (truncate_tx, truncate_rx) = watch::channel(SourceCheckpoint::default()); let (client_config, consumer, group_id) = create_consumer( + source_runtime.node_id(), source_runtime.index_uid(), source_runtime.source_id(), source_params, @@ -654,6 +655,7 @@ pub(super) async fn check_connectivity(params: KafkaSourceParams) -> anyhow::Res /// Creates a new `KafkaSourceConsumer`. fn create_consumer( + node_id: &NodeIdRef, index_uid: &IndexUid, source_id: &str, params: KafkaSourceParams, @@ -676,6 +678,7 @@ fn create_consumer( params.enable_backfill_mode.to_string(), ) .set("group.id", &group_id) + .set("client.id", node_id.as_str()) .set_log_level(log_level) .create_with_context(RdKafkaContext { topic: params.topic, diff --git a/quickwit/quickwit-indexing/src/source/mod.rs b/quickwit/quickwit-indexing/src/source/mod.rs index ec74a6c30de..b8de1b22217 100644 --- a/quickwit/quickwit-indexing/src/source/mod.rs +++ b/quickwit/quickwit-indexing/src/source/mod.rs @@ -186,7 +186,19 @@ impl SourceRuntime { pub async fn fetch_checkpoint(&self) -> MetastoreResult { let index_uid = self.index_uid().clone(); let request = IndexMetadataRequest::for_index_uid(index_uid); - let response = self.metastore.clone().index_metadata(request).await?; + let response = self + .metastore + .clone() + .index_metadata(request) + .await + .inspect_err(|error| { + error!( + %error, + index_uid=%self.index_uid(), + source_id=%self.source_id(), + "failed to fetch index metadata from the metastore" + ); + })?; let index_metadata = response.deserialize_index_metadata()?; if let Some(checkpoint) = index_metadata @@ -609,7 +621,7 @@ mod tests { SourceRuntime { pipeline_id: IndexingPipelineId { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), index_uid: self.index_uid, source_id: self.source_config.source_id.clone(), pipeline_uid: PipelineUid::for_test(0u128), diff --git a/quickwit/quickwit-indexing/src/source/queue_sources/coordinator.rs b/quickwit/quickwit-indexing/src/source/queue_sources/coordinator.rs index fadb4282c37..7c0824c3b8a 100644 --- a/quickwit/quickwit-indexing/src/source/queue_sources/coordinator.rs +++ b/quickwit/quickwit-indexing/src/source/queue_sources/coordinator.rs @@ -338,7 +338,7 @@ mod tests { shared_state: QueueSharedState, ) -> QueueCoordinator { let pipeline_id = IndexingPipelineId { - node_id: NodeId::from_str("test-node").unwrap(), + node_id: NodeId::from_str("test-node"), index_uid: shared_state.source_uid.index_uid.clone(), source_id: shared_state.source_uid.source_id.clone(), pipeline_uid: PipelineUid::random(), diff --git a/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs b/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs index b839c968043..38aa6383885 100644 --- a/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs +++ b/quickwit/quickwit-indexing/src/source/queue_sources/shared_state.rs @@ -135,7 +135,15 @@ impl QueueSharedState { .open_shards(OpenShardsRequest { subrequests: open_shard_subrequests, }) - .await?; + .await + .inspect_err(|error| { + error!( + %error, + index_uid=%self.source_uid.index_uid, + source_id=%self.source_uid.source_id, + "failed to open shards on the metastore" + ); + })?; let mut shards = Vec::new(); let mut re_acquired_shards = Vec::new(); @@ -172,7 +180,15 @@ impl QueueSharedState { shard_ids: re_acquired_shards, publish_token: publish_token.to_string(), }) - .await?; + .await + .inspect_err(|error| { + error!( + %error, + index_uid=%self.source_uid.index_uid, + source_id=%self.source_uid.source_id, + "failed to re-acquire shards on the metastore" + ); + })?; for shard in acquire_shard_resp.acquired_shards { let partition_id = PartitionId::from(shard.shard_id().as_str()); let position = shard.publish_position_inclusive.unwrap_or_default(); diff --git a/quickwit/quickwit-indexing/src/test_utils.rs b/quickwit/quickwit-indexing/src/test_utils.rs index 82198c820f4..69b54faf1a1 100644 --- a/quickwit/quickwit-indexing/src/test_utils.rs +++ b/quickwit/quickwit-indexing/src/test_utils.rs @@ -74,7 +74,7 @@ impl TestSandbox { indexing_settings_yaml: &str, search_fields: &[&str], ) -> anyhow::Result { - let node_id = NodeId::new(append_random_suffix("test-node")); + let node_id = NodeId::from_str(&append_random_suffix("test-node")); let transport = ChannelTransport::default(); let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true) .await @@ -127,6 +127,7 @@ impl TestSandbox { IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + false, ) .await?; let (indexing_service, _indexing_service_handle) = diff --git a/quickwit/quickwit-ingest/src/codegen/ingest_service.rs b/quickwit/quickwit-ingest/src/codegen/ingest_service.rs index 435c9b6d2f9..5ae1f6888a2 100644 --- a/quickwit/quickwit-ingest/src/codegen/ingest_service.rs +++ b/quickwit/quickwit-ingest/src/codegen/ingest_service.rs @@ -459,9 +459,9 @@ type TailLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct IngestServiceTowerLayerStack { - ingest_layers: Vec, - fetch_layers: Vec, - tail_layers: Vec, + pub ingest_layers: Vec, + pub fetch_layers: Vec, + pub tail_layers: Vec, } impl IngestServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs index ea0c28cb775..7d497f3432d 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs @@ -426,7 +426,7 @@ pub async fn setup_local_shards_update_listener( warn!("failed to parse shard infos `{}`", event.value); return; }; - let leader_id: NodeId = event.node.node_id.clone().into(); + let leader_id: NodeId = NodeId::from_arc_str(event.node.node_id.clone()); let local_shards_update = LocalShardsUpdate { leader_id, @@ -619,7 +619,7 @@ mod tests { let queue_id_02 = queue_id(&index_uid, "test-source", &ShardId::from(2)); let mut shard_02 = IngesterShard::new_replica( - NodeId::from("test-leader"), + NodeId::from_str("test-leader"), ShardState::Open, Position::Beginning, Position::Beginning, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs index 6e8d085e35d..588b3aa3276 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs @@ -1274,25 +1274,28 @@ pub(super) mod tests { #[test] fn test_select_preferred_and_failover_ingesters() { - let self_node_id: NodeId = "test-ingester-0".into(); + let self_node_id: NodeId = NodeId::from_str("test-ingester-0"); - let (preferred, failover) = - select_preferred_and_failover_ingesters(&self_node_id, "test-ingester-0".into(), None); + let (preferred, failover) = select_preferred_and_failover_ingesters( + &self_node_id, + NodeId::from_str("test-ingester-0"), + None, + ); assert_eq!(preferred, "test-ingester-0"); assert!(failover.is_none()); let (preferred, failover) = select_preferred_and_failover_ingesters( &self_node_id, - "test-ingester-0".into(), - Some("test-ingester-1".into()), + NodeId::from_str("test-ingester-0"), + Some(NodeId::from_str("test-ingester-1")), ); assert_eq!(preferred, "test-ingester-0"); assert_eq!(failover.unwrap(), "test-ingester-1"); let (preferred, failover) = select_preferred_and_failover_ingesters( &self_node_id, - "test-ingester-1".into(), - Some("test-ingester-0".into()), + NodeId::from_str("test-ingester-1"), + Some(NodeId::from_str("test-ingester-0")), ); assert_eq!(preferred, "test-ingester-0"); assert_eq!(failover.unwrap(), "test-ingester-1"); @@ -1306,7 +1309,10 @@ pub(super) mod tests { let shard_id = ShardId::from(1); let mut from_position_exclusive = Position::offset(0u64); - let ingester_ids: Vec = vec!["test-ingester-0".into(), "test-ingester-1".into()]; + let ingester_ids: Vec = vec![ + NodeId::from_str("test-ingester-0"), + NodeId::from_str("test-ingester-1"), + ]; let ingester_pool = IngesterPool::default(); let (fetch_message_tx, mut fetch_stream) = ServiceStream::new_bounded(5); @@ -1327,7 +1333,7 @@ pub(super) mod tests { }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert("test-ingester-1".into(), ingester_1); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester_1); let fetch_payload = FetchPayload { index_uid: Some(index_uid.clone()), @@ -1404,7 +1410,10 @@ pub(super) mod tests { let shard_id = ShardId::from(1); let mut from_position_exclusive = Position::offset(0u64); - let ingester_ids: Vec = vec!["test-ingester-0".into(), "test-ingester-1".into()]; + let ingester_ids: Vec = vec![ + NodeId::from_str("test-ingester-0"), + NodeId::from_str("test-ingester-1"), + ]; let ingester_pool = IngesterPool::default(); let (fetch_message_tx, mut fetch_stream) = ServiceStream::new_bounded(5); @@ -1442,8 +1451,8 @@ pub(super) mod tests { }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert("test-ingester-0".into(), ingester_0); - ingester_pool.insert("test-ingester-1".into(), ingester_1); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester_1); let fetch_payload = FetchPayload { index_uid: Some(index_uid.clone()), @@ -1520,7 +1529,10 @@ pub(super) mod tests { let shard_id = ShardId::from(1); let mut from_position_exclusive = Position::offset(0u64); - let ingester_ids: Vec = vec!["test-ingester-0".into(), "test-ingester-1".into()]; + let ingester_ids: Vec = vec![ + NodeId::from_str("test-ingester-0"), + NodeId::from_str("test-ingester-1"), + ]; let ingester_pool = IngesterPool::default(); let (fetch_message_tx, mut fetch_stream) = ServiceStream::new_bounded(5); @@ -1557,8 +1569,8 @@ pub(super) mod tests { }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert("test-ingester-0".into(), ingester_0); - ingester_pool.insert("test-ingester-1".into(), ingester_1); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester_1); let fetch_payload = FetchPayload { index_uid: Some(index_uid.clone()), @@ -1638,7 +1650,10 @@ pub(super) mod tests { let shard_id = ShardId::from(1); let mut from_position_exclusive = Position::offset(0u64); - let ingester_ids: Vec = vec!["test-ingester-0".into(), "test-ingester-1".into()]; + let ingester_ids: Vec = vec![ + NodeId::from_str("test-ingester-0"), + NodeId::from_str("test-ingester-1"), + ]; let ingester_pool = IngesterPool::default(); let (fetch_message_tx, mut fetch_stream) = ServiceStream::new_bounded(5); @@ -1659,7 +1674,7 @@ pub(super) mod tests { }) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0); fault_tolerant_fetch_stream( client_id, @@ -1694,7 +1709,7 @@ pub(super) mod tests { let shard_id = ShardId::from(1); let from_position_exclusive = Position::offset(0u64); - let ingester_ids: Vec = vec!["test-ingester".into()]; + let ingester_ids: Vec = vec![NodeId::from_str("test-ingester")]; let ingester_pool = IngesterPool::default(); let (fetch_message_tx, mut fetch_stream) = ServiceStream::new_bounded(5); @@ -1748,7 +1763,7 @@ pub(super) mod tests { }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert("test-ingester".into(), ingester); + ingester_pool.insert(NodeId::from_str("test-ingester"), ingester); let fetch_payload = FetchPayload { index_uid: Some(index_uid.clone()), @@ -1863,7 +1878,7 @@ pub(super) mod tests { #[tokio::test] async fn test_multi_fetch_stream() { - let self_node_id: NodeId = "test-node".into(); + let self_node_id: NodeId = NodeId::from_str("test-node"); let client_id = "test-client".to_string(); let ingester_pool = IngesterPool::default(); let retry_params = RetryParams::for_test(); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index 7a84b487758..bdb5558186c 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -141,7 +141,7 @@ impl Ingester { replication_factor: usize, idle_shard_timeout: Duration, ) -> IngestV2Result { - let self_node_id: NodeId = cluster.self_node_id().into(); + let self_node_id: NodeId = cluster.self_node_id(); let state = IngesterState::load(wal_dir_path, rate_limiter_settings); let weak_state = state.weak(); @@ -224,8 +224,8 @@ impl Ingester { .insert(queue_id, (rate_limiter, rate_meter)); let primary_shard = if let Some(follower_id) = &shard.follower_id { - let leader_id: NodeId = shard.leader_id.clone().into(); - let follower_id: NodeId = follower_id.clone().into(); + let leader_id: NodeId = NodeId::from_str(&shard.leader_id); + let follower_id: NodeId = NodeId::from_str(follower_id); let replication_client = self .init_replication_stream( @@ -385,8 +385,8 @@ impl Ingester { Entry::Vacant(entry) => entry, }; let open_request = OpenReplicationStreamRequest { - leader_id: leader_id.clone().into(), - follower_id: follower_id.clone().into(), + leader_id: leader_id.clone().to_string(), + follower_id: follower_id.clone().to_string(), replication_seqno: 0, }; let open_message = SynReplicationMessage::new_open_request(open_request); @@ -457,7 +457,7 @@ impl Ingester { let commit_type = persist_request.commit_type(); let force_commit = commit_type == CommitTypeV2::Force; - let leader_id: NodeId = persist_request.leader_id.into(); + let leader_id: NodeId = NodeId::from_str(&persist_request.leader_id); let mut state_guard = with_lock_metrics!(self.state.lock_fully().await, "persist", "write")?; @@ -476,7 +476,7 @@ impl Ingester { persist_failures.push(persist_failure); } let persist_response = PersistResponse { - leader_id: leader_id.into(), + leader_id: leader_id.to_string(), successes: Vec::new(), failures: persist_failures, }; @@ -848,8 +848,8 @@ impl Ingester { if open_replication_stream_request.follower_id != self.self_node_id { return Err(IngestV2Error::Internal("routing error".to_string())); } - let leader_id: NodeId = open_replication_stream_request.leader_id.into(); - let follower_id: NodeId = open_replication_stream_request.follower_id.into(); + let leader_id: NodeId = NodeId::from_str(&open_replication_stream_request.leader_id); + let follower_id: NodeId = NodeId::from_str(&open_replication_stream_request.follower_id); let mut state_guard = self.state.lock_partially().await?; @@ -923,7 +923,7 @@ impl Ingester { let self_node_id = self.self_node_id.clone(); let observation_stream = status_stream.map(move |status| { let observation_message = ObservationMessage { - node_id: self_node_id.clone().into(), + node_id: self_node_id.clone().to_string(), status: status as i32, }; Ok(observation_message) @@ -1345,7 +1345,7 @@ mod tests { let control_plane = ControlPlaneServiceClient::from_mock(mock_control_plane); Self { - node_id: "test-ingester".into(), + node_id: NodeId::from_str("test-ingester"), control_plane, ingester_pool: IngesterPool::default(), disk_capacity: ByteSize::mb(256), @@ -1359,7 +1359,7 @@ mod tests { impl IngesterForTest { pub fn with_node_id(mut self, node_id: &str) -> Self { - self.node_id = node_id.into(); + self.node_id = NodeId::from_str(node_id); self } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/models.rs b/quickwit/quickwit-ingest/src/ingest_v2/models.rs index 750570d55b9..82aef96d755 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/models.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/models.rs @@ -250,7 +250,7 @@ mod tests { let doc_mapper = build_doc_mapper(&doc_mapping, &search_settings).unwrap(); let primary_shard = IngesterShard::new_primary( - "test-follower".into(), + NodeId::from_str("test-follower"), ShardState::Closed, Position::offset(42u64), Position::Beginning, @@ -278,7 +278,7 @@ mod tests { #[test] fn test_new_replica_shard() { let replica_shard = IngesterShard::new_replica( - "test-leader".into(), + NodeId::from_str("test-leader"), ShardState::Closed, Position::offset(42u64), Position::Beginning, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/replication.rs b/quickwit/quickwit-ingest/src/ingest_v2/replication.rs index 5e286ec5b84..330e8477d07 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/replication.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/replication.rs @@ -344,8 +344,8 @@ impl ReplicationClient { commit_type: CommitTypeV2, ) -> impl Future> + Send + 'static { let replicate_request = ReplicateRequest { - leader_id: leader_id.into(), - follower_id: follower_id.into(), + leader_id: leader_id.to_string(), + follower_id: follower_id.to_string(), subrequests, commit_type: commit_type as i32, replication_seqno: 0, // replication number are generated further down @@ -466,7 +466,7 @@ impl ReplicationTask { } }; let replica_shard = IngesterShard::new_replica( - replica_shard.leader_id.into(), + NodeId::from_str(&replica_shard.leader_id), ShardState::Open, Position::Beginning, Position::Beginning, @@ -707,7 +707,7 @@ impl ReplicationTask { report_wal_usage(wal_usage); - let follower_id = self.follower_id.clone().into(); + let follower_id = self.follower_id.to_string(); let replicate_response = ReplicateResponse { follower_id, @@ -824,8 +824,8 @@ mod tests { #[tokio::test] async fn test_replication_stream_task_init() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (syn_replication_stream_tx, mut syn_replication_stream_rx) = mpsc::channel(5); let (ack_replication_stream_tx, ack_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -871,8 +871,8 @@ mod tests { #[tokio::test] async fn test_replication_stream_task_replicate() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (syn_replication_stream_tx, mut syn_replication_stream_rx) = mpsc::channel(5); let (ack_replication_stream_tx, ack_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -997,8 +997,8 @@ mod tests { #[tokio::test] async fn test_replication_stream_replicate_errors() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (syn_replication_stream_tx, _syn_replication_stream_rx) = mpsc::channel(5); let (_ack_replication_stream_tx, ack_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -1035,8 +1035,8 @@ mod tests { #[tokio::test] async fn test_replication_task_happy_path() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (_temp_dir, state) = IngesterState::for_test().await; let (syn_replication_stream_tx, syn_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -1298,8 +1298,8 @@ mod tests { #[tokio::test] async fn test_replication_task_shard_closed() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (_temp_dir, state) = IngesterState::for_test().await; let (syn_replication_stream_tx, syn_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -1375,8 +1375,8 @@ mod tests { #[cfg(not(feature = "failpoints"))] #[tokio::test] async fn test_replication_task_deletes_dangling_shard() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (_temp_dir, state) = IngesterState::for_test().await; let (syn_replication_stream_tx, syn_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -1463,8 +1463,8 @@ mod tests { let scenario = fail::FailScenario::setup(); fail::cfg("ingester:append_records", "return").unwrap(); - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (_temp_dir, state) = IngesterState::for_test().await; let (syn_replication_stream_tx, syn_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); @@ -1552,8 +1552,8 @@ mod tests { #[tokio::test] async fn test_replication_task_resource_exhausted() { - let leader_id: NodeId = "test-leader".into(); - let follower_id: NodeId = "test-follower".into(); + let leader_id: NodeId = NodeId::from_str("test-leader"); + let follower_id: NodeId = NodeId::from_str("test-follower"); let (_temp_dir, state) = IngesterState::for_test().await; let (syn_replication_stream_tx, syn_replication_stream) = ServiceStream::new_bounded(SYN_REPLICATION_STREAM_CAPACITY); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index 4cd6fb2b57b..ed1024cbc3a 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -423,7 +423,7 @@ impl IngestRouter { subrequest_ids, }; let persist_request = PersistRequest { - leader_id: leader_id.into(), + leader_id: leader_id.to_string(), subrequests, commit_type: commit_type as i32, }; @@ -724,7 +724,7 @@ mod tests { #[tokio::test] async fn test_router_make_get_or_create_open_shard_request() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane: ControlPlaneServiceClient = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); @@ -758,14 +758,14 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, ], ..Default::default() @@ -839,7 +839,10 @@ mod tests { drop(rendezvous_1); drop(rendezvous_2); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + NodeId::from_str("test-ingester-0"), + IngesterServiceClient::mocked(), + ); { // Ingester-0 has been marked as unavailable due to the previous requests. let (get_or_create_open_shard_request_opt, _rendezvous) = router @@ -883,7 +886,7 @@ mod tests { #[tokio::test] async fn test_router_populate_routing_table() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); let index_uid2: IndexUid = IndexUid::for_test("test-index-1", 0); @@ -1063,7 +1066,7 @@ mod tests { #[tokio::test] async fn test_router_batch_persist_records_no_shards_available_empty_routing_table() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let mut mock_control_plane = MockControlPlaneService::new(); mock_control_plane .expect_get_or_create_open_shards() @@ -1107,7 +1110,7 @@ mod tests { #[tokio::test] async fn test_router_batch_persist_records_no_shards_available_unavailable_ingester() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let mut mock_control_plane = MockControlPlaneService::new(); mock_control_plane .expect_get_or_create_open_shards() @@ -1129,7 +1132,7 @@ mod tests { source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), shard_state: ShardState::Open as i32, - leader_id: "test-ingester".into(), + leader_id: "test-ingester".to_string(), ..Default::default() }], }], @@ -1166,7 +1169,7 @@ mod tests { #[tokio::test] async fn test_router_process_persist_results_record_persist_successes() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1189,7 +1192,7 @@ mod tests { persist_futures.push(async move { let persist_summary = PersistRequestSummary { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), subrequest_ids: vec![0], }; let persist_result = Ok::<_, IngestV2Error>(PersistResponse { @@ -1218,7 +1221,7 @@ mod tests { #[tokio::test] async fn test_router_process_persist_results_record_persist_failures() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1241,7 +1244,7 @@ mod tests { persist_futures.push(async move { let persist_summary = PersistRequestSummary { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), subrequest_ids: vec![0], }; let persist_result = Ok::<_, IngestV2Error>(PersistResponse { @@ -1270,7 +1273,7 @@ mod tests { #[tokio::test] async fn test_router_process_persist_results_closes_and_deletes_shards() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1310,7 +1313,7 @@ mod tests { persist_futures.push(async { let persist_summary = PersistRequestSummary { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), subrequest_ids: vec![0], }; let persist_result = Ok::<_, IngestV2Error>(PersistResponse { @@ -1353,12 +1356,18 @@ mod tests { #[tokio::test] async fn test_router_process_persist_results_does_not_remove_unavailable_leaders() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); - ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + NodeId::from_str("test-ingester-0"), + IngesterServiceClient::mocked(), + ); + ingester_pool.insert( + NodeId::from_str("test-ingester-1"), + IngesterServiceClient::mocked(), + ); let replication_factor = 1; let router = IngestRouter::new( @@ -1387,7 +1396,7 @@ mod tests { persist_futures.push(async { let persist_summary = PersistRequestSummary { - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), subrequest_ids: vec![0], }; let persist_result = @@ -1407,12 +1416,12 @@ mod tests { assert!( !workbench .unavailable_leaders - .contains(&NodeId::from("test-ingester-1")) + .contains(&NodeId::from_str("test-ingester-1")) ); let persist_futures = FuturesUnordered::new(); persist_futures.push(async { let persist_summary = PersistRequestSummary { - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), subrequest_ids: vec![1], }; let persist_result = @@ -1429,7 +1438,7 @@ mod tests { assert!( workbench .unavailable_leaders - .contains(&NodeId::from("test-ingester-1")) + .contains(&NodeId::from_str("test-ingester-1")) ); let subworkbench = workbench.subworkbenches.get(&1).unwrap(); @@ -1441,7 +1450,7 @@ mod tests { #[tokio::test] async fn test_router_ingest() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1588,7 +1597,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let mut mock_ingester_1 = MockIngesterService::new(); mock_ingester_1 @@ -1625,7 +1634,7 @@ mod tests { Ok(response) }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert("test-ingester-1".into(), ingester_1); + ingester_pool.insert(NodeId::from_str("test-ingester-1"), ingester_1); let ingest_request = IngestRequestV2 { subrequests: vec![ @@ -1679,7 +1688,7 @@ mod tests { #[tokio::test] async fn test_router_ingest_retry() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1773,7 +1782,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let ingest_request = IngestRequestV2 { subrequests: vec![IngestSubrequest { @@ -1789,7 +1798,7 @@ mod tests { #[tokio::test] async fn test_router_updates_routing_table_on_chitchat_events() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1819,7 +1828,7 @@ mod tests { drop(state_guard); let local_shards_update = LocalShardsUpdate { - leader_id: "test-ingester".into(), + leader_id: NodeId::from_str("test-ingester"), source_uid: SourceUid { index_uid: index_uid.clone(), source_id: "test-source".to_string(), @@ -1882,7 +1891,7 @@ mod tests { #[tokio::test] async fn test_router_debug_info() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -1932,7 +1941,7 @@ mod tests { #[tokio::test] async fn test_router_does_not_retry_rate_limited_shards() { // We avoid retrying a shard limited shard at the scale of a workbench. - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -2076,7 +2085,7 @@ mod tests { .in_sequence(&mut seq); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let ingest_request = IngestRequestV2 { subrequests: vec![IngestSubrequest { @@ -2092,7 +2101,7 @@ mod tests { #[tokio::test] async fn test_router_returns_rate_limited_failure() { - let self_node_id = "test-router".into(); + let self_node_id: NodeId = NodeId::from_str("test-router"); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); let replication_factor = 1; @@ -2152,7 +2161,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert(NodeId::from_str("test-ingester-0"), ingester_0.clone()); let ingest_request = IngestRequestV2 { subrequests: vec![IngestSubrequest { diff --git a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs index 987d754ed69..c43e7ba0294 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs @@ -41,7 +41,7 @@ impl From for RoutingEntry { source_id: shard.source_id, shard_id, shard_state, - leader_id: shard.leader_id.into(), + leader_id: NodeId::from_str(&shard.leader_id), } } } @@ -503,7 +503,7 @@ mod tests { #[test] fn test_routing_table_entry_new() { - let self_node_id: NodeId = "test-node-0".into(); + let self_node_id: NodeId = NodeId::from_str("test-node-0"); let index_uid = IndexUid::for_test("test-index", 0); let source_id: SourceId = "test-source".into(); let table_entry = RoutingTableEntry::new( @@ -584,8 +584,14 @@ mod tests { assert!(closed_shard_ids.is_empty()); assert!(unavailable_leaders.is_empty()); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); - ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + NodeId::from_str("test-ingester-0"), + IngesterServiceClient::mocked(), + ); + ingester_pool.insert( + NodeId::from_str("test-ingester-1"), + IngesterServiceClient::mocked(), + ); let table_entry = RoutingTableEntry { index_uid: index_uid.clone(), @@ -596,14 +602,14 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, ], local_round_robin_idx: AtomicUsize::default(), @@ -632,21 +638,21 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Closed, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-2".into(), + leader_id: NodeId::from_str("test-ingester-2"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(3), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, ], remote_round_robin_idx: AtomicUsize::default(), @@ -675,8 +681,14 @@ mod tests { .unwrap_err(); assert_eq!(error, NextOpenShardError::NoShardsAvailable); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); - ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + NodeId::from_str("test-ingester-0"), + IngesterServiceClient::mocked(), + ); + ingester_pool.insert( + NodeId::from_str("test-ingester-1"), + IngesterServiceClient::mocked(), + ); let table_entry = RoutingTableEntry { index_uid: index_uid.clone(), @@ -687,21 +699,21 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(3), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, ], local_round_robin_idx: AtomicUsize::default(), @@ -731,7 +743,7 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }], local_round_robin_idx: AtomicUsize::default(), remote_shards: vec![ @@ -740,28 +752,28 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(3), shard_state: ShardState::Closed, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(4), shard_state: ShardState::Open, - leader_id: "test-ingester-2".into(), + leader_id: NodeId::from_str("test-ingester-2"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(5), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, ], remote_round_robin_idx: AtomicUsize::default(), @@ -795,7 +807,10 @@ mod tests { let source_id: SourceId = "test-source".into(); let ingester_pool = IngesterPool::default(); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + NodeId::from_str("test-ingester-0"), + IngesterServiceClient::mocked(), + ); let rate_limited_shards = HashSet::from_iter([ShardId::from(1)]); @@ -807,7 +822,7 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }], local_round_robin_idx: AtomicUsize::default(), remote_shards: Vec::new(), @@ -825,8 +840,8 @@ mod tests { let source_id: SourceId = "test-source".into(); let mut table_entry = RoutingTableEntry::empty(index_uid_0.clone(), source_id.clone()); - let local_node_id: NodeId = "test-ingester-0".into(); - let remote_node_id: NodeId = "test-ingester-1".into(); + let local_node_id: NodeId = NodeId::from_str("test-ingester-0"); + let remote_node_id: NodeId = NodeId::from_str("test-ingester-1"); table_entry.insert_open_shards(&local_node_id, &local_node_id, &index_uid_0, &[]); assert_eq!(table_entry.local_shards.len(), 0); @@ -948,21 +963,21 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(3), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, ], local_round_robin_idx: AtomicUsize::default(), @@ -972,21 +987,21 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(5), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(6), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(7), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, ], remote_round_robin_idx: AtomicUsize::default(), @@ -1029,21 +1044,21 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(1), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(2), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(3), shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), + leader_id: NodeId::from_str("test-ingester-0"), }, ], local_round_robin_idx: AtomicUsize::default(), @@ -1053,21 +1068,21 @@ mod tests { source_id: "test-source".to_string(), shard_id: ShardId::from(5), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(6), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, RoutingEntry { index_uid: index_uid.clone(), source_id: "test-source".to_string(), shard_id: ShardId::from(7), shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), + leader_id: NodeId::from_str("test-ingester-1"), }, ], remote_round_robin_idx: AtomicUsize::default(), diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs index de501d3f06d..9e895023fde 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs @@ -785,7 +785,7 @@ mod tests { let mut workbench = IngestWorkbench::new(ingest_subrequests, 1); let persist_error = IngestV2Error::Timeout("request timed out".to_string()); - let leader_id = NodeId::from("test-leader"); + let leader_id = NodeId::from_str("test-leader"); let persist_summary = PersistRequestSummary { leader_id: leader_id.clone(), subrequest_ids: vec![0], @@ -811,7 +811,7 @@ mod tests { let mut workbench = IngestWorkbench::new(ingest_subrequests, 1); let persist_error = IngestV2Error::Unavailable("connection error".to_string()); - let leader_id = NodeId::from("test-leader"); + let leader_id = NodeId::from_str("test-leader"); let persist_summary = PersistRequestSummary { leader_id: leader_id.clone(), subrequest_ids: vec![0], @@ -840,7 +840,7 @@ mod tests { let persist_error = IngestV2Error::Internal("IO error".to_string()); let persist_summary = PersistRequestSummary { - leader_id: NodeId::from("test-leader"), + leader_id: NodeId::from_str("test-leader"), subrequest_ids: vec![0], }; workbench.record_persist_error(persist_error, persist_summary); diff --git a/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs b/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs index 5277e0ac740..d1ca3bd2e42 100644 --- a/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs +++ b/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs @@ -126,7 +126,7 @@ impl ClusterSandboxBuilder { config.enabled_services.clone_from(&node_builder.services); config.jaeger_config.enable_endpoint = true; config.cluster_id.clone_from(&cluster_id); - config.node_id = NodeId::new(format!("test-node-{node_idx}")); + config.node_id = NodeId::from_str(&format!("test-node-{node_idx}")); config.data_dir_path = root_data_dir.join(config.node_id.as_str()); config.metastore_uri = QuickwitUri::from_str(&format!("ram:///{unique_dir_name}/metastore")).unwrap(); diff --git a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs index ac9e24e517d..5e1b1ec08da 100644 --- a/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs +++ b/quickwit/quickwit-integration-tests/src/tests/update_tests/mod.rs @@ -38,7 +38,6 @@ async fn assert_hits_unordered( .await; if let Ok(expected_hits) = expected_result { let resp = search_res.unwrap_or_else(|err| panic!("query: {query}, error: {err}")); - assert_eq!(resp.errors.len(), 0, "query: {query}"); assert_eq!(resp.num_hits, expected_hits.len() as u64, "query: {query}"); for expected_hit in expected_hits { assert!( @@ -49,8 +48,8 @@ async fn assert_hits_unordered( resp.hits ); } - } else if let Ok(search_response) = search_res { - assert!(!search_response.errors.is_empty(), "query: {query}"); + } else { + search_res.unwrap_err(); } } diff --git a/quickwit/quickwit-jaeger/src/lib.rs b/quickwit/quickwit-jaeger/src/lib.rs index 1b6dfc27d0c..f0171367bab 100644 --- a/quickwit/quickwit-jaeger/src/lib.rs +++ b/quickwit/quickwit-jaeger/src/lib.rs @@ -2718,11 +2718,11 @@ mod tests { num_hits: 2, hits: vec![], elapsed_time_micros: 100, - errors: Vec::new(), aggregation_postcard: Some(aggregation_postcard), scroll_id: None, failed_splits: Vec::new(), num_successful_splits: 1, + splits_by_outcome: None, }) }); diff --git a/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs b/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs index 9506a587bd4..e7dbe410d59 100644 --- a/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs +++ b/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs @@ -190,7 +190,7 @@ impl DeleteTaskPipeline { let packager = Packager::new("MergePackager", tag_fields, uploader_mailbox); let (packager_mailbox, packager_supervisor_handler) = ctx.spawn_actor().supervise(packager); let pipeline_id = MergePipelineId { - node_id: NodeId::from("unknown"), + node_id: NodeId::from_str("unknown"), index_uid: self.index_uid.clone(), source_id: "unknown".to_string(), }; diff --git a/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs b/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs index 5e08b7773e6..e3bc3df5279 100644 --- a/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs +++ b/quickwit/quickwit-janitor/src/actors/delete_task_planner.rs @@ -251,7 +251,7 @@ impl DeleteTaskPlanner { // TODO: validate the query at the beginning and return an appropriate error. let delete_query_ast = serde_json::from_str(&delete_query.query_ast) .expect("Failed to deserialize query_ast json"); - let tags_filter = extract_tags_from_query(delete_query_ast); + let tags_filter = extract_tags_from_query(delete_query_ast, None); split_time_range_filter(&stale_split.split_metadata, time_range.as_ref()) && split_tag_filter(&stale_split.split_metadata, tags_filter.as_ref()) }) diff --git a/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs b/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs index bcb07d79020..0c05f982e31 100644 --- a/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs @@ -20,20 +20,21 @@ use quickwit_proto::control_plane::{ControlPlaneService, ControlPlaneServiceClie use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, - DeleteIndexTemplatesRequest, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, - DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, + DeleteIndexTemplatesRequest, DeleteKvRequest, DeleteQuery, DeleteShardsRequest, + DeleteShardsResponse, DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, GetClusterIdentityRequest, - GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, - IndexMetadataRequest, IndexMetadataResponse, IndexesMetadataRequest, IndexesMetadataResponse, - LastDeleteOpstampRequest, LastDeleteOpstampResponse, ListDeleteTasksRequest, - ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, + GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, GetKvRequest, + GetKvResponse, IndexMetadataRequest, IndexMetadataResponse, IndexesMetadataRequest, + IndexesMetadataResponse, LastDeleteOpstampRequest, LastDeleteOpstampResponse, + ListDeleteTasksRequest, ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, ListIndexTemplatesRequest, ListIndexTemplatesResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, ListShardsRequest, ListShardsResponse, ListSplitsRequest, ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreResult, MetastoreService, MetastoreServiceClient, MetastoreServiceStream, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, PublishSplitsRequest, ResetSourceCheckpointRequest, - StageSplitsRequest, ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, - UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, + SetKvRequest, SoftDeleteDocumentsRequest, SoftDeleteDocumentsResponse, StageSplitsRequest, + ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + UpdateSplitsDeleteOpstampResponse, }; /// A [`MetastoreService`] implementation that proxies some requests to the control plane so it can @@ -188,6 +189,13 @@ impl MetastoreService for ControlPlaneMetastore { self.metastore.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + self.metastore.soft_delete_documents(request).await + } + async fn reset_source_checkpoint( &self, request: ResetSourceCheckpointRequest, @@ -289,4 +297,16 @@ impl MetastoreService for ControlPlaneMetastore { ) -> MetastoreResult { self.metastore.get_cluster_identity(request).await } + + async fn get_kv(&self, request: GetKvRequest) -> MetastoreResult { + self.metastore.get_kv(request).await + } + + async fn set_kv(&self, request: SetKvRequest) -> MetastoreResult { + self.metastore.set_kv(request).await + } + + async fn delete_kv(&self, request: DeleteKvRequest) -> MetastoreResult { + self.metastore.delete_kv(request).await + } } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index 4b53cbf648b..bd1677e89fd 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -32,7 +32,7 @@ use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, DeleteTask, EntityKind, IndexStats, ListShardsSubrequest, ListShardsSubresponse, MetastoreError, MetastoreResult, OpenShardSubrequest, - OpenShardSubresponse, PruneShardsRequest, SplitStats, + OpenShardSubresponse, PruneShardsRequest, SplitDocIds, SplitStats, }; use quickwit_proto::types::{IndexUid, PublishToken, SourceId, SplitId}; use serde::{Deserialize, Serialize}; @@ -43,7 +43,7 @@ use tracing::{info, warn}; use super::MutationOccurred; use crate::checkpoint::IndexCheckpointDelta; -use crate::metastore::{SortBy, use_shard_api}; +use crate::metastore::{MAX_SOFT_DELETED_DOCS_PER_SPLIT, SortBy, use_shard_api}; use crate::{IndexMetadata, ListSplitsQuery, Split, SplitMetadata, SplitState, split_tag_filter}; /// A `FileBackedIndex` object carries an index metadata and its split metadata. @@ -498,6 +498,63 @@ impl FileBackedIndex { Ok(()) } + /// Soft-deletes individual documents within published splits. + pub(crate) fn soft_delete_documents( + &mut self, + split_doc_ids: &[SplitDocIds], + ) -> MetastoreResult { + // First pass: validate all splits before making any changes to guarantee atomicity. + for entry in split_doc_ids { + let split = self.splits.get(&entry.split_id).ok_or_else(|| { + MetastoreError::NotFound(EntityKind::Split { + split_id: entry.split_id.clone(), + }) + })?; + if split.split_state != SplitState::Published { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: entry.split_id.clone(), + }, + message: format!("split `{}` is not in Published state", entry.split_id), + }); + } + let current_count = split.split_metadata.soft_deleted_doc_ids.len(); + let new_unique_count = entry + .doc_ids + .iter() + .filter(|&&id| !split.split_metadata.soft_deleted_doc_ids.contains(&id)) + .count(); + if current_count + new_unique_count > MAX_SOFT_DELETED_DOCS_PER_SPLIT { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: entry.split_id.clone(), + }, + message: format!( + "split `{}` would exceed the maximum number of soft-deleted documents \ + ({MAX_SOFT_DELETED_DOCS_PER_SPLIT}): current={current_count}, would be={}", + entry.split_id, + current_count + new_unique_count, + ), + }); + } + } + + // Second pass: all splits are valid — apply changes. + let mut num_soft_deleted = 0u64; + for entry in split_doc_ids { + let split = self + .splits + .get_mut(&entry.split_id) + .expect("split existence validated in first pass"); + for &doc_id in &entry.doc_ids { + if split.split_metadata.soft_deleted_doc_ids.insert(doc_id) { + num_soft_deleted += 1; + } + } + } + Ok(num_soft_deleted) + } + /// Gets IndexStats for this index pub(crate) fn get_stats(&self) -> MetastoreResult { let mut staged_stats = SplitStats::default(); @@ -724,6 +781,11 @@ impl Debug for Stamper { } fn split_query_predicate(split: &&Split, query: &ListSplitsQuery) -> bool { + if let Some(split_ids) = &query.split_ids + && !split_ids.contains(&split.split_metadata.split_id) + { + return false; + } if !split_tag_filter(&split.split_metadata, query.tags.as_ref()) { return false; } @@ -814,11 +876,14 @@ mod tests { use quickwit_doc_mapper::tag_pruning::TagFilterAst; use quickwit_proto::ingest::Shard; - use quickwit_proto::metastore::{ListShardsSubrequest, SplitStats}; + use quickwit_proto::metastore::{ + EntityKind, ListShardsSubrequest, MetastoreError, SplitDocIds, SplitStats, + }; use quickwit_proto::types::{IndexUid, SourceId}; use super::FileBackedIndex; use crate::file_backed::file_backed_index::split_query_predicate; + use crate::metastore::MAX_SOFT_DELETED_DOCS_PER_SPLIT; use crate::{IndexMetadata, ListSplitsQuery, Split, SplitMetadata, SplitState}; impl FileBackedIndex { @@ -949,6 +1014,15 @@ mod tests { assert!(split_query_predicate(&&split_1, &query)); assert!(split_query_predicate(&&split_2, &query)); assert!(!split_query_predicate(&&split_3, &query)); + + let query = ListSplitsQuery::for_index(IndexUid::new_with_random_ulid("test-index")) + .with_split_ids(vec![ + split_1.split_metadata.split_id.clone(), + split_2.split_metadata.split_id.clone(), + ]); + assert!(split_query_predicate(&&split_1, &query)); + assert!(split_query_predicate(&&split_2, &query)); + assert!(!split_query_predicate(&&split_3, &query)); } #[test] @@ -1019,4 +1093,151 @@ mod tests { assert_eq!(stats.published, expected_published); assert_eq!(stats.marked_for_deletion, expected_marked_for_deletion); } + + /// Helper: creates a `FileBackedIndex` with a single published split. + fn make_index_with_published_split(split_id: &str) -> FileBackedIndex { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + let split_metadata = SplitMetadata { + split_id: split_id.to_string(), + ..Default::default() + }; + index.stage_split(split_metadata).unwrap(); + index + .publish_splits([split_id], Vec::<&str>::new(), None, None) + .unwrap(); + index + } + + #[test] + fn test_soft_delete_documents_basic() { + let mut index = make_index_with_published_split("split-a"); + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 5, 42], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + assert_eq!(num_deleted, 3); + + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids, + BTreeSet::from([1, 5, 42]) + ); + } + + #[test] + fn test_soft_delete_documents_idempotent() { + let mut index = make_index_with_published_split("split-a"); + + // First call: delete doc IDs 1, 2, 3. + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 2, 3], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + assert_eq!(num_deleted, 3); + + // Second call: same IDs plus one new one. + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 2, 3, 4], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + // Only doc_id 4 is new. + assert_eq!(num_deleted, 1); + + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids, + BTreeSet::from([1, 2, 3, 4]) + ); + } + + #[test] + fn test_soft_delete_documents_non_published_split_fails() { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + let split_metadata = SplitMetadata { + split_id: "staged-split".to_string(), + ..Default::default() + }; + index.stage_split(split_metadata).unwrap(); + // The split is still in Staged state — not Published. + + let split_doc_ids = vec![SplitDocIds { + split_id: "staged-split".to_string(), + doc_ids: vec![10], + }]; + let error = index.soft_delete_documents(&split_doc_ids).unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition error, got: {error:?}" + ); + } + + #[test] + fn test_soft_delete_documents_unknown_split_fails() { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + + let split_doc_ids = vec![SplitDocIds { + split_id: "nonexistent-split".to_string(), + doc_ids: vec![1], + }]; + let error = index.soft_delete_documents(&split_doc_ids).unwrap_err(); + assert!( + matches!(error, MetastoreError::NotFound(EntityKind::Split { .. })), + "expected NotFound error, got: {error:?}" + ); + } + + #[test] + fn test_soft_delete_documents_limit_exceeded() { + let mut index = make_index_with_published_split("split-a"); + + // Pre-populate with MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1 soft-deleted doc IDs. + let initial_ids: Vec = (0..MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1).collect(); + let initial_entries = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: initial_ids, + }]; + index.soft_delete_documents(&initial_entries).unwrap(); + + // Adding 2 more unique IDs would push the total to MAX + 1 — must fail. + let overflow_entries = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![ + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1, + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32, + ], + }]; + let error = index.soft_delete_documents(&overflow_entries).unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition error when limit exceeded, got: {error:?}" + ); + + // The split must be unchanged — still at MAX - 1 entries. + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids.len(), + MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1 + ); + } } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs index cec811bd3e4..4e9bf42c972 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/manifest.rs @@ -42,6 +42,7 @@ impl LegacyManifest { indexes: self.indexes, templates: HashMap::new(), identity: Uuid::nil(), + kv_store: HashMap::new(), } } } @@ -67,6 +68,7 @@ pub(crate) struct Manifest { // unnecessary here and we can pass the hash map as is to the `MetastoreState` pub templates: HashMap, pub identity: Uuid, + pub kv_store: HashMap, } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -108,6 +110,8 @@ struct ManifestV0_8 { templates: Vec, #[serde(default, skip_serializing_if = "Uuid::is_nil")] identity: Uuid, + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + kv_store: HashMap, } impl From for ManifestV0_8 { @@ -121,6 +125,7 @@ impl From for ManifestV0_8 { indexes: manifest.indexes, templates, identity: manifest.identity, + kv_store: manifest.kv_store, } } } @@ -137,6 +142,7 @@ impl From for Manifest { indexes, templates, identity: manifest.identity, + kv_store: manifest.kv_store, } } } @@ -158,12 +164,14 @@ impl quickwit_config::TestableForRegression for Manifest { indexes, templates, identity: Uuid::nil(), + kv_store: HashMap::new(), } } fn assert_equality(&self, other: &Self) { assert_eq!(self.indexes, other.indexes); assert_eq!(self.templates, other.templates); + assert_eq!(self.kv_store, other.kv_store); } } @@ -338,6 +346,7 @@ mod tests { indexes, templates, identity: Uuid::nil(), + kv_store: HashMap::new(), }; let manifest_json = serde_json::to_string_pretty(&manifest).unwrap(); let manifest_deserialized: Manifest = serde_json::from_str(&manifest_json).unwrap(); diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 2542f1db36f..51dcbc424a8 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -42,12 +42,13 @@ use quickwit_config::IndexTemplate; use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, - DeleteIndexTemplatesRequest, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, - DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, EntityKind, - FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, GetClusterIdentityRequest, - GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, - IndexMetadataFailure, IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, - IndexTemplateMatch, IndexesMetadataRequest, IndexesMetadataResponse, LastDeleteOpstampRequest, + DeleteIndexTemplatesRequest, DeleteKvRequest, DeleteQuery, DeleteShardsRequest, + DeleteShardsResponse, DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, + EntityKind, FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, + GetClusterIdentityRequest, GetClusterIdentityResponse, GetIndexTemplateRequest, + GetIndexTemplateResponse, GetKvRequest, GetKvResponse, IndexMetadataFailure, + IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, IndexTemplateMatch, + IndexesMetadataRequest, IndexesMetadataResponse, LastDeleteOpstampRequest, LastDeleteOpstampResponse, ListDeleteTasksRequest, ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, ListIndexTemplatesRequest, ListIndexTemplatesResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, @@ -55,8 +56,9 @@ use quickwit_proto::metastore::{ ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, PublishSplitsRequest, ResetSourceCheckpointRequest, - StageSplitsRequest, ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, - UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, serde_utils, + SetKvRequest, SoftDeleteDocumentsRequest, SoftDeleteDocumentsResponse, StageSplitsRequest, + ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + UpdateSplitsDeleteOpstampResponse, serde_utils, }; use quickwit_proto::types::{IndexId, IndexUid}; use quickwit_storage::Storage; @@ -729,6 +731,23 @@ impl MetastoreService for FileBackedMetastore { Ok(EmptyResponse {}) } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + let index_uid = request.index_uid().clone(); + let num_soft_deleted_doc_ids = self + .mutate(&index_uid, |index| { + let num_soft_deleted_doc_ids = + index.soft_delete_documents(&request.split_doc_ids)?; + Ok(MutationOccurred::Yes(num_soft_deleted_doc_ids)) + }) + .await?; + Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids, + }) + } + async fn add_source(&self, request: AddSourceRequest) -> MetastoreResult { let source_config = request.deserialize_source_config()?; let index_uid = request.index_uid(); @@ -1275,6 +1294,43 @@ impl MetastoreService for FileBackedMetastore { uuid: state_wlock_guard.identity.hyphenated().to_string(), }) } + + // KV store API + + async fn get_kv(&self, request: GetKvRequest) -> MetastoreResult { + let state = self.state.read().await; + let value = state.kv_store.get(&request.key).cloned(); + Ok(GetKvResponse { value }) + } + + async fn set_kv(&self, request: SetKvRequest) -> MetastoreResult { + let mut state = self.state.write().await; + let previous_value = state.kv_store.insert(request.key.clone(), request.value); + let manifest = state.as_manifest(); + if let Err(error) = save_manifest(&*self.storage, &manifest).await { + // Rollback + match previous_value { + Some(value) => state.kv_store.insert(request.key, value), + None => state.kv_store.remove(&request.key), + }; + return Err(error); + } + Ok(EmptyResponse {}) + } + + async fn delete_kv(&self, request: DeleteKvRequest) -> MetastoreResult { + let mut state = self.state.write().await; + let previous_value = state.kv_store.remove(&request.key); + let manifest = state.as_manifest(); + if let Err(error) = save_manifest(&*self.storage, &manifest).await { + // Rollback + if let Some(value) = previous_value { + state.kv_store.insert(request.key, value); + } + return Err(error); + } + Ok(EmptyResponse {}) + } } impl MetastoreServiceExt for FileBackedMetastore {} diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs index 0d42408f430..04235e861cf 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/state.rs @@ -33,6 +33,7 @@ pub(super) struct MetastoreState { pub templates: HashMap, pub template_matcher: IndexTemplateMatcher, pub identity: Uuid, + pub kv_store: HashMap, } impl MetastoreState { @@ -67,6 +68,7 @@ impl MetastoreState { templates: manifest.templates, template_matcher, identity: manifest.identity, + kv_store: manifest.kv_store, }; Ok(state) } @@ -89,6 +91,7 @@ impl MetastoreState { indexes, templates, identity: self.identity, + kv_store: self.kv_store.clone(), } } } diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs index 98f2f1d5039..187ad1676d9 100644 --- a/quickwit/quickwit-metastore/src/metastore/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/mod.rs @@ -49,6 +49,10 @@ use crate::{Split, SplitMetadata, SplitState}; /// Splits batch size returned by the stream splits API pub(crate) const STREAM_SPLITS_CHUNK_SIZE: usize = 100; +/// Maximum number of soft-deleted document IDs allowed per split. +/// Attempts to soft-delete documents that would push the total above this limit will fail. +pub(crate) const MAX_SOFT_DELETED_DOCS_PER_SPLIT: usize = 10_000; + /// An extended trait for [`MetastoreService`]. #[async_trait] pub trait MetastoreServiceExt: MetastoreService { @@ -640,6 +644,10 @@ pub struct ListSplitsQuery { /// A specific node ID to filter by. pub node_id: Option, + /// A non-empty list of split IDs to fetch, or + /// None to ignore this filter. + pub split_ids: Option>, + /// The maximum number of splits to retrieve. pub limit: Option, @@ -739,6 +747,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, } } @@ -765,6 +774,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, }) } @@ -787,6 +797,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, } } @@ -796,6 +807,12 @@ impl ListSplitsQuery { self } + /// Selects only splits with the specified IDs. + pub fn with_split_ids(mut self, split_ids: Vec) -> Self { + self.split_ids = Some(split_ids); + self + } + /// Sets the maximum number of splits to retrieve. pub fn with_limit(mut self, n: usize) -> Self { self.limit = Some(n); diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index d4296fb7ee6..a2129b8623a 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::fmt::{self, Write}; use std::str::FromStr; use std::time::Duration; @@ -30,21 +30,23 @@ use quickwit_proto::ingest::{Shard, ShardState}; use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, AddSourceRequest, CreateIndexRequest, CreateIndexResponse, CreateIndexTemplateRequest, DeleteIndexRequest, - DeleteIndexTemplatesRequest, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, - DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, EntityKind, - FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, GetClusterIdentityRequest, - GetClusterIdentityResponse, GetIndexTemplateRequest, GetIndexTemplateResponse, - IndexMetadataFailure, IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, - IndexStats, IndexTemplateMatch, IndexesMetadataRequest, IndexesMetadataResponse, - LastDeleteOpstampRequest, LastDeleteOpstampResponse, ListDeleteTasksRequest, - ListDeleteTasksResponse, ListIndexStatsRequest, ListIndexStatsResponse, - ListIndexTemplatesRequest, ListIndexTemplatesResponse, ListIndexesMetadataRequest, - ListIndexesMetadataResponse, ListShardsRequest, ListShardsResponse, ListShardsSubresponse, - ListSplitsRequest, ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, - MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, + DeleteIndexTemplatesRequest, DeleteKvRequest, DeleteQuery, DeleteShardsRequest, + DeleteShardsResponse, DeleteSourceRequest, DeleteSplitsRequest, DeleteTask, EmptyResponse, + EntityKind, FindIndexTemplateMatchesRequest, FindIndexTemplateMatchesResponse, + GetClusterIdentityRequest, GetClusterIdentityResponse, GetIndexTemplateRequest, + GetIndexTemplateResponse, GetKvRequest, GetKvResponse, IndexMetadataFailure, + IndexMetadataFailureReason, IndexMetadataRequest, IndexMetadataResponse, IndexStats, + IndexTemplateMatch, IndexesMetadataRequest, IndexesMetadataResponse, LastDeleteOpstampRequest, + LastDeleteOpstampResponse, ListDeleteTasksRequest, ListDeleteTasksResponse, + ListIndexStatsRequest, ListIndexStatsResponse, ListIndexTemplatesRequest, + ListIndexTemplatesResponse, ListIndexesMetadataRequest, ListIndexesMetadataResponse, + ListShardsRequest, ListShardsResponse, ListShardsSubresponse, ListSplitsRequest, + ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, + MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, OpenShardSubresponse, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, - PublishSplitsRequest, ResetSourceCheckpointRequest, SplitStats, StageSplitsRequest, - ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + PublishSplitsRequest, ResetSourceCheckpointRequest, SetKvRequest, SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, SplitStats, StageSplitsRequest, ToggleSourceRequest, + UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, serde_utils, }; use quickwit_proto::types::{IndexId, IndexUid, Position, PublishToken, ShardId, SourceId}; @@ -72,13 +74,14 @@ use crate::file_backed::MutationOccurred; use crate::metastore::postgres::model::Shards; use crate::metastore::postgres::utils::split_maturity_timestamp; use crate::metastore::{ - IndexesMetadataResponseExt, PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE, - UpdateSourceRequestExt, use_shard_api, + IndexesMetadataResponseExt, MAX_SOFT_DELETED_DOCS_PER_SPLIT, PublishSplitsRequestExt, + STREAM_SPLITS_CHUNK_SIZE, UpdateSourceRequestExt, use_shard_api, }; use crate::{ AddSourceRequestExt, CreateIndexRequestExt, IndexMetadata, IndexMetadataResponseExt, ListIndexesMetadataResponseExt, ListSplitsRequestExt, ListSplitsResponseExt, - MetastoreServiceExt, Split, SplitState, StageSplitsRequestExt, UpdateIndexRequestExt, + MetastoreServiceExt, Split, SplitMetadata, SplitState, StageSplitsRequestExt, + UpdateIndexRequestExt, }; /// PostgreSQL metastore implementation. @@ -1165,6 +1168,124 @@ impl MetastoreService for PostgresqlMetastore { Ok(EmptyResponse {}) } + #[instrument(skip(self))] + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + let index_uid: IndexUid = request.index_uid().clone(); + let split_doc_ids = request.split_doc_ids; + + if split_doc_ids.is_empty() { + return Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids: 0, + }); + } + + // Fetches current metadata for all requested splits in a single round-trip, locking + // the rows for the duration of the transaction. + const FETCH_SPLITS_METADATA_QUERY: &str = r#" + SELECT split_id, split_metadata_json + FROM splits + WHERE + index_uid = $1 + AND split_id = ANY($2) + AND split_state = 'Published' + FOR UPDATE + "#; + + // Updates all modified splits in a single round-trip via UNNEST. + const UPDATE_SPLITS_METADATA_QUERY: &str = r#" + UPDATE splits + SET + split_metadata_json = updates.split_metadata_json, + update_timestamp = (CURRENT_TIMESTAMP AT TIME ZONE 'UTC') + FROM UNNEST($1::TEXT[], $2::TEXT[]) AS updates(split_id, split_metadata_json) + WHERE + splits.index_uid = $3 + AND splits.split_id = updates.split_id + AND splits.split_state = 'Published' + "#; + + // Build a lookup map: split_id → new doc IDs to add. + let mut new_ids_by_split: HashMap<&str, BTreeSet> = HashMap::new(); + for split in &split_doc_ids { + let entry = new_ids_by_split.entry(split.split_id.as_str()).or_default(); + entry.extend(split.doc_ids.iter().copied()); + } + + let requested_split_ids: Vec<&str> = + split_doc_ids.iter().map(|s| s.split_id.as_str()).collect(); + + run_with_tx!(self.connection_pool, tx, "soft delete documents", { + // Phase 1: fetch and lock all relevant splits, merge new doc IDs, validate limits. + // Any error here causes the transaction to roll back, so no split is modified. + let rows: Vec<(String, String)> = sqlx::query_as(FETCH_SPLITS_METADATA_QUERY) + .bind(&index_uid) + .bind(&requested_split_ids) + .fetch_all(tx.as_mut()) + .await + .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?; + + let mut updated_split_ids: Vec = Vec::with_capacity(rows.len()); + let mut updated_metadata_jsons: Vec = Vec::with_capacity(rows.len()); + let mut total_soft_deleted: u64 = 0; + + for (split_id, split_metadata_json) in rows { + let new_ids = new_ids_by_split + .get(split_id.as_str()) + .cloned() + .unwrap_or_default(); + + let mut split_metadata = serde_json::from_str::( + &split_metadata_json, + ) + .map_err(|error| MetastoreError::JsonDeserializeError { + struct_name: "SplitMetadata".to_string(), + message: error.to_string(), + })?; + + let old_count = split_metadata.soft_deleted_doc_ids.len(); + split_metadata.soft_deleted_doc_ids.extend(new_ids); + let new_count = split_metadata.soft_deleted_doc_ids.len(); + if old_count == new_count { + continue; + } + + if new_count > MAX_SOFT_DELETED_DOCS_PER_SPLIT { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: split_id.clone(), + }, + message: format!( + "split `{split_id}` would exceed the maximum number of soft-deleted \ + documents ({MAX_SOFT_DELETED_DOCS_PER_SPLIT}): would be {new_count}", + ), + }); + } + + updated_metadata_jsons.push(serde_utils::to_json_str(&split_metadata)?); + updated_split_ids.push(split_id); + total_soft_deleted += (new_count - old_count) as u64; + } + + // Phase 2: all validations passed — apply all updates in a single query. + if !updated_split_ids.is_empty() { + sqlx::query(UPDATE_SPLITS_METADATA_QUERY) + .bind(&updated_split_ids) + .bind(&updated_metadata_jsons) + .bind(&index_uid) + .execute(tx.as_mut()) + .await + .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?; + } + + Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids: total_soft_deleted, + }) + }) + } + #[instrument(skip(self))] async fn add_source(&self, request: AddSourceRequest) -> MetastoreResult { let source_config = request.deserialize_source_config()?; @@ -1765,6 +1886,39 @@ impl MetastoreService for PostgresqlMetastore { Ok(EmptyResponse {}) } + async fn get_kv(&self, request: GetKvRequest) -> MetastoreResult { + let value: Option<(String,)> = sqlx::query_as("SELECT value FROM kv WHERE key = $1") + .bind(&request.key) + .fetch_optional(&self.connection_pool) + .await?; + Ok(GetKvResponse { + value: value.map(|(v,)| v), + }) + } + + async fn set_kv(&self, request: SetKvRequest) -> MetastoreResult { + sqlx::query( + r" + INSERT INTO kv (key, value) + VALUES ($1, $2) + ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value + ", + ) + .bind(&request.key) + .bind(&request.value) + .execute(&self.connection_pool) + .await?; + Ok(EmptyResponse {}) + } + + async fn delete_kv(&self, request: DeleteKvRequest) -> MetastoreResult { + sqlx::query("DELETE FROM kv WHERE key = $1") + .bind(&request.key) + .execute(&self.connection_pool) + .await?; + Ok(EmptyResponse {}) + } + async fn get_cluster_identity( &self, _: GetClusterIdentityRequest, @@ -2241,6 +2395,18 @@ mod tests { sql.to_string(PostgresQueryBuilder), r#"SELECT * FROM "splits" WHERE "time_range_end" <= 42"# ); + + let mut select_statement = Query::select(); + let sql = select_statement.column(Asterisk).from(Splits::Table); + + let query = ListSplitsQuery::for_all_indexes() + .with_split_ids(vec!["split-1".to_string(), "split-2".to_string()]); + append_query_filters_and_order_by(sql, &query); + + assert_eq!( + sql.to_string(PostgresQueryBuilder), + r#"SELECT * FROM "splits" WHERE "split_id" IN ('split-1', 'split-2')"# + ); } #[test] diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/model.rs b/quickwit/quickwit-metastore/src/metastore/postgres/model.rs index 86853c531b4..8c605859f55 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/model.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/model.rs @@ -90,6 +90,7 @@ pub enum Splits { IndexUid, NodeId, DeleteOpstamp, + SoftDeletedDocIds, } pub(super) struct ToTimestampFunc; diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs index b5769201948..f0d87246c0c 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs @@ -107,6 +107,10 @@ pub(super) fn append_query_filters_and_order_by( sql.cond_where(Expr::col(Splits::IndexUid).is_in(index_uids)); } + if let Some(split_ids) = &query.split_ids { + sql.cond_where(Expr::col(Splits::SplitId).is_in(split_ids)); + } + if let Some(node_id) = &query.node_id { sql.cond_where(Expr::col(Splits::NodeId).eq(node_id)); }; diff --git a/quickwit/quickwit-metastore/src/split_metadata.rs b/quickwit/quickwit-metastore/src/split_metadata.rs index 829029e5d43..3de6f9122f4 100644 --- a/quickwit/quickwit-metastore/src/split_metadata.rs +++ b/quickwit/quickwit-metastore/src/split_metadata.rs @@ -135,6 +135,9 @@ pub struct SplitMetadata { /// Doc mapping UID used when creating this split. This split may only be merged with other /// splits using the same doc mapping UID. pub doc_mapping_uid: DocMappingUid, + + /// Set of tantivy doc_ids that have been soft-deleted from this split. + pub soft_deleted_doc_ids: BTreeSet, } impl fmt::Debug for SplitMetadata { @@ -180,6 +183,9 @@ impl fmt::Debug for SplitMetadata { debug_struct.field("footer_offsets", &self.footer_offsets); debug_struct.field("delete_opstamp", &self.delete_opstamp); debug_struct.field("num_merge_ops", &self.num_merge_ops); + if !self.soft_deleted_doc_ids.is_empty() { + debug_struct.field("soft_deleted_doc_ids", &self.soft_deleted_doc_ids); + } debug_struct.finish() } } @@ -286,6 +292,7 @@ impl quickwit_config::TestableForRegression for SplitMetadata { footer_offsets: 1000..2000, num_merge_ops: 3, doc_mapping_uid: DocMappingUid::default(), + soft_deleted_doc_ids: BTreeSet::new(), } } @@ -427,6 +434,7 @@ mod tests { delete_opstamp: 0, num_merge_ops: 0, doc_mapping_uid: DocMappingUid::default(), + soft_deleted_doc_ids: BTreeSet::new(), }; let expected_output = diff --git a/quickwit/quickwit-metastore/src/split_metadata_version.rs b/quickwit/quickwit-metastore/src/split_metadata_version.rs index 5f6204c85b7..43b38542133 100644 --- a/quickwit/quickwit-metastore/src/split_metadata_version.rs +++ b/quickwit/quickwit-metastore/src/split_metadata_version.rs @@ -97,6 +97,10 @@ pub(crate) struct SplitMetadataV0_8 { // splits before when updates first appeared are compatible with each other. #[serde(default)] doc_mapping_uid: DocMappingUid, + + /// Set of tantivy doc_ids that have been soft-deleted from this split. + #[serde(default)] + pub soft_deleted_doc_ids: BTreeSet, } impl From for SplitMetadata { @@ -134,6 +138,7 @@ impl From for SplitMetadata { footer_offsets: v8.footer_offsets, num_merge_ops: v8.num_merge_ops, doc_mapping_uid: v8.doc_mapping_uid, + soft_deleted_doc_ids: v8.soft_deleted_doc_ids, } } } @@ -157,6 +162,7 @@ impl From for SplitMetadataV0_8 { footer_offsets: split.footer_offsets, num_merge_ops: split.num_merge_ops, doc_mapping_uid: split.doc_mapping_uid, + soft_deleted_doc_ids: split.soft_deleted_doc_ids, } } } diff --git a/quickwit/quickwit-metastore/src/tests/kv.rs b/quickwit/quickwit-metastore/src/tests/kv.rs new file mode 100644 index 00000000000..32f0834b6f6 --- /dev/null +++ b/quickwit/quickwit-metastore/src/tests/kv.rs @@ -0,0 +1,232 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use quickwit_proto::metastore::{DeleteKvRequest, GetKvRequest, MetastoreService, SetKvRequest}; + +use super::DefaultForTest; + +pub async fn test_metastore_kv_set_get() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "test-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Get the key-value pair + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("test-value".to_string())); +} + +pub async fn test_metastore_kv_get_non_existent< + MetastoreToTest: MetastoreService + DefaultForTest, +>() { + let metastore = MetastoreToTest::default_for_test().await; + + // Try to get a non-existent key + let get_request = GetKvRequest { + key: "non-existent-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, None); +} + +pub async fn test_metastore_kv_set_overwrite() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "original-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Overwrite with new value + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "updated-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Verify the value was updated + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("updated-value".to_string())); +} + +pub async fn test_metastore_kv_delete() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: "test-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Verify it exists + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("test-value".to_string())); + + // Delete the key + let delete_request = DeleteKvRequest { + key: "test-key".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); + + // Verify it no longer exists + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, None); +} + +pub async fn test_metastore_kv_delete_non_existent< + MetastoreToTest: MetastoreService + DefaultForTest, +>() { + let metastore = MetastoreToTest::default_for_test().await; + + // Delete a non-existent key (should succeed without error) + let delete_request = DeleteKvRequest { + key: "non-existent-key".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); +} + +pub async fn test_metastore_kv_multiple_keys() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set multiple key-value pairs + let set_request_1 = SetKvRequest { + key: "key-1".to_string(), + value: "value-1".to_string(), + }; + metastore.set_kv(set_request_1).await.unwrap(); + + let set_request_2 = SetKvRequest { + key: "key-2".to_string(), + value: "value-2".to_string(), + }; + metastore.set_kv(set_request_2).await.unwrap(); + + let set_request_3 = SetKvRequest { + key: "key-3".to_string(), + value: "value-3".to_string(), + }; + metastore.set_kv(set_request_3).await.unwrap(); + + // Verify all keys exist + let get_request_1 = GetKvRequest { + key: "key-1".to_string(), + }; + let response_1 = metastore.get_kv(get_request_1).await.unwrap(); + assert_eq!(response_1.value, Some("value-1".to_string())); + + let get_request_2 = GetKvRequest { + key: "key-2".to_string(), + }; + let response_2 = metastore.get_kv(get_request_2).await.unwrap(); + assert_eq!(response_2.value, Some("value-2".to_string())); + + let get_request_3 = GetKvRequest { + key: "key-3".to_string(), + }; + let response_3 = metastore.get_kv(get_request_3).await.unwrap(); + assert_eq!(response_3.value, Some("value-3".to_string())); + + // Delete one key + let delete_request = DeleteKvRequest { + key: "key-2".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); + + // Verify key-2 is deleted but others remain + let get_request_1 = GetKvRequest { + key: "key-1".to_string(), + }; + let response_1 = metastore.get_kv(get_request_1).await.unwrap(); + assert_eq!(response_1.value, Some("value-1".to_string())); + + let get_request_2 = GetKvRequest { + key: "key-2".to_string(), + }; + let response_2 = metastore.get_kv(get_request_2).await.unwrap(); + assert_eq!(response_2.value, None); + + let get_request_3 = GetKvRequest { + key: "key-3".to_string(), + }; + let response_3 = metastore.get_kv(get_request_3).await.unwrap(); + assert_eq!(response_3.value, Some("value-3".to_string())); +} + +pub async fn test_metastore_kv_empty_key() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair with an empty key + let set_request = SetKvRequest { + key: "".to_string(), + value: "empty-key-value".to_string(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Get the empty key + let get_request = GetKvRequest { + key: "".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some("empty-key-value".to_string())); + + // Delete the empty key + let delete_request = DeleteKvRequest { + key: "".to_string(), + }; + metastore.delete_kv(delete_request).await.unwrap(); + + // Verify it's deleted + let get_request = GetKvRequest { + key: "".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, None); +} + +pub async fn test_metastore_kv_empty_value() { + let metastore = MetastoreToTest::default_for_test().await; + + // Set a key-value pair with an empty value + let set_request = SetKvRequest { + key: "test-key".to_string(), + value: String::new(), + }; + metastore.set_kv(set_request).await.unwrap(); + + // Get the key with empty value + let get_request = GetKvRequest { + key: "test-key".to_string(), + }; + let response = metastore.get_kv(get_request).await.unwrap(); + assert_eq!(response.value, Some(String::new())); +} diff --git a/quickwit/quickwit-metastore/src/tests/list_splits.rs b/quickwit/quickwit-metastore/src/tests/list_splits.rs index 11afe5955cd..dbc2d6be918 100644 --- a/quickwit/quickwit-metastore/src/tests/list_splits.rs +++ b/quickwit/quickwit-metastore/src/tests/list_splits.rs @@ -1041,7 +1041,7 @@ pub async fn test_metastore_list_splits_by_node_id< metastore.stage_splits(stage_splits_request).await.unwrap(); let list_splits_query = - ListSplitsQuery::for_index(index_uid.clone()).with_node_id(NodeId::from("test-node-1")); + ListSplitsQuery::for_index(index_uid.clone()).with_node_id(NodeId::from_str("test-node-1")); let list_splits_request = ListSplitsRequest::try_from_list_splits_query(&list_splits_query).unwrap(); diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs index d6e549baf25..c6177da1b43 100644 --- a/quickwit/quickwit-metastore/src/tests/mod.rs +++ b/quickwit/quickwit-metastore/src/tests/mod.rs @@ -28,6 +28,7 @@ use quickwit_proto::types::IndexUid; pub(crate) mod delete_task; pub(crate) mod get_identity; pub(crate) mod index; +pub(crate) mod kv; pub(crate) mod list_splits; pub(crate) mod shard; pub(crate) mod source; @@ -575,6 +576,94 @@ macro_rules! metastore_test_suite { let _ = tracing_subscriber::fmt::try_init(); $crate::tests::get_identity::test_metastore_get_identity::<$metastore_type>().await; } + + /// KV API tests + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_set_get() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_set_get::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_get_non_existent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_get_non_existent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_set_overwrite() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_set_overwrite::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_delete() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_delete::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_delete_non_existent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_delete_non_existent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_multiple_keys() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_multiple_keys::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_empty_key() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_empty_key::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_kv_empty_value() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::kv::test_metastore_kv_empty_value::<$metastore_type>().await; + } + + /// Soft-delete documents API tests + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_idempotent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_idempotent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_non_published_split() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_non_published_split::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_limit_exceeded() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_limit_exceeded::<$metastore_type>().await; + } } }; } diff --git a/quickwit/quickwit-metastore/src/tests/split.rs b/quickwit/quickwit-metastore/src/tests/split.rs index 9e6d45265e3..16a96905936 100644 --- a/quickwit/quickwit-metastore/src/tests/split.rs +++ b/quickwit/quickwit-metastore/src/tests/split.rs @@ -20,7 +20,7 @@ use quickwit_config::{IndexConfig, SourceConfig, SourceParams}; use quickwit_proto::metastore::{ CreateIndexRequest, DeleteSplitsRequest, EntityKind, IndexMetadataRequest, ListSplitsRequest, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, PublishSplitsRequest, - StageSplitsRequest, UpdateSplitsDeleteOpstampRequest, + SoftDeleteDocumentsRequest, SplitDocIds, StageSplitsRequest, UpdateSplitsDeleteOpstampRequest, }; use quickwit_proto::types::{IndexUid, Position}; use time::OffsetDateTime; @@ -29,7 +29,7 @@ use tracing::{error, info}; use super::DefaultForTest; use crate::checkpoint::{IndexCheckpointDelta, PartitionId, SourceCheckpointDelta}; -use crate::metastore::MetastoreServiceStreamSplitsExt; +use crate::metastore::{MAX_SOFT_DELETED_DOCS_PER_SPLIT, MetastoreServiceStreamSplitsExt}; use crate::tests::cleanup_index; use crate::{ CreateIndexRequestExt, IndexMetadataResponseExt, ListSplitsQuery, ListSplitsRequestExt, @@ -1806,3 +1806,431 @@ pub async fn test_metastore_update_splits_delete_opstamp< cleanup_index(&mut metastore, index_uid).await; } } + +pub async fn test_metastore_soft_delete_documents< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-docs"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + let split_id = format!("{index_id}--split-1"); + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata).unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 5, 42], + }], + }; + let response = metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + assert!(response.num_soft_deleted_doc_ids > 0); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&5)); + assert!(soft_deleted.contains(&42)); + assert_eq!(soft_deleted.len(), 3); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_idempotent< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-idempotent"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + let split_id = format!("{index_id}--split-1"); + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata).unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + // First call: soft-delete doc IDs [1, 2, 3]. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + // Second call: same doc IDs — must not return an error. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + // The set of soft-deleted IDs must still be exactly {1, 2, 3}. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert_eq!(soft_deleted.len(), 3); + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&2)); + assert!(soft_deleted.contains(&3)); + + // Third call: same IDs plus one new one — must extend the set by exactly one. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3, 4], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert_eq!(soft_deleted.len(), 4); + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&2)); + assert!(soft_deleted.contains(&3)); + assert!(soft_deleted.contains(&4)); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_non_published_split< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-unpublished"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + // Stage a split but do NOT publish it. + let staged_split_id = format!("{index_id}--split1"); + let staged_split_metadata = SplitMetadata { + split_id: staged_split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &staged_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + // Stage, publish, then mark another split for deletion. + let marked_split_id = format!("{index_id}--split2"); + let marked_split_metadata = SplitMetadata { + split_id: marked_split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &marked_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![marked_split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + let mark_for_deletion_request = + MarkSplitsForDeletionRequest::new(index_uid.clone(), vec![marked_split_id.clone()]); + metastore + .mark_splits_for_deletion(mark_for_deletion_request) + .await + .unwrap(); + + // Attempt to soft-delete documents on the staged split. + // Implementations may return an error (file-backed) or silently skip (postgres) — both are + // valid. What matters is that the split's soft_deleted_doc_ids remains unmodified. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: staged_split_id.clone(), + doc_ids: vec![10, 20], + }], + }; + let _ = metastore.soft_delete_documents(soft_delete_request).await; + + let list_staged_request = ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid.clone()).with_split_state(SplitState::Staged), + ) + .unwrap(); + let staged_splits = metastore + .list_splits(list_staged_request) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(staged_splits.len(), 1); + assert!( + staged_splits[0] + .split_metadata + .soft_deleted_doc_ids + .is_empty(), + "staged split must not have any soft-deleted doc IDs" + ); + + // Attempt to soft-delete documents on the marked-for-deletion split. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: marked_split_id.clone(), + doc_ids: vec![30, 40], + }], + }; + let _ = metastore.soft_delete_documents(soft_delete_request).await; + + let list_marked_request = ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid.clone()) + .with_split_state(SplitState::MarkedForDeletion), + ) + .unwrap(); + let marked_splits = metastore + .list_splits(list_marked_request) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(marked_splits.len(), 1); + assert!( + marked_splits[0] + .split_metadata + .soft_deleted_doc_ids + .is_empty(), + "marked-for-deletion split must not have any soft-deleted doc IDs" + ); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_limit_exceeded< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-limit"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + // Create and publish two splits. + let split_a_id = format!("{index_id}--split-a"); + let split_b_id = format!("{index_id}--split-b"); + + for split_id in [&split_a_id, &split_b_id] { + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata) + .unwrap(); + metastore.stage_splits(stage_request).await.unwrap(); + + let publish_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore.publish_splits(publish_request).await.unwrap(); + } + + // Pre-populate split-b with MAX - 1 soft-deleted doc IDs so one more would be fine but two + // would exceed the limit. + let initial_ids: Vec = (0..MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1).collect(); + let pre_populate_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_b_id.clone(), + doc_ids: initial_ids, + }], + }; + metastore + .soft_delete_documents(pre_populate_request) + .await + .unwrap(); + + // Request that would: + // - soft-delete 1 doc on split-a (valid on its own) + // - soft-delete 2 *new* docs on split-b (would push total from MAX-1 to MAX+1) + // The whole request must fail and neither split must be modified. + let overflow_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![ + SplitDocIds { + split_id: split_a_id.clone(), + doc_ids: vec![100], + }, + SplitDocIds { + split_id: split_b_id.clone(), + doc_ids: vec![ + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1, + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32, + ], + }, + ], + }; + let error = metastore + .soft_delete_documents(overflow_request) + .await + .unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition when soft-deleted doc limit is exceeded, got: {error:?}" + ); + + // Verify atomicity: both splits must be unmodified after the failed request. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + let split_a = splits + .iter() + .find(|s| s.split_metadata.split_id == split_a_id) + .expect("split-a must exist"); + assert!( + split_a.split_metadata.soft_deleted_doc_ids.is_empty(), + "split-a must not have been modified (atomicity guarantee)" + ); + + let split_b = splits + .iter() + .find(|s| s.split_metadata.split_id == split_b_id) + .expect("split-b must exist"); + assert_eq!( + split_b.split_metadata.soft_deleted_doc_ids.len(), + MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1, + "split-b must not have been modified (atomicity guarantee)" + ); + + cleanup_index(&mut metastore, index_uid).await; +} diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json index 9f7f0e27f23..0f708ac16b3 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 1 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000000", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000000", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000000", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1704067200 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000000", - "update_timestamp": 1704067200 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json index 9f7f0e27f23..0f708ac16b3 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 1 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000000", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000000", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000000", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1704067200 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000000", - "update_timestamp": 1704067200 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json index f9ecb6a7bcb..2d60feec007 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 12 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000001", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000001", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000001", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1724240908 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000001", - "update_timestamp": 1724240908 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json index f9ecb6a7bcb..2d60feec007 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 12 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000001", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000001", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000001", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1724240908 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000001", - "update_timestamp": 1724240908 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json index 248baebc68e..fc54c8b931c 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json index 248baebc68e..fc54c8b931c 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json index 85bdfca81e0..3e2d37292d0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json index 85bdfca81e0..3e2d37292d0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-proto/protos/quickwit/control_plane.proto b/quickwit/quickwit-proto/protos/quickwit/control_plane.proto index d0850091280..6aa7d28d7da 100644 --- a/quickwit/quickwit-proto/protos/quickwit/control_plane.proto +++ b/quickwit/quickwit-proto/protos/quickwit/control_plane.proto @@ -69,6 +69,21 @@ service ControlPlaneService { // Performs a debounced shard pruning request to the metastore. rpc PruneShards(quickwit.metastore.PruneShardsRequest) returns (quickwit.metastore.EmptyResponse); + + // Swaps indexing pipelines of different indexes between different indexers. + rpc SwapIndexingPipelines(SwapIndexingPipelinesRequest) returns (SwapIndexingPipelinesResponse); + + // Maintenance Mode API + + // Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + // metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + rpc EnableMaintenanceMode(EnableMaintenanceModeRequest) returns (EnableMaintenanceModeResponse); + + // Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + rpc DisableMaintenanceMode(DisableMaintenanceModeRequest) returns (DisableMaintenanceModeResponse); + + // Returns the current maintenance mode status. + rpc GetMaintenanceMode(GetMaintenanceModeRequest) returns (GetMaintenanceModeResponse); } // Shard API @@ -125,3 +140,59 @@ message AdviseResetShardsResponse { repeated quickwit.ingest.ShardIds shards_to_delete = 1; repeated quickwit.ingest.ShardIdPositions shards_to_truncate = 2; } + +message SwapIndexingPipelinesRequest { + repeated SwapIndexingPipelinesEntry swaps = 1; +} + +message SwapIndexingPipelinesEntry { + string left_node_id = 1; + string left_index_id = 2; + string right_node_id = 3; + optional string right_index_id = 4; +} + +message SwapIndexingPipelinesResponse { + repeated SwapIndexingPipelinesResult results = 1; +} + +message SwapIndexingPipelinesResult { + SwapIndexingPipelinesEntry swap = 1; + bool success = 2; + // Human-readable reason when success is false. + string reason = 3; +} + +// Maintenance Mode API + +message EnableMaintenanceModeRequest { +} + +message EnableMaintenanceModeResponse { + // The frozen physical indexing plan serialized as JSON. + string frozen_plan_json = 1; +} + +message DisableMaintenanceModeRequest { +} + +message DisableMaintenanceModeResponse { +} + +message GetMaintenanceModeRequest { +} + +message GetMaintenanceModeResponse { + bool is_maintenance_mode = 1; + // If maintenance mode is active, the RFC 3339 datetime string when it was enabled. + optional string enabled_at = 2; +} + +message MaintenanceFrozenPlanForNode { + string index_id = 1; + repeated quickwit.indexing.IndexingTask indexing_tasks = 2; +} + +message MaintenanceFrozenPlan { + repeated MaintenanceFrozenPlanForNode state_per_node = 2; +} \ No newline at end of file diff --git a/quickwit/quickwit-proto/protos/quickwit/metastore.proto b/quickwit/quickwit-proto/protos/quickwit/metastore.proto index 00680da02d0..97a1287068c 100644 --- a/quickwit/quickwit-proto/protos/quickwit/metastore.proto +++ b/quickwit/quickwit-proto/protos/quickwit/metastore.proto @@ -125,6 +125,9 @@ service MetastoreService { // Deletes splits. rpc DeleteSplits(DeleteSplitsRequest) returns (EmptyResponse); + // Soft-deletes individual documents within published splits. + rpc SoftDeleteDocuments(SoftDeleteDocumentsRequest) returns (SoftDeleteDocumentsResponse); + // Adds a source. rpc AddSource(AddSourceRequest) returns (EmptyResponse); @@ -202,6 +205,17 @@ service MetastoreService { // Deletes index templates. rpc DeleteIndexTemplates(DeleteIndexTemplatesRequest) returns (EmptyResponse); + // Key-Value API (for cluster-wide settings like maintenance mode) + + // Gets a value by key from the cluster-wide key-value store. + rpc GetKv(GetKvRequest) returns (GetKvResponse); + + // Sets a key-value pair in the cluster-wide key-value store. + rpc SetKv(SetKvRequest) returns (EmptyResponse); + + // Deletes a key from the cluster-wide key-value store. + rpc DeleteKv(DeleteKvRequest) returns (EmptyResponse); + // Get cluster identity rpc GetClusterIdentity(GetClusterIdentityRequest) returns (GetClusterIdentityResponse); } @@ -348,6 +362,20 @@ message DeleteSplitsRequest { repeated string split_ids = 3; } +message SplitDocIds { + string split_id = 1; + repeated uint32 doc_ids = 2; +} + +message SoftDeleteDocumentsRequest { + quickwit.common.IndexUid index_uid = 1; + repeated SplitDocIds split_doc_ids = 2; +} + +message SoftDeleteDocumentsResponse { + uint64 num_soft_deleted_doc_ids = 1; +} + message AddSourceRequest { quickwit.common.IndexUid index_uid = 1; string source_config_json = 2; @@ -561,6 +589,26 @@ message DeleteIndexTemplatesRequest { repeated string template_ids = 1; } +// Key-Value API + +message GetKvRequest { + string key = 1; +} + +message GetKvResponse { + // Empty if the key does not exist. + optional string value = 1; +} + +message SetKvRequest { + string key = 1; + string value = 2; +} + +message DeleteKvRequest { + string key = 1; +} + message GetClusterIdentityRequest { } diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index 7b543e9ed25..7cc168a858b 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -37,6 +37,11 @@ service SearchService { /// This methods takes `PartialHit`s and returns `Hit`s. rpc FetchDocs(FetchDocsRequest) returns (FetchDocsResponse); + // Streams document contents from the document store. + // This method takes `PartialHit`s and streams back `LeafHit`s in batches + // to avoid hitting gRPC message size limits. + rpc StreamFetchDocs(FetchDocsRequest) returns (stream FetchDocsResponse); + // Root list terms API. // This RPC identifies the set of splits on which the query should run on, // and dispatches the several calls to `LeafListTerms`. @@ -249,6 +254,9 @@ message SearchRequest { bool ignore_missing_indexes = 18; optional string split_id = 19; + + // The user agent of the client that initiated the search request. + optional string user_agent = 20; } enum CountHits { @@ -264,7 +272,7 @@ message SortField { SortOrder sort_order = 2; // Optional sort value format for datetime field only. // If none, the default output format for datetime field is - // unix_timestamp_nanos. + // unix_timestamp_millis. optional SortDatetimeFormat sort_datetime_format = 3; } @@ -292,8 +300,8 @@ message SearchResponse { // server-side and expressed in microseconds. uint64 elapsed_time_micros = 3; - // The searcherrors that occurred formatted as string. - repeated string errors = 4; + // deprecated `errors` field + reserved 4; // used to be json-encoded aggregation reserved 5; @@ -314,6 +322,9 @@ message SearchResponse { // Total number of successful splits searched. uint64 num_successful_splits = 8; + + // Statistics on the split outcomes + SplitsByOutcome splits_by_outcome = 10; } message SearchPlanResponse { @@ -351,6 +362,24 @@ message LeafSearchRequest { repeated string index_uris = 9; } +// Split outcome counters +message SplitsByOutcome { + uint64 pruned_before_warmup = 1; + uint64 pruned_after_warmup = 2; + // Cancelled before warmup started (error or timeout) + uint64 cancel_before_warmup = 3; + uint64 processed = 4; + uint64 processed_from_metadata = 5; + // Resolved by the partial request cache + uint64 cache_hit = 6; + // Cancelled during warmup (error or timeout) + uint64 cancel_warmup = 7; + // Cancelled while waiting in the CPU thread pool queue + uint64 cancel_cpu_queue = 8; + // Cancelled during CPU processing (error or timeout) + uint64 cancel_cpu = 9; +} + message ResourceStats { uint64 short_lived_cache_num_bytes = 1; uint64 split_num_docs = 2; @@ -386,6 +415,8 @@ message SplitIdAndFooterOffsets { optional int64 timestamp_end = 5; // The number of docs in the split uint64 num_docs = 6; + // Tantivy doc IDs that have been soft-deleted from this split + repeated uint32 soft_deleted_doc_ids = 7; } // Hits returned by a FetchDocRequest. @@ -461,9 +492,11 @@ message SortByValue { int64 i64 = 2; double f64 = 3; bool boolean = 4; + string str = 5; + int64 datetime = 6; } // Room for eventual future sorted key types. - reserved 5 to 20; + reserved 7 to 20; } message LeafSearchResponse { @@ -493,6 +526,9 @@ message LeafSearchResponse { optional bytes intermediate_aggregation_result = 6; ResourceStats resource_stats = 8; + + // Split outcome counters for all splits targeted by this leaf request. + optional SplitsByOutcome splits_by_outcome = 9; } message SnippetRequest { diff --git a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs index 6736d97c7e2..afa08ca3c9d 100644 --- a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs +++ b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs @@ -120,10 +120,12 @@ pub struct Span { /// attributes is a collection of key/value pairs. Note, global attributes /// like server name can be set using the resource API. Examples of attributes: /// - /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" - /// "/http/server_latency": 300 - /// "abc.com/myattribute": true - /// "abc.com/score": 10.239 + /// ```text + /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" + /// "/http/server_latency": 300 + /// "abc.com/myattribute": true + /// "abc.com/score": 10.239 + /// ``` /// /// The OpenTelemetry API specification further restricts the allowed value types: /// @@ -276,7 +278,7 @@ pub mod span { } /// The Status type defines a logical error model that is suitable for different /// programming environments, including REST APIs and RPC APIs. -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Status { /// A developer-facing human readable error message. #[prost(string, tag = "2")] diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.cluster.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.cluster.rs index f326b8e7c29..e31a341e359 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.cluster.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.cluster.rs @@ -278,7 +278,7 @@ type FetchClusterStateLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct ClusterServiceTowerLayerStack { - fetch_cluster_state_layers: Vec, + pub fetch_cluster_state_layers: Vec, } impl ClusterServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs index 09cfbdebf58..dbd6f06e38e 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.control_plane.rs @@ -73,6 +73,83 @@ pub struct AdviseResetShardsResponse { pub shards_to_truncate: ::prost::alloc::vec::Vec, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SwapIndexingPipelinesRequest { + #[prost(message, repeated, tag = "1")] + pub swaps: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SwapIndexingPipelinesEntry { + #[prost(string, tag = "1")] + pub left_node_id: ::prost::alloc::string::String, + #[prost(string, tag = "2")] + pub left_index_id: ::prost::alloc::string::String, + #[prost(string, tag = "3")] + pub right_node_id: ::prost::alloc::string::String, + #[prost(string, optional, tag = "4")] + pub right_index_id: ::core::option::Option<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SwapIndexingPipelinesResponse { + #[prost(message, repeated, tag = "1")] + pub results: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SwapIndexingPipelinesResult { + #[prost(message, optional, tag = "1")] + pub swap: ::core::option::Option, + #[prost(bool, tag = "2")] + pub success: bool, + /// Human-readable reason when success is false. + #[prost(string, tag = "3")] + pub reason: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct EnableMaintenanceModeRequest {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct EnableMaintenanceModeResponse { + /// The frozen physical indexing plan serialized as JSON. + #[prost(string, tag = "1")] + pub frozen_plan_json: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct DisableMaintenanceModeRequest {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct DisableMaintenanceModeResponse {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetMaintenanceModeRequest {} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetMaintenanceModeResponse { + #[prost(bool, tag = "1")] + pub is_maintenance_mode: bool, + /// If maintenance mode is active, the RFC 3339 datetime string when it was enabled. + #[prost(string, optional, tag = "2")] + pub enabled_at: ::core::option::Option<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MaintenanceFrozenPlanForNode { + #[prost(string, tag = "1")] + pub index_id: ::prost::alloc::string::String, + #[prost(message, repeated, tag = "2")] + pub indexing_tasks: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MaintenanceFrozenPlan { + #[prost(message, repeated, tag = "2")] + pub state_per_node: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[serde(rename_all = "snake_case")] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -180,6 +257,27 @@ pub trait ControlPlaneService: std::fmt::Debug + Send + Sync + 'static { &self, request: super::metastore::PruneShardsRequest, ) -> crate::control_plane::ControlPlaneResult; + ///Swaps indexing pipelines of different indexes between different indexers. + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult; + ///Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + ///metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult; + ///Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult; + ///Returns the current maintenance mode status. + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult; } #[derive(Debug, Clone)] pub struct ControlPlaneServiceClient { @@ -352,6 +450,30 @@ impl ControlPlaneService for ControlPlaneServiceClient { ) -> crate::control_plane::ControlPlaneResult { self.inner.0.prune_shards(request).await } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.swap_indexing_pipelines(request).await + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.enable_maintenance_mode(request).await + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.disable_maintenance_mode(request).await + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner.0.get_maintenance_mode(request).await + } } #[cfg(any(test, feature = "testsuite"))] pub mod mock_control_plane_service { @@ -440,6 +562,38 @@ pub mod mock_control_plane_service { > { self.inner.lock().await.prune_shards(request).await } + async fn swap_indexing_pipelines( + &self, + request: super::SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::SwapIndexingPipelinesResponse, + > { + self.inner.lock().await.swap_indexing_pipelines(request).await + } + async fn enable_maintenance_mode( + &self, + request: super::EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::EnableMaintenanceModeResponse, + > { + self.inner.lock().await.enable_maintenance_mode(request).await + } + async fn disable_maintenance_mode( + &self, + request: super::DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::DisableMaintenanceModeResponse, + > { + self.inner.lock().await.disable_maintenance_mode(request).await + } + async fn get_maintenance_mode( + &self, + request: super::GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult< + super::GetMaintenanceModeResponse, + > { + self.inner.lock().await.get_maintenance_mode(request).await + } } } pub type BoxFuture = std::pin::Pin< @@ -613,6 +767,70 @@ for InnerControlPlaneServiceClient { Box::pin(fut) } } +impl tower::Service for InnerControlPlaneServiceClient { + type Response = SwapIndexingPipelinesResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SwapIndexingPipelinesRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.swap_indexing_pipelines(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerControlPlaneServiceClient { + type Response = EnableMaintenanceModeResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: EnableMaintenanceModeRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.enable_maintenance_mode(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerControlPlaneServiceClient { + type Response = DisableMaintenanceModeResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: DisableMaintenanceModeRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.disable_maintenance_mode(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerControlPlaneServiceClient { + type Response = GetMaintenanceModeResponse; + type Error = crate::control_plane::ControlPlaneError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: GetMaintenanceModeRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.get_maintenance_mode(request).await }; + Box::pin(fut) + } +} /// A tower service stack is a set of tower services. #[derive(Debug)] struct ControlPlaneServiceTowerServiceStack { @@ -668,6 +886,26 @@ struct ControlPlaneServiceTowerServiceStack { super::metastore::EmptyResponse, crate::control_plane::ControlPlaneError, >, + swap_indexing_pipelines_svc: quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + enable_maintenance_mode_svc: quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + disable_maintenance_mode_svc: quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + get_maintenance_mode_svc: quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, } #[async_trait::async_trait] impl ControlPlaneService for ControlPlaneServiceTowerServiceStack { @@ -735,6 +973,30 @@ impl ControlPlaneService for ControlPlaneServiceTowerServiceStack { ) -> crate::control_plane::ControlPlaneResult { self.prune_shards_svc.clone().ready().await?.call(request).await } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.swap_indexing_pipelines_svc.clone().ready().await?.call(request).await + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.enable_maintenance_mode_svc.clone().ready().await?.call(request).await + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.disable_maintenance_mode_svc.clone().ready().await?.call(request).await + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.get_maintenance_mode_svc.clone().ready().await?.call(request).await + } } type CreateIndexLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< @@ -836,18 +1098,62 @@ type PruneShardsLayer = quickwit_common::tower::BoxLayer< super::metastore::EmptyResponse, crate::control_plane::ControlPlaneError, >; +type SwapIndexingPipelinesLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, +>; +type EnableMaintenanceModeLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, +>; +type DisableMaintenanceModeLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, +>; +type GetMaintenanceModeLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, +>; #[derive(Debug, Default)] pub struct ControlPlaneServiceTowerLayerStack { - create_index_layers: Vec, - update_index_layers: Vec, - delete_index_layers: Vec, - add_source_layers: Vec, - update_source_layers: Vec, - toggle_source_layers: Vec, - delete_source_layers: Vec, - get_or_create_open_shards_layers: Vec, - advise_reset_shards_layers: Vec, - prune_shards_layers: Vec, + pub create_index_layers: Vec, + pub update_index_layers: Vec, + pub delete_index_layers: Vec, + pub add_source_layers: Vec, + pub update_source_layers: Vec, + pub toggle_source_layers: Vec, + pub delete_source_layers: Vec, + pub get_or_create_open_shards_layers: Vec, + pub advise_reset_shards_layers: Vec, + pub prune_shards_layers: Vec, + pub swap_indexing_pipelines_layers: Vec, + pub enable_maintenance_mode_layers: Vec, + pub disable_maintenance_mode_layers: Vec, + pub get_maintenance_mode_layers: Vec, } impl ControlPlaneServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self @@ -1120,6 +1426,114 @@ impl ControlPlaneServiceTowerLayerStack { >>::Service as tower::Service< super::metastore::PruneShardsRequest, >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SwapIndexingPipelinesRequest, + Response = SwapIndexingPipelinesResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + SwapIndexingPipelinesRequest, + >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + EnableMaintenanceModeRequest, + Response = EnableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + EnableMaintenanceModeRequest, + >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + DisableMaintenanceModeRequest, + Response = DisableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + DisableMaintenanceModeRequest, + >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + GetMaintenanceModeRequest, + Response = GetMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + GetMaintenanceModeRequest, + >>::Future: Send + 'static, { self.create_index_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); @@ -1141,6 +1555,14 @@ impl ControlPlaneServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.prune_shards_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.swap_indexing_pipelines_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.enable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.disable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.get_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self } pub fn stack_create_index_layer(mut self, layer: L) -> Self @@ -1353,28 +1775,116 @@ impl ControlPlaneServiceTowerLayerStack { self.prune_shards_layers.push(quickwit_common::tower::BoxLayer::new(layer)); self } - pub fn build(self, instance: T) -> ControlPlaneServiceClient + pub fn stack_swap_indexing_pipelines_layer(mut self, layer: L) -> Self where - T: ControlPlaneService, + L: tower::Layer< + quickwit_common::tower::BoxService< + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SwapIndexingPipelinesRequest, + Response = SwapIndexingPipelinesResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, { - let inner_client = InnerControlPlaneServiceClient(std::sync::Arc::new(instance)); - self.build_from_inner_client(inner_client) - } - pub fn build_from_channel( - self, - addr: std::net::SocketAddr, - channel: tonic::transport::Channel, - max_message_size: bytesize::ByteSize, - compression_encoding_opt: Option, - ) -> ControlPlaneServiceClient { - let client = ControlPlaneServiceClient::from_channel( - addr, - channel, - max_message_size, - compression_encoding_opt, - ); - let inner_client = client.inner; - self.build_from_inner_client(inner_client) + self.swap_indexing_pipelines_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_enable_maintenance_mode_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + EnableMaintenanceModeRequest, + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + EnableMaintenanceModeRequest, + Response = EnableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.enable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_disable_maintenance_mode_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + DisableMaintenanceModeRequest, + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + DisableMaintenanceModeRequest, + Response = DisableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.disable_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_get_maintenance_mode_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + GetMaintenanceModeRequest, + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + GetMaintenanceModeRequest, + Response = GetMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.get_maintenance_mode_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn build(self, instance: T) -> ControlPlaneServiceClient + where + T: ControlPlaneService, + { + let inner_client = InnerControlPlaneServiceClient(std::sync::Arc::new(instance)); + self.build_from_inner_client(inner_client) + } + pub fn build_from_channel( + self, + addr: std::net::SocketAddr, + channel: tonic::transport::Channel, + max_message_size: bytesize::ByteSize, + compression_encoding_opt: Option, + ) -> ControlPlaneServiceClient { + let client = ControlPlaneServiceClient::from_channel( + addr, + channel, + max_message_size, + compression_encoding_opt, + ); + let inner_client = client.inner; + self.build_from_inner_client(inner_client) } pub fn build_from_balance_channel( self, @@ -1496,6 +2006,38 @@ impl ControlPlaneServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let swap_indexing_pipelines_svc = self + .swap_indexing_pipelines_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let enable_maintenance_mode_svc = self + .enable_maintenance_mode_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let disable_maintenance_mode_svc = self + .disable_maintenance_mode_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let get_maintenance_mode_svc = self + .get_maintenance_mode_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let tower_svc_stack = ControlPlaneServiceTowerServiceStack { inner: inner_client, create_index_svc, @@ -1508,6 +2050,10 @@ impl ControlPlaneServiceTowerLayerStack { get_or_create_open_shards_svc, advise_reset_shards_svc, prune_shards_svc, + swap_indexing_pipelines_svc, + enable_maintenance_mode_svc, + disable_maintenance_mode_svc, + get_maintenance_mode_svc, }; ControlPlaneServiceClient::new(tower_svc_stack) } @@ -1673,6 +2219,42 @@ where super::metastore::EmptyResponse, crate::control_plane::ControlPlaneError, >, + > + + tower::Service< + SwapIndexingPipelinesRequest, + Response = SwapIndexingPipelinesResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + SwapIndexingPipelinesResponse, + crate::control_plane::ControlPlaneError, + >, + > + + tower::Service< + EnableMaintenanceModeRequest, + Response = EnableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + EnableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + + tower::Service< + DisableMaintenanceModeRequest, + Response = DisableMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + DisableMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, + > + + tower::Service< + GetMaintenanceModeRequest, + Response = GetMaintenanceModeResponse, + Error = crate::control_plane::ControlPlaneError, + Future = BoxFuture< + GetMaintenanceModeResponse, + crate::control_plane::ControlPlaneError, + >, >, { async fn create_index( @@ -1739,6 +2321,30 @@ where ) -> crate::control_plane::ControlPlaneResult { self.clone().call(request).await } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.clone().call(request).await + } } #[derive(Debug, Clone)] pub struct ControlPlaneServiceGrpcClientAdapter { @@ -1918,6 +2524,62 @@ where super::metastore::PruneShardsRequest::rpc_name(), )) } + async fn swap_indexing_pipelines( + &self, + request: SwapIndexingPipelinesRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .swap_indexing_pipelines(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SwapIndexingPipelinesRequest::rpc_name(), + )) + } + async fn enable_maintenance_mode( + &self, + request: EnableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .enable_maintenance_mode(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + EnableMaintenanceModeRequest::rpc_name(), + )) + } + async fn disable_maintenance_mode( + &self, + request: DisableMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .disable_maintenance_mode(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + DisableMaintenanceModeRequest::rpc_name(), + )) + } + async fn get_maintenance_mode( + &self, + request: GetMaintenanceModeRequest, + ) -> crate::control_plane::ControlPlaneResult { + self.inner + .clone() + .get_maintenance_mode(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + GetMaintenanceModeRequest::rpc_name(), + )) + } } #[derive(Debug)] pub struct ControlPlaneServiceGrpcServerAdapter { @@ -2049,6 +2711,50 @@ for ControlPlaneServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn swap_indexing_pipelines( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .swap_indexing_pipelines(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn enable_maintenance_mode( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .enable_maintenance_mode(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn disable_maintenance_mode( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .disable_maintenance_mode(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn get_maintenance_mode( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .get_maintenance_mode(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } } /// Generated client implementations. pub mod control_plane_service_grpc_client { @@ -2450,6 +3156,127 @@ pub mod control_plane_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Swaps indexing pipelines of different indexes between different indexers. + pub async fn swap_indexing_pipelines( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/SwapIndexingPipelines", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "SwapIndexingPipelines", + ), + ); + self.inner.unary(req, path, codec).await + } + /// Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + /// metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + pub async fn enable_maintenance_mode( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/EnableMaintenanceMode", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "EnableMaintenanceMode", + ), + ); + self.inner.unary(req, path, codec).await + } + /// Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + pub async fn disable_maintenance_mode( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/DisableMaintenanceMode", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "DisableMaintenanceMode", + ), + ); + self.inner.unary(req, path, codec).await + } + /// Returns the current maintenance mode status. + pub async fn get_maintenance_mode( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.control_plane.ControlPlaneService/GetMaintenanceMode", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.control_plane.ControlPlaneService", + "GetMaintenanceMode", + ), + ); + self.inner.unary(req, path, codec).await + } } } /// Generated server implementations. @@ -2546,6 +3373,39 @@ pub mod control_plane_service_grpc_server { tonic::Response, tonic::Status, >; + /// Swaps indexing pipelines of different indexes between different indexers. + async fn swap_indexing_pipelines( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Enables maintenance mode on the cluster. When active, the indexing plan is frozen, + /// metadata mutations (index/source CRUD) are accepted but the plan is not rebuilt, and shard scaling/rebalancing is paused. + async fn enable_maintenance_mode( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Disables maintenance mode. Triggers a full indexing plan rebuild to reconcile the cluster. + async fn disable_maintenance_mode( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Returns the current maintenance mode status. + async fn get_maintenance_mode( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; } #[derive(Debug)] pub struct ControlPlaneServiceGrpcServer { @@ -3137,6 +3997,208 @@ pub mod control_plane_service_grpc_server { }; Box::pin(fut) } + "/quickwit.control_plane.ControlPlaneService/SwapIndexingPipelines" => { + #[allow(non_camel_case_types)] + struct SwapIndexingPipelinesSvc( + pub Arc, + ); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for SwapIndexingPipelinesSvc { + type Response = super::SwapIndexingPipelinesResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::swap_indexing_pipelines( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SwapIndexingPipelinesSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.control_plane.ControlPlaneService/EnableMaintenanceMode" => { + #[allow(non_camel_case_types)] + struct EnableMaintenanceModeSvc( + pub Arc, + ); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for EnableMaintenanceModeSvc { + type Response = super::EnableMaintenanceModeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::enable_maintenance_mode( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = EnableMaintenanceModeSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.control_plane.ControlPlaneService/DisableMaintenanceMode" => { + #[allow(non_camel_case_types)] + struct DisableMaintenanceModeSvc( + pub Arc, + ); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for DisableMaintenanceModeSvc { + type Response = super::DisableMaintenanceModeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::disable_maintenance_mode( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = DisableMaintenanceModeSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.control_plane.ControlPlaneService/GetMaintenanceMode" => { + #[allow(non_camel_case_types)] + struct GetMaintenanceModeSvc(pub Arc); + impl< + T: ControlPlaneServiceGrpc, + > tonic::server::UnaryService + for GetMaintenanceModeSvc { + type Response = super::GetMaintenanceModeResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_maintenance_mode( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = GetMaintenanceModeSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } _ => { Box::pin(async move { let mut response = http::Response::new( diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.developer.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.developer.rs index 38520a259ec..c0816e33d1f 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.developer.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.developer.rs @@ -212,7 +212,7 @@ type GetDebugInfoLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct DeveloperServiceTowerLayerStack { - get_debug_info_layers: Vec, + pub get_debug_info_layers: Vec, } impl DeveloperServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.indexing.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.indexing.rs index c22235f36ec..417bf73f7ae 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.indexing.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.indexing.rs @@ -223,7 +223,7 @@ type ApplyIndexingPlanLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct IndexingServiceTowerLayerStack { - apply_indexing_plan_layers: Vec, + pub apply_indexing_plan_layers: Vec, } impl IndexingServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs index 018e19a39a9..97de2edf742 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs @@ -1199,15 +1199,15 @@ type DecommissionLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct IngesterServiceTowerLayerStack { - persist_layers: Vec, - open_replication_stream_layers: Vec, - open_fetch_stream_layers: Vec, - open_observation_stream_layers: Vec, - init_shards_layers: Vec, - retain_shards_layers: Vec, - truncate_shards_layers: Vec, - close_shards_layers: Vec, - decommission_layers: Vec, + pub persist_layers: Vec, + pub open_replication_stream_layers: Vec, + pub open_fetch_stream_layers: Vec, + pub open_observation_stream_layers: Vec, + pub init_shards_layers: Vec, + pub retain_shards_layers: Vec, + pub truncate_shards_layers: Vec, + pub close_shards_layers: Vec, + pub decommission_layers: Vec, } impl IngesterServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.router.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.router.rs index f502783e038..0bf8948cf81 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.router.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.router.rs @@ -321,7 +321,7 @@ type IngestLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct IngestRouterServiceTowerLayerStack { - ingest_layers: Vec, + pub ingest_layers: Vec, } impl IngestRouterServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs index ab6d1ddc236..c77b5ef2d9f 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs @@ -210,6 +210,28 @@ pub struct DeleteSplitsRequest { } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SplitDocIds { + #[prost(string, tag = "1")] + pub split_id: ::prost::alloc::string::String, + #[prost(uint32, repeated, tag = "2")] + pub doc_ids: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SoftDeleteDocumentsRequest { + #[prost(message, optional, tag = "1")] + pub index_uid: ::core::option::Option, + #[prost(message, repeated, tag = "2")] + pub split_doc_ids: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SoftDeleteDocumentsResponse { + #[prost(uint64, tag = "1")] + pub num_soft_deleted_doc_ids: u64, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct AddSourceRequest { #[prost(message, optional, tag = "1")] pub index_uid: ::core::option::Option, @@ -524,6 +546,33 @@ pub struct DeleteIndexTemplatesRequest { pub template_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetKvRequest { + #[prost(string, tag = "1")] + pub key: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct GetKvResponse { + /// Empty if the key does not exist. + #[prost(string, optional, tag = "1")] + pub value: ::core::option::Option<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SetKvRequest { + #[prost(string, tag = "1")] + pub key: ::prost::alloc::string::String, + #[prost(string, tag = "2")] + pub value: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct DeleteKvRequest { + #[prost(string, tag = "1")] + pub key: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct GetClusterIdentityRequest {} #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] @@ -693,6 +742,11 @@ impl RpcName for DeleteSplitsRequest { "delete_splits" } } +impl RpcName for SoftDeleteDocumentsRequest { + fn rpc_name() -> &'static str { + "soft_delete_documents" + } +} impl RpcName for AddSourceRequest { fn rpc_name() -> &'static str { "add_source" @@ -793,6 +847,21 @@ impl RpcName for DeleteIndexTemplatesRequest { "delete_index_templates" } } +impl RpcName for GetKvRequest { + fn rpc_name() -> &'static str { + "get_kv" + } +} +impl RpcName for SetKvRequest { + fn rpc_name() -> &'static str { + "set_kv" + } +} +impl RpcName for DeleteKvRequest { + fn rpc_name() -> &'static str { + "delete_kv" + } +} impl RpcName for GetClusterIdentityRequest { fn rpc_name() -> &'static str { "get_cluster_identity" @@ -867,6 +936,11 @@ pub trait MetastoreService: std::fmt::Debug + Send + Sync + 'static { &self, request: DeleteSplitsRequest, ) -> crate::metastore::MetastoreResult; + ///Soft-deletes individual documents within published splits. + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult; ///Adds a source. async fn add_source( &self, @@ -980,6 +1054,21 @@ pub trait MetastoreService: std::fmt::Debug + Send + Sync + 'static { &self, request: DeleteIndexTemplatesRequest, ) -> crate::metastore::MetastoreResult; + ///Gets a value by key from the cluster-wide key-value store. + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult; + ///Sets a key-value pair in the cluster-wide key-value store. + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult; + ///Deletes a key from the cluster-wide key-value store. + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult; ///Get cluster identity async fn get_cluster_identity( &self, @@ -1167,6 +1256,12 @@ impl MetastoreService for MetastoreServiceClient { ) -> crate::metastore::MetastoreResult { self.inner.0.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.soft_delete_documents(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -1287,6 +1382,24 @@ impl MetastoreService for MetastoreServiceClient { ) -> crate::metastore::MetastoreResult { self.inner.0.delete_index_templates(request).await } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.get_kv(request).await + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.set_kv(request).await + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.delete_kv(request).await + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -1383,6 +1496,12 @@ pub mod mock_metastore_service { ) -> crate::metastore::MetastoreResult { self.inner.lock().await.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: super::SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.soft_delete_documents(request).await + } async fn add_source( &self, request: super::AddSourceRequest, @@ -1505,6 +1624,24 @@ pub mod mock_metastore_service { ) -> crate::metastore::MetastoreResult { self.inner.lock().await.delete_index_templates(request).await } + async fn get_kv( + &self, + request: super::GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.get_kv(request).await + } + async fn set_kv( + &self, + request: super::SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.set_kv(request).await + } + async fn delete_kv( + &self, + request: super::DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.delete_kv(request).await + } async fn get_cluster_identity( &self, request: super::GetClusterIdentityRequest, @@ -1714,6 +1851,22 @@ impl tower::Service for InnerMetastoreServiceClient { Box::pin(fut) } } +impl tower::Service for InnerMetastoreServiceClient { + type Response = SoftDeleteDocumentsResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SoftDeleteDocumentsRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.soft_delete_documents(request).await }; + Box::pin(fut) + } +} impl tower::Service for InnerMetastoreServiceClient { type Response = EmptyResponse; type Error = crate::metastore::MetastoreError; @@ -2034,6 +2187,54 @@ impl tower::Service for InnerMetastoreServiceClient Box::pin(fut) } } +impl tower::Service for InnerMetastoreServiceClient { + type Response = GetKvResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: GetKvRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.get_kv(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerMetastoreServiceClient { + type Response = EmptyResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SetKvRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.set_kv(request).await }; + Box::pin(fut) + } +} +impl tower::Service for InnerMetastoreServiceClient { + type Response = EmptyResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: DeleteKvRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.delete_kv(request).await }; + Box::pin(fut) + } +} impl tower::Service for InnerMetastoreServiceClient { type Response = GetClusterIdentityResponse; type Error = crate::metastore::MetastoreError; @@ -2115,6 +2316,11 @@ struct MetastoreServiceTowerServiceStack { EmptyResponse, crate::metastore::MetastoreError, >, + soft_delete_documents_svc: quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, add_source_svc: quickwit_common::tower::BoxService< AddSourceRequest, EmptyResponse, @@ -2215,6 +2421,21 @@ struct MetastoreServiceTowerServiceStack { EmptyResponse, crate::metastore::MetastoreError, >, + get_kv_svc: quickwit_common::tower::BoxService< + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + set_kv_svc: quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + delete_kv_svc: quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, get_cluster_identity_svc: quickwit_common::tower::BoxService< GetClusterIdentityRequest, GetClusterIdentityResponse, @@ -2295,6 +2516,12 @@ impl MetastoreService for MetastoreServiceTowerServiceStack { ) -> crate::metastore::MetastoreResult { self.delete_splits_svc.clone().ready().await?.call(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.soft_delete_documents_svc.clone().ready().await?.call(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -2415,6 +2642,24 @@ impl MetastoreService for MetastoreServiceTowerServiceStack { ) -> crate::metastore::MetastoreResult { self.delete_index_templates_svc.clone().ready().await?.call(request).await } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.get_kv_svc.clone().ready().await?.call(request).await + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.set_kv_svc.clone().ready().await?.call(request).await + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.delete_kv_svc.clone().ready().await?.call(request).await + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -2548,6 +2793,16 @@ type DeleteSplitsLayer = quickwit_common::tower::BoxLayer< EmptyResponse, crate::metastore::MetastoreError, >; +type SoftDeleteDocumentsLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, +>; type AddSourceLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< AddSourceRequest, @@ -2748,6 +3003,36 @@ type DeleteIndexTemplatesLayer = quickwit_common::tower::BoxLayer< EmptyResponse, crate::metastore::MetastoreError, >; +type GetKvLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, +>; +type SetKvLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, +>; +type DeleteKvLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, +>; type GetClusterIdentityLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< GetClusterIdentityRequest, @@ -2760,39 +3045,43 @@ type GetClusterIdentityLayer = quickwit_common::tower::BoxLayer< >; #[derive(Debug, Default)] pub struct MetastoreServiceTowerLayerStack { - create_index_layers: Vec, - update_index_layers: Vec, - index_metadata_layers: Vec, - indexes_metadata_layers: Vec, - list_indexes_metadata_layers: Vec, - delete_index_layers: Vec, - list_index_stats_layers: Vec, - list_splits_layers: Vec, - stage_splits_layers: Vec, - publish_splits_layers: Vec, - mark_splits_for_deletion_layers: Vec, - delete_splits_layers: Vec, - add_source_layers: Vec, - update_source_layers: Vec, - toggle_source_layers: Vec, - delete_source_layers: Vec, - reset_source_checkpoint_layers: Vec, - last_delete_opstamp_layers: Vec, - create_delete_task_layers: Vec, - update_splits_delete_opstamp_layers: Vec, - list_delete_tasks_layers: Vec, - list_stale_splits_layers: Vec, - open_shards_layers: Vec, - acquire_shards_layers: Vec, - delete_shards_layers: Vec, - prune_shards_layers: Vec, - list_shards_layers: Vec, - create_index_template_layers: Vec, - get_index_template_layers: Vec, - find_index_template_matches_layers: Vec, - list_index_templates_layers: Vec, - delete_index_templates_layers: Vec, - get_cluster_identity_layers: Vec, + pub create_index_layers: Vec, + pub update_index_layers: Vec, + pub index_metadata_layers: Vec, + pub indexes_metadata_layers: Vec, + pub list_indexes_metadata_layers: Vec, + pub delete_index_layers: Vec, + pub list_index_stats_layers: Vec, + pub list_splits_layers: Vec, + pub stage_splits_layers: Vec, + pub publish_splits_layers: Vec, + pub mark_splits_for_deletion_layers: Vec, + pub delete_splits_layers: Vec, + pub soft_delete_documents_layers: Vec, + pub add_source_layers: Vec, + pub update_source_layers: Vec, + pub toggle_source_layers: Vec, + pub delete_source_layers: Vec, + pub reset_source_checkpoint_layers: Vec, + pub last_delete_opstamp_layers: Vec, + pub create_delete_task_layers: Vec, + pub update_splits_delete_opstamp_layers: Vec, + pub list_delete_tasks_layers: Vec, + pub list_stale_splits_layers: Vec, + pub open_shards_layers: Vec, + pub acquire_shards_layers: Vec, + pub delete_shards_layers: Vec, + pub prune_shards_layers: Vec, + pub list_shards_layers: Vec, + pub create_index_template_layers: Vec, + pub get_index_template_layers: Vec, + pub find_index_template_matches_layers: Vec, + pub list_index_templates_layers: Vec, + pub delete_index_templates_layers: Vec, + pub get_kv_layers: Vec, + pub set_kv_layers: Vec, + pub delete_kv_layers: Vec, + pub get_cluster_identity_layers: Vec, } impl MetastoreServiceTowerLayerStack { pub fn stack_layer(mut self, layer: L) -> Self @@ -3101,6 +3390,33 @@ impl MetastoreServiceTowerLayerStack { crate::metastore::MetastoreError, >, >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + SoftDeleteDocumentsRequest, + >>::Future: Send + 'static, L: tower::Layer< quickwit_common::tower::BoxService< AddSourceRequest, @@ -3613,6 +3929,81 @@ impl MetastoreServiceTowerLayerStack { >>::Service as tower::Service< DeleteIndexTemplatesRequest, >>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + GetKvRequest, + Response = GetKvResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SetKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + DeleteKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, L: tower::Layer< quickwit_common::tower::BoxService< GetClusterIdentityRequest, @@ -3665,6 +4056,8 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.delete_splits_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.soft_delete_documents_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.add_source_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.update_source_layers @@ -3705,6 +4098,9 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.delete_index_templates_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.get_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.set_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.delete_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.get_cluster_identity_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self @@ -3943,6 +4339,28 @@ impl MetastoreServiceTowerLayerStack { self.delete_splits_layers.push(quickwit_common::tower::BoxLayer::new(layer)); self } + pub fn stack_soft_delete_documents_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.soft_delete_documents_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } pub fn stack_add_source_layer(mut self, layer: L) -> Self where L: tower::Layer< @@ -4344,12 +4762,69 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer)); self } - pub fn stack_get_cluster_identity_layer(mut self, layer: L) -> Self + pub fn stack_get_kv_layer(mut self, layer: L) -> Self where L: tower::Layer< quickwit_common::tower::BoxService< - GetClusterIdentityRequest, - GetClusterIdentityResponse, + GetKvRequest, + GetKvResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + GetKvRequest, + Response = GetKvResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.get_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_set_kv_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + SetKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SetKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.set_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_delete_kv_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + DeleteKvRequest, + EmptyResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + DeleteKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.delete_kv_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn stack_get_cluster_identity_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + GetClusterIdentityRequest, + GetClusterIdentityResponse, crate::metastore::MetastoreError, >, > + Send + Sync + 'static, @@ -4522,6 +4997,14 @@ impl MetastoreServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let soft_delete_documents_svc = self + .soft_delete_documents_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let add_source_svc = self .add_source_layers .into_iter() @@ -4682,6 +5165,30 @@ impl MetastoreServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let get_kv_svc = self + .get_kv_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let set_kv_svc = self + .set_kv_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let delete_kv_svc = self + .delete_kv_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let get_cluster_identity_svc = self .get_cluster_identity_layers .into_iter() @@ -4704,6 +5211,7 @@ impl MetastoreServiceTowerLayerStack { publish_splits_svc, mark_splits_for_deletion_svc, delete_splits_svc, + soft_delete_documents_svc, add_source_svc, update_source_svc, toggle_source_svc, @@ -4724,6 +5232,9 @@ impl MetastoreServiceTowerLayerStack { find_index_template_matches_svc, list_index_templates_svc, delete_index_templates_svc, + get_kv_svc, + set_kv_svc, + delete_kv_svc, get_cluster_identity_svc, }; MetastoreServiceClient::new(tower_svc_stack) @@ -4879,6 +5390,15 @@ where Error = crate::metastore::MetastoreError, Future = BoxFuture, > + + tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture< + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + tower::Service< AddSourceRequest, Response = EmptyResponse, @@ -5014,6 +5534,24 @@ where Error = crate::metastore::MetastoreError, Future = BoxFuture, > + + tower::Service< + GetKvRequest, + Response = GetKvResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture, + > + + tower::Service< + SetKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture, + > + + tower::Service< + DeleteKvRequest, + Response = EmptyResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture, + > + tower::Service< GetClusterIdentityRequest, Response = GetClusterIdentityResponse, @@ -5096,6 +5634,12 @@ where ) -> crate::metastore::MetastoreResult { self.clone().call(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -5216,6 +5760,24 @@ where ) -> crate::metastore::MetastoreResult { self.clone().call(request).await } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -5445,6 +6007,20 @@ where DeleteSplitsRequest::rpc_name(), )) } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .soft_delete_documents(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SoftDeleteDocumentsRequest::rpc_name(), + )) + } async fn add_source( &self, request: AddSourceRequest, @@ -5725,6 +6301,48 @@ where DeleteIndexTemplatesRequest::rpc_name(), )) } + async fn get_kv( + &self, + request: GetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .get_kv(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + GetKvRequest::rpc_name(), + )) + } + async fn set_kv( + &self, + request: SetKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .set_kv(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SetKvRequest::rpc_name(), + )) + } + async fn delete_kv( + &self, + request: DeleteKvRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .delete_kv(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + DeleteKvRequest::rpc_name(), + )) + } async fn get_cluster_identity( &self, request: GetClusterIdentityRequest, @@ -5909,6 +6527,17 @@ for MetastoreServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn soft_delete_documents( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .soft_delete_documents(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } async fn add_source( &self, request: tonic::Request, @@ -6129,6 +6758,39 @@ for MetastoreServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn get_kv( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .get_kv(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn set_kv( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .set_kv(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } + async fn delete_kv( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .delete_kv(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } async fn get_cluster_identity( &self, request: tonic::Request, @@ -6619,6 +7281,36 @@ pub mod metastore_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Soft-deletes individual documents within published splits. + pub async fn soft_delete_documents( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/SoftDeleteDocuments", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.metastore.MetastoreService", + "SoftDeleteDocuments", + ), + ); + self.inner.unary(req, path, codec).await + } /// Adds a source. pub async fn add_source( &mut self, @@ -7193,6 +7885,74 @@ pub mod metastore_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Gets a value by key from the cluster-wide key-value store. + pub async fn get_kv( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/GetKv", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("quickwit.metastore.MetastoreService", "GetKv")); + self.inner.unary(req, path, codec).await + } + /// Sets a key-value pair in the cluster-wide key-value store. + pub async fn set_kv( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/SetKv", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("quickwit.metastore.MetastoreService", "SetKv")); + self.inner.unary(req, path, codec).await + } + /// Deletes a key from the cluster-wide key-value store. + pub async fn delete_kv( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/DeleteKv", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("quickwit.metastore.MetastoreService", "DeleteKv"), + ); + self.inner.unary(req, path, codec).await + } /// Get cluster identity pub async fn get_cluster_identity( &mut self, @@ -7325,6 +8085,14 @@ pub mod metastore_service_grpc_server { &self, request: tonic::Request, ) -> std::result::Result, tonic::Status>; + /// Soft-deletes individual documents within published splits. + async fn soft_delete_documents( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Adds a source. async fn add_source( &self, @@ -7471,6 +8239,21 @@ pub mod metastore_service_grpc_server { &self, request: tonic::Request, ) -> std::result::Result, tonic::Status>; + /// Gets a value by key from the cluster-wide key-value store. + async fn get_kv( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// Sets a key-value pair in the cluster-wide key-value store. + async fn set_kv( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + /// Deletes a key from the cluster-wide key-value store. + async fn delete_kv( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; /// Get cluster identity async fn get_cluster_identity( &self, @@ -8176,6 +8959,55 @@ pub mod metastore_service_grpc_server { }; Box::pin(fut) } + "/quickwit.metastore.MetastoreService/SoftDeleteDocuments" => { + #[allow(non_camel_case_types)] + struct SoftDeleteDocumentsSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService + for SoftDeleteDocumentsSvc { + type Response = super::SoftDeleteDocumentsResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::soft_delete_documents( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SoftDeleteDocumentsSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.metastore.MetastoreService/AddSource" => { #[allow(non_camel_case_types)] struct AddSourceSvc(pub Arc); @@ -9138,6 +9970,140 @@ pub mod metastore_service_grpc_server { }; Box::pin(fut) } + "/quickwit.metastore.MetastoreService/GetKv" => { + #[allow(non_camel_case_types)] + struct GetKvSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService for GetKvSvc { + type Response = super::GetKvResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_kv(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = GetKvSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.metastore.MetastoreService/SetKv" => { + #[allow(non_camel_case_types)] + struct SetKvSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService for SetKvSvc { + type Response = super::EmptyResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::set_kv(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SetKvSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.metastore.MetastoreService/DeleteKv" => { + #[allow(non_camel_case_types)] + struct DeleteKvSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService + for DeleteKvSvc { + type Response = super::EmptyResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::delete_kv(&inner, request) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = DeleteKvSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.metastore.MetastoreService/GetClusterIdentity" => { #[allow(non_camel_case_types)] struct GetClusterIdentitySvc(pub Arc); diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 9c6f7f5b70d..5fef3bfdc29 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -187,6 +187,9 @@ pub struct SearchRequest { pub ignore_missing_indexes: bool, #[prost(string, optional, tag = "19")] pub split_id: ::core::option::Option<::prost::alloc::string::String>, + /// The user agent of the client that initiated the search request. + #[prost(string, optional, tag = "20")] + pub user_agent: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] @@ -197,7 +200,7 @@ pub struct SortField { pub sort_order: i32, /// Optional sort value format for datetime field only. /// If none, the default output format for datetime field is - /// unix_timestamp_nanos. + /// unix_timestamp_millis. #[prost(enumeration = "SortDatetimeFormat", optional, tag = "3")] pub sort_datetime_format: ::core::option::Option, } @@ -214,9 +217,6 @@ pub struct SearchResponse { /// server-side and expressed in microseconds. #[prost(uint64, tag = "3")] pub elapsed_time_micros: u64, - /// The searcherrors that occurred formatted as string. - #[prost(string, repeated, tag = "4")] - pub errors: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, /// Postcard-encoded aggregation response #[prost(bytes = "vec", optional, tag = "9")] pub aggregation_postcard: ::core::option::Option<::prost::alloc::vec::Vec>, @@ -234,6 +234,9 @@ pub struct SearchResponse { /// Total number of successful splits searched. #[prost(uint64, tag = "8")] pub num_successful_splits: u64, + /// Statistics on the split outcomes + #[prost(message, optional, tag = "10")] + pub splits_by_outcome: ::core::option::Option, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] @@ -276,6 +279,34 @@ pub struct LeafSearchRequest { #[prost(string, repeated, tag = "9")] pub index_uris: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } +/// Split outcome counters +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SplitsByOutcome { + #[prost(uint64, tag = "1")] + pub pruned_before_warmup: u64, + #[prost(uint64, tag = "2")] + pub pruned_after_warmup: u64, + /// Cancelled before warmup started (error or timeout) + #[prost(uint64, tag = "3")] + pub cancel_before_warmup: u64, + #[prost(uint64, tag = "4")] + pub processed: u64, + #[prost(uint64, tag = "5")] + pub processed_from_metadata: u64, + /// Resolved by the partial request cache + #[prost(uint64, tag = "6")] + pub cache_hit: u64, + /// Cancelled during warmup (error or timeout) + #[prost(uint64, tag = "7")] + pub cancel_warmup: u64, + /// Cancelled while waiting in the CPU thread pool queue + #[prost(uint64, tag = "8")] + pub cancel_cpu_queue: u64, + /// Cancelled during CPU processing (error or timeout) + #[prost(uint64, tag = "9")] + pub cancel_cpu: u64, +} #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] pub struct ResourceStats { @@ -327,6 +358,9 @@ pub struct SplitIdAndFooterOffsets { /// The number of docs in the split #[prost(uint64, tag = "6")] pub num_docs: u64, + /// Tantivy doc IDs that have been soft-deleted from this split + #[prost(uint32, repeated, tag = "7")] + pub soft_deleted_doc_ids: ::prost::alloc::vec::Vec, } /// Hits returned by a FetchDocRequest. /// @@ -407,16 +441,16 @@ pub struct PartialHit { } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Ord, PartialOrd)] -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct SortByValue { - #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4")] + #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4, 5, 6")] pub sort_value: ::core::option::Option, } /// Nested message and enum types in `SortByValue`. pub mod sort_by_value { #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[serde(rename_all = "snake_case")] - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum SortValue { #[prost(uint64, tag = "1")] U64(u64), @@ -426,6 +460,10 @@ pub mod sort_by_value { F64(f64), #[prost(bool, tag = "4")] Boolean(bool), + #[prost(string, tag = "5")] + Str(::prost::alloc::string::String), + #[prost(int64, tag = "6")] + Datetime(i64), } } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] @@ -457,6 +495,9 @@ pub struct LeafSearchResponse { >, #[prost(message, optional, tag = "8")] pub resource_stats: ::core::option::Option, + /// Split outcome counters for all splits targeted by this leaf request. + #[prost(message, optional, tag = "9")] + pub splits_by_outcome: ::core::option::Option, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] @@ -886,6 +927,35 @@ pub mod search_service_client { .insert(GrpcMethod::new("quickwit.search.SearchService", "FetchDocs")); self.inner.unary(req, path, codec).await } + /// Streams document contents from the document store. + /// This method takes `PartialHit`s and streams back `LeafHit`s in batches + /// to avoid hitting gRPC message size limits. + pub async fn stream_fetch_docs( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response>, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.search.SearchService/StreamFetchDocs", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("quickwit.search.SearchService", "StreamFetchDocs"), + ); + self.inner.server_streaming(req, path, codec).await + } /// Root list terms API. /// This RPC identifies the set of splits on which the query should run on, /// and dispatches the several calls to `LeafListTerms`. @@ -1167,6 +1237,22 @@ pub mod search_service_server { tonic::Response, tonic::Status, >; + /// Server streaming response type for the StreamFetchDocs method. + type StreamFetchDocsStream: tonic::codegen::tokio_stream::Stream< + Item = std::result::Result, + > + + std::marker::Send + + 'static; + /// Streams document contents from the document store. + /// This method takes `PartialHit`s and streams back `LeafHit`s in batches + /// to avoid hitting gRPC message size limits. + async fn stream_fetch_docs( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Root list terms API. /// This RPC identifies the set of splits on which the query should run on, /// and dispatches the several calls to `LeafListTerms`. @@ -1451,6 +1537,53 @@ pub mod search_service_server { }; Box::pin(fut) } + "/quickwit.search.SearchService/StreamFetchDocs" => { + #[allow(non_camel_case_types)] + struct StreamFetchDocsSvc(pub Arc); + impl< + T: SearchService, + > tonic::server::ServerStreamingService + for StreamFetchDocsSvc { + type Response = super::FetchDocsResponse; + type ResponseStream = T::StreamFetchDocsStream; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::stream_fetch_docs(&inner, request) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = StreamFetchDocsSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.server_streaming(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.search.SearchService/RootListTerms" => { #[allow(non_camel_case_types)] struct RootListTermsSvc(pub Arc); diff --git a/quickwit/quickwit-proto/src/control_plane/mod.rs b/quickwit/quickwit-proto/src/control_plane/mod.rs index 4278ec104eb..fd278bc6199 100644 --- a/quickwit/quickwit-proto/src/control_plane/mod.rs +++ b/quickwit/quickwit-proto/src/control_plane/mod.rs @@ -138,6 +138,12 @@ impl RpcName for AdviseResetShardsRequest { } } +impl RpcName for SwapIndexingPipelinesRequest { + fn rpc_name() -> &'static str { + "swap_indexing_pipelines" + } +} + impl GetOrCreateOpenShardsFailureReason { pub fn create_failure( &self, @@ -154,6 +160,24 @@ impl GetOrCreateOpenShardsFailureReason { } } +impl RpcName for EnableMaintenanceModeRequest { + fn rpc_name() -> &'static str { + "enable_maintenance_mode" + } +} + +impl RpcName for DisableMaintenanceModeRequest { + fn rpc_name() -> &'static str { + "disable_maintenance_mode" + } +} + +impl RpcName for GetMaintenanceModeRequest { + fn rpc_name() -> &'static str { + "get_maintenance_mode" + } +} + impl From for GetOrCreateOpenShardsSubrequest { fn from(metastore_open_shard_subrequest: OpenShardSubrequest) -> Self { let index_id = metastore_open_shard_subrequest.index_uid().index_id.clone(); diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs index a327c1717a7..73847554b7c 100644 --- a/quickwit/quickwit-proto/src/getters.rs +++ b/quickwit/quickwit-proto/src/getters.rs @@ -136,6 +136,7 @@ generate_getters! { ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, + SoftDeleteDocumentsRequest, UpdateSplitsDeleteOpstampRequest } diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index f4ddb734d2a..f89fdb97687 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -28,7 +28,8 @@ use tracing_opentelemetry::OpenTelemetrySpanExt; pub mod cluster; pub mod control_plane; -pub use {bytes, tonic}; +pub use bytes; +pub use tonic; pub mod developer; pub mod error; mod getters; diff --git a/quickwit/quickwit-proto/src/search/mod.rs b/quickwit/quickwit-proto/src/search/mod.rs index 307de262a70..caba73828cd 100644 --- a/quickwit/quickwit-proto/src/search/mod.rs +++ b/quickwit/quickwit-proto/src/search/mod.rs @@ -17,6 +17,8 @@ use std::fmt; use std::io::{self, Read}; use prost::Message; +use quickwit_common::numeric_types::num_proj::ProjectedNumber; +use quickwit_common::numeric_types::{num_cmp, num_proj}; pub use sort_by_value::SortValue; include!("../codegen/quickwit/quickwit.search.rs"); @@ -83,6 +85,8 @@ impl SortByValue { } } Some(SortValue::Boolean(b)) => Bool(b), + Some(SortValue::Str(s)) => String(s), + Some(SortValue::Datetime(dt)) => Number(dt.into()), None => Null, } } @@ -104,18 +108,7 @@ impl SortByValue { return None; } } - // Strings that can be converted to a number are accepted. - // Some clients (like JS clients) can't easily handle large integers - // without losing precision, so we accept them as strings. - String(value) => { - if let Ok(number) = value.parse::() { - Some(SortValue::I64(number)) - } else if let Ok(number) = value.parse::() { - Some(SortValue::U64(number)) - } else { - return None; - } - } + String(value) => Some(SortValue::Str(value)), Array(_) | Object(_) => return None, }; Some(SortByValue { sort_value }) @@ -132,25 +125,33 @@ impl Eq for SortValue {} impl Ord for SortValue { #[inline] fn cmp(&self, other: &Self) -> Ordering { - // We make sure to end up with a total order. - match (*self, *other) { + match (self, other) { // Same types. - (SortValue::U64(left), SortValue::U64(right)) => left.cmp(&right), - (SortValue::I64(left), SortValue::I64(right)) => left.cmp(&right), - (SortValue::Boolean(left), SortValue::Boolean(right)) => left.cmp(&right), - // We half the logic by making sure we keep - // the "stronger" type on the left. + (SortValue::U64(left), SortValue::U64(right)) => left.cmp(right), + (SortValue::I64(left), SortValue::I64(right)) => left.cmp(right), + (SortValue::Boolean(left), SortValue::Boolean(right)) => left.cmp(right), + (SortValue::Str(left), SortValue::Str(right)) => left.cmp(right), + (SortValue::F64(left), SortValue::F64(right)) => left.total_cmp(right), + (SortValue::Datetime(left), SortValue::Datetime(right)) => left.cmp(right), + // Different numeric types but can still be compared. + (SortValue::U64(left), SortValue::F64(right)) => { + num_cmp::cmp_u64_f64(*left, *right).expect("unexpected float comparison") + } + (SortValue::F64(left), SortValue::U64(right)) => num_cmp::cmp_u64_f64(*right, *left) + .expect("unexpected float comparison") + .reverse(), + (SortValue::I64(left), SortValue::F64(right)) => { + num_cmp::cmp_i64_f64(*left, *right).expect("unexpected float comparison") + } + (SortValue::F64(left), SortValue::I64(right)) => num_cmp::cmp_i64_f64(*right, *left) + .expect("unexpected float comparison") + .reverse(), + (SortValue::I64(left), SortValue::U64(right)) => num_cmp::cmp_i64_u64(*left, *right), (SortValue::U64(left), SortValue::I64(right)) => { - if left > i64::MAX as u64 { - return Ordering::Greater; - } - (left as i64).cmp(&right) + num_cmp::cmp_i64_u64(*right, *left).reverse() } - (SortValue::F64(left), SortValue::F64(right)) => left.total_cmp(&right), - (SortValue::F64(left), SortValue::U64(right)) => left.total_cmp(&(right as f64)), - (SortValue::F64(left), SortValue::I64(right)) => left.total_cmp(&(right as f64)), - (SortValue::Boolean(left), right) => SortValue::U64(left as u64).cmp(&right), - (left, right) => right.cmp(&left).reverse(), + // Incompatible types, they are sorted one after another. + (left, right) => left.type_sort_key().cmp(&right.type_sort_key()), } } } @@ -165,7 +166,7 @@ impl std::hash::Hash for SortValue { fn hash(&self, state: &mut H) { let this = self.normalize(); std::mem::discriminant(&this).hash(state); - match this { + match &this { SortValue::U64(number) => { number.hash(state); } @@ -178,6 +179,12 @@ impl std::hash::Hash for SortValue { SortValue::Boolean(b) => { b.hash(state); } + SortValue::Str(s) => { + s.hash(state); + } + SortValue::Datetime(dt) => { + dt.hash(state); + } } } } @@ -188,27 +195,36 @@ impl SortValue { /// For number, we prefer to represent them, in order, as i64, then as u64 and finally as f64. pub fn normalize(&self) -> Self { match self { - SortValue::I64(_) => *self, - SortValue::Boolean(_) => *self, - SortValue::U64(number) => { - if let Ok(number) = (*number).try_into() { - SortValue::I64(number) - } else { - *self - } - } - SortValue::F64(number) => { - let number = *number; - if number.ceil() == number { - // number is not NaN, and is a natural number - if number >= i64::MIN as f64 && number <= i64::MAX as f64 { - return SortValue::I64(number as i64); - } else if number.is_sign_positive() && number <= u64::MAX as f64 { - return SortValue::U64(number as u64); + SortValue::I64(_) => self.clone(), + SortValue::Boolean(_) => self.clone(), + SortValue::Str(_) => self.clone(), + SortValue::U64(number) => match num_proj::u64_to_i64(*number) { + ProjectedNumber::Exact(number) => SortValue::I64(number), + _ => self.clone(), + }, + SortValue::F64(float) => match num_proj::f64_to_i64(*float) { + ProjectedNumber::Exact(number) => SortValue::I64(number), + ProjectedNumber::AfterLast => { + if let ProjectedNumber::Exact(number) = num_proj::f64_to_u64(*float) { + SortValue::U64(number) + } else { + self.clone() } } - *self - } + _ => self.clone(), + }, + SortValue::Datetime(_) => self.clone(), + } + } + + pub fn type_sort_key(&self) -> TypeSortKey { + match self { + SortValue::U64(_) => TypeSortKey::Numeric, + SortValue::I64(_) => TypeSortKey::Numeric, + SortValue::F64(_) => TypeSortKey::Numeric, + SortValue::Boolean(_) => TypeSortKey::Boolean, + SortValue::Str(_) => TypeSortKey::Str, + SortValue::Datetime(_) => TypeSortKey::DateTime, } } } @@ -216,14 +232,26 @@ impl SortValue { impl PartialHit { /// Helper to get access to the 1st sort value pub fn sort_value(&self) -> Option { - if let Some(sort_value) = self.sort_value { - sort_value.sort_value + if let Some(sort_value) = &self.sort_value { + sort_value.sort_value.clone() } else { None } } } +/// Defines the order between types when sorting on a field with multiple types. +/// Expected order: +/// - Asc: numeric -> string -> boolean -> datetime +/// - Desc: datetime -> boolean -> string -> numeric +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TypeSortKey { + Numeric, + Str, + Boolean, + DateTime, +} + /// Serializes the Split fields. /// /// `fields_metadata` has to be sorted. diff --git a/quickwit/quickwit-proto/src/types/mod.rs b/quickwit/quickwit-proto/src/types/mod.rs index f103e5c60f2..4133ecf27b2 100644 --- a/quickwit/quickwit-proto/src/types/mod.rs +++ b/quickwit/quickwit-proto/src/types/mod.rs @@ -13,11 +13,10 @@ // limitations under the License. use std::borrow::Borrow; -use std::convert::Infallible; use std::fmt; use std::fmt::{Display, Formatter}; use std::ops::Deref; -use std::str::FromStr; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use tracing::warn; @@ -94,17 +93,16 @@ impl Display for SourceUid { } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] -pub struct NodeId(String); +pub struct NodeId(Arc); impl NodeId { - /// Constructs a new [`NodeId`]. - pub const fn new(node_id: String) -> Self { - Self(node_id) + #[allow(clippy::should_implement_trait)] + pub fn from_str(node_id: &str) -> Self { + Self(Arc::from(node_id)) } - /// Takes ownership of the underlying [`String`], consuming `self`. - pub fn take(self) -> String { - self.0 + pub fn from_arc_str(node_id: Arc) -> Self { + Self(node_id) } } @@ -120,12 +118,6 @@ impl Borrow for NodeId { } } -impl Borrow for NodeId { - fn borrow(&self) -> &String { - &self.0 - } -} - impl Borrow for NodeId { fn borrow(&self) -> &NodeIdRef { self.deref() @@ -146,50 +138,18 @@ impl Display for NodeId { } } -impl From<&'_ str> for NodeId { - fn from(node_id: &str) -> Self { - Self::new(node_id.to_string()) - } -} - -impl From for NodeId { - fn from(node_id: String) -> Self { - Self::new(node_id) - } -} - -impl From for String { +impl From for Arc { fn from(node_id: NodeId) -> Self { node_id.0 } } -impl From<&'_ NodeIdRef> for NodeId { - fn from(node_id: &NodeIdRef) -> Self { - node_id.to_owned() - } -} - -impl FromStr for NodeId { - type Err = Infallible; - - fn from_str(node_id: &str) -> Result { - Ok(NodeId::new(node_id.to_string())) - } -} - impl PartialEq<&str> for NodeId { fn eq(&self, other: &&str) -> bool { self.as_str() == *other } } -impl PartialEq for NodeId { - fn eq(&self, other: &String) -> bool { - self.as_str() == *other - } -} - #[repr(transparent)] #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct NodeIdRef(str); @@ -272,7 +232,7 @@ impl ToOwned for NodeIdRef { type Owned = NodeId; fn to_owned(&self) -> Self::Owned { - NodeId(self.0.to_string()) + NodeId(Arc::from(&self.0)) } } @@ -321,7 +281,7 @@ mod tests { #[test] fn test_node_id() { - let node_id = NodeId::new("test-node".to_string()); + let node_id = NodeId::from_str("test-node"); assert_eq!(node_id.as_str(), "test-node"); assert_eq!(node_id, NodeIdRef::from_str("test-node")); } @@ -333,7 +293,7 @@ mod tests { node_id: NodeId, } let node = Node { - node_id: NodeId::from("test-node"), + node_id: NodeId::from_str("test-node"), }; let serialized = serde_json::to_string(&node).unwrap(); assert_eq!(serialized, r#"{"node_id":"test-node"}"#); diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 066c00c0ff7..f24d8662715 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -15,9 +15,6 @@ anyhow = { workspace = true } base64 = { workspace = true } bitpacking = { workspace = true } hex = { workspace = true } -lindera-core = { workspace = true, optional = true } -lindera-dictionary = { workspace = true, optional = true } -lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true } @@ -29,7 +26,6 @@ tracing = { workspace = true } time = { workspace = true } thiserror = { workspace = true } rustc-hash = { workspace = true } -whichlang = { workspace = true, optional = true } quickwit-common = { workspace = true } quickwit-datetime = { workspace = true } @@ -42,19 +38,6 @@ time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -[features] -multilang = [ - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "whichlang", - "tantivy/stemmer", -] - [[bench]] name = "tokenizers_bench" harness = false - -[[bench]] -name = "multilang_tokenizers_bench" -harness = false diff --git a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs deleted file mode 100644 index 61755dea556..00000000000 --- a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main}; -use quickwit_query::create_default_quickwit_tokenizer_manager; -use tantivy::tokenizer::{TextAnalyzer, Token, TokenStream}; - -// A random ascii string of length 100 chars. -const ASCII_SHORT: &str = "It is a long established fact"; -static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a - page when looking at its layout. The point of using Lorem Ipsum is that it has a - more-or-less normal distribution of letters, as opposed to using 'Content here, content - here', making it look like readable English. Many desktop publishing packages and web page - editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will - uncover many web sites still in their infancy. Various versions have evolved over the years, - sometimes by accident, sometimes on purpose (injected humour and the like)."#; -const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね"; -const JPN_LONG: &str = r#"日本ごです。 和名の由来は、 - 太陽の動きにつれてその方向を追うように花が回るといわれたことから。 - ただしこの動きは生長に伴うものであるため、 - 実際に太陽を追って動くのは生長が盛んな若い時期だけである。 - 若いヒマワリの茎の上部の葉は太陽に正対になるように動き、 - 朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、 - 夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、 - つぼみが大きくなり花が開く素敵な言葉ですね."#; -const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。"; -const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。 - 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。 - 是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事, - 滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#; -const KOR_SHORT: &str = "안녕하세요. 반갑습니다."; -const KOR_LONG: &str = r#" -포근히 내려오는 눈밭속에서는 -낯이 붉은 處女아이들도 깃들이어 오는 소리… -울고 -웃고 -수구리고 -새파라니 얼어서 -運命들이 모두다 안끼어 드는 소리… -큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적 -큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리 -끊임없이 내리는 눈발 속에서는 -山도 山도 靑山도 안끼어 드는 소리 -"#; - -fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec { - let mut token_stream = analyzer.token_stream(text); - let mut tokens: Vec = Vec::new(); - token_stream.process(&mut |token: &Token| tokens.push(token.clone())); - tokens -} - -pub fn tokenizers_throughput_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("multilang"); - let tokenizer_manager = create_default_quickwit_tokenizer_manager(); - let mut default_tokenizer = tokenizer_manager.get_tokenizer("default").unwrap(); - let mut multilang_tokenizer = tokenizer_manager.get_tokenizer("multilang").unwrap(); - let mut chinese_tokenizer = tokenizer_manager - .get_tokenizer("chinese_compatible") - .unwrap(); - - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - let short_with_prefix = "ENG:".to_string() + ASCII_SHORT; - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input( - "multilang-tokenize-short-with-prefix", - &short_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - let long_with_prefix = "ENG:".to_string() + ASCII_LONG; - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input( - "multilang-tokenize-long-with-prefix", - &long_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(JPN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(JPN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-short", - CMN_SHORT, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-long", - CMN_LONG, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); -} - -criterion_group!( - tokenizers_throughput_benches, - tokenizers_throughput_benchmark -); -criterion_main!(tokenizers_throughput_benches); diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/string_or_struct.rs b/quickwit/quickwit-query/src/elastic_query_dsl/literal_or_struct.rs similarity index 75% rename from quickwit/quickwit-query/src/elastic_query_dsl/string_or_struct.rs rename to quickwit/quickwit-query/src/elastic_query_dsl/literal_or_struct.rs index c2be03c8367..c290ef3a6c0 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/string_or_struct.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/literal_or_struct.rs @@ -18,7 +18,7 @@ use std::marker::PhantomData; use serde::de::{MapAccess, Visitor}; use serde::{Deserialize, Deserializer, de}; -/// The point of `StringOrStructForSerialization` is to support +/// The point of `LiteralOrStructForSerialization` is to support /// the two following formats for various queries. /// /// `{"field": {"query": "my query", "default_operator": "OR"}}` @@ -26,37 +26,37 @@ use serde::{Deserialize, Deserializer, de}; /// and the shorter. /// `{"field": "my query"}` /// -/// If a integer is passed, we cast it to string. Floats are not supported. +/// If a number or bool is passed, we cast it to string /// /// We don't use untagged enum to support this, in order to keep good errors. /// /// The code below is adapted from solution described here: #[derive(Deserialize)] #[serde(transparent)] -pub(crate) struct StringOrStructForSerialization +pub(crate) struct LiteralOrStructForSerialization where T: From, for<'de2> T: Deserialize<'de2>, { - #[serde(deserialize_with = "string_or_struct")] + #[serde(deserialize_with = "literal_or_struct")] pub inner: T, } -struct StringOrStructVisitor { +struct LiteralOrStructVisitor { phantom_data: PhantomData, } -fn string_or_struct<'de, D, T>(deserializer: D) -> Result +fn literal_or_struct<'de, D, T>(deserializer: D) -> Result where D: Deserializer<'de>, T: From + Deserialize<'de>, { - deserializer.deserialize_any(StringOrStructVisitor { + deserializer.deserialize_any(LiteralOrStructVisitor { phantom_data: Default::default(), }) } -impl<'de, T> Visitor<'de> for StringOrStructVisitor +impl<'de, T> Visitor<'de> for LiteralOrStructVisitor where T: From, T: Deserialize<'de>, @@ -68,6 +68,11 @@ where formatter.write_str(&format!("string or map to deserialize {type_str}.")) } + fn visit_bool(self, v: bool) -> Result + where E: de::Error { + self.visit_str(&v.to_string()) + } + fn visit_i64(self, v: i64) -> Result where E: de::Error { self.visit_str(&v.to_string()) @@ -78,6 +83,11 @@ where self.visit_str(&v.to_string()) } + fn visit_f64(self, v: f64) -> Result + where E: de::Error { + self.visit_str(&v.to_string()) + } + fn visit_str(self, query: &str) -> Result where E: serde::de::Error { Ok(T::from(query.to_string())) diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/match_bool_prefix.rs b/quickwit/quickwit-query/src/elastic_query_dsl/match_bool_prefix.rs index 752f4f2c0a2..dbb2491903b 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/match_bool_prefix.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/match_bool_prefix.rs @@ -14,7 +14,7 @@ use serde::Deserialize; -use super::{ElasticQueryDslInner, StringOrStructForSerialization}; +use super::{ElasticQueryDslInner, LiteralOrStructForSerialization}; use crate::OneFieldMap; use crate::elastic_query_dsl::match_query::MatchQueryParams; use crate::elastic_query_dsl::{ConvertibleToQueryAst, default_max_expansions}; @@ -23,7 +23,7 @@ use crate::query_ast::{FullTextParams, FullTextQuery, QueryAst}; /// `MatchBoolPrefixQuery` as defined in /// #[derive(Deserialize, Clone, Eq, PartialEq, Debug)] -#[serde(from = "OneFieldMap>")] +#[serde(from = "OneFieldMap>")] pub(crate) struct MatchBoolPrefixQuery { pub(crate) field: String, pub(crate) params: MatchQueryParams, @@ -54,9 +54,9 @@ impl From for ElasticQueryDslInner { } } -impl From>> for MatchBoolPrefixQuery { +impl From>> for MatchBoolPrefixQuery { fn from( - match_query_params: OneFieldMap>, + match_query_params: OneFieldMap>, ) -> Self { let OneFieldMap { field, value } = match_query_params; MatchBoolPrefixQuery { diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/match_phrase_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/match_phrase_query.rs index 1f49929782f..e178a053ea1 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/match_phrase_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/match_phrase_query.rs @@ -15,7 +15,7 @@ use serde::Deserialize; use crate::elastic_query_dsl::{ - ConvertibleToQueryAst, ElasticQueryDslInner, StringOrStructForSerialization, + ConvertibleToQueryAst, ElasticQueryDslInner, LiteralOrStructForSerialization, }; use crate::query_ast::{FullTextMode, FullTextParams, FullTextQuery, QueryAst}; use crate::{MatchAllOrNone, OneFieldMap}; @@ -23,7 +23,7 @@ use crate::{MatchAllOrNone, OneFieldMap}; /// `MatchPhraseQuery` as defined in /// #[derive(Deserialize, Clone, Eq, PartialEq, Debug)] -#[serde(from = "OneFieldMap>")] +#[serde(from = "OneFieldMap>")] pub(crate) struct MatchPhraseQuery { pub(crate) field: String, pub(crate) params: MatchPhraseQueryParams, @@ -65,11 +65,11 @@ impl From for ElasticQueryDslInner { } } -impl From>> +impl From>> for MatchPhraseQuery { fn from( - match_query_params: OneFieldMap>, + match_query_params: OneFieldMap>, ) -> Self { let OneFieldMap { field, value } = match_query_params; MatchPhraseQuery { diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs index 9f0f56a3184..5286c9736e0 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/match_query.rs @@ -16,7 +16,7 @@ use serde::Deserialize; use super::LeniencyBool; use crate::elastic_query_dsl::{ - ConvertibleToQueryAst, ElasticQueryDslInner, StringOrStructForSerialization, + ConvertibleToQueryAst, ElasticQueryDslInner, LiteralOrStructForSerialization, }; use crate::query_ast::{FullTextParams, FullTextQuery, QueryAst}; use crate::{BooleanOperand, MatchAllOrNone, OneFieldMap}; @@ -24,7 +24,7 @@ use crate::{BooleanOperand, MatchAllOrNone, OneFieldMap}; /// `MatchQuery` as defined in /// #[derive(Deserialize, Clone, Eq, PartialEq, Debug)] -#[serde(from = "OneFieldMap>")] +#[serde(from = "OneFieldMap>")] pub struct MatchQuery { pub(crate) field: String, pub(crate) params: MatchQueryParams, @@ -64,9 +64,9 @@ impl From for ElasticQueryDslInner { } } -impl From>> for MatchQuery { +impl From>> for MatchQuery { fn from( - match_query_params: OneFieldMap>, + match_query_params: OneFieldMap>, ) -> Self { let OneFieldMap { field, value } = match_query_params; MatchQuery { diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs index 871032951e2..e4af368534d 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs @@ -16,6 +16,7 @@ use serde::{Deserialize, Serialize}; mod bool_query; mod exists_query; +mod literal_or_struct; mod match_bool_prefix; mod match_phrase_query; mod match_query; @@ -26,18 +27,17 @@ mod prefix_query; mod query_string_query; mod range_query; mod regex_query; -mod string_or_struct; mod term_query; mod terms_query; mod wildcard_query; use bool_query::BoolQuery; +pub(crate) use literal_or_struct::LiteralOrStructForSerialization; pub use one_field_map::OneFieldMap; use phrase_prefix_query::MatchPhrasePrefixQuery; use prefix_query::PrefixQuery; pub(crate) use query_string_query::QueryStringQuery; use range_query::RangeQuery; -pub(crate) use string_or_struct::StringOrStructForSerialization; use term_query::TermQuery; use crate::elastic_query_dsl::exists_query::ExistsQuery; diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs index f19fad61037..0542c9de954 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs @@ -15,11 +15,11 @@ use serde::Deserialize; use crate::elastic_query_dsl::one_field_map::OneFieldMap; -use crate::elastic_query_dsl::{ConvertibleToQueryAst, StringOrStructForSerialization}; +use crate::elastic_query_dsl::{ConvertibleToQueryAst, LiteralOrStructForSerialization}; use crate::query_ast::{QueryAst, WildcardQuery as AstWildcardQuery}; #[derive(Deserialize, Clone, Eq, PartialEq, Debug)] -#[serde(from = "OneFieldMap>")] +#[serde(from = "OneFieldMap>")] pub(crate) struct PrefixQuery { pub(crate) field: String, pub(crate) params: PrefixQueryParams, @@ -53,9 +53,9 @@ impl ConvertibleToQueryAst for PrefixQuery { } } -impl From>> for PrefixQuery { +impl From>> for PrefixQuery { fn from( - match_query_params: OneFieldMap>, + match_query_params: OneFieldMap>, ) -> Self { let OneFieldMap { field, value } = match_query_params; PrefixQuery { diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs index 5fd320a2580..d8b34a5ebbc 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs @@ -14,21 +14,21 @@ use serde::{Deserialize, Deserializer, Serialize}; -use super::StringOrStructForSerialization; +use super::LiteralOrStructForSerialization; use crate::elastic_query_dsl::one_field_map::OneFieldMap; use crate::elastic_query_dsl::{ConvertibleToQueryAst, ElasticQueryDslInner}; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::{self, QueryAst}; #[derive(Deserialize, Debug, PartialEq, Eq, Clone)] -#[serde(from = "OneFieldMap>")] +#[serde(from = "OneFieldMap>")] pub struct TermQuery { pub field: String, pub value: TermQueryParams, } -impl From>> for TermQuery { - fn from(one_field_map: OneFieldMap>) -> Self { +impl From>> for TermQuery { + fn from(one_field_map: OneFieldMap>) -> Self { TermQuery { field: one_field_map.field, value: one_field_map.value.inner, @@ -52,6 +52,8 @@ enum TermValue { I64(i64), U64(u64), Str(String), + Bool(bool), + F64(f64), } fn deserialize_term_value<'de, D>(deserializer: D) -> Result @@ -61,6 +63,8 @@ where D: Deserializer<'de> { TermValue::I64(i64) => Ok(i64.to_string()), TermValue::U64(u64) => Ok(u64.to_string()), TermValue::Str(str) => Ok(str), + TermValue::Bool(b) => Ok(b.to_string()), + TermValue::F64(f) => Ok(f.to_string()), } } @@ -123,7 +127,7 @@ mod tests { use super::*; #[test] - fn test_term_query_simple() { + fn test_term_query_string() { let term_query_json = r#"{ "product_id": { "value": "61809" } }"#; let term_query: TermQuery = serde_json::from_str(term_query_json).unwrap(); assert_eq!( @@ -133,7 +137,7 @@ mod tests { } #[test] - fn test_term_query_deserialization_in_short_format() { + fn test_term_query_string_short_form() { let term_query: TermQuery = serde_json::from_str( r#"{ "product_id": "61809" @@ -145,4 +149,38 @@ mod tests { &term_query_from_field_value("product_id", "61809") ); } + + #[test] + fn test_term_query_bool() { + let term_query_json = r#"{ "is_product_pretty": { "value": true } }"#; + let term_query: TermQuery = serde_json::from_str(term_query_json).unwrap(); + assert_eq!( + &term_query, + &term_query_from_field_value("is_product_pretty", "true") + ); + } + + #[test] + fn test_term_query_bool_short_form() { + let term_query_json = r#"{ "is_product_pretty": true }"#; + let term_query: TermQuery = serde_json::from_str(term_query_json).unwrap(); + assert_eq!( + &term_query, + &term_query_from_field_value("is_product_pretty", "true") + ); + } + + #[test] + fn test_term_query_float() { + let term_query_json = r#"{ "price": { "value": 1.1 } }"#; + let term_query: TermQuery = serde_json::from_str(term_query_json).unwrap(); + assert_eq!(&term_query, &term_query_from_field_value("price", "1.1")); + } + + #[test] + fn test_term_query_float_short_form() { + let term_query_json = r#"{ "price": 1.1 }"#; + let term_query: TermQuery = serde_json::from_str(term_query_json).unwrap(); + assert_eq!(&term_query, &term_query_from_field_value("price", "1.1")); + } } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/terms_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/terms_query.rs index 48b65e4dd29..ec3d300e18f 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/terms_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/terms_query.rs @@ -13,7 +13,9 @@ // limitations under the License. use std::collections::{BTreeSet, HashMap}; +use std::sync::LazyLock; +use anyhow::bail; use serde::Deserialize; use crate::elastic_query_dsl::one_field_map::OneFieldMap; @@ -85,8 +87,26 @@ impl TryFrom for TermsQuery { } } +const TERMS_QUERY_WARN_THRESHOLD: usize = 10_000; +static TERMS_QUERY_REJECT_THRESHOLD: LazyLock = + LazyLock::new(|| quickwit_common::get_from_env("QW_MAX_TERMS_QUERY_SIZE", 100_000, false)); + impl ConvertibleToQueryAst for TermsQuery { fn convert_to_query_ast(self) -> anyhow::Result { + if self.values.len() > TERMS_QUERY_WARN_THRESHOLD { + tracing::warn!( + num_terms = self.values.len(), + field = %self.field, + "terms query contains more than {TERMS_QUERY_WARN_THRESHOLD} terms" + ); + } + if self.values.len() > *TERMS_QUERY_REJECT_THRESHOLD { + bail!( + "too many terms ({}>{})", + self.values.len(), + *TERMS_QUERY_REJECT_THRESHOLD + ) + } let mut terms_per_field = HashMap::new(); let values_set: BTreeSet = self.values.into_iter().collect(); terms_per_field.insert(self.field, values_set); diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs index 3b975e896e9..5fef8c667a8 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs @@ -16,11 +16,11 @@ use serde::Deserialize; use crate::NotNaNf32; use crate::elastic_query_dsl::one_field_map::OneFieldMap; -use crate::elastic_query_dsl::{ConvertibleToQueryAst, StringOrStructForSerialization}; +use crate::elastic_query_dsl::{ConvertibleToQueryAst, LiteralOrStructForSerialization}; use crate::query_ast::{QueryAst, WildcardQuery as AstWildcardQuery}; #[derive(Deserialize, Clone, Eq, PartialEq, Debug)] -#[serde(from = "OneFieldMap>")] +#[serde(from = "OneFieldMap>")] pub(crate) struct WildcardQuery { pub(crate) field: String, pub(crate) params: WildcardQueryParams, @@ -49,9 +49,9 @@ impl ConvertibleToQueryAst for WildcardQuery { } } -impl From>> for WildcardQuery { +impl From>> for WildcardQuery { fn from( - match_query_params: OneFieldMap>, + match_query_params: OneFieldMap>, ) -> Self { let OneFieldMap { field, value } = match_query_params; WildcardQuery { diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index b2040f73daa..8f70e155933 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -38,8 +38,6 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; -#[cfg(feature = "multilang")] -pub use tokenizers::MultiLangTokenizer; pub use tokenizers::{ CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH, create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 84176f4a4aa..7b24a66163d 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -247,7 +247,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -290,7 +289,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -335,7 +333,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -398,7 +395,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index d086c36a977..5a90715075e 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -14,8 +14,6 @@ mod chinese_compatible; mod code_tokenizer; -#[cfg(feature = "multilang")] -mod multilang; mod tokenizer_manager; use once_cell::sync::Lazy; @@ -26,8 +24,6 @@ use tantivy::tokenizer::{ use self::chinese_compatible::ChineseTokenizer; pub use self::code_tokenizer::CodeTokenizer; -#[cfg(feature = "multilang")] -pub use self::multilang::MultiLangTokenizer; pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager}; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; @@ -58,17 +54,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - #[cfg(feature = "multilang")] - { - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .filter(tantivy::tokenizer::Stemmer::new( - tantivy::tokenizer::Language::English, - )) - .build(); - tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - } tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) @@ -94,15 +79,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(), true, ); - #[cfg(feature = "multilang")] - tokenizer_manager.register( - "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - true, - ); tokenizer_manager } diff --git a/quickwit/quickwit-query/src/tokenizers/multilang.rs b/quickwit/quickwit-query/src/tokenizers/multilang.rs deleted file mode 100644 index a62d2ff151c..00000000000 --- a/quickwit/quickwit-query/src/tokenizers/multilang.rs +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use lindera_core::mode::Mode; -use lindera_dictionary::{DictionaryConfig, DictionaryKind, load_dictionary_from_config}; -use lindera_tokenizer::token::Token as LinderaToken; -use lindera_tokenizer::tokenizer::Tokenizer as LinderaTokenizer; -use once_cell::sync::Lazy; -use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer}; -use whichlang::{Lang, detect_language}; - -// Note(fmassot): we use `lindera_tokenizer::tokenizer::Tokenizer` and not -// `use lindera_tantivy::tokenizer::LinderaTokenizer` to avoid -// costly copy of lindera dictionaries each time we clone the `MultiLangTokenizer`. - -/// Mandarin chinese tokenizer. -static CMN_TOKENIZER: Lazy = Lazy::new(|| { - let cmn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let cmn_dictionary = load_dictionary_from_config(cmn_dictionary_config) - .expect("Lindera `CcCedict` dictionary must be present"); - LinderaTokenizer::new(cmn_dictionary, None, Mode::Normal) -}); - -/// Japanese tokenizer. -static JPN_TOKENIZER: Lazy = Lazy::new(|| { - let jpn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let jpn_dictionary = load_dictionary_from_config(jpn_dictionary_config) - .expect("Lindera `IPADIC` dictionary must be present"); - LinderaTokenizer::new(jpn_dictionary, None, Mode::Normal) -}); - -/// Korean tokenizer. -static KOR_TOKENIZER: Lazy = Lazy::new(|| { - let kor_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let kor_dictionary = load_dictionary_from_config(kor_dictionary_config) - .expect("Lindera `KoDic` dictionary must be present"); - LinderaTokenizer::new(kor_dictionary, None, Mode::Normal) -}); - -/// Multilanguage tokenizer that uses the `whichlang` to detect the language of the text -/// and uses the appropriate tokenizer for the detected language: -/// - lindera for Chinese, Japanese, and Korean. -/// - Quickwit's default tokenizer for other languages. -/// -/// It is possible to bypass the language detection by prefixing the text with the language code -/// followed by a colon. For example, `KOR:일본입니다` will be tokenized by the korean tokenizer. -/// Current supported prefix are: -/// - `KOR:` for Korean tokenizer -/// - `JPN:` for Japanese tokenizer -/// - `CMN:` for Chinese tokenizer -/// - `ENG:` for Quickwit's default tokenizer -#[derive(Clone, Default)] -pub struct MultiLangTokenizer { - default_tokenizer: SimpleTokenizer, - token: Token, -} - -impl Tokenizer for MultiLangTokenizer { - type TokenStream<'a> = MultiLanguageTokenStream<'a>; - fn token_stream<'a>(&'a mut self, text: &'a str) -> MultiLanguageTokenStream<'a> { - self.token.reset(); - let (language_prefix, text_to_tokenize) = get_language_from_prefix(text); - // If the text is empty, we return an empty token stream. - // `whichlang::detect_language` panicks if the text is empty. - if text.trim().is_empty() { - return MultiLanguageTokenStream::Empty; - } - let language = language_prefix.unwrap_or_else(|| detect_language(text_to_tokenize)); - match language { - Lang::Cmn => { - let lindera_token_stream = LinderaTokenStream { - tokens: CMN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Jpn => { - let lindera_token_stream = LinderaTokenStream { - tokens: JPN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Kor => { - let lindera_token_stream = LinderaTokenStream { - tokens: KOR_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - _ => MultiLanguageTokenStream::Simple( - self.default_tokenizer.token_stream(text_to_tokenize), - ), - } - } -} - -/// Gets the language defined by a prefix `{ID}:text` where ID being the 3-letter language used by -/// whichlang) and returns the language and the text without the prefix. If the prefix is not -/// recognized, the language is `None` and the text is the original. -fn get_language_from_prefix(text: &str) -> (Option, &str) { - let prefix_bytes = &text.as_bytes()[0..std::cmp::min(4, text.len())]; - // TODO: refactor. - let prefix_language = match prefix_bytes { - b"CMN:" => Some(Lang::Cmn), - b"ENG:" => Some(Lang::Eng), - b"JPN:" => Some(Lang::Jpn), - b"KOR:" => Some(Lang::Kor), - _ => None, - }; - let text_without_prefix = if prefix_language.is_some() { - // This is safe as we know that the prefix is made of 4 ascii characters. - &text[4..] - } else { - text - }; - (prefix_language, text_without_prefix) -} -pub enum MultiLanguageTokenStream<'a> { - Empty, - Lindera(LinderaTokenStream<'a>), - Simple(SimpleTokenStream<'a>), -} - -impl TokenStream for MultiLanguageTokenStream<'_> { - fn advance(&mut self) -> bool { - match self { - MultiLanguageTokenStream::Empty => false, - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.advance(), - } - } - - fn token(&self) -> &Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token(), - } - } - - fn token_mut(&mut self) -> &mut Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token_mut() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token_mut(), - } - } -} - -pub struct LinderaTokenStream<'a> { - pub tokens: Vec>, - pub token: &'a mut Token, -} - -impl TokenStream for LinderaTokenStream<'_> { - fn advance(&mut self) -> bool { - if self.tokens.is_empty() { - return false; - } - let token = self.tokens.remove(0); - self.token.text = token.text.to_string(); - self.token.offset_from = token.byte_start; - self.token.offset_to = token.byte_end; - self.token.position = token.position; - self.token.position_length = token.position_length; - - true - } - - fn token(&self) -> &Token { - self.token - } - - fn token_mut(&mut self) -> &mut Token { - self.token - } -} - -#[cfg(test)] -mod tests { - use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; - - use super::{MultiLangTokenizer, MultiLanguageTokenStream, get_language_from_prefix}; - - fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec { - let mut tokens: Vec = Vec::new(); - tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); - tokens - } - - #[test] - fn test_multilanguage_tokenizer_cmn() { - let mut tokenizer = MultiLangTokenizer::default(); - let tokens = test_helper( - tokenizer.token_stream("地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元"), - ); - assert_eq!(tokens.len(), 19); - { - let token = &tokens[0]; - assert_eq!(token.text, "地址"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_jpn() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("すもももももももものうち")); - assert_eq!(tokens.len(), 7); - { - let token = &tokens[0]; - assert_eq!(token.text, "すもも"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 9); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - // Force usage of JPN tokenizer. - let tokens = test_helper(tokenizer.token_stream("JPN:すもももももももものうち")); - assert_eq!(tokens.len(), 7); - } - { - // Force usage of ENG tokenizer. - // This tokenizer will return only one token. - let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_kor() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - { - let token = &tokens[0]; - assert_eq!(token.text, "일본"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - let tokens = - test_helper(tokenizer.token_stream("KOR:일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - } - { - let tokens = test_helper(tokenizer.token_stream("ENG:일본입니다")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_with_empty_string() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("")); - assert_eq!(tokens.len(), 0); - } - { - let tokens = test_helper(tokenizer.token_stream(" ")); - assert_eq!(tokens.len(), 0); - } - } - - #[test] - fn test_multilanguage_process_language_prefix() { - { - let (lang, text) = get_language_from_prefix("JPN:すもももももももものうち"); - assert_eq!(lang, Some(whichlang::Lang::Jpn)); - assert_eq!(text, "すもももももももものうち"); - } - { - let (lang, text) = get_language_from_prefix("CMN:地址1,包含無效的字元"); - assert_eq!(lang, Some(whichlang::Lang::Cmn)); - assert_eq!(text, "地址1,包含無效的字元"); - } - { - let (lang, text) = get_language_from_prefix("ENG:my address"); - assert_eq!(lang, Some(whichlang::Lang::Eng)); - assert_eq!(text, "my address"); - } - { - let (lang, text) = get_language_from_prefix("UNK:my address"); - assert!(lang.is_none()); - assert_eq!(text, "UNK:my address"); - } - { - let (lang, text) = get_language_from_prefix(""); - assert!(lang.is_none()); - assert_eq!(text, ""); - } - } -} diff --git a/quickwit/quickwit-rest-client/src/models.rs b/quickwit/quickwit-rest-client/src/models.rs index 2857495803f..cfe570058bd 100644 --- a/quickwit/quickwit-rest-client/src/models.rs +++ b/quickwit/quickwit-rest-client/src/models.rs @@ -83,7 +83,6 @@ pub struct SearchResponseRestClient { pub hits: Vec, pub snippets: Option>, pub elapsed_time_micros: u64, - pub errors: Vec, pub aggregations: Option, } diff --git a/quickwit/quickwit-rest-client/src/rest_client.rs b/quickwit/quickwit-rest-client/src/rest_client.rs index 1fb2b5c9812..ab51a3fd1dd 100644 --- a/quickwit/quickwit-rest-client/src/rest_client.rs +++ b/quickwit/quickwit-rest-client/src/rest_client.rs @@ -25,7 +25,7 @@ use quickwit_proto::ingest::Shard; use quickwit_serve::{ ListSplitsQueryParams, ListSplitsResponse, RestIngestResponse, SearchRequestQueryString, }; -use reqwest::header::{CONTENT_TYPE, HeaderMap, HeaderValue}; +use reqwest::header::{CONTENT_TYPE, HeaderMap, HeaderValue, USER_AGENT}; use reqwest::tls::Certificate; use reqwest::{ClientBuilder as ReqwestClientBuilder, Method, StatusCode, Url}; use reqwest_middleware::{ClientBuilder as ReqwestMiddlewareClientBuilder, ClientWithMiddleware}; @@ -112,6 +112,7 @@ impl Transport { } let mut request_headers = HeaderMap::new(); request_headers.insert(CONTENT_TYPE, HeaderValue::from_static(DEFAULT_CONTENT_TYPE)); + request_headers.insert(USER_AGENT, HeaderValue::from_static("qw-rest-client")); if let Some(header_map_val) = header_map { request_headers.extend(header_map_val.into_iter()); } @@ -292,6 +293,10 @@ impl QuickwitClient { ClusterClient::new(&self.transport, self.timeout) } + pub fn maintenance(&self) -> MaintenanceClient<'_> { + MaintenanceClient::new(&self.transport, self.timeout) + } + pub fn node_stats(&self) -> NodeStatsClient<'_> { NodeStatsClient::new(&self.transport, self.timeout) } @@ -780,6 +785,79 @@ impl<'a> NodeHealthClient<'a> { } } +/// Response from the maintenance status endpoint. +#[derive(Debug, serde::Deserialize)] +pub struct MaintenanceStatusResponse { + pub is_maintenance_mode: bool, + pub enabled_at: Option, +} + +/// Response from the enable maintenance endpoint. +#[derive(Debug, serde::Deserialize)] +pub struct EnableMaintenanceResponse { + pub frozen_plan_json: String, +} + +/// Client for maintenance mode APIs. +pub struct MaintenanceClient<'a> { + transport: &'a Transport, + timeout: Timeout, +} + +impl<'a> MaintenanceClient<'a> { + fn new(transport: &'a Transport, timeout: Timeout) -> Self { + Self { transport, timeout } + } + + pub async fn status(&self) -> Result { + let response = self + .transport + .send::<()>( + Method::GET, + "cluster/maintenance", + None, + None, + None, + self.timeout, + ) + .await?; + let status = response.deserialize().await?; + Ok(status) + } + + pub async fn enable(&self) -> Result { + let response = self + .transport + .send::<()>( + Method::PUT, + "cluster/maintenance", + None, + None, + None, + self.timeout, + ) + .await?; + let result = response.deserialize().await?; + Ok(result) + } + + pub async fn disable(&self) -> Result<(), Error> { + let response = self + .transport + .send::<()>( + Method::DELETE, + "cluster/maintenance", + None, + None, + None, + self.timeout, + ) + .await?; + response.check().await?; + Ok(()) + } +} + fn header_from_config_format(config_format: ConfigFormat) -> HeaderMap { let mut header_map = HeaderMap::new(); let content_type_value = format!("application/{}", config_format.as_str()); @@ -842,7 +920,6 @@ mod test { snippets: None, aggregations: None, elapsed_time_micros: 100, - errors: Vec::new(), }; Mock::given(method("POST")) .and(path("/api/v1/my-index/search")) diff --git a/quickwit/quickwit-search/src/client.rs b/quickwit/quickwit-search/src/client.rs index 194bf0b2bd0..628325b2efc 100644 --- a/quickwit/quickwit-search/src/client.rs +++ b/quickwit/quickwit-search/src/client.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use std::time::Duration; use bytesize::ByteSize; +use futures::TryStreamExt; use http::Uri; use quickwit_proto::search::{GetKvRequest, PutKvRequest, ReportSplitsRequest}; use quickwit_proto::tonic::Request; @@ -151,12 +152,24 @@ impl SearchServiceClient { ) -> crate::Result { match &mut self.client_impl { SearchServiceClientImpl::Grpc(grpc_client) => { + let nb_docs_fetched = request.partial_hits.len(); let tonic_request = Request::new(request); - let tonic_response = grpc_client - .fetch_docs(tonic_request) + let all_hits = grpc_client + .stream_fetch_docs(tonic_request) .await - .map_err(|tonic_error| parse_grpc_error(&tonic_error))?; - Ok(tonic_response.into_inner()) + .map_err(|tonic_error| parse_grpc_error(&tonic_error))? + .into_inner() + // TODO stream item errors are all collapsed into SearchError::Internal + .map_err(|tonic_error| parse_grpc_error(&tonic_error)) + .try_fold( + Vec::with_capacity(nb_docs_fetched), + |mut acc, response| async move { + acc.extend(response.hits); + Ok(acc) + }, + ) + .await?; + Ok(quickwit_proto::search::FetchDocsResponse { hits: all_hits }) } SearchServiceClientImpl::Local(service) => service.fetch_docs(request).await, } diff --git a/quickwit/quickwit-search/src/cluster_client.rs b/quickwit/quickwit-search/src/cluster_client.rs index 79f6ba81702..461ffae7521 100644 --- a/quickwit/quickwit-search/src/cluster_client.rs +++ b/quickwit/quickwit-search/src/cluster_client.rs @@ -26,7 +26,9 @@ use tracing::{debug, error, info, warn}; use crate::retry::search::LeafSearchRetryPolicy; use crate::retry::{DefaultRetryPolicy, RetryPolicy, retry_client}; -use crate::{SearchJobPlacer, SearchServiceClient, merge_resource_stats_it}; +use crate::{ + SearchJobPlacer, SearchServiceClient, merge_resource_stats_it, merge_splits_by_outcome_it, +}; /// Maximum number of put requests emitted to perform a replicated given PUT KV. const MAX_PUT_KV_ATTEMPTS: usize = 6; @@ -260,10 +262,17 @@ fn merge_original_with_retry_leaf_search_response( (Some(left), None) => Some(left), (None, None) => None, }; + let resource_stats = merge_resource_stats_it([ &original_response.resource_stats, &retry_response.resource_stats, ]); + // Splits with "cancel_" outcome will be retried, so they will typically be + // counted twice in the outcome breakdown. + let splits_by_outcome = merge_splits_by_outcome_it([ + original_response.splits_by_outcome, + retry_response.splits_by_outcome, + ]); Ok(LeafSearchResponse { intermediate_aggregation_result, num_hits: original_response.num_hits + retry_response.num_hits, @@ -274,6 +283,7 @@ fn merge_original_with_retry_leaf_search_response( num_successful_splits: original_response.num_successful_splits + retry_response.num_successful_splits, resource_stats, + splits_by_outcome, }) } @@ -328,6 +338,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }], ..Default::default() } @@ -355,6 +366,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -363,6 +375,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, ], }], diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index ed21fd968ba..5b329c99913 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -16,12 +16,15 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; -use itertools::Itertools; +use itertools::{Either, Itertools}; use quickwit_common::binary_heap::{SortKeyMapper, TopK}; +use quickwit_common::numeric_types::num_proj::{ + ProjectedNumber, f64_to_i64, f64_to_u64, i64_to_f64, i64_to_u64, u64_to_f64, u64_to_i64, +}; use quickwit_doc_mapper::{FastFieldWarmupInfo, WarmupInfo}; use quickwit_proto::search::{ LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortByValue, SortOrder, - SortValue, SplitSearchError, + SortValue, SplitSearchError, SplitsByOutcome, TypeSortKey, }; use quickwit_proto::types::SplitId; use serde::Deserialize; @@ -29,14 +32,22 @@ use tantivy::aggregation::agg_req::{Aggregations, get_fast_field_names}; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::aggregation::{AggContextParams, AggregationLimitsGuard, AggregationSegmentCollector}; use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::columnar::{ColumnType, MonotonicallyMappableToU64}; +use tantivy::columnar::{ + ColumnIndex, ColumnType, MonotonicallyMappableToU64, StrColumn, TermOrdHit, +}; use tantivy::fastfield::Column; use tantivy::tokenizer::TokenizerManager; -use tantivy::{DateTime, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; +use tantivy::{ + COLLECT_BLOCK_BUFFER_LEN, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError, +}; use crate::find_trace_ids_collector::{FindTraceIdsCollector, FindTraceIdsSegmentCollector, Span}; -use crate::top_k_collector::{QuickwitSegmentTopKCollector, specialized_top_k_segment_collector}; -use crate::{GlobalDocAddress, merge_resource_stats, merge_resource_stats_it}; +use crate::sort_repr::{ElidableU64, InternalSortValueRepr, InternalValueRepr}; +use crate::top_k_collector::QuickwitSegmentTopKCollector; +use crate::{ + GlobalDocAddress, merge_resource_stats, merge_resource_stats_it, merge_splits_by_outcome, + merge_splits_by_outcome_it, +}; #[derive(Clone, Debug)] pub(crate) enum SortByComponent { @@ -51,30 +62,7 @@ pub(crate) enum SortByComponent { order: SortOrder, }, } -impl From for SortByPair { - fn from(value: SortByComponent) -> Self { - Self { - first: value, - second: None, - } - } -} -#[derive(Clone)] -pub(crate) struct SortByPair { - first: SortByComponent, - second: Option, -} -impl SortByPair { - pub fn sort_orders(&self) -> (SortOrder, SortOrder) { - ( - self.first.sort_order(), - self.second - .as_ref() - .map(|sort_by| sort_by.sort_order()) - .unwrap_or(SortOrder::Desc), - ) - } -} + impl SortByComponent { fn to_sorting_field_extractor_component( &self, @@ -83,19 +71,48 @@ impl SortByComponent { match self { SortByComponent::DocId { .. } => Ok(SortingFieldExtractorComponent::DocId), SortByComponent::FastField { field_name, .. } => { - let sort_column_opt: Option<(Column, ColumnType)> = - segment_reader.fast_fields().u64_lenient(field_name)?; - let (sort_column, column_type) = sort_column_opt.unwrap_or_else(|| { - ( - Column::build_empty_column(segment_reader.max_doc()), - ColumnType::U64, - ) - }); - let sort_field_type = SortFieldType::try_from(column_type)?; - Ok(SortingFieldExtractorComponent::FastField { - sort_column, - sort_field_type, - }) + let allowed_column_types = [ + ColumnType::I64, + ColumnType::U64, + ColumnType::F64, + ColumnType::Str, + ColumnType::DateTime, + ColumnType::Bool, + // ColumnType::IpAddr Unsupported + // ColumnType::Bytes Unsupported + ]; + let fast_fields = segment_reader.fast_fields(); + let mut sort_columns = fast_fields + .u64_lenient_for_type_all(Some(&allowed_column_types), field_name)? + .into_iter() + .map(|(col, col_typ)| match col_typ { + ColumnType::U64 => Ok((col, SortFieldType::U64)), + ColumnType::I64 => Ok((col, SortFieldType::I64)), + ColumnType::F64 => Ok((col, SortFieldType::F64)), + ColumnType::DateTime => Ok((col, SortFieldType::DateTime)), + ColumnType::Bool => Ok((col, SortFieldType::Bool)), + ColumnType::Str => Ok(( + col, + SortFieldType::String( + fast_fields + .str(field_name)? + .expect("field with str column type should have str column"), + ), + )), + _ => panic!("unsupported"), + }) + .collect::>>()?; + + sort_columns.sort_by_key(|(_, col_typ)| col_typ.type_sort_key()); + + // TODO we could skip the columns that are before the search after + + Ok(SortingFieldExtractorComponent::FastField( + FastFieldExtractor { + sort_columns, + col_scratch: Box::new([None; COLLECT_BLOCK_BUFFER_LEN]), + }, + )) } SortByComponent::Score { .. } => Ok(SortingFieldExtractorComponent::Score), } @@ -125,347 +142,568 @@ impl SortByComponent { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Clone)] +pub(crate) struct SortByPair { + first: SortByComponent, + second: Option, +} +impl SortByPair { + pub fn sort_orders(&self) -> (SortOrder, SortOrder) { + ( + self.first.sort_order(), + self.second + .as_ref() + .map(|sort_by| sort_by.sort_order()) + .unwrap_or(SortOrder::Desc), + ) + } +} + +#[derive(Clone, Debug)] pub(crate) enum SortFieldType { U64, I64, F64, DateTime, Bool, + String(StrColumn), +} + +impl SortFieldType { + fn type_sort_key(&self) -> TypeSortKey { + match self { + SortFieldType::U64 => TypeSortKey::Numeric, + SortFieldType::I64 => TypeSortKey::Numeric, + SortFieldType::F64 => TypeSortKey::Numeric, + SortFieldType::DateTime => TypeSortKey::DateTime, + SortFieldType::Bool => TypeSortKey::Boolean, + SortFieldType::String(_) => TypeSortKey::Str, + } + } +} + +struct FastFieldExtractor { + /// Sort columns are sorted in the same order as types (TypeSortKey) + sort_columns: Vec<(Column, SortFieldType)>, + col_scratch: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, +} + +impl FastFieldExtractor { + fn fill_batch( + &mut self, + docs: &[DocId], + order: SortOrder, + out: &mut [InternalValueRepr], + ) { + let n = docs.len(); + let unique_column = &self.sort_columns[0].0; + if let ColumnIndex::Multivalued(_) = unique_column.index { + // TODO: first_vals() doesn't enforce zeroing for multivalued + // columns. It seems like something that should be fixed in Tantivy? + self.col_scratch[..n].fill(None); + } + self.sort_columns[0] + .0 + .first_vals(docs, &mut self.col_scratch[..n]); + for (repr, val_opt) in out[..n].iter_mut().zip(self.col_scratch[..n].iter()) { + *repr = match val_opt { + Some(val) => InternalValueRepr::new(*val, 0, order), + None => InternalValueRepr::new_missing(), + }; + } + } } /// The `SortingFieldExtractor` is used to extract a score, which can either be a true score, /// a value from a fast field, or nothing (sort by DocId). -pub(crate) enum SortingFieldExtractorComponent { +enum SortingFieldExtractorComponent { /// If undefined, we simply sort by DocIds. DocId, - FastField { - sort_column: Column, - sort_field_type: SortFieldType, - }, + FastField(FastFieldExtractor), Score, } impl SortingFieldExtractorComponent { - pub fn is_score(&self) -> bool { - matches!(self, SortingFieldExtractorComponent::Score) + pub fn is_doc_id(&self) -> bool { + matches!(self, SortingFieldExtractorComponent::DocId) } - pub fn is_fast_field(&self) -> bool { - matches!(self, SortingFieldExtractorComponent::FastField { .. }) - } - /// Loads the fast field values for the given doc_ids in its u64 representation. The returned - /// u64 representation maintains the ordering of the original value. - #[inline] - pub fn extract_typed_sort_values_block(&self, doc_ids: &[DocId], values: &mut [Option]) { - // In the collect block case we don't have scores to extract - if let SortingFieldExtractorComponent::FastField { sort_column, .. } = self { - let values = &mut values[..doc_ids.len()]; - sort_column.first_vals(doc_ids, values); + + /// Currently batch extraction only has a fast path for full columns. That + /// can only happen if there is only one column for the fast field. + fn extractor_for_batch_if_worthwhile(&mut self) -> Option<&mut FastFieldExtractor> { + match self { + SortingFieldExtractorComponent::FastField(extractor) + if extractor.sort_columns.len() == 1 => + { + Some(extractor) + } + _ => None, } } - /// Returns the sort value for the given element in its u64 representation. The returned u64 - /// representation maintains the ordering of the original value. - /// - /// The function returns None if the sort key is a fast field, for which we have no value - /// for the given doc_id, or we sort by DocId. + /// Returns the sort value for the given element in its u64 representation. + /// The returned u64 representation maintains the ordering of the original + /// value. #[inline] - fn extract_typed_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option { + fn project_to_internal_sort_value( + &self, + doc_id: DocId, + score: Score, + order: SortOrder, + ) -> InternalValueRepr { match self { - // Tie breaks are not handled here, but in SegmentPartialHit - SortingFieldExtractorComponent::DocId => None, - SortingFieldExtractorComponent::FastField { sort_column, .. } => { - sort_column.first(doc_id) + SortingFieldExtractorComponent::DocId => { + // Doc id is handled at the compound sort value level + debug_assert!(V::is_elided()); + InternalValueRepr::new_missing() + } + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }) => { + for (idx, (sort_column, _)) in sort_columns.iter().enumerate() { + if let Some(value) = sort_column.first(doc_id) { + return InternalValueRepr::new(value, idx as u8, order); + } + } + InternalValueRepr::new_missing() + } + SortingFieldExtractorComponent::Score => { + InternalValueRepr::new((score as f64).to_u64(), 0, order) } - SortingFieldExtractorComponent::Score => Some((score as f64).to_u64()), } } - #[inline] - /// Converts u64 fast field values to its correct type. - /// The conversion is delayed for performance reasons. - /// - /// This is used to convert `search_after` sort value to a u64 representation that will respect - /// the same order as the `SortValue` representation. - pub fn convert_u64_ff_val_to_sort_value(&self, sort_value: u64) -> SortValue { - let map_fast_field_to_value = |fast_field_value, field_type| match field_type { - SortFieldType::U64 => SortValue::U64(fast_field_value), - SortFieldType::I64 => SortValue::I64(i64::from_u64(fast_field_value)), - SortFieldType::F64 => SortValue::F64(f64::from_u64(fast_field_value)), - SortFieldType::DateTime => SortValue::I64(i64::from_u64(fast_field_value)), - SortFieldType::Bool => SortValue::Boolean(fast_field_value != 0u64), - }; - match self { - SortingFieldExtractorComponent::DocId => SortValue::U64(sort_value), - SortingFieldExtractorComponent::FastField { - sort_field_type, .. - } => map_fast_field_to_value(sort_value, *sort_field_type), - SortingFieldExtractorComponent::Score => SortValue::F64(f64::from_u64(sort_value)), + fn project_from_internal_sort_value( + &self, + internal_repr: InternalValueRepr, + order: SortOrder, + ) -> tantivy::Result> { + if V::is_elided() { + return Ok(None); } + let Some((col_idx, val_as_u64)) = internal_repr.decode(order) else { + return Ok(Some(SortByValue { sort_value: None })); + }; + let sort_value = match self { + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }) => { + let (_, field_type) = &sort_columns[col_idx as usize]; + match field_type { + SortFieldType::U64 => SortValue::U64(val_as_u64), + SortFieldType::I64 => SortValue::I64(i64::from_u64(val_as_u64)), + SortFieldType::F64 => SortValue::F64(f64::from_u64(val_as_u64)), + SortFieldType::DateTime => SortValue::Datetime(i64::from_u64(val_as_u64)), + SortFieldType::Bool => SortValue::Boolean(val_as_u64 != 0u64), + SortFieldType::String(str_column) => { + let term_dict = str_column.dictionary(); + let mut buffer = Vec::new(); + term_dict.ord_to_term(val_as_u64, &mut buffer)?; + let string_value = String::from_utf8(buffer).map_err(|_| { + tantivy::TantivyError::InternalError( + "term dictionary contains non-UTF-8 bytes".to_string(), + ) + })?; + SortValue::Str(string_value) + } + } + } + SortingFieldExtractorComponent::Score => SortValue::F64(f64::from_u64(val_as_u64)), + SortingFieldExtractorComponent::DocId => { + return Err(tantivy::TantivyError::InternalError( + "value should be elided on doc id sort".to_string(), + )); + } + }; + Ok(Some(SortByValue { + sort_value: Some(sort_value), + })) } - /// Converts fast field values into their u64 fast field representation. - /// - /// Returns None if value is out of bounds of target value. - /// None means that the search_after will be disabled and everything matches. - /// - /// What's currently missing is to signal that _nothing_ matches to generate an optimized - /// query. For now we just choose the max value of the target type. - #[inline] - pub fn convert_to_u64_ff_val( + + fn project_to_internal_search_after( &self, - sort_value: SortValue, + sort_by_value: &SortByValue, sort_order: SortOrder, - ) -> Option { - match self { - SortingFieldExtractorComponent::DocId => match sort_value { - SortValue::U64(val) => Some(val), - _ => panic!("Internal error: Got non-U64 sort value for DocId."), - }, - SortingFieldExtractorComponent::FastField { - sort_field_type, .. - } => { - // We need to convert a (potential user provided) value in the correct u64 - // representation of the fast field. - // This requires this weird conversion of first casting into the target type - // (if possible) and then to its u64 presentation. - // - // For the conversion into the target type it's important to know if the target - // type does not cover the whole range of the source type. In that case we need to - // add additional conversion checks, to see if it matches everything - // or nothing. (Which also depends on the sort order). - // Below are the visual representations of the value ranges of the different types. - // Note: DateTime is equal to I64 and omitted. - // - // Bool value range (0, 1): - // <-> - // - // I64 value range (signed 64-bit integer): - // <------------------------------------> - // -2^63 2^63-1 - // U64 value range (unsigned 64-bit integer): - // <------------------------------------> - // 0 2^64-1 - // F64 value range (64-bit floating point, conceptual, not to scale): - // <--------------------------------------------------------------------> - // Very negative numbers Very positive numbers - // - // Those conversions have limited target type value space: - // - [X] U64 -> I64 - // - [X] F64 -> I64 - // - [X] I64 -> U64 - // - [X] F64 -> U64 - // - // - [X] F64 -> Bool - // - [X] I64 -> Bool - // - [X] U64 -> Bool - // - let val = match (sort_value, sort_field_type) { - // Same field type, no conversion needed. - (SortValue::U64(val), SortFieldType::U64) => val, - (SortValue::F64(val), SortFieldType::F64) => val.to_u64(), - (SortValue::Boolean(val), SortFieldType::Bool) => val.to_u64(), - (SortValue::I64(val), SortFieldType::I64) => val.to_u64(), - (SortValue::U64(mut val), SortFieldType::I64) => { - if sort_order == SortOrder::Desc && val > i64::MAX as u64 { - return None; - } - // Add a limit to avoid overflow. - val = val.min(i64::MAX as u64); - (val as i64).to_u64() - } - (SortValue::U64(val), SortFieldType::F64) => (val as f64).to_u64(), - (SortValue::U64(mut val), SortFieldType::DateTime) => { - // Match everything - if sort_order == SortOrder::Desc && val > i64::MAX as u64 { - return None; - } - // Add a limit to avoid overflow. - val = val.min(i64::MAX as u64); - DateTime::from_timestamp_nanos(val as i64).to_u64() - } - (SortValue::I64(val), SortFieldType::U64) => { - if val < 0 && sort_order == SortOrder::Asc { - return None; - } - if val < 0 && sort_order == SortOrder::Desc { - u64::MIN // matches nothing as search_after is not inclusive - } else { - val as u64 - } - } - (SortValue::I64(val), SortFieldType::F64) => (val as f64).to_u64(), - (SortValue::I64(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val).to_u64() - } - (SortValue::F64(val), SortFieldType::U64) => { - let all_values_ahead1 = - val < u64::MIN as f64 && sort_order == SortOrder::Asc; - let all_values_ahead2 = - val > u64::MAX as f64 && sort_order == SortOrder::Desc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // f64 cast already handles under/overflow and clamps the value - (val as u64).to_u64() - } - (SortValue::F64(val), SortFieldType::I64) - | (SortValue::F64(val), SortFieldType::DateTime) => { - let all_values_ahead1 = - val < i64::MIN as f64 && sort_order == SortOrder::Asc; - let all_values_ahead2 = - val > i64::MAX as f64 && sort_order == SortOrder::Desc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // f64 cast already handles under/overflow and clamps the value - let val_i64 = val as i64; + ) -> tantivy::Result> { + let SortByValue { + sort_value: sort_value_opt, + } = sort_by_value; + match (self, sort_value_opt) { + (SortingFieldExtractorComponent::DocId, _) => { + // Doc id sorts are handled at the compound sort value level + debug_assert!(V::is_elided()); + Ok(InternalValueRepr::new_missing()) + } + (SortingFieldExtractorComponent::FastField(_), None) => { + Ok(InternalValueRepr::new_missing()) + } + ( + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }), + Some(sort_value), + ) => project_search_after_sort_value(sort_columns, sort_value, sort_order), + (SortingFieldExtractorComponent::Score, Some(SortValue::F64(val))) => { + Ok(InternalValueRepr::new(val.to_u64(), 0, sort_order)) + } + (SortingFieldExtractorComponent::Score, _) => { + Err(tantivy::TantivyError::InvalidArgument( + "got non-F64 sort value for score".to_string(), + )) + } + } + } +} - if *sort_field_type == SortFieldType::DateTime { - DateTime::from_timestamp_nanos(val_i64).to_u64() - } else { - val_i64.to_u64() - } - } - // Not sure when we hit this, it's probably are very rare case. - (SortValue::Boolean(val), SortFieldType::U64) => val as u64, - (SortValue::Boolean(val), SortFieldType::F64) => (val as u64 as f64).to_u64(), - (SortValue::Boolean(val), SortFieldType::I64) => (val as i64).to_u64(), - (SortValue::Boolean(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val as i64).to_u64() +fn projected_number_internal_repr( + projected: ProjectedNumber, + order: SortOrder, + accessor_idx: u8, +) -> InternalValueRepr { + match (projected, order) { + (ProjectedNumber::Exact(val), _) => { + InternalValueRepr::new(val.to_u64(), accessor_idx, order) + } + (ProjectedNumber::AfterLast, SortOrder::Asc) => { + InternalValueRepr::new_skip_column(accessor_idx, order) + } + (ProjectedNumber::AfterLast, SortOrder::Desc) => { + InternalValueRepr::new_keep_column(accessor_idx, order) + } + (ProjectedNumber::Next(val), SortOrder::Asc) => { + let val_u64 = val.to_u64(); + if val_u64 == 0 { + InternalValueRepr::new_keep_column(accessor_idx, order) + } else { + InternalValueRepr::new(val_u64 - 1, accessor_idx, order) + } + } + (ProjectedNumber::Next(val), SortOrder::Desc) => { + let val_u64 = val.to_u64(); + if val_u64 == 0 { + InternalValueRepr::new_skip_column(accessor_idx, order) + } else { + InternalValueRepr::new(val_u64, accessor_idx, order) + } + } + } +} + +fn project_search_after_sort_value( + sort_columns: &[(Column, SortFieldType)], + sort_value: &SortValue, + sort_order: SortOrder, +) -> tantivy::Result> { + let col_iter = match sort_order { + SortOrder::Asc => Either::Left(sort_columns.iter().enumerate()), + SortOrder::Desc => Either::Right(sort_columns.iter().enumerate().rev()), + }; + for (idx, sort_column) in col_iter { + let internal_repr = match (&sort_column.1, sort_value) { + // project to u64 column + (SortFieldType::U64, SortValue::U64(val)) => { + InternalValueRepr::new(*val, idx as u8, sort_order) + } + (SortFieldType::U64, SortValue::F64(val)) => { + projected_number_internal_repr(f64_to_u64(*val), sort_order, idx as u8) + } + (SortFieldType::U64, SortValue::I64(val)) => { + projected_number_internal_repr(i64_to_u64(*val), sort_order, idx as u8) + } + // project to i64 column + (SortFieldType::I64, SortValue::I64(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::I64, SortValue::F64(val)) => { + projected_number_internal_repr(f64_to_i64(*val), sort_order, idx as u8) + } + (SortFieldType::I64, SortValue::U64(val)) => { + projected_number_internal_repr(u64_to_i64(*val), sort_order, idx as u8) + } + // project to f64 column + (SortFieldType::F64, SortValue::F64(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::F64, SortValue::I64(val)) => { + projected_number_internal_repr(i64_to_f64(*val), sort_order, idx as u8) + } + (SortFieldType::F64, SortValue::U64(val)) => { + projected_number_internal_repr(u64_to_f64(*val), sort_order, idx as u8) + } + // other types + (SortFieldType::DateTime, SortValue::Datetime(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::Bool, SortValue::Boolean(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::String(str_column), SortValue::Str(val)) => { + let term_dict = str_column.dictionary(); + let hit = term_dict.term_ord_or_next(val.as_str().as_bytes())?; + match (hit, sort_order) { + (TermOrdHit::Exact(ord), _) => { + InternalValueRepr::new(ord, idx as u8, sort_order) } - (SortValue::U64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; - if all_values_ahead1 { - return None; - } - // clamp value for comparison - val = val.clamp(0, 1); - (val == 1).to_u64() + (TermOrdHit::Next(ord), SortOrder::Desc) => { + InternalValueRepr::new(ord, idx as u8, sort_order) } - (SortValue::I64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; - let all_values_ahead2 = val < 0 && sort_order == SortOrder::Asc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // clamp value for comparison - val = val.clamp(0, 1); - (val == 1).to_u64() + (TermOrdHit::Next(0), SortOrder::Asc) => { + InternalValueRepr::new_keep_column(idx as u8, sort_order) } - (SortValue::F64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1.0 && sort_order == SortOrder::Desc; - let all_values_ahead2 = val < 0.0 && sort_order == SortOrder::Asc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - val = val.clamp(0.0, 1.0); - (val >= 0.5).to_u64() // Is this correct? + (TermOrdHit::Next(ord), SortOrder::Asc) => { + InternalValueRepr::new(ord - 1, idx as u8, sort_order) } + } + } + // unsupported mixed types + // + // TODO: we need a strongly typed pagination API to support JSON + // fields with datetime and schema evolutions + ( + SortFieldType::I64 | SortFieldType::U64 | SortFieldType::F64, + SortValue::Datetime(_), + ) => { + return Err(TantivyError::SchemaError( + "search after not supported for schema updates to datetime".to_string(), + )); + } + ( + SortFieldType::DateTime, + SortValue::I64(_) | SortValue::U64(_) | SortValue::F64(_), + ) => { + return Err(TantivyError::SchemaError( + "search after not supported on multi-typed fields with datetime".to_string(), + )); + } + // supported mixed types + (sort_field_type, sort_value) => { + let column_key = sort_field_type.type_sort_key(); + let value_key = sort_value.type_sort_key(); + debug_assert_ne!(column_key, value_key); + let column_comes_after = match sort_order { + SortOrder::Desc => column_key < value_key, + SortOrder::Asc => column_key > value_key, }; - Some(val) + if column_comes_after { + InternalValueRepr::new_keep_column(idx as u8, sort_order) + } else { + continue; + } } - SortingFieldExtractorComponent::Score => match sort_value { - SortValue::F64(val) => Some(val.to_u64()), - _ => panic!("Internal error: Got non-F64 sort value for Score."), - }, - } + }; + return Ok(internal_repr); } + Ok(InternalValueRepr::new_skip_all_but_missing()) } -impl From for SortingFieldExtractorPair { - fn from(value: SortingFieldExtractorComponent) -> Self { - Self { - first: value, - second: None, - } - } +pub(crate) struct SortingFieldExtractorPair { + first: SortingFieldExtractorComponent, + second: Option, + first_order: SortOrder, + second_order: SortOrder, + sort1_scratch: Box<[InternalValueRepr; COLLECT_BLOCK_BUFFER_LEN]>, + sort2_scratch: Box<[InternalValueRepr; COLLECT_BLOCK_BUFFER_LEN]>, } -pub(crate) struct SortingFieldExtractorPair { - pub first: SortingFieldExtractorComponent, - pub second: Option, -} +impl SortingFieldExtractorPair { + fn doc_id_sort_order(&self) -> SortOrder { + if self.first.is_doc_id() { + self.first_order + } else if let Some(second) = &self.second + && second.is_doc_id() + { + self.second_order + } else { + // TODO this is the current behavior which is weird. QW docs for the + // native search API advertise that the sort order by default is + // reverse(doc_id). In ES _shard_doc is supposed to be always ascending. + self.first_order + } + } -impl SortingFieldExtractorPair { - pub fn is_score(&self) -> bool { - self.first.is_score() - || self - .second + pub(crate) fn search_after_from_partial_hit( + &self, + split_id: &SplitId, + segment_ord: SegmentOrdinal, + partial_hit: &PartialHit, + ) -> tantivy::Result> { + let sort_1 = if let Some(sort_by_value) = &partial_hit.sort_value { + self.first + .project_to_internal_search_after(sort_by_value, self.first_order)? + } else { + InternalValueRepr::new_missing() + }; + let sort_2 = if let Some(sort_by_value) = &partial_hit.sort_value2 { + self.second .as_ref() - .map(|second| second.is_score()) - .unwrap_or(false) + .ok_or_else(|| { + TantivyError::InvalidArgument( + "search after has 2 values but there is only 1 sort dimension".to_string(), + ) + })? + .project_to_internal_search_after(sort_by_value, self.second_order)? + } else { + InternalValueRepr::new_missing() + }; + + let internal_repr = if partial_hit.split_id.is_empty() { + // When split_id is empty, the search_after is a pure sort-value + // boundary (no doc position), any doc with the same sort value must be + // excluded otherwise we risk iterating over an over through the same + // documents. + InternalSortValueRepr::new_skip_doc_ids(sort_1, sort_2) + } else { + let split_cmp = split_id + .as_str() + .cmp(partial_hit.split_id.as_str()) + .then(segment_ord.cmp(&partial_hit.segment_ord)); + match (split_cmp, self.doc_id_sort_order()) { + (Ordering::Less, SortOrder::Asc) | (Ordering::Greater, SortOrder::Desc) => { + InternalSortValueRepr::new_skip_doc_ids(sort_1, sort_2) + } + (Ordering::Less, SortOrder::Desc) | (Ordering::Greater, SortOrder::Asc) => { + InternalSortValueRepr::new_keep_doc_ids(sort_1, sort_2) + } + (Ordering::Equal, doc_id_order) => { + InternalSortValueRepr::new(sort_1, sort_2, partial_hit.doc_id, doc_id_order) + } + } + }; + Ok(internal_repr) } - /// Returns the list of sort values for the given element - /// - /// See also [`SortingFieldExtractorComponent::extract_typed_sort_values_block`] for more - /// information. - #[inline] - pub(crate) fn extract_typed_sort_values( + + pub(crate) fn internal_to_partial_hit( &self, - doc_ids: &[DocId], - values1: &mut [Option], - values2: &mut [Option], - ) { - self.first - .extract_typed_sort_values_block(doc_ids, &mut values1[..doc_ids.len()]); - if let Some(second) = self.second.as_ref() { - second.extract_typed_sort_values_block(doc_ids, &mut values2[..doc_ids.len()]); - } + split_id: &SplitId, + segment_ord: SegmentOrdinal, + internal_repr: InternalSortValueRepr, + ) -> tantivy::Result { + let sort_1 = self + .first + .project_from_internal_sort_value(internal_repr.sort_1(), self.first_order)?; + let sort_2 = self + .second + .as_ref() + .map(|second| { + second.project_from_internal_sort_value(internal_repr.sort_2(), self.second_order) + }) + .transpose()? + .unwrap_or_default(); + Ok(PartialHit { + sort_value: sort_1, + sort_value2: sort_2, + doc_id: internal_repr.doc_id(self.doc_id_sort_order()), + split_id: split_id.clone(), + segment_ord, + }) } + /// Returns the list of sort values for the given element /// /// See also [`SortingFieldExtractorComponent::extract_typed_sort_value_opt`] for more /// information. #[inline] - pub(crate) fn extract_typed_sort_value( + pub(crate) fn project_to_internal_sort_value( &self, doc_id: DocId, score: Score, - ) -> (Option, Option) { - let first = self.first.extract_typed_sort_value_opt(doc_id, score); + ) -> InternalSortValueRepr { + let first = self + .first + .project_to_internal_sort_value(doc_id, score, self.first_order); let second = self .second .as_ref() - .and_then(|second| second.extract_typed_sort_value_opt(doc_id, score)); - (first, second) + .map(|second| second.project_to_internal_sort_value(doc_id, score, self.second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + InternalSortValueRepr::new(first, second, doc_id, self.doc_id_sort_order()) } -} -impl TryFrom for SortFieldType { - type Error = tantivy::TantivyError; - - fn try_from(column_type: ColumnType) -> tantivy::Result { - match column_type { - ColumnType::U64 => Ok(SortFieldType::U64), - ColumnType::I64 => Ok(SortFieldType::I64), - ColumnType::F64 => Ok(SortFieldType::F64), - ColumnType::DateTime => Ok(SortFieldType::DateTime), - ColumnType::Bool => Ok(SortFieldType::Bool), - _ => Err(TantivyError::InvalidArgument(format!( - "Unsupported sort field type `{column_type:?}`." - ))), + pub(crate) fn project_to_internal_sort_value_block( + &mut self, + docs: &[DocId], + mut f: impl FnMut(InternalSortValueRepr), + ) { + let doc_id_order = self.doc_id_sort_order(); + let first_order = self.first_order; + let second_order = self.second_order; + + let n = docs.len(); + + let SortingFieldExtractorPair { + first, + second, + sort1_scratch, + sort2_scratch, + .. + } = self; + + let first_extractor_opt = first.extractor_for_batch_if_worthwhile(); + let second_extractor_opt = second + .as_mut() + .and_then(|s| s.extractor_for_batch_if_worthwhile()); + match (first_extractor_opt, second_extractor_opt) { + (Some(fst_batch_extr), Some(sec_batch_extr)) => { + fst_batch_extr.fill_batch(docs, first_order, &mut sort1_scratch[..n]); + sec_batch_extr.fill_batch(docs, second_order, &mut sort2_scratch[..n]); + for i in 0..n { + f(InternalSortValueRepr::new( + sort1_scratch[i], + sort2_scratch[i], + docs[i], + doc_id_order, + )); + } + } + (Some(fst_batch_extr), None) => { + fst_batch_extr.fill_batch(docs, first_order, &mut sort1_scratch[..n]); + for i in 0..n { + let sort2 = second + .as_ref() + .map(|s| s.project_to_internal_sort_value(docs[i], 0.0, second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + f(InternalSortValueRepr::new( + sort1_scratch[i], + sort2, + docs[i], + doc_id_order, + )); + } + } + (None, Some(sec_batch_extr)) => { + sec_batch_extr.fill_batch(docs, second_order, &mut sort2_scratch[..n]); + for i in 0..n { + let sort1 = first.project_to_internal_sort_value(docs[i], 0.0, first_order); + f(InternalSortValueRepr::new( + sort1, + sort2_scratch[i], + docs[i], + doc_id_order, + )); + } + } + (None, None) => { + for &doc_id in docs { + let first = self + .first + .project_to_internal_sort_value(doc_id, 0.0, first_order); + let second = self + .second + .as_ref() + .map(|s| s.project_to_internal_sort_value(doc_id, 0.0, second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + f(InternalSortValueRepr::new( + first, + second, + doc_id, + doc_id_order, + )); + } + } } } } -/// Takes a user-defined sorting criteria and resolves it to a -/// segment specific `SortingFieldExtractorPair`. -fn get_score_extractor( - sort_by: &SortByPair, - segment_reader: &SegmentReader, -) -> tantivy::Result { - Ok(SortingFieldExtractorPair { - first: sort_by - .first - .to_sorting_field_extractor_component(segment_reader)?, - second: sort_by - .second - .as_ref() - .map(|first| first.to_sorting_field_extractor_component(segment_reader)) - .transpose()?, - }) -} - #[allow(clippy::large_enum_variant)] enum AggregationSegmentCollectors { FindTraceIdsSegmentCollector(Box), @@ -474,51 +712,50 @@ enum AggregationSegmentCollectors { /// Quickwit collector working at the scale of the segment. pub struct QuickwitSegmentCollector { - segment_top_k_collector: Option>, + segment_top_k_collector: Option, aggregation: Option, num_hits: u64, } -#[derive(Copy, Clone, Debug)] -pub(crate) struct SegmentPartialHit { - /// Normalized to u64, the typed value can be reconstructed with - /// SortingFieldExtractorComponent. - pub sort_value: Option, - pub sort_value2: Option, - pub doc_id: DocId, -} - -impl SegmentPartialHit { - pub fn into_partial_hit( - self, - split_id: SplitId, - segment_ord: SegmentOrdinal, - first: &SortingFieldExtractorComponent, - second: &Option, - ) -> PartialHit { - PartialHit { - sort_value: self - .sort_value - .map(|sort_value| first.convert_u64_ff_val_to_sort_value(sort_value)) - .map(|sort_value| SortByValue { - sort_value: Some(sort_value), - }), - sort_value2: self - .sort_value2 - .map(|sort_value| { - second - .as_ref() - .expect("Internal error: Got sort_value2, but no sort extractor") - .convert_u64_ff_val_to_sort_value(sort_value) - }) - .map(|sort_value| SortByValue { - sort_value: Some(sort_value), - }), - doc_id: self.doc_id, - split_id, - segment_ord, - } - } +/// Takes a user-defined sorting criteria and resolves it to a +/// segment specific `SortingFieldExtractorPair`. +#[allow(clippy::type_complexity)] +fn get_sorting_field_extractors( + sort_by: &SortByPair, + segment_reader: &SegmentReader, + split_id: &SplitId, + segment_ord: SegmentOrdinal, + search_after: &Option, +) -> tantivy::Result<( + SortingFieldExtractorPair, + Option>, +)> { + let extractor = SortingFieldExtractorPair { + first: sort_by + .first + .to_sorting_field_extractor_component(segment_reader)?, + second: sort_by + .second + .as_ref() + .map(|first| first.to_sorting_field_extractor_component(segment_reader)) + .transpose()?, + first_order: sort_by.first.sort_order(), + second_order: sort_by + .second + .as_ref() + .map(|second| second.sort_order()) + // value irrelevant? + .unwrap_or(SortOrder::Desc), + sort1_scratch: Box::new([InternalValueRepr::new_missing(); COLLECT_BLOCK_BUFFER_LEN]), + sort2_scratch: Box::new([InternalValueRepr::new_missing(); COLLECT_BLOCK_BUFFER_LEN]), + }; + let search_after_opt = search_after + .as_ref() + .map(|search_after| { + extractor.search_after_from_partial_hit(split_id, segment_ord, search_after) + }) + .transpose()?; + Ok((extractor, search_after_opt)) } impl SegmentCollector for QuickwitSegmentCollector { @@ -526,7 +763,6 @@ impl SegmentCollector for QuickwitSegmentCollector { #[inline] fn collect_block(&mut self, filtered_docs: &[DocId]) { - // Update results self.num_hits += filtered_docs.len() as u64; if let Some(segment_top_k_collector) = self.segment_top_k_collector.as_mut() { @@ -565,7 +801,7 @@ impl SegmentCollector for QuickwitSegmentCollector { fn harvest(self) -> Self::Fruit { let mut partial_hits: Vec = Vec::new(); if let Some(segment_top_k_collector) = self.segment_top_k_collector { - partial_hits = segment_top_k_collector.get_top_k(); + partial_hits = segment_top_k_collector.get_top_k()?; } let intermediate_aggregation_result = match self.aggregation { @@ -591,6 +827,7 @@ impl SegmentCollector for QuickwitSegmentCollector { num_attempted_splits: 1, num_successful_splits: 1, resource_stats: None, + splits_by_outcome: None, }) } } @@ -668,7 +905,7 @@ impl QuickwitIncrementalAggregations { let timestamp = last_elem.span_timestamp.into_timestamp_nanos(); return Some(PartialHit { sort_value: Some(SortByValue { - sort_value: Some(SortValue::I64(timestamp)), + sort_value: Some(SortValue::Datetime(timestamp)), }), sort_value2: None, split_id: SplitId::new(), @@ -792,22 +1029,70 @@ impl Collector for QuickwitCollector { ), None => None, }; - let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?; - let (order1, order2) = self.sort_by.sort_orders(); let segment_top_k_collector = if leaf_max_hits == 0 { None } else { - let coll: Box = specialized_top_k_segment_collector( - self.split_id.clone(), - score_extractor, - leaf_max_hits, - segment_ord, - self.search_after.clone(), - order1, - order2, - ); - Some(coll) + let segment_top_k_collector = match self.sort_by { + SortByPair { + first: SortByComponent::DocId { .. }, + second: None, + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_doc_id_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + SortByPair { + first: _, + second: None | Some(SortByComponent::DocId { .. }), + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_one_dim_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + SortByPair { + first: _, + second: Some(_), + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_two_dim_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + }; + Some(segment_top_k_collector) }; Ok(QuickwitSegmentCollector { @@ -929,6 +1214,9 @@ fn merge_leaf_responses( .map(|leaf_response| &leaf_response.resource_stats); let merged_resource_stats = merge_resource_stats_it(resource_stats_it); + let merged_splits_by_outcome = + merge_splits_by_outcome_it(leaf_responses.iter().map(|r| r.splits_by_outcome)); + let merged_intermediate_aggregation_result: Option> = merge_intermediate_aggregation_result( aggregations_opt, @@ -971,6 +1259,7 @@ fn merge_leaf_responses( num_attempted_splits, num_successful_splits, resource_stats: merged_resource_stats, + splits_by_outcome: merged_splits_by_outcome, }) } @@ -1008,14 +1297,20 @@ pub(crate) fn sort_by_from_request(search_request: &SearchRequest) -> SortByPair let num_sort_fields = search_request.sort_fields.len(); if num_sort_fields == 0 { - SortByComponent::DocId { - order: SortOrder::Desc, + SortByPair { + first: SortByComponent::DocId { + order: SortOrder::Desc, + }, + second: None, } - .into() } else if num_sort_fields == 1 { let sort_field = &search_request.sort_fields[0]; let order = SortOrder::try_from(sort_field.sort_order).unwrap_or(SortOrder::Desc); - to_sort_by_component(&sort_field.field_name, order).into() + let first = to_sort_by_component(&sort_field.field_name, order); + SortByPair { + first, + second: None, + } } else if num_sort_fields == 2 { let sort_field1 = &search_request.sort_fields[0]; let order1 = SortOrder::try_from(sort_field1.sort_order).unwrap_or(SortOrder::Desc); @@ -1080,44 +1375,6 @@ pub(crate) fn make_merge_collector( }) } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct SegmentPartialHitSortingKey { - sort_value: Option, - sort_value2: Option, - doc_id: DocId, - // TODO This should not be there. - sort_order: SortOrder, - // TODO This should not be there. - sort_order2: SortOrder, -} - -impl Ord for SegmentPartialHitSortingKey { - fn cmp(&self, other: &SegmentPartialHitSortingKey) -> Ordering { - debug_assert_eq!( - self.sort_order, other.sort_order, - "comparing two PartialHitSortingKey of different ordering" - ); - debug_assert_eq!( - self.sort_order2, other.sort_order2, - "comparing two PartialHitSortingKey of different ordering" - ); - let order = self - .sort_order - .compare_opt(&self.sort_value, &other.sort_value); - let order2 = self - .sort_order2 - .compare_opt(&self.sort_value2, &other.sort_value2); - let order_addr = self.sort_order.compare(&self.doc_id, &other.doc_id); - order.then(order2).then(order_addr) - } -} - -impl PartialOrd for SegmentPartialHitSortingKey { - fn partial_cmp(&self, other: &SegmentPartialHitSortingKey) -> Option { - Some(self.cmp(other)) - } -} - #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct PartialHitSortingKey { sort_value: Option, @@ -1169,8 +1426,8 @@ impl SortKeyMapper for HitSortingMapper { type Key = PartialHitSortingKey; fn get_sort_key(&self, partial_hit: &PartialHit) -> PartialHitSortingKey { PartialHitSortingKey { - sort_value: partial_hit.sort_value.and_then(|v| v.sort_value), - sort_value2: partial_hit.sort_value2.and_then(|v| v.sort_value), + sort_value: partial_hit.sort_value.clone().and_then(|v| v.sort_value), + sort_value2: partial_hit.sort_value2.clone().and_then(|v| v.sort_value), address: GlobalDocAddress::from_partial_hit(partial_hit), sort_order: self.order1, sort_order2: self.order2, @@ -1178,19 +1435,6 @@ impl SortKeyMapper for HitSortingMapper { } } -impl SortKeyMapper for HitSortingMapper { - type Key = SegmentPartialHitSortingKey; - fn get_sort_key(&self, partial_hit: &SegmentPartialHit) -> SegmentPartialHitSortingKey { - SegmentPartialHitSortingKey { - sort_value: partial_hit.sort_value, - sort_value2: partial_hit.sort_value2, - doc_id: partial_hit.doc_id, - sort_order: self.order1, - sort_order2: self.order2, - } - } -} - /// Incrementally merge segment results. #[derive(Clone)] pub(crate) struct IncrementalCollector { @@ -1202,6 +1446,7 @@ pub(crate) struct IncrementalCollector { num_successful_splits: u64, start_offset: usize, resource_stats: Option, + splits_by_outcome: Option, } impl IncrementalCollector { @@ -1223,6 +1468,7 @@ impl IncrementalCollector { num_attempted_splits: 0, num_successful_splits: 0, resource_stats: None, + splits_by_outcome: None, } } @@ -1236,9 +1482,11 @@ impl IncrementalCollector { intermediate_aggregation_result, num_successful_splits, resource_stats, + splits_by_outcome, } = leaf_response; merge_resource_stats(&resource_stats, &mut self.resource_stats); + merge_splits_by_outcome(splits_by_outcome, &mut self.splits_by_outcome); self.num_hits += num_hits; self.top_k_hits.add_entries(partial_hits.into_iter()); @@ -1290,6 +1538,7 @@ impl IncrementalCollector { num_successful_splits: self.num_successful_splits, intermediate_aggregation_result, resource_stats: self.resource_stats, + splits_by_outcome: self.splits_by_outcome, }) } } @@ -1300,16 +1549,17 @@ mod tests { use quickwit_proto::search::{ LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortByValue, SortField, - SortOrder, SortValue, SplitSearchError, + SortOrder, SortValue, SplitSearchError, SplitsByOutcome, }; use tantivy::TantivyDocument; use tantivy::aggregation::agg_req::Aggregations; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::collector::Collector; - use super::{IncrementalCollector, make_merge_collector}; - use crate::QuickwitAggregations; - use crate::collector::{merge_intermediate_aggregation_result, top_k_partial_hits}; + use super::{ + IncrementalCollector, QuickwitAggregations, make_merge_collector, + merge_intermediate_aggregation_result, top_k_partial_hits, + }; #[test] fn test_merge_partial_hits_no_tie() { @@ -1394,66 +1644,52 @@ mod tests { ] } - fn make_request(max_hits: u64, sort_fields: &str) -> SearchRequest { - SearchRequest { - max_hits, - sort_fields: sort_fields - .split(',') - .filter(|field| !field.is_empty()) - .map(|field| { - if let Some(field) = field.strip_prefix('-') { - SortField { - field_name: field.to_string(), - sort_order: SortOrder::Asc.into(), - sort_datetime_format: None, - } - } else { - SortField { - field_name: field.to_string(), - sort_order: SortOrder::Desc.into(), - sort_datetime_format: None, - } + /// Create a list of SortField from a comma-separated list of field names. + /// Field names can be prefixed with - to indicate ascending order. + fn make_sort_fields(sort_fields: &str) -> Vec { + sort_fields + .split(',') + .filter(|field| !field.is_empty()) + .map(|field| { + if let Some(field) = field.strip_prefix('-') { + SortField { + field_name: field.to_string(), + sort_order: SortOrder::Asc.into(), + sort_datetime_format: None, } - }) - .collect(), - ..SearchRequest::default() - } + } else { + SortField { + field_name: field.to_string(), + sort_order: SortOrder::Desc.into(), + sort_datetime_format: None, + } + } + }) + .collect() } - fn make_index() -> tantivy::Index { + /// Build a tantivy index from a JSON dataset. Each element must be a JSON + /// object whose keys match field names in the pre-determined schema. + fn make_index(dataset: &[serde_json::Value]) -> tantivy::Index { use tantivy::Index; use tantivy::indexer::UserOperation; - use tantivy::schema::{NumericOptions, Schema}; - - let dataset = sort_dataset(); + use tantivy::schema::{FAST, NumericOptions, Schema}; let mut schema_builder = Schema::builder(); let opts = NumericOptions::default().set_fast(); - - schema_builder.add_u64_field("sort1", opts.clone()); - schema_builder.add_u64_field("sort2", opts); + schema_builder.add_u64_field("sort_u64_1", opts.clone()); + schema_builder.add_u64_field("sort_u64_2", opts); + schema_builder.add_json_field("kv", FAST); let schema = schema_builder.build(); - let field1 = schema.get_field("sort1").unwrap(); - let field2 = schema.get_field("sort2").unwrap(); - - let index = Index::create_in_ram(schema); + let index = Index::create_in_ram(schema.clone()); let mut index_writer = index.writer(50_000_000).unwrap(); index_writer .run( dataset - .into_iter() - .map(|(val1, val2)| { - let mut doc = TantivyDocument::new(); - if let Some(val1) = val1 { - doc.add_u64(field1, val1); - } - if let Some(val2) = val2 { - doc.add_u64(field2, val2); - } - doc - }) + .iter() + .map(|obj| TantivyDocument::parse_json(&schema, &obj.to_string()).unwrap()) .map(UserOperation::Add), ) .unwrap(); @@ -1463,8 +1699,22 @@ mod tests { } #[test] - fn test_single_split_sorting() { - let index = make_index(); + fn test_single_split_sorting_single_type() { + let raw_dataset = sort_dataset(); + let json_dataset: Vec = raw_dataset + .iter() + .map(|(v1, v2)| { + let mut obj = serde_json::Map::new(); + if let Some(v) = v1 { + obj.insert("sort_u64_1".to_string(), (*v).into()); + } + if let Some(v) = v2 { + obj.insert("sort_u64_2".to_string(), (*v).into()); + } + serde_json::Value::Object(obj) + }) + .collect(); + let index = make_index(&json_dataset); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1472,7 +1722,7 @@ mod tests { // tuple of DocId and sort value type Doc = (usize, (Option, Option)); - let mut dataset: Vec = sort_dataset().into_iter().enumerate().collect(); + let mut dataset: Vec = raw_dataset.into_iter().enumerate().collect(); let reverse_int = |val: &Option| val.as_ref().map(|val| u64::MAX - val); let cmp_doc_id_desc = |a: &Doc, b: &Doc| b.0.cmp(&a.0); @@ -1532,25 +1782,27 @@ mod tests { assert_eq!(data, data_copy); } + // The implicit doc_id tiebreaker is always ascending, matching Elasticsearch's + // behavior where _shard_doc is always ascending regardless of primary sort direction. #[allow(clippy::type_complexity)] let sort_orders: Vec<(_, Box Ordering>)> = vec![ ("", Box::new(cmp_doc_id_desc)), ( - "sort1", + "sort_u64_1", Box::new(|a, b| cmp_1_desc(a, b).then(cmp_doc_id_desc(a, b))), ), ( - "-sort1", + "-sort_u64_1", Box::new(|a, b| cmp_1_asc(a, b).then(cmp_doc_id_asc(a, b))), ), ( - "sort1,sort2", + "sort_u64_1,sort_u64_2", Box::new(|a, b| { cmp_1_desc(a, b).then(cmp_2_desc(a, b).then(cmp_doc_id_desc(a, b))) }), ), ( - "-sort1,sort2", + "-sort_u64_1,sort_u64_2", Box::new(|a, b| { cmp_1_asc(a, b) .then(cmp_2_desc(a, b)) @@ -1558,11 +1810,11 @@ mod tests { }), ), ( - "sort1,-sort2", + "sort_u64_1,-sort_u64_2", Box::new(|a, b| cmp_1_desc(a, b).then(cmp_2_asc(a, b).then(cmp_doc_id_desc(a, b)))), ), ( - "-sort1,-sort2", + "-sort_u64_1,-sort_u64_2", Box::new(|a, b| { cmp_1_asc(a, b) .then(cmp_2_asc(a, b)) @@ -1577,7 +1829,11 @@ mod tests { for slice_len in 0..dataset.len() { let collector = super::make_collector_for_split( "fake_split_id".to_string(), - &make_request(slice_len as u64, sort_str), + &SearchRequest { + max_hits: slice_len as u64, + sort_fields: make_sort_fields(sort_str), + ..SearchRequest::default() + }, Default::default(), ) .unwrap(); @@ -1604,8 +1860,8 @@ mod tests { format!( "{} {:?} {:?}", hit.doc_id, - hit.sort_value.and_then(|el| el.sort_value).clone(), - hit.sort_value2.and_then(|el| el.sort_value).clone() + hit.sort_value.clone().and_then(|el| el.sort_value), + hit.sort_value2.clone().and_then(|el| el.sort_value) ) }) .collect::>(); @@ -1619,8 +1875,22 @@ mod tests { } #[test] - fn test_search_after() { - let index = make_index(); + fn test_search_after_single_type() { + let raw_dataset = sort_dataset(); + let json_dataset: Vec = raw_dataset + .iter() + .map(|(v1, v2)| { + let mut obj = serde_json::Map::new(); + if let Some(v) = v1 { + obj.insert("sort_u64_1".to_string(), (*v).into()); + } + if let Some(v) = v2 { + obj.insert("sort_u64_2".to_string(), (*v).into()); + } + serde_json::Value::Object(obj) + }) + .collect(); + let index = make_index(&json_dataset); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1628,7 +1898,7 @@ mod tests { // tuple of DocId and sort value type Doc = (usize, (Option, Option)); - let mut dataset: Vec = sort_dataset().into_iter().enumerate().collect(); + let mut dataset: Vec = raw_dataset.into_iter().enumerate().collect(); let reverse_int = |val: &Option| val.as_ref().map(|val| u64::MAX - val); let cmp_doc_id_desc = |a: &Doc, b: &Doc| b.0.cmp(&a.0); @@ -1658,12 +1928,12 @@ mod tests { max_hits: 1000, sort_fields: vec![ SortField { - field_name: "sort1".to_string(), + field_name: "sort_u64_1".to_string(), sort_order: SortOrder::Desc.into(), sort_datetime_format: None, }, SortField { - field_name: "sort2".to_string(), + field_name: "sort_u64_2".to_string(), sort_order: SortOrder::Asc.into(), sort_datetime_format: None, }, @@ -1752,7 +2022,256 @@ mod tests { } } - fn merge_collector_equal_results( + fn assert_search_after_results( + searcher: &tantivy::Searcher, + index_len: usize, + sort_str: &str, + search_after: PartialHit, + expected_doc_ids: impl AsRef<[u32]>, + label: &str, + ) { + let expected_doc_ids = expected_doc_ids.as_ref(); + let request = SearchRequest { + max_hits: 1000, + sort_fields: make_sort_fields(sort_str), + search_after: Some(search_after.clone()), + ..SearchRequest::default() + }; + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &request, + Default::default(), + ) + .unwrap(); + let Ok(res) = searcher.search(&tantivy::query::AllQuery, &collector) else { + panic!("search failed for {label} with search_after {search_after:?}"); + }; + // num_hits counts every doc regardless of search_after. + assert_eq!( + res.num_hits, index_len as u64, + "num_hits mismatch for {label}" + ); + assert_eq!( + res.partial_hits.len(), + expected_doc_ids.len(), + "result count mismatch for {label}" + ); + for (expected_doc_id, got) in expected_doc_ids.iter().zip(res.partial_hits.iter()) { + assert_eq!( + *expected_doc_id, got.doc_id, + "doc order mismatch for {label} after {search_after:?}" + ); + } + } + + #[test] + fn test_single_split_search_after_multitype() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"sort1": false, "sort2": "b"}}), // doc 0 + serde_json::json!({"kv": {"sort1": true, "sort2": "a"}}), // doc 1 + serde_json::json!({"kv": {"sort1": "apple", "sort2": "a"}}), // doc 2 + serde_json::json!({"kv": {"sort1": "banana", "sort2": "b"}}), // doc 3 + serde_json::json!({"kv": {"sort1": 1, "sort2": "b"}}), // doc 4 + serde_json::json!({"kv": {"sort1": 5, "sort2": "a"}}), // doc 5 + serde_json::json!({}), // doc 6: missing + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + for (sort_str, expected_order) in [ + // Desc: booleans (true first) > strings (lex desc) > numbers (largest first) > missing + ("kv.sort1", &[1, 0, 3, 2, 5, 4, 6]), + // Asc: numbers (smallest first) > strings (lex asc) > booleans (false first) > + // missing + ("-kv.sort1", &[4, 5, 2, 3, 0, 1, 6]), + ("", &[6, 5, 4, 3, 2, 1, 0]), + ("_doc", &[6, 5, 4, 3, 2, 1, 0]), + ("-_doc", &[0, 1, 2, 3, 4, 5, 6]), + // sort2 with "b" first then "a" + ("kv.sort2,kv.sort1", &[0, 3, 4, 1, 2, 5, 6]), + // sort2 with "a" first then "b" + ("-kv.sort2,kv.sort1", &[1, 2, 5, 0, 3, 4, 6]), + ] { + // Step 1: full search to collect PartialHits carrying the correct typed SortValues. + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &SearchRequest { + max_hits: 1000, + sort_fields: make_sort_fields(sort_str), + ..Default::default() + }, + Default::default(), + ) + .unwrap(); + let full_res = searcher + .search(&tantivy::query::AllQuery, &collector) + .unwrap(); + assert_eq!(full_res.partial_hits.len(), dataset.len()); + for (expected_doc_id, got) in expected_order.iter().zip(full_res.partial_hits.iter()) { + assert_eq!( + *expected_doc_id, got.doc_id, + "sort order mismatch for \"{sort_str}\"" + ); + } + + // Step 2: use each PartialHit as a search_after fence and verify the returned tail. + for (i, search_after) in full_res.partial_hits.iter().enumerate() { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + search_after.clone(), + &expected_order[i + 1..], + &format!("\"{sort_str}\" search_after position {i}"), + ); + } + } + } + + #[test] + fn test_single_split_search_after_exogeneous_type() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"mixed": false, "integer": 1}}), // doc 0 + serde_json::json!({"kv": {"mixed": true, "integer": 4}}), // doc 1 + serde_json::json!({"kv": {"mixed": "banana", "integer": 3}}), // doc 2 + serde_json::json!({"kv": {"mixed": "plum", "integer": 4}}), // doc 3 + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let str_sort_val = |s: &str| SortValue::Str(s.to_string()); + for (sort_str, search_after_value, expected_order) in [ + // Desc: booleans (true first) > strings (lex desc) > numbers (search after) > missing + ("kv.mixed", SortValue::I64(-10), vec![]), + // Asc: numbers (search after) > strings (lex asc) > booleans (false first) > missing + ("-kv.mixed", SortValue::I64(-10), vec![2, 3, 0, 1]), + // project f64 to i64 + ("kv.integer", SortValue::F64(3.5), vec![2, 0]), + ("-kv.integer", SortValue::F64(3.5), vec![1, 3]), + // str not in columns dict, check all possible relative position + ("kv.mixed", str_sort_val("c"), vec![2]), + ("-kv.mixed", str_sort_val("c"), vec![3, 0, 1]), + ("kv.mixed", str_sort_val("a"), vec![]), + ("-kv.mixed", str_sort_val("a"), vec![2, 3, 0, 1]), + ("kv.mixed", str_sort_val("z"), vec![3, 2]), + ("-kv.mixed", str_sort_val("z"), vec![0, 1]), + ] { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + PartialHit { + sort_value: Some(search_after_value.clone().into()), + sort_value2: None, + ..Default::default() + }, + expected_order, + &format!("\"{sort_str}\""), + ); + } + } + + #[test] + fn test_single_split_search_after_exogeneous_type_with_null() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"sort": false}}), // doc 0 + serde_json::json!({"kv": {"sort": true}}), // doc 1 + serde_json::json!({"kv": {"sort": "apple"}}), // doc 2 + serde_json::json!({"kv": {"sort": "banana"}}), // doc 3 + serde_json::json!({}), // doc 4: missing + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let search_after_value = SortValue::I64(-10); + + // Desc: booleans (true first) > strings (lex desc) > numbers (search after) > missing + let desc_order: &[u32] = &[4]; + // Asc: numbers (search after) > strings (lex asc) > booleans (false first) > missing + let asc_order: &[u32] = &[2, 3, 0, 1, 4]; + + for (sort_str, expected_order) in [("kv.sort", desc_order), ("-kv.sort", asc_order)] { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + PartialHit { + sort_value: Some(search_after_value.clone().into()), + sort_value2: None, + ..Default::default() + }, + expected_order, + &format!("\"{sort_str}\""), + ); + } + } + + #[test] + fn test_single_split_default_sort() { + let dataset: Vec = vec![ + serde_json::json!({"sort_u64_1": 15}), // doc 0 + serde_json::json!({"sort_u64_1": 13}), // doc 1 + serde_json::json!({"sort_u64_1": 10}), // doc 2 + serde_json::json!({"sort_u64_1": 12}), // doc 3 + serde_json::json!({"sort_u64_1": 9}), // doc 4 + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let request = SearchRequest { + max_hits: 3, + sort_fields: vec![], + search_after: None, + ..SearchRequest::default() + }; + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &request, + Default::default(), + ) + .unwrap(); + let res = searcher + .search(&tantivy::query::AllQuery, &collector) + .unwrap(); + // assert the exact hits where in other tests we mostly focus on the order + assert_eq!( + res.partial_hits, + vec![ + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 4, + sort_value: None, + sort_value2: None, + }, + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 3, + sort_value: None, + sort_value2: None, + }, + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 2, + sort_value: None, + sort_value2: None, + }, + ] + ); + } + + /// Merge intermediate results, asserting that both the regular and + /// incremental merge produce the same output. + fn merge_on_both_collectors( request: &SearchRequest, results: Vec, ) -> LeafSearchResponse { @@ -1774,7 +2293,7 @@ mod tests { #[test] fn test_merge_collectors() { - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1800,6 +2319,7 @@ mod tests { num_successful_splits: 3, intermediate_aggregation_result: None, resource_stats: None, + splits_by_outcome: None, }], ); @@ -1819,10 +2339,11 @@ mod tests { num_successful_splits: 3, intermediate_aggregation_result: None, resource_stats: None, + splits_by_outcome: None, } ); - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1858,6 +2379,7 @@ mod tests { num_successful_splits: 3, intermediate_aggregation_result: None, resource_stats: None, + splits_by_outcome: None, }, LeafSearchResponse { num_hits: 10, @@ -1877,6 +2399,7 @@ mod tests { num_successful_splits: 1, intermediate_aggregation_result: None, resource_stats: None, + splits_by_outcome: None, }, ], ); @@ -1910,11 +2433,12 @@ mod tests { num_successful_splits: 4, intermediate_aggregation_result: None, resource_stats: None, + splits_by_outcome: None, } ); // same request, but we reverse sort order - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1953,6 +2477,11 @@ mod tests { cpu_microsecs: 100, ..Default::default() }), + splits_by_outcome: Some(SplitsByOutcome { + processed: 3, + cache_hit: 1, + ..Default::default() + }), }, LeafSearchResponse { num_hits: 10, @@ -1975,6 +2504,11 @@ mod tests { cpu_microsecs: 50, ..Default::default() }), + splits_by_outcome: Some(SplitsByOutcome { + processed: 1, + pruned_before_warmup: 1, + ..Default::default() + }), }, ], ); @@ -2011,6 +2545,12 @@ mod tests { cpu_microsecs: 150, ..Default::default() }), + splits_by_outcome: Some(SplitsByOutcome { + processed: 4, + cache_hit: 1, + pruned_before_warmup: 1, + ..Default::default() + }), } ); // TODO would be nice to test aggregation too. diff --git a/quickwit/quickwit-search/src/error.rs b/quickwit/quickwit-search/src/error.rs index 21141a3035a..71d67ca4bc8 100644 --- a/quickwit/quickwit-search/src/error.rs +++ b/quickwit/quickwit-search/src/error.rs @@ -46,6 +46,8 @@ pub enum SearchError { Timeout(String), #[error("too many requests")] TooManyRequests, + #[error("too many splits: {0}")] + TooManySplits(String), #[error("service unavailable: {0}")] Unavailable(String), } @@ -87,6 +89,7 @@ impl ServiceError for SearchError { } Self::Timeout(_) => ServiceErrorCode::Timeout, Self::TooManyRequests => ServiceErrorCode::TooManyRequests, + Self::TooManySplits(_) => ServiceErrorCode::BadRequest, Self::Unavailable(_) => ServiceErrorCode::Unavailable, } } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 3d9e5d00cce..feb2aae3613 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -52,6 +52,7 @@ use crate::metrics::{SplitSearchOutcomeCounters, queue_label}; use crate::root::is_metadata_count_request_with_ast; use crate::search_permit_provider::{SearchPermit, compute_initial_memory_allocation}; use crate::service::{SearcherContext, deserialize_doc_mapper}; +use crate::soft_delete_query::SoftDeleteQuery; use crate::{QuickwitAggregations, SearchError}; async fn get_split_footer_from_cache_or_fetch( @@ -218,16 +219,10 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any let warm_up_term_ranges_future = warm_up_term_ranges(searcher, &warmup_info.term_ranges_grouped_by_field) .instrument(debug_span!("warm_up_term_ranges")); - let warm_up_term_dict_future = - warm_up_term_dict_fields(searcher, &warmup_info.term_dict_fields) - .instrument(debug_span!("warm_up_term_dicts")); let warm_up_fastfields_future = warm_up_fastfields(searcher, &warmup_info.fast_fields) .instrument(debug_span!("warm_up_fastfields")); let warm_up_fieldnorms_future = warm_up_fieldnorms(searcher, warmup_info.field_norms) .instrument(debug_span!("warm_up_fieldnorms")); - // TODO merge warm_up_postings into warm_up_term_dict_fields - let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields) - .instrument(debug_span!("warm_up_postings")); let warm_up_automatons_future = warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field) .instrument(debug_span!("warm_up_automatons")); @@ -236,45 +231,13 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any warm_up_terms_future, warm_up_term_ranges_future, warm_up_fastfields_future, - warm_up_term_dict_future, warm_up_fieldnorms_future, - warm_up_postings_future, warm_up_automatons_future, )?; Ok(()) } -async fn warm_up_term_dict_fields( - searcher: &Searcher, - term_dict_fields: &HashSet, -) -> anyhow::Result<()> { - let mut warm_up_futures = Vec::new(); - for field in term_dict_fields { - for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(*field)?.clone(); - warm_up_futures.push(async move { - let dict = inverted_index.terms(); - dict.warm_up_dictionary().await - }); - } - } - try_join_all(warm_up_futures).await?; - Ok(()) -} - -async fn warm_up_postings(searcher: &Searcher, fields: &HashSet) -> anyhow::Result<()> { - let mut warm_up_futures = Vec::new(); - for field in fields { - for segment_reader in searcher.segment_readers() { - let inverted_index = segment_reader.inverted_index(*field)?.clone(); - warm_up_futures.push(async move { inverted_index.warm_postings_full(false).await }); - } - } - try_join_all(warm_up_futures).await?; - Ok(()) -} - async fn warm_up_fastfield( fast_field_reader: &FastFieldReaders, fast_field: &FastFieldWarmupInfo, @@ -389,6 +352,10 @@ async fn warm_up_automatons( .await .context("failed to load automaton") } + Automaton::TermSet(automaton) => inv_idx_clone + .warm_postings_automaton(automaton.clone(), cpu_intensive_executor) + .await + .context("failed to warm term set"), } }); } @@ -425,6 +392,7 @@ fn get_leaf_resp_from_count(count: u64) -> LeafSearchResponse { num_successful_splits: 1, intermediate_aggregation_result: None, resource_stats: None, + splits_by_outcome: None, } } @@ -474,8 +442,11 @@ async fn leaf_search_single_split( // split can't have better results. // if is_metadata_count_request_with_ast(&query_ast, &search_request) { - leaf_search_state_guard.set_state(SplitSearchState::PrunedBeforeWarmup); - return Ok(Some(get_leaf_resp_from_count(split.num_docs))); + leaf_search_state_guard.set_state(SplitSearchState::ProcessedFromMetadata); + let effective_num_docs = split + .num_docs + .saturating_sub(split.soft_deleted_doc_ids.len() as u64); + return Ok(Some(get_leaf_resp_from_count(effective_num_docs))); } let split_id = split.split_id.to_string(); @@ -526,6 +497,14 @@ async fn leaf_search_single_split( false, predicate_cache, )?; + let query: Box = if split.soft_deleted_doc_ids.is_empty() { + query + } else { + Box::new(SoftDeleteQuery::new( + query, + split.soft_deleted_doc_ids.clone(), + )) + }; let collector_warmup_info = collector.warmup_info(); warmup_info.merge(collector_warmup_info); @@ -548,7 +527,6 @@ async fn leaf_search_single_split( .leaf_search_single_split_warmup_num_bytes .observe(warmup_size.as_u64() as f64); search_permit.update_memory_usage(warmup_size); - search_permit.free_warmup_slot(); let split_num_docs = split.num_docs; @@ -576,7 +554,10 @@ async fn leaf_search_single_split( collector.update_search_param(&simplified_search_request); let mut leaf_search_response: LeafSearchResponse = if is_metadata_count_request_with_ast(&query_ast, &simplified_search_request) { - get_leaf_resp_from_count(searcher.num_docs()) + let num_docs = searcher + .num_docs() + .saturating_sub(split_clone.soft_deleted_doc_ids.len() as u64); + get_leaf_resp_from_count(num_docs) } else if collector.is_count_only() { let count = query.count(&searcher)? as u64; get_leaf_resp_from_count(count) @@ -591,7 +572,10 @@ async fn leaf_search_single_split( cpu_thread_pool_wait_microsecs: cpu_thread_pool_wait_microsecs.as_micros() as u64, }); - leaf_search_state_guard.set_state(SplitSearchState::Success); + // splits by outcome are estimated at the (doc mapping) leaf + // response level to account for all early returns, so it is + // left None here + leaf_search_state_guard.set_state(SplitSearchState::Processed); Result::<_, TantivyError>::Ok(Some(( simplified_search_request, leaf_search_response, @@ -809,28 +793,24 @@ fn remove_redundant_timestamp_range( } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; - let final_end_timestamp = match ( - visitor.end_timestamp, - split.timestamp_end.map(DateTime::from_timestamp_secs), - ) { - (Bound::Included(query_ts), Some(split_ts)) => { - if query_ts < split_ts { - Bound::Included(query_ts) - } else { - Bound::Unbounded - } - } - (Bound::Excluded(query_ts), Some(split_ts)) => { - if query_ts <= split_ts { - Bound::Excluded(query_ts) + let final_end_timestamp = match (visitor.end_timestamp, split.timestamp_end) { + ( + query_bound @ (Bound::Included(query_ts) | Bound::Excluded(query_ts)), + Some(split_end), + ) => { + // split.timestamp_end is the truncation of the highest timestamp in the split, + // so the actual known bound for the split is split.timestamp_end+1 (exclusive) + let split_end_exclusive = DateTime::from_timestamp_secs(split_end + 1); + if query_ts < split_end_exclusive { + query_bound } else { Bound::Unbounded } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; if final_start_timestamp != Bound::Unbounded || final_end_timestamp != Bound::Unbounded { let range = RangeQuery { @@ -1459,8 +1439,6 @@ pub async fn single_doc_mapping_leaf_search( } } - info!(split_outcome_counters=%leaf_search_context.split_outcome_counters, "leaf split search finished"); - // we can't use unwrap_or_clone because mutexes aren't Clone let mut incremental_merge_collector = match Arc::try_unwrap(incremental_merge_collector) { Ok(filter_merger) => filter_merger.into_inner().unwrap(), @@ -1482,19 +1460,26 @@ pub async fn single_doc_mapping_leaf_search( .await .context("failed to merge split search responses"); - Ok(leaf_search_response_reresult??) + let mut leaf_response = leaf_search_response_reresult??; + leaf_response.splits_by_outcome = Some( + leaf_search_context + .split_outcome_counters + .split_by_outcome(), + ); + Ok(leaf_response) } #[derive(Copy, Clone)] enum SplitSearchState { Start, CacheHit, + ProcessedFromMetadata, PrunedBeforeWarmup, WarmUp, PrunedAfterWarmup, CpuQueue, Cpu, - Success, + Processed, } impl SplitSearchState { @@ -1502,12 +1487,13 @@ impl SplitSearchState { match self { SplitSearchState::Start => counters.cancel_before_warmup.inc(), SplitSearchState::CacheHit => counters.cache_hit.inc(), + SplitSearchState::ProcessedFromMetadata => counters.processed_from_metadata.inc(), SplitSearchState::PrunedBeforeWarmup => counters.pruned_before_warmup.inc(), SplitSearchState::WarmUp => counters.cancel_warmup.inc(), SplitSearchState::PrunedAfterWarmup => counters.pruned_after_warmup.inc(), SplitSearchState::CpuQueue => counters.cancel_cpu_queue.inc(), SplitSearchState::Cpu => counters.cancel_cpu.inc(), - SplitSearchState::Success => counters.success.inc(), + SplitSearchState::Processed => counters.processed.inc(), } } } @@ -1688,6 +1674,11 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, None); + let expected_upper_inclusive = RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), + }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { field: timestamp_field.to_string(), @@ -1697,7 +1688,7 @@ mod tests { .unwrap(), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, None); + remove_timestamp_test_case(&search_request, &split, Some(expected_upper_inclusive)); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), @@ -1740,10 +1731,10 @@ mod tests { Some(expected_upper_exclusive.clone()), ); - let expected_lower_exclusive = RangeQuery { + let expected_lower_excl_upper_incl = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Excluded((time2 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1757,10 +1748,22 @@ mod tests { remove_timestamp_test_case( &search_request, &split, - Some(expected_lower_exclusive.clone()), + Some(expected_lower_excl_upper_incl.clone()), ); + } + + #[test] + fn test_remove_timestamp_range_multiple_bounds() { + // When bounds are defined both in the AST and in the search request, + // make sure we take the most restrictive ones. + const S_TO_NS: i64 = 1_000_000_000; + let time1 = 1700001000; + let time2 = 1700002000; + let time3 = 1700003000; + let time4 = 1700004000; + + let timestamp_field = "timestamp".to_string(); - // we take the most restrictive bounds let split = SplitIdAndFooterOffsets { timestamp_start: Some(time1), timestamp_end: Some(time4), @@ -1803,10 +1806,10 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, Some(expected_upper_2_inc)); - let expected_lower_3 = RangeQuery { + let expected_lower_3_upper_4 = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Included((time3 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time4 * S_TO_NS).into()), }; let search_request = SearchRequest { @@ -1820,7 +1823,11 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3.clone())); + remove_timestamp_test_case( + &search_request, + &split, + Some(expected_lower_3_upper_4.clone()), + ); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1833,7 +1840,7 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3)); + remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3_upper_4)); let mut search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), diff --git a/quickwit/quickwit-search/src/leaf_cache.rs b/quickwit/quickwit-search/src/leaf_cache.rs index abc756763ef..3da09bcf129 100644 --- a/quickwit/quickwit-search/src/leaf_cache.rs +++ b/quickwit/quickwit-search/src/leaf_cache.rs @@ -85,6 +85,9 @@ struct CacheKey { /// The effective time range of the request, that is, the intersection of the timerange /// requested, and the timerange covered by the split. merged_time_range: HalfOpenRange, + /// The number of soft deleted documents in the split. + /// This assumes that the list of deleted docs is append only for a split. + soft_deleted_docs_len: usize, } impl CacheKey { @@ -106,6 +109,7 @@ impl CacheKey { split_id: split_info.split_id, request: search_request, merged_time_range, + soft_deleted_docs_len: split_info.soft_deleted_doc_ids.len(), } } } @@ -253,6 +257,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { @@ -262,6 +267,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let query_1 = SearchRequest { @@ -298,6 +304,7 @@ mod tests { split_id: "split_1".to_string(), }], resource_stats: None, + splits_by_outcome: None, }; assert!(cache.get(split_1.clone(), query_1.clone()).is_none()); @@ -319,6 +326,7 @@ mod tests { timestamp_start: Some(100), timestamp_end: Some(199), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -327,6 +335,7 @@ mod tests { timestamp_start: Some(150), timestamp_end: Some(249), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_3 = SplitIdAndFooterOffsets { split_id: "split_3".to_string(), @@ -335,6 +344,7 @@ mod tests { timestamp_start: Some(150), timestamp_end: Some(249), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let query_1 = SearchRequest { @@ -389,6 +399,7 @@ mod tests { split_id: "split_1".to_string(), }], resource_stats: Some(ResourceStats::default()), + splits_by_outcome: None, }; // for split_1, 1 and 1bis cover different timestamp ranges diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index 33a21664c3f..8476326bd53 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -35,7 +35,10 @@ mod scroll_context; mod search_job_placer; mod search_response_rest; mod service; +mod soft_delete_query; +mod sort_repr; pub(crate) mod top_k_collector; +mod top_k_computer; mod metrics; mod search_permit_provider; @@ -68,6 +71,7 @@ use quickwit_metastore::{ }; use quickwit_proto::search::{ PartialHit, ResourceStats, SearchRequest, SearchResponse, SplitIdAndFooterOffsets, + SplitsByOutcome, }; use quickwit_proto::types::IndexUid; use quickwit_storage::StorageResolver; @@ -172,6 +176,11 @@ fn extract_split_and_footer_offsets(split_metadata: &SplitMetadata) -> SplitIdAn .as_ref() .map(|time_range| *time_range.end()), num_docs: split_metadata.num_docs as u64, + soft_deleted_doc_ids: split_metadata + .soft_deleted_doc_ids + .iter() + .copied() + .collect(), } } @@ -390,6 +399,16 @@ pub(crate) fn merge_resource_stats_it<'a>( acc_stats } +pub(crate) fn merge_splits_by_outcome_it( + outcomes_it: impl IntoIterator>, +) -> Option { + let mut acc: Option = None; + for new in outcomes_it { + merge_splits_by_outcome(new, &mut acc); + } + acc +} + fn merge_resource_stats( new_stats_opt: &Option, stat_accs_opt: &mut Option, @@ -406,6 +425,27 @@ fn merge_resource_stats( } } } + +pub(crate) fn merge_splits_by_outcome( + new_opt: Option, + acc_opt: &mut Option, +) { + if let Some(new) = new_opt { + if let Some(acc) = acc_opt { + acc.pruned_before_warmup += new.pruned_before_warmup; + acc.pruned_after_warmup += new.pruned_after_warmup; + acc.cancel_before_warmup += new.cancel_before_warmup; + acc.cancel_warmup += new.cancel_warmup; + acc.cancel_cpu_queue += new.cancel_cpu_queue; + acc.cancel_cpu += new.cancel_cpu; + acc.processed += new.processed; + acc.processed_from_metadata += new.processed_from_metadata; + acc.cache_hit += new.cache_hit; + } else { + *acc_opt = Some(new); + } + } +} #[cfg(test)] mod stats_merge_tests { use super::*; @@ -501,4 +541,60 @@ mod stats_merge_tests { }) ); } + + #[test] + fn test_merge_splits_by_outcome() { + let mut acc = None; + + // merging None into None stays None + merge_splits_by_outcome(None, &mut acc); + assert_eq!(acc, None); + + let outcome = Some(SplitsByOutcome { + processed: 3, + cache_hit: 1, + cancel_warmup: 1, + ..Default::default() + }); + merge_splits_by_outcome(outcome, &mut acc); + assert_eq!( + acc, + Some(SplitsByOutcome { + processed: 3, + cache_hit: 1, + cancel_warmup: 1, + ..Default::default() + }) + ); + + // merging None keeps the accumulator unchanged + merge_splits_by_outcome(None, &mut acc); + assert_eq!( + acc, + Some(SplitsByOutcome { + processed: 3, + cache_hit: 1, + cancel_warmup: 1, + ..Default::default() + }) + ); + + // retry outcome: the failed split succeeded on retry + let retry_outcome = Some(SplitsByOutcome { + processed: 1, + pruned_before_warmup: 2, + ..Default::default() + }); + merge_splits_by_outcome(retry_outcome, &mut acc); + assert_eq!( + acc, + Some(SplitsByOutcome { + processed: 4, + cache_hit: 1, + cancel_warmup: 1, + pruned_before_warmup: 2, + ..Default::default() + }) + ); + } } diff --git a/quickwit/quickwit-search/src/list_fields_cache.rs b/quickwit/quickwit-search/src/list_fields_cache.rs index 681ce7a2e77..c940893b722 100644 --- a/quickwit/quickwit-search/src/list_fields_cache.rs +++ b/quickwit/quickwit-search/src/list_fields_cache.rs @@ -83,6 +83,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { @@ -92,6 +93,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let result = ListFieldsEntryResponse { diff --git a/quickwit/quickwit-search/src/metrics.rs b/quickwit/quickwit-search/src/metrics.rs index db4083a7eed..478b3524694 100644 --- a/quickwit/quickwit-search/src/metrics.rs +++ b/quickwit/quickwit-search/src/metrics.rs @@ -23,6 +23,7 @@ use quickwit_common::metrics::{ linear_buckets, new_counter, new_counter_vec, new_gauge, new_gauge_vec, new_histogram, new_histogram_vec, }; +use quickwit_proto::search::SplitsByOutcome; fn print_if_not_null( field_name: &'static str, @@ -36,27 +37,32 @@ fn print_if_not_null( Ok(()) } +/// Counters to track the outcome of leaf search splits. +/// +/// Cancellation counters cover two scenarios: errors in splits and timeouts. pub struct SplitSearchOutcomeCounters { pub cancel_before_warmup: IntCounter, pub cache_hit: IntCounter, + pub processed_from_metadata: IntCounter, pub pruned_before_warmup: IntCounter, pub cancel_warmup: IntCounter, pub pruned_after_warmup: IntCounter, pub cancel_cpu_queue: IntCounter, pub cancel_cpu: IntCounter, - pub success: IntCounter, + pub processed: IntCounter, } impl fmt::Display for SplitSearchOutcomeCounters { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { print_if_not_null("cancel_before_warmup", &self.cancel_before_warmup, f)?; print_if_not_null("cache_hit", &self.cache_hit, f)?; + print_if_not_null("processed_from_metadata", &self.processed_from_metadata, f)?; print_if_not_null("pruned_before_warmup", &self.pruned_before_warmup, f)?; print_if_not_null("cancel_warmup", &self.cancel_warmup, f)?; print_if_not_null("pruned_after_warmup", &self.pruned_after_warmup, f)?; print_if_not_null("cancel_cpu_queue", &self.cancel_cpu_queue, f)?; print_if_not_null("cancel_cpu", &self.cancel_cpu, f)?; - print_if_not_null("success", &self.success, f)?; + print_if_not_null("processed", &self.processed, f)?; Ok(()) } } @@ -92,6 +98,8 @@ impl SplitSearchOutcomeCounters { cancel_before_warmup: search_split_outcome_vec .with_label_values(["cancel_before_warmup"]), cache_hit: search_split_outcome_vec.with_label_values(["cache_hit"]), + processed_from_metadata: search_split_outcome_vec + .with_label_values(["processed_from_metadata"]), pruned_before_warmup: search_split_outcome_vec .with_label_values(["pruned_before_warmup"]), cancel_warmup: search_split_outcome_vec.with_label_values(["cancel_warmup"]), @@ -99,15 +107,42 @@ impl SplitSearchOutcomeCounters { .with_label_values(["pruned_after_warmup"]), cancel_cpu_queue: search_split_outcome_vec.with_label_values(["cancel_cpu_queue"]), cancel_cpu: search_split_outcome_vec.with_label_values(["cancel_cpu"]), - success: search_split_outcome_vec.with_label_values(["success"]), + processed: search_split_outcome_vec.with_label_values(["processed"]), + } + } + + pub fn split_by_outcome(&self) -> SplitsByOutcome { + // Destructure to make sure we don't forget to update this if we add a + // new state. + let Self { + pruned_before_warmup, + pruned_after_warmup, + cancel_before_warmup, + cancel_warmup, + cancel_cpu_queue, + cancel_cpu, + processed, + processed_from_metadata, + cache_hit, + } = &self; + SplitsByOutcome { + pruned_before_warmup: pruned_before_warmup.get(), + pruned_after_warmup: pruned_after_warmup.get(), + cancel_before_warmup: cancel_before_warmup.get(), + cancel_warmup: cancel_warmup.get(), + cancel_cpu_queue: cancel_cpu_queue.get(), + cancel_cpu: cancel_cpu.get(), + processed: processed.get(), + processed_from_metadata: processed_from_metadata.get(), + cache_hit: cache_hit.get(), } } } pub struct SearchMetrics { - pub root_search_requests_total: IntCounterVec<1>, - pub root_search_request_duration_seconds: HistogramVec<1>, - pub root_search_targeted_splits: HistogramVec<1>, + pub root_search_requests_total: IntCounterVec<2>, + pub root_search_request_duration_seconds: HistogramVec<2>, + pub root_search_targeted_splits: HistogramVec<2>, pub leaf_search_requests_total: IntCounterVec<2>, pub leaf_search_request_duration_seconds: HistogramVec<2>, pub leaf_search_targeted_splits: HistogramVec<2>, @@ -170,14 +205,14 @@ impl Default for SearchMetrics { "Total number of root search gRPC requests processed.", "search", &[("kind", "server")], - ["status"], + ["user_agent", "status"], ), root_search_request_duration_seconds: new_histogram_vec( "root_search_request_duration_seconds", "Duration of root search gRPC requests in seconds.", "search", &[("kind", "server")], - ["status"], + ["user_agent", "status"], duration_buckets(), ), root_search_targeted_splits: new_histogram_vec( @@ -185,7 +220,7 @@ impl Default for SearchMetrics { "Number of splits targeted per root search GRPC request.", "search", &[], - ["status"], + ["user_agent", "status"], targeted_splits_buckets.clone(), ), leaf_search_requests_total: new_counter_vec( diff --git a/quickwit/quickwit-search/src/metrics_trackers.rs b/quickwit/quickwit-search/src/metrics_trackers.rs index 9539ac2e098..5d88dd31024 100644 --- a/quickwit/quickwit-search/src/metrics_trackers.rs +++ b/quickwit/quickwit-search/src/metrics_trackers.rs @@ -19,7 +19,8 @@ use std::task::{Context, Poll, ready}; use std::time::Instant; use pin_project::{pin_project, pinned_drop}; -use quickwit_proto::search::{LeafSearchResponse, SearchResponse}; +use quickwit_proto::search::{LeafSearchResponse, SearchResponse, SplitsByOutcome}; +use tracing::{Span, record_all}; use crate::SearchError; use crate::metrics::{SEARCH_METRICS, queue_label}; @@ -34,20 +35,26 @@ pub struct SearchPlanMetricsFuture { #[pin] pub tracked: F, pub start: Instant, - pub is_success: Option, + pub status: Option>, + pub user_agent: String, + pub req_span: Span, } #[pinned_drop] impl PinnedDrop for SearchPlanMetricsFuture { fn drop(self: Pin<&mut Self>) { - let status = match self.is_success { + let status = match self.status { // this is a partial success, actual status will be recorded during the search step - Some(true) => return, - Some(false) => "plan-error", - None => "plan-cancelled", + Some(Ok(())) => return, + Some(Err(error)) => error, + None => { + let _guard = self.req_span.enter(); + tracing::info!("root search cancelled"); + "plan-cancelled" + } }; - let label_values = [status]; + let label_values = [normalize_user_agent(&self.user_agent), status]; SEARCH_METRICS .root_search_requests_total .with_label_values(label_values) @@ -68,9 +75,14 @@ where F: Future> let this = self.project(); let response = ready!(this.tracked.poll(cx)); if let Err(err) = &response { + let _guard = this.req_span.enter(); tracing::error!(?err, "root search planning failed"); } - *this.is_success = Some(response.is_ok()); + *this.status = match &response { + Ok(_) => Some(Ok(())), + Err(SearchError::TooManySplits(_)) => Some(Err("too-many-splits")), + Err(_) => Some(Err("plan-error")), + }; Poll::Ready(Ok(response?)) } } @@ -85,13 +97,19 @@ pub struct RootSearchMetricsFuture { pub start: Instant, pub num_targeted_splits: usize, pub status: Option<&'static str>, + pub user_agent: String, + pub req_span: Span, } #[pinned_drop] impl PinnedDrop for RootSearchMetricsFuture { fn drop(self: Pin<&mut Self>) { + if self.status.is_none() { + let _guard = self.req_span.enter(); + tracing::info!("root search cancelled"); + } let status = self.status.unwrap_or("cancelled"); - let label_values = [status]; + let label_values = [normalize_user_agent(&self.user_agent), status]; SEARCH_METRICS .root_search_requests_total .with_label_values(label_values) @@ -107,6 +125,48 @@ impl PinnedDrop for RootSearchMetricsFuture { } } +struct SplitsByOutcomeDisp(SplitsByOutcome); + +impl std::fmt::Display for SplitsByOutcomeDisp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Destructure to make sure we update this if a state is added + let SplitsByOutcome { + pruned_before_warmup, + pruned_after_warmup, + cancel_before_warmup, + cancel_warmup, + cancel_cpu_queue, + cancel_cpu, + processed, + processed_from_metadata, + cache_hit, + } = self.0; + let mut sep = "{"; + for (name, val) in [ + ("pruned_before_warmup", pruned_before_warmup), + ("pruned_after_warmup", pruned_after_warmup), + ("cancel_before_warmup", cancel_before_warmup), + ("cancel_warmup", cancel_warmup), + ("cancel_cpu_queue", cancel_cpu_queue), + ("cancel_cpu", cancel_cpu), + ("processed", processed), + ("processed_from_metadata", processed_from_metadata), + ("cache_hit", cache_hit), + ] { + if val > 0 { + write!(f, "{sep}{name}={val}")?; + sep = ","; + } + } + if sep == "{" { + write!(f, "{{}}")?; + } else { + write!(f, "}}")?; + } + Ok(()) + } +} + impl Future for RootSearchMetricsFuture where F: Future> { @@ -115,18 +175,27 @@ where F: Future> fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { let this = self.project(); let response = ready!(this.tracked.poll(cx)); + record_all!(this.req_span, elapsed_ms = this.start.elapsed().as_millis()); + let _guard = this.req_span.enter(); if let Err(err) = &response { tracing::error!(?err, "root search failed"); - } - if let Ok(resp) = &response { + *this.status = Some("error"); + } else if let Ok(resp) = &response { + let s = resp.splits_by_outcome.unwrap_or_default(); if resp.failed_splits.is_empty() { *this.status = Some("success"); + tracing::info!(splits_by_outcome = %SplitsByOutcomeDisp(s), "root search success"); } else { *this.status = Some("partial-success"); + tracing::error!( + failed_splits = resp.failed_splits.len(), + first_failed_split = ?resp.failed_splits.first().unwrap(), + splits_by_outcome = %SplitsByOutcomeDisp(s), + "root search partial success" + ); } - } else { - *this.status = Some("error"); } + Poll::Ready(Ok(response?)) } } @@ -182,3 +251,96 @@ where F: Future> Poll::Ready(Ok(response?)) } } + +/// Simplify the user agent to limit the metric's cardinality. +pub fn normalize_user_agent(user_agent: &str) -> &str { + let ua = user_agent.trim(); + + // Browsers always start with "Mozilla/" + if ua.starts_with("Mozilla") { + return "browser"; + } + + let lower = ua.to_ascii_lowercase(); + + // Well-known CLI / library prefixes (match on the start of the lower-cased + // string so version numbers don't matter). + const CLI_PREFIXES: &[&str] = &[ + "curl", + "wget", + "python-httpx", + "python-requests", + "elasticsearch-py", + "go-http-client", + "java", + "okhttp", + "axios", + "ruby", + "node-fetch", + "node", + ]; + if let Some(&prefix) = CLI_PREFIXES.iter().find(|p| lower.starts_with(*p)) { + return prefix; + } + + // Keep short service names verbatim; truncate anything exotic. + if ua.len() <= 64 { ua } else { "other" } +} + +#[cfg(test)] +mod tests { + use quickwit_proto::search::SplitsByOutcome; + + use super::SplitsByOutcomeDisp; + + fn disp(s: SplitsByOutcome) -> String { + format!("{}", SplitsByOutcomeDisp(s)) + } + + #[test] + fn test_splits_by_outcome_disp_all_zero() { + assert_eq!(disp(SplitsByOutcome::default()), "{}"); + } + + #[test] + fn test_splits_by_outcome_disp_single_field() { + assert_eq!( + disp(SplitsByOutcome { + processed: 3, + ..Default::default() + }), + "{processed=3}" + ); + } + + #[test] + fn test_splits_by_outcome_disp_multiple_fields() { + assert_eq!( + disp(SplitsByOutcome { + pruned_before_warmup: 2, + processed: 1, + ..Default::default() + }), + "{pruned_before_warmup=2,processed=1}" + ); + } + + #[test] + fn test_splits_by_outcome_disp_all_fields() { + assert_eq!( + disp(SplitsByOutcome { + pruned_before_warmup: 1, + pruned_after_warmup: 2, + cancel_before_warmup: 3, + cancel_warmup: 4, + cancel_cpu_queue: 5, + cancel_cpu: 6, + processed: 7, + processed_from_metadata: 8, + cache_hit: 9, + }), + "{pruned_before_warmup=1,pruned_after_warmup=2,cancel_before_warmup=3,cancel_warmup=4,\ + cancel_cpu_queue=5,cancel_cpu=6,processed=7,processed_from_metadata=8,cache_hit=9}" + ); + } +} diff --git a/quickwit/quickwit-search/src/retry/mod.rs b/quickwit/quickwit-search/src/retry/mod.rs index 996665717cf..a496159d76c 100644 --- a/quickwit/quickwit-search/src/retry/mod.rs +++ b/quickwit/quickwit-search/src/retry/mod.rs @@ -128,6 +128,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let client_for_retry = retry_client( &search_job_placer, diff --git a/quickwit/quickwit-search/src/retry/search.rs b/quickwit/quickwit-search/src/retry/search.rs index 696a352de94..7ae744c8625 100644 --- a/quickwit/quickwit-search/src/retry/search.rs +++ b/quickwit/quickwit-search/src/retry/search.rs @@ -93,6 +93,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -101,6 +102,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, ], }], diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index 246d3308636..43a68254d5b 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -45,7 +45,7 @@ use tantivy::aggregation::agg_result::AggregationResults; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::collector::Collector; use tantivy::schema::{Field, FieldEntry, FieldType, Schema}; -use tracing::{debug, info, info_span, instrument}; +use tracing::{debug, info_span, instrument, record_all}; use crate::cluster_client::ClusterClient; use crate::collector::{QuickwitAggregations, make_merge_collector}; @@ -161,12 +161,18 @@ pub struct IndexMetasForLeafSearch { pub(crate) type IndexesMetasForLeafSearch = HashMap; +/// Maps to `true` if the field mapping of all indexes is `datetime` for the +/// given sort field. Contains an entry for every sort field. Does not ensure +/// that the field is indeed a datetime in all splits (doc mapping might +/// have been updated). +type SortFieldsIsDatetime = HashMap; + #[derive(Debug)] struct RequestMetadata { timestamp_field_opt: Option, query_ast_resolved: QueryAst, indexes_meta_for_leaf_search: IndexesMetasForLeafSearch, - sort_fields_is_datetime: HashMap, + sort_fields_is_datetime: SortFieldsIsDatetime, } /// Validates request against each index's doc mapper and ensures that: @@ -189,11 +195,10 @@ fn validate_request_and_build_metadata( )?; let query_ast: QueryAst = serde_json::from_str(&search_request.query_ast) .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; - let mut indexes_meta_for_leaf_search: HashMap = - HashMap::new(); + let mut indexes_meta_for_leaf_search: IndexesMetasForLeafSearch = HashMap::new(); let mut query_ast_resolved_opt: Option = None; let mut timestamp_field_opt: Option = None; - let mut sort_fields_is_datetime: HashMap = HashMap::new(); + let mut sort_fields_is_datetime: SortFieldsIsDatetime = HashMap::new(); for index_metadata in indexes_metadata { let doc_mapper = build_doc_mapper( @@ -315,7 +320,7 @@ fn validate_secondary_time(index_metadata: &[IndexMetadata]) -> crate::Result, + sort_field_is_datetime: &mut SortFieldsIsDatetime, ) -> crate::Result<()> { for sort_field in sort_fields.iter() { if let Some(sort_field_entry) = get_sort_by_field_entry(&sort_field.field_name, schema)? { @@ -402,6 +407,7 @@ fn simplify_search_request_for_scroll_api(req: &SearchRequest) -> crate::Result< count_hits: quickwit_proto::search::CountHits::Underestimate as i32, ignore_missing_indexes: req.ignore_missing_indexes, split_id: req.split_id.clone(), + user_agent: req.user_agent.clone(), }) } @@ -439,16 +445,10 @@ fn validate_sort_by_fields_and_search_after( } let mut search_after_sort_value_count = 0; - // TODO: we could validate if the search after sort value types of consistent with the sort - // field types. - if let Some(sort_by_value) = search_after_partial_hit.sort_value.as_ref() { - sort_by_value.sort_value.context("sort value must be set")?; + if search_after_partial_hit.sort_value.is_some() { search_after_sort_value_count += 1; } - if let Some(sort_by_value_2) = search_after_partial_hit.sort_value2.as_ref() { - sort_by_value_2 - .sort_value - .context("sort value must be set")?; + if search_after_partial_hit.sort_value2.is_some() { search_after_sort_value_count += 1; } if search_after_sort_value_count != sort_fields_without_doc_count { @@ -486,11 +486,6 @@ fn validate_sort_by_field_type( has_timestamp_format: bool, ) -> crate::Result<()> { let field_name = sort_by_field_entry.name(); - if matches!(sort_by_field_entry.field_type(), FieldType::Str(_)) { - return Err(SearchError::InvalidArgument(format!( - "sort by field on type text is currently not supported `{field_name}`" - ))); - } if !sort_by_field_entry.is_fast() { return Err(SearchError::InvalidArgument(format!( "sort by field must be a fast field, please add the fast property to your field \ @@ -638,6 +633,7 @@ async fn search_partial_hits_phase_with_scroll( cached_partial_hits, failed_splits: leaf_search_resp.failed_splits.clone(), num_successful_splits: leaf_search_resp.num_successful_splits, + splits_by_outcome: leaf_search_resp.splits_by_outcome, }; let scroll_key_and_start_offset: ScrollKeyAndStartOffset = ScrollKeyAndStartOffset::new_with_start_offset( @@ -710,13 +706,18 @@ pub fn get_count_from_metadata(split_metadatas: &[SplitMetadata]) -> Vec> = leaf_search_responses.into_iter().map(Ok).collect_vec(); let span = info_span!("merge_fruits"); - let leaf_search_response = crate::search_thread_pool() + let mut leaf_search_response = crate::search_thread_pool() .run_cpu_intensive(move || { let _span_guard = span.enter(); merge_collector.merge_fruits(leaf_search_results) @@ -828,8 +829,8 @@ pub(crate) async fn search_partial_hits_phase( ); } - if !leaf_search_response.failed_splits.is_empty() { - quickwit_common::rate_limited_error!(limit_per_min=6, failed_splits = ?leaf_search_response.failed_splits, "leaf search response contains at least one failed split"); + if leaf_search_response.splits_by_outcome.is_none() { + leaf_search_response.splits_by_outcome = Some(Default::default()); } Ok(leaf_search_response) @@ -952,7 +953,7 @@ fn build_hit_with_position( if let Some(sort_by_value) = sort_value_opt && let Some(output_datetime_format) = &sort_field_1_datetime_format_opt { - convert_sort_datetime_value(sort_by_value, *output_datetime_format)?; + convert_sort_datetime_value_from_nanos(sort_by_value, *output_datetime_format)?; } let sort_value_2_opt = partial_hit_ref .sort_value2 @@ -961,7 +962,7 @@ fn build_hit_with_position( if let Some(sort_by_value) = sort_value_2_opt && let Some(output_datetime_format) = &sort_field_2_datetime_format_opt { - convert_sort_datetime_value(sort_by_value, *output_datetime_format)?; + convert_sort_datetime_value_from_nanos(sort_by_value, *output_datetime_format)?; } let position = *hit_order.get(&key).expect("hit order must be present"); let index_id = split_id_to_index_id_map @@ -1043,12 +1044,12 @@ async fn root_search_aux( num_hits: first_phase_result.num_hits, hits, elapsed_time_micros: 0u64, - errors: Vec::new(), scroll_id: scroll_key_and_start_offset_opt .as_ref() .map(ToString::to_string), failed_splits: first_phase_result.failed_splits, num_successful_splits: first_phase_result.num_successful_splits, + splits_by_outcome: first_phase_result.splits_by_outcome, }) } @@ -1146,7 +1147,7 @@ async fn refine_and_list_matches( search_request: &mut SearchRequest, indexes_metadata: Vec, query_ast_resolved: QueryAst, - sort_fields_is_datetime: HashMap, + sort_fields_is_datetime: SortFieldsIsDatetime, timestamp_field_opt: Option, secondary_timestamp_field_opt: Option, ) -> crate::Result> { @@ -1180,7 +1181,15 @@ async fn refine_and_list_matches( ); } - let tag_filter_ast = extract_tags_from_query(query_ast_resolved); + // We might miss some pruning opportunities by restricting the tag filter + // AST to the tag fields of the current doc mappings, but sending all + // possible filters to the metastore is too expensive for large queries (e.g + // TermSet queries with thousands of terms). + let tag_field_names: std::collections::BTreeSet = indexes_metadata + .iter() + .flat_map(|meta| meta.index_config.doc_mapping.tag_fields.iter().cloned()) + .collect(); + let tag_filter_ast = extract_tags_from_query(query_ast_resolved, Some(&tag_field_names)); // TODO if search after is set, we sort by timestamp and we don't want to count all results, // we can refine more here. Same if we sort by _shard_doc @@ -1213,6 +1222,7 @@ async fn refine_and_list_matches( async fn plan_splits_for_root_search( search_request: &mut SearchRequest, metastore: &mut MetastoreServiceClient, + max_splits_per_search: Option, ) -> crate::Result<(Vec, IndexesMetasForLeafSearch)> { let list_indexes_metadatas_request = ListIndexesMetadataRequest { index_id_patterns: search_request.index_id_patterns.clone(), @@ -1246,12 +1256,47 @@ async fn plan_splits_for_root_search( secondary_timestamp_field_opt, ) .await?; + + let num_targeted_splits = split_metadatas.len(); + if let Some(max_total_split_searches) = max_splits_per_search + && num_targeted_splits > max_total_split_searches + { + return Err(SearchError::TooManySplits(format!( + "Targeted split limit exceeded ({num_targeted_splits}>{max_total_split_searches})" + ))); + } + Ok(( split_metadatas, request_metadata.indexes_meta_for_leaf_search, )) } +fn record_request_span(search_request: &SearchRequest) -> tracing::Span { + let span = info_span!( + "request", + indexes = ?PrettySample::new(&search_request.index_id_patterns, 5), + user_agent = search_request.user_agent.as_deref().unwrap_or_default(), + query_ast = %search_request.query_ast, + count_required = search_request.count_hits().as_str_name(), + agg = tracing::field::Empty, + ts_range = tracing::field::Empty, + elapsed_ms = tracing::field::Empty, + targeted_splits_bytes = tracing::field::Empty, + num_targeted_splits = tracing::field::Empty, + ); + if let Some(agg) = search_request.aggregation_request.as_ref() { + record_all!(span, agg = %agg); + } + if search_request.start_timestamp.is_some() || search_request.end_timestamp.is_some() { + record_all!( + span, + ts_range = ?search_request.start_timestamp..search_request.end_timestamp, + ); + } + span +} + /// Performs a distributed search. /// 1. Sends leaf requests over gRPC to multiple leaf nodes. /// 2. Merges the search results. @@ -1266,45 +1311,31 @@ pub async fn root_search( ) -> crate::Result { let start_instant = Instant::now(); + let req_span = record_request_span(&search_request); + let (split_metadatas, indexes_meta_for_leaf_search) = SearchPlanMetricsFuture { start: start_instant, - tracked: plan_splits_for_root_search(&mut search_request, &mut metastore), - is_success: None, + user_agent: search_request.user_agent.clone().unwrap_or_default(), + tracked: plan_splits_for_root_search( + &mut search_request, + &mut metastore, + searcher_context.searcher_config.max_splits_per_search, + ), + status: None, + req_span: req_span.clone(), } .await?; - let num_docs: usize = split_metadatas.iter().map(|split| split.num_docs).sum(); - let num_splits = split_metadatas.len(); - - // It would have been nice to add those in the context of the trace span, - // but with our current logging setting, it makes logs too verbose. - info!( - query_ast = search_request.query_ast.as_str(), - agg = search_request.aggregation_request(), - start_ts = ?(search_request.start_timestamp()..search_request.end_timestamp()), - count_required = search_request.count_hits().as_str_name(), - num_docs = num_docs, - num_splits = num_splits, - "root_search" - ); - - if let Some(max_total_split_searches) = searcher_context.searcher_config.max_splits_per_search - && max_total_split_searches < num_splits - { - tracing::error!( - num_splits, - max_total_split_searches, - index=?PrettySample::new(search_request.index_id_patterns, 5), - query=%search_request.query_ast, - "max total splits exceeded" - ); - return Err(SearchError::InvalidArgument(format!( - "Number of targeted splits {num_splits} exceeds the limit {max_total_split_searches}" - ))); - } + let targeted_splits_bytes: u64 = split_metadatas + .iter() + .map(|split| split.footer_offsets.end) + .sum(); + let num_targeted_splits = split_metadatas.len(); + record_all!(req_span, targeted_splits_bytes, num_targeted_splits); let mut search_response_result = RootSearchMetricsFuture { start: start_instant, + user_agent: search_request.user_agent.clone().unwrap_or_default(), tracked: root_search_aux( searcher_context, &indexes_meta_for_leaf_search, @@ -1313,7 +1344,8 @@ pub async fn root_search( cluster_client, ), status: None, - num_targeted_splits: num_splits, + num_targeted_splits, + req_span, } .await; @@ -1397,12 +1429,11 @@ pub async fn search_plan( } else { 0 }; - let sstable_query_count = warmup_info.term_dict_fields.len() - + warmup_info - .terms_grouped_by_field - .values() - .map(|terms: &HashMap| terms.len()) - .sum::() + let sstable_query_count = warmup_info + .terms_grouped_by_field + .values() + .map(|terms: &HashMap| terms.len()) + .sum::() + warmup_info .term_ranges_grouped_by_field .values() @@ -1448,10 +1479,9 @@ pub async fn search_plan( /// Converts search after with datetime format to nanoseconds (representation in tantivy). /// If the sort field is a datetime field and no datetime format is set, the default format is /// milliseconds. -/// `sort_fields_are_datetime_opt` must be of the same length as `search_request.sort_fields`. fn convert_search_after_datetime_values( search_request: &mut SearchRequest, - sort_fields_is_datetime: &HashMap, + sort_fields_is_datetime: &SortFieldsIsDatetime, ) -> crate::Result<()> { for sort_field in search_request.sort_fields.iter_mut() { if *sort_fields_is_datetime @@ -1488,79 +1518,57 @@ fn convert_search_after_datetime_values( Ok(()) } -/// Convert sort values from input datetime format into nanoseconds. -/// The conversion is done only for U64 and I64 sort values, an error is returned for other types. +/// Converts a numerical sort value from the given input datetime format into a `Datetime` sort +/// value (nanoseconds, tantivy's internal datetime representation). +/// Only `U64` and `I64` sort values are accepted; an error is returned for other types. fn convert_sort_datetime_value_into_nanos( sort_value: &mut SortValue, input_format: SortDatetimeFormat, ) -> crate::Result<()> { - match sort_value { - SortValue::U64(value) => match input_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value = value.checked_mul(1_000_000).ok_or_else(|| { - SearchError::Internal(format!( - "sort value defined in milliseconds is too large and cannot be converted \ - into nanoseconds: {value}" - )) - })?; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing to do as the internal format is nanos. - } - }, - SortValue::I64(value) => match input_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value = value.checked_mul(1_000_000).ok_or_else(|| { - SearchError::Internal(format!( - "sort value defined in milliseconds is too large and cannot be converted \ - into nanoseconds: {value}" - )) - })?; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing to do as the internal format is nanos. - } - }, + // Normalise to i64, even though in theory the sort value should be parsed as i64 anyway. + let raw: i64 = match sort_value { + SortValue::U64(value) => i64::try_from(*value).map_err(|_| { + SearchError::Internal(format!( + "sort value is too large to be represented as a datetime: {value}" + )) + })?, + SortValue::I64(value) => *value, _ => { return Err(SearchError::Internal(format!( - "datetime conversion are only support for u64 and i64 sort values, not \ + "datetime conversion is only supported for u64 and i64 sort values, not \ `{sort_value:?}`" ))); } - } + }; + let nanos: i64 = match input_format { + SortDatetimeFormat::UnixTimestampMillis => raw.checked_mul(1_000_000).ok_or_else(|| { + SearchError::Internal(format!( + "sort value defined in milliseconds is too large to be a timestamp: {raw}" + )) + })?, + SortDatetimeFormat::UnixTimestampNanos => raw, + }; + *sort_value = SortValue::Datetime(nanos); Ok(()) } -/// Convert sort values from nanoseconds to the requested output format. -/// The conversion is done only for U64 and I64 sort values, an error is returned for other types. -fn convert_sort_datetime_value( +/// Converts a `Datetime` sort value (nanoseconds, tantivy's internal representation) into the +/// requested output format, replacing the value in place. +/// +/// Only the `Datetime` variant is accepted; an error is returned for other types. +fn convert_sort_datetime_value_from_nanos( sort_value: &mut SortValue, output_format: SortDatetimeFormat, ) -> crate::Result<()> { - match sort_value { - SortValue::U64(value) => match output_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value /= 1_000_000; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing todo as the internal format is in nanos. - } - }, - SortValue::I64(value) => match output_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value /= 1_000_000; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing todo as the internal format is in nanos. - } - }, - _ => { - return Err(SearchError::Internal(format!( - "datetime conversion are only support for u64 and i64 sort values, not \ - `{sort_value:?}`" - ))); - } - } + let SortValue::Datetime(nanos) = sort_value else { + return Err(SearchError::Internal(format!( + "datetime conversion is only supported for datetime sort values, not `{sort_value:?}`" + ))); + }; + *sort_value = match output_format { + SortDatetimeFormat::UnixTimestampMillis => SortValue::I64(*nanos / 1_000_000), + SortDatetimeFormat::UnixTimestampNanos => SortValue::I64(*nanos), + }; Ok(()) } @@ -1597,19 +1605,16 @@ impl ExtractTimestampRange<'_> { fn update_start_timestamp( &mut self, lower_bound: &quickwit_query::JsonLiteral, - included: bool, + _included: bool, ) { use quickwit_query::InterpretUserInput; - let Some(lower_bound) = tantivy::DateTime::interpret_json(lower_bound) else { + let Some(lower_bound_timestamp) = tantivy::DateTime::interpret_json(lower_bound) else { return; }; - let mut lower_bound = lower_bound.into_timestamp_secs(); - if !included { - // TODO saturating isn't exactly right, we should replace the RangeQuery with - // a match_none, but the visitor doesn't allow mutation. - lower_bound = lower_bound.saturating_add(1); - } + let lower_bound = lower_bound_timestamp.into_timestamp_secs(); + // lower_bound_timestamp has arbitrary precision, so even if it is + // excluded its truncated value must be used a start_timestamp. self.start_timestamp = self.start_timestamp.max(Some(lower_bound)); } @@ -2179,27 +2184,65 @@ mod tests { #[test] fn test_convert_sort_datetime_value() { - let mut sort_value = SortValue::U64(1617000000000000000); - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap(); - assert_eq!(sort_value, SortValue::U64(1617000000000)); - let mut sort_value = SortValue::I64(1617000000000000000); - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap(); + // millis output + let mut sort_value = SortValue::Datetime(1617000000000000000); + convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampMillis, + ) + .unwrap(); assert_eq!(sort_value, SortValue::I64(1617000000000)); - // conversion with float values should fail. + // nanos output + let mut sort_value = SortValue::Datetime(1617000000000000000); + convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampNanos, + ) + .unwrap(); + assert_eq!(sort_value, SortValue::I64(1617000000000000000)); + + // non-datetime values should fail. let mut sort_value = SortValue::F64(1617000000000000000.0); - let error = - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap_err(); + let error = convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampMillis, + ) + .unwrap_err(); assert_eq!( error.to_string(), - "internal error: `datetime conversion are only support for u64 and i64 sort values, \ - not `F64(1.617e18)``" + "internal error: `datetime conversion is only supported for datetime sort values, not \ + `F64(1.617e18)``" ); } + #[test] + fn test_sort_datetime_value_roundtrip() { + use quickwit_proto::search::SortByValue; + let nanos: i64 = 1617000000000000000; + + for format in [ + SortDatetimeFormat::UnixTimestampMillis, + SortDatetimeFormat::UnixTimestampNanos, + ] { + let mut sort_value = SortValue::Datetime(nanos); + convert_sort_datetime_value_from_nanos(&mut sort_value, format).unwrap(); + + let json = SortByValue::from(sort_value).into_json(); + + let sort_by_value = SortByValue::try_from_json(json).unwrap(); + let mut sort_value = sort_by_value.sort_value.unwrap(); + + convert_sort_datetime_value_into_nanos(&mut sort_value, format).unwrap(); + + assert_eq!( + sort_value, + SortValue::Datetime(nanos), + "roundtrip failed for format {format:?}" + ); + } + } + #[test] fn test_convert_sort_datetime_value_into_nanos() { let mut sort_value = SortValue::U64(1617000000000); @@ -2208,39 +2251,29 @@ mod tests { SortDatetimeFormat::UnixTimestampMillis, ) .unwrap(); - assert_eq!(sort_value, SortValue::U64(1617000000000000000)); + assert_eq!(sort_value, SortValue::Datetime(1617000000000000000)); let mut sort_value = SortValue::I64(1617000000000); convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap(); - assert_eq!(sort_value, SortValue::I64(1617000000000000000)); + assert_eq!(sort_value, SortValue::Datetime(1617000000000000000)); // conversion with a too large millisecond value should fail. let mut sort_value = SortValue::I64(1617000000000000); - let error = convert_sort_datetime_value_into_nanos( + convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap_err(); - assert_eq!( - error.to_string(), - "internal error: `sort value defined in milliseconds is too large and cannot be \ - converted into nanoseconds: 1617000000000000`" - ); // conversion with float values should fail. let mut sort_value = SortValue::F64(1617000000000000.0); - let error = convert_sort_datetime_value_into_nanos( + convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap_err(); - assert_eq!( - error.to_string(), - "internal error: `datetime conversion are only support for u64 and i64 sort values, \ - not `F64(1617000000000000.0)``" - ); } #[test] @@ -2411,7 +2444,7 @@ mod tests { let timestamp_field = schema_builder.add_date_field("timestamp", FAST); let id_field = schema_builder.add_u64_field("id", FAST); let no_fast_field = schema_builder.add_u64_field("no_fast", STORED); - let text_field = schema_builder.add_text_field("text", STORED); + let text_field = schema_builder.add_text_field("text", FAST); let schema = schema_builder.build(); { let sort_by_field_entry = schema.get_field_entry(timestamp_field); @@ -2439,11 +2472,7 @@ mod tests { } { let sort_by_field_entry = schema.get_field_entry(text_field); - let error = validate_sort_by_field_type(sort_by_field_entry, true).unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument: sort by field on type text is currently not supported `text`" - ); + validate_sort_by_field_type(sort_by_field_entry, false).unwrap(); } } @@ -2987,9 +3016,9 @@ mod tests { query_ast: qast_json_helper("test", &["body"]), max_hits: 10, sort_fields: vec![SortField { - field_name: "response_date".to_string(), + field_name: "response_time".to_string(), sort_order: SortOrder::Asc.into(), - sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampNanos as i32), + ..Default::default() }], ..Default::default() }; @@ -3169,9 +3198,9 @@ mod tests { query_ast: qast_json_helper("test", &["body"]), max_hits: 10, sort_fields: vec![SortField { - field_name: "response_date".to_string(), + field_name: "response_time".to_string(), sort_order: SortOrder::Desc.into(), - sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampNanos as i32), + ..Default::default() }], ..Default::default() }; @@ -4449,7 +4478,8 @@ mod tests { timestamp_range_extractor.start_timestamp = None; timestamp_range_extractor.end_timestamp = None; timestamp_range_extractor.visit(&unusual_bounds).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353942)); + // > 2021-04-13T22:45:41Z must include any 2021-04-13T22:45:41.xxxZ + assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353941)); assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283880)); let wrong_field = quickwit_query::query_ast::RangeQuery { @@ -4484,6 +4514,32 @@ mod tests { timestamp_range_extractor.visit(&high_precision).unwrap(); assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353941)); assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283880)); + + // create bounds from fractional seconds + let narrow_fractional_range = quickwit_query::query_ast::RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Excluded(JsonLiteral::String( + "2022-12-16T10:00:57.148999Z".to_owned(), + )), + upper_bound: Bound::Included(JsonLiteral::String( + "2022-12-16T10:00:57.149001Z".to_owned(), + )), + } + .into(); + + let mut timestamp_range_extractor = ExtractTimestampRange { + timestamp_field, + start_timestamp: None, + end_timestamp: None, + }; + timestamp_range_extractor + .visit(&narrow_fractional_range) + .unwrap(); + // When we have > 1671184857.148999, we should get >= 1671184857, not >= 1671184858 + // because the second 1671184857 contains values > 1671184857.148999 + assert_eq!(timestamp_range_extractor.start_timestamp, Some(1671184857)); + // When we have <= 1671184857.149001, we should get < 1671184858 + assert_eq!(timestamp_range_extractor.end_timestamp, Some(1671184858)); } fn create_search_resp( @@ -5356,7 +5412,7 @@ mod tests { ) .await .unwrap_err(); - assert!(matches!(search_error, SearchError::InvalidArgument { .. })); + assert!(matches!(search_error, SearchError::TooManySplits { .. })); Ok(()) } } diff --git a/quickwit/quickwit-search/src/scroll_context.rs b/quickwit/quickwit-search/src/scroll_context.rs index a4a31a856b5..26e8f7cfad3 100644 --- a/quickwit/quickwit-search/src/scroll_context.rs +++ b/quickwit/quickwit-search/src/scroll_context.rs @@ -25,7 +25,9 @@ use base64::prelude::BASE64_STANDARD; use quickwit_common::metrics::GaugeGuard; use quickwit_common::shared_consts::SCROLL_BATCH_LEN; use quickwit_metastore::SplitMetadata; -use quickwit_proto::search::{LeafSearchResponse, PartialHit, SearchRequest, SplitSearchError}; +use quickwit_proto::search::{ + LeafSearchResponse, PartialHit, SearchRequest, SplitSearchError, SplitsByOutcome, +}; use quickwit_proto::types::IndexUid; use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; @@ -55,6 +57,7 @@ pub(crate) struct ScrollContext { pub cached_partial_hits: Vec, pub failed_splits: Vec, pub num_successful_splits: u64, + pub splits_by_outcome: Option, } impl ScrollContext { @@ -117,6 +120,7 @@ impl ScrollContext { .await?; self.cached_partial_hits_start_offset = start_offset; self.cached_partial_hits = leaf_search_response.partial_hits; + self.splits_by_outcome = leaf_search_response.splits_by_outcome; Ok(true) } } diff --git a/quickwit/quickwit-search/src/search_permit_provider.rs b/quickwit/quickwit-search/src/search_permit_provider.rs index fac7c5e2e3e..502c54710a3 100644 --- a/quickwit/quickwit-search/src/search_permit_provider.rs +++ b/quickwit/quickwit-search/src/search_permit_provider.rs @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::VecDeque; +use std::collections::BinaryHeap; +use std::collections::binary_heap::PeekMut; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; @@ -28,11 +29,11 @@ use crate::metrics::SearchTaskMetrics; /// Distributor of permits to perform split search operation. /// -/// Requests are served in order. Each permit initially reserves a slot for the -/// warmup (limit concurrent downloads) and a pessimistic amount of memory. Once -/// the warmup is completed, the actual memory usage is set and the warmup slot -/// is released. Once the search is completed and the permit is dropped, the -/// remaining memory is also released. +/// Requests are served in order. Each permit reserves a slot for concurrent +/// search execution and a pessimistic amount of memory. The slot is held for +/// the entire duration of the search. Once the actual memory usage is known, +/// it can be updated via `update_memory_usage()`. When the permit is dropped, +/// both the search slot and memory are released. #[derive(Clone)] pub struct SearchPermitProvider { message_sender: mpsc::UnboundedSender, @@ -48,10 +49,8 @@ pub enum SearchPermitMessage { UpdateMemory { memory_delta: i64, }, - FreeWarmupSlot, Drop { memory_size: u64, - warmup_slot_freed: bool, }, } @@ -81,7 +80,7 @@ pub fn compute_initial_memory_allocation( impl SearchPermitProvider { pub fn new( - num_download_slots: usize, + max_num_concurrent_split_searches: usize, memory_budget: ByteSize, metrics: SearchTaskMetrics, ) -> Self { @@ -91,9 +90,9 @@ impl SearchPermitProvider { let actor = SearchPermitActor { msg_receiver: message_receiver, msg_sender: message_sender.downgrade(), - num_warmup_slots_available: num_download_slots, + num_search_slots_available: max_num_concurrent_split_searches, total_memory_budget: memory_budget.as_u64(), - permits_requests: VecDeque::new(), + permits_requests: BinaryHeap::new(), total_memory_allocated: 0u64, #[cfg(test)] stopped: state_sender, @@ -118,8 +117,11 @@ impl SearchPermitProvider { &self, splits: impl IntoIterator, ) -> Vec { + let permit_sizes: Vec = splits.into_iter().map(|size| size.as_u64()).collect(); + if permit_sizes.is_empty() { + return Vec::new(); + } let (permit_sender, permit_receiver) = oneshot::channel(); - let permit_sizes = splits.into_iter().map(|size| size.as_u64()).collect(); self.message_sender .send(SearchPermitMessage::Request { permit_sender, @@ -136,17 +138,91 @@ struct SearchPermitActor { metrics: SearchTaskMetrics, msg_receiver: mpsc::UnboundedReceiver, msg_sender: mpsc::WeakUnboundedSender, - num_warmup_slots_available: usize, + num_search_slots_available: usize, /// Note it is possible for memory_allocated to exceed memory_budget temporarily, /// if and only if a split leaf search task ended up using more than `initial_allocation`. /// When it happens, new permits will not be assigned until the memory is freed. total_memory_budget: u64, total_memory_allocated: u64, - permits_requests: VecDeque<(oneshot::Sender, u64)>, + permits_requests: BinaryHeap, #[cfg(test)] stopped: watch::Sender, } +struct SingleSplitPermitRequest { + permit_sender: oneshot::Sender, + permit_size: u64, +} + +struct LeafPermitRequest { + /// Single split permit requests for this leaf search. + single_split_permit_requests: std::vec::IntoIter, +} + +impl Ord for LeafPermitRequest { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // we compare other with self and not the other way around because we want a min-heap and + // Rust's is a max-heap + other + .single_split_permit_requests + .as_slice() + .len() + .cmp(&self.single_split_permit_requests.as_slice().len()) + } +} + +impl PartialOrd for LeafPermitRequest { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for LeafPermitRequest { + fn eq(&self, other: &Self) -> bool { + self.cmp(other).is_eq() + } +} + +impl Eq for LeafPermitRequest {} + +impl LeafPermitRequest { + fn from_estimated_costs(permit_sizes: Vec) -> (Self, Vec) { + let mut permits = Vec::with_capacity(permit_sizes.len()); + let mut single_split_permit_requests = Vec::with_capacity(permit_sizes.len()); + for permit_size in permit_sizes { + let (tx, rx) = oneshot::channel(); + // we keep our internal list of permits and the returned wait handles in the + // same order to make sure we emit each permit in the right order. Doing otherwise + // may cause deadlocks + single_split_permit_requests.push(SingleSplitPermitRequest { + permit_sender: tx, + permit_size, + }); + permits.push(SearchPermitFuture(rx)); + } + ( + LeafPermitRequest { + single_split_permit_requests: single_split_permit_requests.into_iter(), + }, + permits, + ) + } + + fn pop_if_smaller_than(&mut self, max_size: u64) -> Option { + // IntoIter::as_slice() allows us to peek at the next element without consuming it + match self.single_split_permit_requests.as_slice().first() { + Some(request) if request.permit_size <= max_size => { + self.single_split_permit_requests.next() + } + _ => None, + } + } + + fn is_empty(&self) -> bool { + self.single_split_permit_requests.as_slice().is_empty() + } +} + impl SearchPermitActor { async fn run(mut self) { // Stops when the last clone of SearchPermitProvider is dropped. @@ -163,12 +239,14 @@ impl SearchPermitActor { permit_sizes, permit_sender, } => { - let mut permits = Vec::with_capacity(permit_sizes.len()); - for permit_size in permit_sizes { - let (tx, rx) = oneshot::channel(); - self.permits_requests.push_back((tx, permit_size)); - permits.push(SearchPermitFuture(rx)); - } + assert_ne!( + permit_sizes.len(), + 0, + "empty permit request would lead to deadlock" + ); + let (leaf_permit_request, permits) = + LeafPermitRequest::from_estimated_costs(permit_sizes); + self.permits_requests.push(leaf_permit_request); self.assign_available_permits(); // The receiver could be dropped in the (unlikely) situation // where the future requesting these permits is cancelled before @@ -183,17 +261,8 @@ impl SearchPermitActor { (self.total_memory_allocated as i64 + memory_delta) as u64; self.assign_available_permits(); } - SearchPermitMessage::FreeWarmupSlot => { - self.num_warmup_slots_available += 1; - self.assign_available_permits(); - } - SearchPermitMessage::Drop { - memory_size, - warmup_slot_freed, - } => { - if !warmup_slot_freed { - self.num_warmup_slots_available += 1; - } + SearchPermitMessage::Drop { memory_size } => { + self.num_search_slots_available += 1; self.total_memory_allocated = self .total_memory_allocated .checked_sub(memory_size) @@ -203,41 +272,48 @@ impl SearchPermitActor { } } - fn pop_next_request_if_serviceable(&mut self) -> Option<(oneshot::Sender, u64)> { - if self.num_warmup_slots_available == 0 { + fn pop_next_request_if_serviceable(&mut self) -> Option { + if self.num_search_slots_available == 0 { return None; } - if let Some((_, next_permit_size)) = self.permits_requests.front() - && self.total_memory_allocated + next_permit_size <= self.total_memory_budget - { - return self.permits_requests.pop_front(); + let available_memory = self + .total_memory_budget + .checked_sub(self.total_memory_allocated)?; + let mut peeked = self.permits_requests.peek_mut()?; + + if let Some(permit_request) = peeked.pop_if_smaller_than(available_memory) { + if peeked.is_empty() { + PeekMut::pop(peeked); + } + return Some(permit_request); } None } fn assign_available_permits(&mut self) { - let ongoing_tasks_metric = self.metrics.ongoing_tasks; - while let Some((permit_requester_tx, next_permit_size)) = - self.pop_next_request_if_serviceable() - { + while let Some(permit_request) = self.pop_next_request_if_serviceable() { + let ongoing_tasks_metric = self.metrics.ongoing_tasks; let mut ongoing_gauge_guard = GaugeGuard::from_gauge(ongoing_tasks_metric); ongoing_gauge_guard.add(1); - self.total_memory_allocated += next_permit_size; - self.num_warmup_slots_available -= 1; - permit_requester_tx + self.total_memory_allocated += permit_request.permit_size; + self.num_search_slots_available -= 1; + permit_request + .permit_sender .send(SearchPermit { _ongoing_gauge_guard: ongoing_gauge_guard, msg_sender: self.msg_sender.clone(), - memory_allocation: next_permit_size, - warmup_slot_freed: false, + memory_allocation: permit_request.permit_size, }) // if the requester dropped its receiver, we drop the newly // created SearchPermit which releases the resources .ok(); } - self.metrics - .pending_tasks - .set(self.permits_requests.len() as i64); + let pending_tasks = self + .permits_requests + .iter() + .map(|leaf_req| leaf_req.single_split_permit_requests.as_slice().len() as i64) + .sum(); + self.metrics.pending_tasks.set(pending_tasks); } } @@ -245,7 +321,6 @@ pub struct SearchPermit { _ongoing_gauge_guard: GaugeGuard<'static>, msg_sender: mpsc::WeakUnboundedSender, memory_allocation: u64, - warmup_slot_freed: bool, } impl SearchPermit { @@ -259,16 +334,6 @@ impl SearchPermit { self.send_if_still_running(SearchPermitMessage::UpdateMemory { memory_delta }); } - /// Drop the warmup permit, allowing more downloads to be started. Only one - /// slot is attached to each permit so calling this again has no effect. - pub fn free_warmup_slot(&mut self) { - if self.warmup_slot_freed { - return; - } - self.warmup_slot_freed = true; - self.send_if_still_running(SearchPermitMessage::FreeWarmupSlot); - } - pub fn memory_allocation(&self) -> ByteSize { ByteSize(self.memory_allocation) } @@ -288,7 +353,6 @@ impl Drop for SearchPermit { fn drop(&mut self) { self.send_if_still_running(SearchPermitMessage::Drop { memory_size: self.memory_allocation, - warmup_slot_freed: self.warmup_slot_freed, }); } } @@ -324,6 +388,18 @@ mod tests { SEARCH_METRICS.search_task_metrics() } + #[tokio::test] + async fn test_get_permits_empty() { + let permit_provider = SearchPermitProvider::new(1, ByteSize::mb(100), test_metrics()); + let permits = permit_provider.get_permits(std::iter::empty()).await; + assert!(permits.is_empty()); + + // Subsequent non-empty requests must still be served normally. + let permits = permit_provider.get_permits([ByteSize::mb(10)]).await; + assert_eq!(permits.len(), 1); + let _permit = permits.into_iter().next().unwrap().await; + } + #[tokio::test] async fn test_search_permit_order() { let permit_provider = SearchPermitProvider::new(1, ByteSize::mb(100), test_metrics()); @@ -374,6 +450,75 @@ mod tests { } } + #[tokio::test] + async fn test_search_permit_order_with_concurrent_search() { + let permit_provider = SearchPermitProvider::new(4, ByteSize::mb(100), test_metrics()); + let mut all_futures = Vec::new(); + let first_batch_of_permits = permit_provider + .get_permits(repeat_n(ByteSize::mb(10), 8)) + .await; + assert_eq!(first_batch_of_permits.len(), 8); + all_futures.extend( + first_batch_of_permits + .into_iter() + .enumerate() + .map(move |(i, fut)| ((1, i), fut)), + ); + + let second_batch_of_permits = permit_provider + .get_permits(repeat_n(ByteSize::mb(10), 2)) + .await; + all_futures.extend( + second_batch_of_permits + .into_iter() + .enumerate() + .map(move |(i, fut)| ((2, i), fut)), + ); + + let third_batch_of_permits = permit_provider + .get_permits(repeat_n(ByteSize::mb(10), 6)) + .await; + all_futures.extend( + third_batch_of_permits + .into_iter() + .enumerate() + .map(move |(i, fut)| ((3, i), fut)), + ); + + // not super useful, considering what join set does, but still a tiny bit more sound. + all_futures.shuffle(&mut rand::rng()); + + let mut join_set = JoinSet::new(); + for (res, fut) in all_futures { + join_set.spawn(async move { + let permit = fut.await; + (res, permit) + }); + } + let mut ordered_result: Vec<(usize, usize)> = Vec::with_capacity(20); + while let Some(Ok(((batch_id, order), _permit))) = join_set.join_next().await { + ordered_result.push((batch_id, order)); + } + + let mut counters = [0; 4]; + let expected_result: Vec<(usize, usize)> = [ + 1, 1, 1, 1, // initial 4 permits + 2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, + ] + .into_iter() + .map(|batch_id| { + let order = counters[batch_id]; + counters[batch_id] += 1; + (batch_id, order) + }) + .collect(); + + // for the first 4 permits, the order is not well defined as they are all granted at once, + // and we poll futures in a random order. We sort them to fix that artifact + ordered_result[..4].sort(); + assert_eq!(ordered_result, expected_result); + } + #[tokio::test] async fn test_search_permit_early_drops() { let permit_provider = SearchPermitProvider::new(1, ByteSize::mb(100), test_metrics()); @@ -446,7 +591,7 @@ mod tests { } #[tokio::test] - async fn test_warmup_slot() { + async fn test_concurrent_search_slots() { let permit_provider = SearchPermitProvider::new(10, ByteSize::mb(100), test_metrics()); let mut permit_futs = permit_provider .get_permits(repeat_n(ByteSize::mb(1), 16)) @@ -458,27 +603,19 @@ mod tests { .buffered(1) .collect() .await; - // the next permit is blocked by the warmup slots + // the next permit is blocked by the concurrent search slots let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); try_get(next_blocked_permit_fut).await.err().unwrap(); // if we drop one of the permits, we can get a new one permits.drain(0..1); let next_permit_fut = remaining_permit_futs.next().unwrap(); permits.push(try_get(next_permit_fut).await.unwrap()); - // the next permit is blocked again by the warmup slots + // the next permit is blocked again by the concurrent search slots let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); try_get(next_blocked_permit_fut).await.err().unwrap(); - // we can explicitly free the warmup slot on a permit - permits[0].free_warmup_slot(); + // dropping a permit frees up a slot + permits.drain(0..1); let next_permit_fut = remaining_permit_futs.next().unwrap(); permits.push(try_get(next_permit_fut).await.unwrap()); - // dropping that same permit does not free up another slot - permits.drain(0..1); - let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); - try_get(next_blocked_permit_fut).await.err().unwrap(); - // but dropping a permit for which the slot wasn't explicitly free does free up a slot - permits.drain(0..1); - let next_blocked_permit_fut = remaining_permit_futs.next().unwrap(); - permits.push(try_get(next_blocked_permit_fut).await.unwrap()); } } diff --git a/quickwit/quickwit-search/src/search_response_rest.rs b/quickwit/quickwit-search/src/search_response_rest.rs index 58eddc7b927..c2cbafaf392 100644 --- a/quickwit/quickwit-search/src/search_response_rest.rs +++ b/quickwit/quickwit-search/src/search_response_rest.rs @@ -52,8 +52,6 @@ pub struct SearchResponseRest { pub snippets: Option>, /// Elapsed time. pub elapsed_time_micros: u64, - /// Search errors. - pub errors: Vec, /// Aggregations. #[schema(value_type = Object)] #[serde(skip_serializing_if = "Option::is_none")] @@ -107,7 +105,6 @@ impl TryFrom for SearchResponseRest { hits: documents, snippets: snippet_opt, elapsed_time_micros: search_response.elapsed_time_micros, - errors: search_response.errors, aggregations: aggregations_opt, }) } diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 890052a5053..e62661278fb 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -406,10 +406,10 @@ pub(crate) async fn scroll( num_hits: scroll_context.total_num_hits, elapsed_time_micros: start.elapsed().as_micros() as u64, scroll_id: Some(next_scroll_id.to_string()), - errors: Vec::new(), aggregation_postcard: None, failed_splits: scroll_context.failed_splits, num_successful_splits: scroll_context.num_successful_splits, + splits_by_outcome: scroll_context.splits_by_outcome, }) } /// [`SearcherContext`] provides a common set of variables diff --git a/quickwit/quickwit-search/src/soft_delete_query.rs b/quickwit/quickwit-search/src/soft_delete_query.rs new file mode 100644 index 00000000000..8283523359d --- /dev/null +++ b/quickwit/quickwit-search/src/soft_delete_query.rs @@ -0,0 +1,321 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; +use std::sync::Arc; + +use tantivy::query::{EnableScoring, Exclude, Explanation, Query, QueryClone, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TERMINATED}; + +/// A [`DocSet`] backed by a sorted, deduplicated vector of doc IDs. +/// +/// Used as the excluding [`DocSet`] argument passed to [`Exclude`] when +/// constructing a scorer inside [`SoftDeleteWeight`]. +/// +/// # Invariant +/// +/// The underlying slice must be sorted in strictly ascending order and free of +/// duplicates. This is guaranteed by [`SoftDeleteQuery::new`], which sorts and +/// deduplicates the input before storing it. +struct SortedDocIdSet { + doc_ids: Arc>, + /// Index of the current document inside `doc_ids`. + cursor: usize, +} + +impl SortedDocIdSet { + fn new(doc_ids: Arc>) -> Self { + SortedDocIdSet { doc_ids, cursor: 0 } + } +} + +impl DocSet for SortedDocIdSet { + #[inline] + fn advance(&mut self) -> DocId { + self.cursor += 1; + self.doc() + } + + fn seek(&mut self, target: DocId) -> DocId { + // The DocSet contract guarantees seek() is always called with a + // non-decreasing target, so we only need to scan forward from cursor. + let remaining = self.doc_ids.get(self.cursor..).unwrap_or(&[]); + let offset = remaining.partition_point(|&id| id < target); + self.cursor += offset; + self.doc() + } + + #[inline] + fn doc(&self) -> DocId { + self.doc_ids.get(self.cursor).copied().unwrap_or(TERMINATED) + } + + fn size_hint(&self) -> u32 { + self.doc_ids.len().saturating_sub(self.cursor) as u32 + } +} + +/// [`Weight`] produced by [`SoftDeleteQuery`]. +/// +/// Wraps the inner weight's scorer with [`Exclude`] to filter out +/// soft-deleted doc IDs transparently across all collection paths. +struct SoftDeleteWeight { + inner: Box, + deleted_doc_ids: Arc>, +} + +impl Weight for SoftDeleteWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> tantivy::Result> { + let inner_scorer = self.inner.scorer(reader, boost)?; + let excluded = SortedDocIdSet::new(Arc::clone(&self.deleted_doc_ids)); + Ok(Box::new(Exclude::new(inner_scorer, excluded))) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + self.inner.explain(reader, doc) + } +} + +/// A tantivy [`Query`] that wraps another query and excludes a fixed set of +/// soft-deleted doc IDs from every result set it produces. +pub(crate) struct SoftDeleteQuery { + inner: Box, + /// Sorted, deduplicated tantivy doc IDs to exclude. + deleted_doc_ids: Arc>, +} + +impl SoftDeleteQuery { + /// Creates a new [`SoftDeleteQuery`]. + /// + /// `deleted_doc_ids` may be supplied in any order and may contain + /// duplicates; this constructor sorts and deduplicates the input. + pub(crate) fn new(inner: Box, mut deleted_doc_ids: Vec) -> Self { + deleted_doc_ids.sort_unstable(); + deleted_doc_ids.dedup(); + SoftDeleteQuery { + inner, + deleted_doc_ids: Arc::new(deleted_doc_ids), + } + } +} + +impl Clone for SoftDeleteQuery { + fn clone(&self) -> Self { + SoftDeleteQuery { + inner: self.inner.box_clone(), + deleted_doc_ids: Arc::clone(&self.deleted_doc_ids), + } + } +} + +impl fmt::Debug for SoftDeleteQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SoftDeleteQuery") + .field("inner", &self.inner) + .field("num_deleted", &self.deleted_doc_ids.len()) + .finish() + } +} + +impl Query for SoftDeleteQuery { + fn weight(&self, enable_scoring: EnableScoring<'_>) -> tantivy::Result> { + let inner_weight = self.inner.weight(enable_scoring)?; + Ok(Box::new(SoftDeleteWeight { + inner: inner_weight, + deleted_doc_ids: Arc::clone(&self.deleted_doc_ids), + })) + } + + fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a tantivy::Term, bool)) { + self.inner.query_terms(visitor); + } +} + +#[cfg(test)] +mod tests { + use tantivy::collector::Count; + use tantivy::query::AllQuery; + use tantivy::schema::{Schema, TEXT}; + use tantivy::{Index, IndexWriter}; + + use super::*; + + /// Creates a single-segment, in-RAM index containing `num_docs` documents. + /// + /// Returns `(index, reader)`. The tantivy doc IDs are 0-based and + /// contiguous inside the single segment, so doc ID `k` corresponds to the + /// (k+1)-th inserted document. + fn make_index(num_docs: usize) -> tantivy::Result<(Index, tantivy::IndexReader)> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer(15_000_000)?; + for i in 0..num_docs { + writer.add_document(tantivy::doc!(text_field => format!("doc {i}")))?; + } + writer.commit()?; + let reader = index.reader()?; + Ok((index, reader)) + } + + // ── SortedDocIdSet unit tests ───────────────────────────────────────────── + + #[test] + fn test_sorted_doc_id_set_advance_through_all() { + let ids = Arc::new(vec![2u32, 5, 8, 11]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.doc(), 2); + assert_eq!(ds.advance(), 5); + assert_eq!(ds.advance(), 8); + assert_eq!(ds.advance(), 11); + // Advancing past the last element returns TERMINATED via unwrap_or. + assert_eq!(ds.advance(), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + // Subsequent advances keep returning TERMINATED: cursor increments past + // doc_ids.len(), get() returns None, unwrap_or yields TERMINATED. + assert_eq!(ds.advance(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_empty() { + let mut ds = SortedDocIdSet::new(Arc::new(vec![])); + assert_eq!(ds.doc(), TERMINATED); + assert_eq!(ds.advance(), TERMINATED); + assert_eq!(ds.seek(0), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_exact_hit() { + let ids = Arc::new(vec![1u32, 3, 7, 10, 15]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(7), 7); + assert_eq!(ds.doc(), 7); + + // Seeking to the same target is idempotent. + assert_eq!(ds.seek(7), 7); + + assert_eq!(ds.seek(10), 10); + assert_eq!(ds.doc(), 10); + } + + #[test] + fn test_sorted_doc_id_set_seek_between_entries() { + let ids = Arc::new(vec![1u32, 3, 7, 10, 15]); + let mut ds = SortedDocIdSet::new(ids); + + // Target falls between 3 and 7 → should return 7. + assert_eq!(ds.seek(4), 7); + assert_eq!(ds.doc(), 7); + + // Target falls between 10 and 15 → should return 15. + assert_eq!(ds.seek(11), 15); + assert_eq!(ds.doc(), 15); + } + + #[test] + fn test_sorted_doc_id_set_seek_past_last_entry() { + let ids = Arc::new(vec![1u32, 3, 7]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(100), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_terminated_sentinel() { + let ids = Arc::new(vec![1u32, 3, 7]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(TERMINATED), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_before_current_position() { + // After advancing past the start, seeking to the current doc must not + // go backwards. + let ids = Arc::new(vec![1u32, 5, 9]); + let mut ds = SortedDocIdSet::new(ids); + + ds.advance(); // cursor → 5 + // Seeking to 5 (= current) must keep returning 5. + assert_eq!(ds.seek(5), 5); + assert_eq!(ds.doc(), 5); + } + + #[test] + fn test_sorted_doc_id_set_size_hint_decrements() { + let ids = Arc::new(vec![1u32, 3, 7, 10]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.size_hint(), 4); + ds.advance(); + assert_eq!(ds.size_hint(), 3); + ds.advance(); + ds.advance(); + ds.advance(); // now TERMINATED + assert_eq!(ds.size_hint(), 0); + } + + #[test] + fn test_soft_delete_query_no_deleted_docs() -> tantivy::Result<()> { + let (_index, reader) = make_index(5)?; + let searcher = reader.searcher(); + + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![]); + assert_eq!(searcher.search(&query, &Count)?, 5); + Ok(()) + } + + #[test] + fn test_soft_delete_query_excludes_subset() -> tantivy::Result<()> { + let (_index, reader) = make_index(5)?; + let searcher = reader.searcher(); + + // Delete doc IDs 1 and 3; 0, 2, 4 should remain. + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![1, 3]); + assert_eq!(searcher.search(&query, &Count)?, 3); + Ok(()) + } + + #[test] + fn test_soft_delete_query_excludes_all_docs() -> tantivy::Result<()> { + let (_index, reader) = make_index(3)?; + let searcher = reader.searcher(); + + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![0, 1, 2]); + assert_eq!(searcher.search(&query, &Count)?, 0); + Ok(()) + } + + #[test] + fn test_soft_delete_query_count_method_matches_search() -> tantivy::Result<()> { + let (_index, reader) = make_index(10)?; + let searcher = reader.searcher(); + + // Delete every even doc ID. + let deleted: Vec = (0..10).filter(|x| x % 2 == 0).collect(); + let query = SoftDeleteQuery::new(Box::new(AllQuery), deleted); + + let count_via_search = searcher.search(&query, &Count)?; + let count_via_method = query.count(&searcher)?; + + assert_eq!(count_via_search, 5); + assert_eq!(count_via_method, 5); + Ok(()) + } +} diff --git a/quickwit/quickwit-search/src/sort_repr.rs b/quickwit/quickwit-search/src/sort_repr.rs new file mode 100644 index 00000000000..940e97366c9 --- /dev/null +++ b/quickwit/quickwit-search/src/sort_repr.rs @@ -0,0 +1,409 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; +use std::ops::Not; + +use quickwit_proto::search::SortOrder; +use tantivy::DocId; + +use crate::top_k_computer::MinValue; + +/// A u64 that can be elided to unit type to save memory. +pub(crate) trait ElidableU64: Ord + Copy + Debug + MinValue { + fn value(self) -> u64; + fn from_u64(value: u64) -> Self; + fn is_elided() -> bool; +} + +impl MinValue for u64 { + fn min_value() -> Self { + 0 + } +} + +impl MinValue for () { + fn min_value() -> Self {} +} + +impl ElidableU64 for u64 { + fn from_u64(value: u64) -> Self { + value + } + fn value(self) -> u64 { + self + } + fn is_elided() -> bool { + false + } +} + +impl ElidableU64 for () { + fn from_u64(_value: u64) -> Self {} + fn value(self) -> u64 { + 0 + } + fn is_elided() -> bool { + true + } +} + +/// Encoded representation of the value, the index of its accessor in the list +/// of fast field columns and the sort order. +/// +/// The first u8 encodes the index of the accessor and a sentinel value for +/// missing and search after values: +/// - 0 is a sentinel for skip all +/// - 1 is a sentinel for missing (always last in the sort order) +/// - other odd values encode the index of the accessor in the list of fast field columns (3 for +/// index 0, 5 for index 1, etc.) +/// - even values are sentinels for search after values that keep/skip all documents for a given +/// column (2 to skip all columns but keep missing, 4 only keeps column 0, 6 keeps column 0 and 1, +/// etc.) +/// +/// The following u64 encodes the value itself or its bitwise negation to +/// reverse the sort order when building an ascending sort (keeping in mind that +/// this is fed to a top-k calculator). +#[derive(Clone, Copy)] +pub(crate) struct InternalValueRepr(u8, V); + +/// Inverts the sort order by reversing the bits. +/// +/// Using the bitwise negation is a cheap way to reverse the order while +/// maintaining the type (and memory footprint). It is also reversible +/// (`not(not(value)) == value`) which makes it simply decodable. +/// +/// This wrapper is just an alias to make the code more readable. Using `!value` +/// or `value.not()` inline yields the same result. +#[inline] +fn reverse>(value: T) -> T { + value.not() +} + +impl InternalValueRepr { + #[inline] + pub fn new(value: u64, accessor_idx: u8, order: SortOrder) -> Self { + // For Asc, smaller values should win: invert so smaller maps to larger repr + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 3), V::from_u64(reverse(value))), + SortOrder::Desc => Self(accessor_idx * 2 + 3, V::from_u64(value)), + } + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be kept. + pub fn new_keep_column(accessor_idx: u8, order: SortOrder) -> Self { + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 2), V::from_u64(0)), + SortOrder::Desc => Self(accessor_idx * 2 + 4, V::from_u64(0)), + } + } + #[inline] + pub fn new_missing() -> Self { + // Missing always last in topk, so use the smallest possible value + // (besides the skip_all value) + Self(1, V::from_u64(0)) + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be skipped for the given column. + pub fn new_skip_column(accessor_idx: u8, order: SortOrder) -> Self { + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 4), V::from_u64(0)), + SortOrder::Desc => Self(accessor_idx * 2 + 2, V::from_u64(0)), + } + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be skipped. + pub fn new_skip_all_but_missing() -> Self { + Self(2, V::from_u64(0)) + } + #[inline] + pub fn decode(self, order: SortOrder) -> Option<(u8, u64)> { + if self.0 == 1 { + return None; + } + debug_assert_eq!( + match order { + SortOrder::Asc => reverse(self.0), + SortOrder::Desc => self.0, + } % 2, + 1, + "sentinel indexes are not meant to be decoded" + ); + match order { + SortOrder::Asc => Some(((reverse(self.0) - 3) / 2, reverse(V::value(self.1)))), + SortOrder::Desc => Some(((self.0 - 3) / 2, V::value(self.1))), + } + } +} + +/// Ordered representation of the sort values. It is the concatenation of: +/// - the first two (u8, u64) pairs contain the internal representation of the sort values +/// - the second sort value's internal representation +/// - the doc id, preceeded by a sentinel indicating how it should be used for tie-breaking +/// +/// ElidableU64 is used instead of u64 for sort values to reduce the size of the +/// representation when they are not used. The associated sentinels could also +/// be elided, but in practice they don't have an impact on the tuple's size +/// because the doc id and its sentinel (u8, u32) gets padded anyway. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Default, Hash)] +pub(crate) struct InternalSortValueRepr(u8, V1, u8, V2, u8, u32); + +impl InternalSortValueRepr { + #[inline] + pub fn new( + sort_1: InternalValueRepr, + sort_2: InternalValueRepr, + doc_id: DocId, + doc_id_sort: SortOrder, + ) -> Self { + // For Asc, smaller values should win: invert so smaller maps to larger repr + match doc_id_sort { + SortOrder::Asc => Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 1, reverse(doc_id)), + SortOrder::Desc => Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 1, doc_id), + } + } + pub fn new_keep_doc_ids(sort_1: InternalValueRepr, sort_2: InternalValueRepr) -> Self { + Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 2, 0) + } + pub fn new_skip_doc_ids(sort_1: InternalValueRepr, sort_2: InternalValueRepr) -> Self { + Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 0, 0) + } + #[inline] + pub fn sort_1(self) -> InternalValueRepr { + InternalValueRepr(self.0, self.1) + } + #[inline] + pub fn sort_2(self) -> InternalValueRepr { + InternalValueRepr(self.2, self.3) + } + #[inline] + pub fn doc_id(self, order: SortOrder) -> DocId { + debug_assert_eq!(self.4, 1, "doc id sentinel is not meant to be decoded"); + match order { + SortOrder::Asc => reverse(self.5), + SortOrder::Desc => self.5, + } + } + pub fn is_skip_all(&self) -> bool { + *self <= Self(1, V1::min_value(), 1, V2::min_value(), 1, 0) + } +} + +impl MinValue for InternalSortValueRepr { + fn min_value() -> Self { + Self(0, V1::min_value(), 0, V2::min_value(), 1, 0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_internal_sort_value_repr_ordering_values() { + // Primary sort (Desc v1=10) dominates over secondary (Desc v2=100) and doc_id. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Desc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs > rhs, "primary sort must dominate, desc"); + + // Same values but Asc, the order is reversed + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Asc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs < rhs, "primary sort must dominate, asc"); + + // Secondary sort (Desc v2) breaks a tie on the primary field. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(5, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs > rhs, "secondary sort must break primary tie, desc"); + + // Same values but Asc, the order is reversed. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(10, 0, SortOrder::Asc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(5, 0, SortOrder::Asc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "secondary sort must break primary tie, asc"); + + // Doc-id Desc tiebreaker: higher doc_id wins. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 5, + SortOrder::Desc, + ); + assert!(lhs > rhs, "Desc: higher doc_id must win tiebreaker"); + + // Doc-id Asc tiebreaker: lower doc_id wins. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 5, + SortOrder::Asc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 10, + SortOrder::Asc, + ); + assert!(lhs > rhs, "Asc: lower doc_id must win tiebreaker"); + + // Missing values are always smaller + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new_missing(), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "missing values are always smaller, desc"); + + // Same but Asc, missing is still smaller. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new_missing(), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "missing values are always smaller, asc"); + } + + #[test] + fn test_internal_sort_value_repr_ordering_sentinels() { + // Doc-id sentinel ordering: skip_doc_ids < normal_doc_id < keep_doc_ids. + let s1 = InternalValueRepr::::new(10, 0, SortOrder::Desc); + let s2 = InternalValueRepr::::new_missing(); + let skip_docs = InternalSortValueRepr::new_skip_doc_ids(s1, s2); + let keep_docs = InternalSortValueRepr::new_keep_doc_ids(s1, s2); + let normal_doc_desc = InternalSortValueRepr::new(s1, s2, 0, SortOrder::Desc); + let normal_doc_asc = InternalSortValueRepr::new(s1, s2, 0, SortOrder::Asc); + assert!( + skip_docs < normal_doc_desc, + "skip_doc_ids must be below normal" + ); + assert!( + normal_doc_desc < keep_docs, + "normal must be below keep_doc_ids" + ); + assert!( + skip_docs < normal_doc_asc, + "skip_doc_ids must be below normal" + ); + assert!( + normal_doc_asc < keep_docs, + "normal must be below keep_doc_ids" + ); + } + + #[test] + fn test_internal_sort_value_repr_ordering_types() { + // Primary accessor ordering dominates all the rest + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 1, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(15, 0, SortOrder::Desc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs > rhs, "primary type sort must dominate, desc"); + + // Same values but Asc, the order is reversed + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 1, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(15, 0, SortOrder::Asc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs < rhs, "primary type sort must dominate, asc"); + } + + #[test] + fn test_memory_footprint() { + // Make sure that the memory representation is efficiently packed. For + // instance refactoring to: + // ``` + // struct InternalSortValueRepr(InternalValueRepr,InternalValueRepr,u64) + // ``` + // would cause InternalSortValueRepr to jump to 40 bytes. + + assert_eq!(std::mem::size_of::>(), 24); + assert_eq!(std::mem::size_of::>(), 16); + assert_eq!(std::mem::size_of::>(), 8); + } +} diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index dc6dfe9f9cd..1b7cd4f5797 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; +use std::vec; use assert_json_diff::{assert_json_eq, assert_json_include}; use quickwit_config::SearcherConfig; @@ -22,8 +23,8 @@ use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_indexing::TestSandbox; use quickwit_opentelemetry::otlp::TraceId; use quickwit_proto::search::{ - LeafListTermsResponse, ListTermsRequest, SearchRequest, SortByValue, SortField, SortOrder, - SortValue, + CountHits, LeafListTermsResponse, ListTermsRequest, PartialHit, SearchRequest, SortByValue, + SortDatetimeFormat, SortField, SortOrder, SortValue, }; use quickwit_query::query_ast::{ QueryAst, qast_helper, qast_json_helper, query_ast_from_user_text, @@ -179,7 +180,8 @@ async fn test_single_search_with_snippet() -> anyhow::Result<()> { Ok(()) } -async fn slop_search_and_check( +/// Search with "body" as default field and assert expected number of matches. +async fn search_and_check( test_sandbox: &TestSandbox, index_id: &str, query: &str, @@ -234,33 +236,98 @@ async fn test_slop_queries() { ]; test_sandbox.add_documents(docs.clone()).await.unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0) + search_and_check(&test_sandbox, index_id, "\"small bird\"~2", 0) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1) + search_and_check(&test_sandbox, index_id, "\"red bike\"~2", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1) + search_and_check(&test_sandbox, index_id, "\"small blue bike\"~3", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"", 1) + search_and_check(&test_sandbox, index_id, "\"small bike\"", 1) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2) + search_and_check(&test_sandbox, index_id, "\"small bike\"~1", 2) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2) + search_and_check(&test_sandbox, index_id, "\"small bike\"~2", 2) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3) + search_and_check(&test_sandbox, index_id, "\"small bike\"~3", 3) .await .unwrap(); - slop_search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1) + search_and_check(&test_sandbox, index_id, "\"tiny shelter\"~3", 1) .await .unwrap(); test_sandbox.assert_quit().await; } +#[tokio::test] +async fn test_multi_term_queries() { + let index_id = "multi-term-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + record: position + "#; + + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]) + .await + .unwrap(); + let docs = vec![ + json!({"title": "one", "body": "a red bike"}), + json!({"title": "two", "body": "a small blue bike"}), + json!({"title": "three", "body": "a small, rusty, and yellow bike"}), + json!({"title": "four", "body": "fred's small bike"}), + json!({"title": "five", "body": "a tiny shelter"}), + ]; + test_sandbox.add_documents(docs.clone()).await.unwrap(); + + search_and_check( + &test_sandbox, + index_id, + "IN [red blue green yellow pink black]", + 3, + ) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [aaaa]", 0) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [red]", 1) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "IN [zzzz]", 0) + .await + .unwrap(); + + search_and_check( + &test_sandbox, + index_id, + "red OR blue OR green OR yellow OR pink OR black", + 3, + ) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "red AND \"small bike\"", 0) + .await + .unwrap(); + + search_and_check(&test_sandbox, index_id, "bike AND \"small bike\"", 1) + .await + .unwrap(); + + test_sandbox.assert_quit().await; +} + #[tokio::test] async fn test_single_node_several_splits() -> anyhow::Result<()> { let index_id = "single-node-several-splits"; @@ -371,7 +438,8 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await?; + .await + .unwrap(); assert_eq!(single_node_response.num_hits, 10); assert_eq!(single_node_response.hits.len(), 10); assert!(&single_node_response.hits[0].json.contains("t:19")); @@ -395,7 +463,8 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await?; + .await + .unwrap(); assert_eq!(single_node_response.num_hits, 19); assert_eq!(single_node_response.hits.len(), 19); assert!(&single_node_response.hits[0].json.contains("t:19")); @@ -890,7 +959,7 @@ async fn test_sort_by_2_field() { } #[tokio::test] -async fn test_single_node_invalid_sorting_with_query() { +async fn test_sort_by_text() { let index_id = "single-node-invalid-sorting"; let doc_mapping_yaml = r#" field_mappings: @@ -906,7 +975,7 @@ async fn test_single_node_invalid_sorting_with_query() { let mut docs = Vec::new(); for i in 0..30 { - let description = format!("city info-{}", i + 1); + let description = format!("city info-{:02}", i + 1); docs.push(json!({"description": description, "ts": i+1, "temperature": i+32})); } test_sandbox.add_documents(docs).await.unwrap(); @@ -927,13 +996,19 @@ async fn test_single_node_invalid_sorting_with_query() { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await; - assert!(single_node_response.is_err()); - let error_msg = single_node_response.unwrap_err().to_string(); - assert_eq!( - error_msg, - "Invalid argument: sort by field on type text is currently not supported `description`" - ); + .await + .unwrap(); + + assert_eq!(single_node_response.num_hits, 30); + assert_eq!(single_node_response.hits.len(), 15); + assert!(single_node_response.hits.windows(2).all(|hits| { + let hit0: JsonValue = serde_json::from_str(&hits[0].json).unwrap(); + let hit1: JsonValue = serde_json::from_str(&hits[1].json).unwrap(); + hit0["description"].as_str().unwrap() >= hit1["description"].as_str().unwrap() + })); + assert!(single_node_response.hits[0].json.contains("city info-30")); + assert!(single_node_response.hits[14].json.contains("city info-16")); + test_sandbox.assert_quit().await; } @@ -961,16 +1036,26 @@ async fn test_single_node_split_pruning_by_tags() -> anyhow::Result<()> { } let query_ast: QueryAst = qast_helper("owner:francois", &[]); + let tag_fields: BTreeSet = [String::from("owner")].into_iter().collect(); let selected_splits = list_relevant_splits( vec![index_uid.clone()], None, None, - extract_tags_from_query(query_ast), + extract_tags_from_query(query_ast.clone(), Some(&tag_fields)), &mut test_sandbox.metastore(), ) .await?; assert!(selected_splits.is_empty()); + let selected_splits = list_relevant_splits( + vec![index_uid.clone()], + None, + None, + extract_tags_from_query(query_ast, Some(&BTreeSet::new())), + &mut test_sandbox.metastore(), + ) + .await?; + assert_eq!(selected_splits.len(), 2); let query_ast: QueryAst = qast_helper("", &[]); @@ -978,7 +1063,17 @@ async fn test_single_node_split_pruning_by_tags() -> anyhow::Result<()> { vec![index_uid.clone()], None, None, - extract_tags_from_query(query_ast), + extract_tags_from_query(query_ast.clone(), Some(&tag_fields)), + &mut test_sandbox.metastore(), + ) + .await?; + assert_eq!(selected_splits.len(), 2); + + let selected_splits = list_relevant_splits( + vec![index_uid.clone()], + None, + None, + extract_tags_from_query(query_ast, Some(&BTreeSet::new())), &mut test_sandbox.metastore(), ) .await?; @@ -990,7 +1085,7 @@ async fn test_single_node_split_pruning_by_tags() -> anyhow::Result<()> { vec![index_uid.clone()], None, None, - extract_tags_from_query(query_ast), + extract_tags_from_query(query_ast, Some(&tag_fields)), &mut test_sandbox.metastore(), ) .await?; @@ -1887,3 +1982,761 @@ fn test_global_doc_address_ser_deser() { let doc_address_deser: GlobalDocAddress = doc_address_string.parse().unwrap(); assert_eq!(doc_address_deser, doc_address); } + +#[tokio::test] +async fn test_single_node_soft_delete_excludes_from_search() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-search"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Search all — should find 3 + let search_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["title"]), + max_hits: 10, + ..Default::default() + }; + let result = single_node_search( + search_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + + // Search for "alpha" specifically to find its doc_id and split_id + let alpha_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }; + let alpha_result = single_node_search( + alpha_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let alpha_hit = &alpha_result.hits[0]; + let partial_hit = alpha_hit.partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete that document via the metastore + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + + let metastore = test_sandbox.metastore(); + metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Search all again — should find only 2 + let result = single_node_search( + search_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + + // Verify that the soft-deleted document ("alpha") is not in the results + for hit in &result.hits { + let hit_json: JsonValue = serde_json::from_str(&hit.json)?; + assert_ne!(hit_json["title"], "alpha"); + } + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_single_node_soft_delete_count_only() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-count-only"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Count-only search (max_hits: 0) — should find 3 + let count_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["title"]), + max_hits: 0, + ..Default::default() + }; + let result = single_node_search( + count_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + assert!(result.hits.is_empty()); + + // Find the doc_id for "alpha" so we can soft-delete it + let alpha_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }; + let alpha_result = single_node_search( + alpha_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let partial_hit = alpha_result.hits[0].partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete that document via the metastore + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + + let metastore = test_sandbox.metastore(); + metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id, + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Count-only search again — should find only 2 + let result = single_node_search( + count_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + assert!(result.hits.is_empty()); + + test_sandbox.assert_quit().await; + Ok(()) +} + +/// Regression test: the `is_count_only` path (non-MatchAll query with max_hits=0) was calling +/// `query.count(&searcher)` which bypasses Quickwit's soft-delete filter entirely. +/// MatchAll + max_hits=0 goes through `is_metadata_count_request_with_ast` (already correct); +/// this test specifically exercises the `is_count_only` branch with a real term query. +#[tokio::test] +async fn test_single_node_soft_delete_count_only_term_query() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-count-only-term-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Use a non-MatchAll query so that the `is_count_only` branch is taken instead of + // `is_metadata_count_request_with_ast`. "alpha OR beta OR gamma" matches all 3 docs + // but is not `QueryAst::MatchAll`. + let count_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha OR beta OR gamma", &["title"]), + max_hits: 0, + ..Default::default() + }; + let result = single_node_search( + count_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + assert!(result.hits.is_empty()); + + // Locate the doc_id for "alpha" so we can soft-delete it. + let alpha_result = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let partial_hit = alpha_result.hits[0].partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete the "alpha" document. + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + test_sandbox + .metastore() + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id, + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Count-only term query: before the fix this returned 3 (soft-deleted doc was counted); + // after the fix it must return 2. + let result = single_node_search( + count_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + assert!(result.hits.is_empty()); + + test_sandbox.assert_quit().await; + Ok(()) +} + +/// Tests that when sorting by a datetime field with `sort_datetime_format` set to millis: +/// 1. The sort values returned in `partial_hit` are in milliseconds (not nanoseconds). +/// 2. Those values can be fed back as `search_after` to retrieve the next page correctly. +#[tokio::test] +async fn test_sort_by_datetime_format_millis_and_search_after() -> anyhow::Result<()> { + let index_id = "sort-datetime-millis-search-after"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + - name: body + type: text + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // Index 10 documents with timestamps 100_000_000_000 .. 100_000_009_000 ms since epoch. + let base_secs: i64 = 100_000_000; + let docs: Vec<_> = (0..10) + .map(|i| json!({"ts": base_secs + i, "body": format!("doc {i}")})) + .collect(); + test_sandbox.add_documents(docs).await?; + + let sort_field = SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampMillis as i32), + }; + + // Page 1: top 5 hits sorted by ts desc with millis output + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field.clone()], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 10); + assert_eq!(page1.hits.len(), 5); + + // Verify sort values are in milliseconds (not nanoseconds) + let expected_millis: Vec = (5..10).rev().map(|i| (base_secs + i) * 1_000).collect(); + let actual_millis: Vec = page1 + .hits + .iter() + .map(|hit| { + let partial_hit = hit.partial_hit.as_ref().unwrap(); + match &partial_hit.sort_value.as_ref().unwrap().sort_value { + Some(SortValue::I64(ms)) => *ms, + other => panic!("expected I64 sort value in millis, got {other:?}"), + } + }) + .collect(); + assert_eq!(actual_millis, expected_millis); + + // Page 2: use the last hit's sort value as search_after + let last_hit = page1.hits.last().unwrap().partial_hit.as_ref().unwrap(); + let search_after = PartialHit { + sort_value: last_hit.sort_value.clone(), + sort_value2: None, + split_id: String::new(), + segment_ord: 0, + doc_id: 0, + }; + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field], + search_after: Some(search_after), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page2.hits.len(), 5); + // Page 2 should contain docs with timestamps base_secs+4 down to base_secs+0 in millis + let expected_millis_page2: Vec = (0..5).rev().map(|i| (base_secs + i) * 1_000).collect(); + let actual_millis_page2: Vec = page2 + .hits + .iter() + .map(|hit| { + let partial_hit = hit.partial_hit.as_ref().unwrap(); + match &partial_hit.sort_value.as_ref().unwrap().sort_value { + Some(SortValue::I64(ms)) => *ms, + other => panic!("expected I64 sort value in millis, got {other:?}"), + } + }) + .collect(); + assert_eq!(actual_millis_page2, expected_millis_page2); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_sort_by_dynamic_with_datetime_page_fails() -> anyhow::Result<()> { + let index_id = "sort-dynamic-datetime-page-fails"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + mode: dynamic + dynamic_mapping: + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + let docs = [ + json!({"ts": 100_000_001, "my_dynamic_field": 2024}), + json!({"ts": 100_000_002, "my_dynamic_field": "2024-03-30T00:00:00Z"}), + json!({"ts": 100_000_001, "my_dynamic_field": 2025}), + json!({"ts": 100_000_002, "my_dynamic_field": "2025-03-30T00:00:00Z"}), + json!({"ts": 100_000_001, "my_dynamic_field": 2026}), + json!({"ts": 100_000_002, "my_dynamic_field": "2026-03-30T00:00:00Z"}), + ]; + test_sandbox.add_documents(docs).await?; + + let sort_field = SortField { + field_name: "my_dynamic_field".to_string(), + sort_order: SortOrder::Desc as i32, + ..Default::default() + }; + + // Page 1: sort should work even on a dynamic field with a datetime column + // values for the first page + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field.clone()], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 6); + assert_eq!(page1.hits.len(), 5); + + // Verify sort values are in milliseconds (not nanoseconds) + let page_1_sort_values: Vec<_> = page1 + .hits + .iter() + .map(|hit| { + &hit.partial_hit + .as_ref() + .unwrap() + .sort_value + .as_ref() + .unwrap() + .sort_value + }) + .collect(); + assert_eq!( + page_1_sort_values, + vec![ + &Some(SortValue::Datetime(1774828800000000000)), + &Some(SortValue::Datetime(1743292800000000000)), + &Some(SortValue::Datetime(1711756800000000000)), + &Some(SortValue::I64(2026)), + &Some(SortValue::I64(2025)), + ] + ); + + // Page 2: search after not yet supported + let last_hit = page1.hits.last().unwrap().partial_hit.as_ref().unwrap(); + let search_after = PartialHit { + sort_value: last_hit.sort_value.clone(), + sort_value2: None, + split_id: String::new(), + segment_ord: 0, + doc_id: 0, + }; + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field], + search_after: Some(search_after), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await + .unwrap(); + + assert_eq!(page2.failed_splits.len(), 1); + assert_eq!(page2.hits.len(), 0); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_sort_by_two_fields_with_null() -> anyhow::Result<()> { + let index_id = "sort-datetime-millis-search-after"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + - name: body + type: text + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // timestamps with 10 digits should be interpreted as secs + let docs: Vec<_> = vec![ + json!({"ts": 1_000_000_001i64, "body": format!("doc 9")}), + json!({"ts": 1_000_000_002i64, "body": format!("doc 8")}), + json!({"ts": 1_000_000_003i64, "body": format!("doc 7")}), + json!({"ts": 1_000_000_004i64}), + json!({"ts": 1_000_000_005i64}), + json!({"ts": 1_000_000_006i64}), + ]; + test_sandbox.add_documents(docs).await?; + + let sort_fields = vec![ + SortField { + field_name: "body".to_string(), + sort_order: SortOrder::Asc as i32, + ..Default::default() + }, + SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Asc as i32, + sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampMillis as i32), + }, + ]; + + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: sort_fields.clone(), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 6); + assert_eq!(page1.hits.len(), 5); + let page_1_hits = page1 + .hits + .iter() + .map(|hit| hit.partial_hit.clone().unwrap()) + .collect::>(); + let split_id = page_1_hits[0].split_id.clone(); + // for the timestamp field we convert to sort_datetime_format repr as I64 + assert_eq!( + page_1_hits, + vec![ + PartialHit { + sort_value: Some(SortValue::Str("doc 7".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_003_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 2, + }, + PartialHit { + sort_value: Some(SortValue::Str("doc 8".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_002_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 1, + }, + PartialHit { + sort_value: Some(SortValue::Str("doc 9".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_001_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 0, + }, + PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_004_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 3, + }, + PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_005_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 4, + }, + ] + ); + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: sort_fields.clone(), + search_after: Some(page_1_hits[4].clone()), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await + .unwrap(); + + assert_eq!(page2.num_hits, 6); + assert_eq!(page2.hits.len(), 1); + let page_2_hits = page2 + .hits + .iter() + .map(|hit| hit.partial_hit.clone().unwrap()) + .collect::>(); + let split_id = page_2_hits[0].split_id.clone(); + // for the timestamp field we convert to sort_datetime_format repr as I64 + assert_eq!( + page_2_hits, + vec![PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_006_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 5, + },] + ); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_single_node_splits_by_outcome() -> anyhow::Result<()> { + let index_id = "test-splits-by-outcome"; + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - "rfc3339" + - "unix_timestamp" + fast: true + timestamp_field: ts + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // Three splits with non-overlapping timestamp ranges + let base_ts = OffsetDateTime::now_utc().unix_timestamp(); + test_sandbox + .add_documents(vec![ + json!({"body": "old doc 1", "ts": base_ts - 2_000_000}), + json!({"body": "old doc 2", "ts": base_ts - 1_999_999}), + ]) + .await?; + test_sandbox + .add_documents(vec![ + json!({"body": "mid doc 1", "ts": base_ts - 1_000_000}), + json!({"body": "mid doc 2", "ts": base_ts - 999_999}), + ]) + .await?; + test_sandbox + .add_documents(vec![ + json!({"body": "new doc 1", "ts": base_ts}), + json!({"body": "new doc 2", "ts": base_ts + 1}), + ]) + .await?; + + // All 3 splits should be processed for an unrestricted search. + let response = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("doc", &["body"]), + max_hits: 10, + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(response.num_hits, 6); + let outcomes = response.splits_by_outcome.unwrap(); + assert_eq!(outcomes.processed, 3, "all 3 splits should be processed"); + assert_eq!(outcomes.pruned_before_warmup, 0); + assert_eq!(outcomes.cancel_before_warmup, 0); + assert_eq!(outcomes.cancel_warmup, 0); + assert_eq!(outcomes.cancel_cpu_queue, 0); + assert_eq!(outcomes.cancel_cpu, 0); + + // With MatchAll, we expect an early optimization that prevents the + // processing of older splits. + let response = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), + max_hits: 1, + count_hits: CountHits::Underestimate as i32, + sort_fields: vec![SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + sort_datetime_format: None, + }], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(response.num_hits, 2); + let outcomes = response.splits_by_outcome.unwrap(); + assert_eq!(outcomes.processed, 1); + assert_eq!(outcomes.pruned_before_warmup, 2); + + // MatchAll + max_hits=0 + CountAll triggers the metadata-count fast path: the split's + // stored num_docs is used directly without opening the tantivy index. + let response = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), + max_hits: 0, + count_hits: CountHits::CountAll as i32, + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(response.num_hits, 6); + let outcomes = response.splits_by_outcome.unwrap(); + assert_eq!(outcomes.processed_from_metadata, 3); + assert_eq!(outcomes.processed, 0); + assert_eq!(outcomes.pruned_before_warmup, 0); + + // MatchAll with a time range that fully covers 1 split but only partially + // overlaps the 2 others + let response = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), + max_hits: 0, + count_hits: CountHits::CountAll as i32, + start_timestamp: Some(base_ts - 1_999_999), + end_timestamp: Some(base_ts + 1), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + // split 1: 1 doc (base_ts-1_999_999), split 2: 2 docs, split 3: 1 doc (base_ts) + assert_eq!(response.num_hits, 4); + let outcomes = response.splits_by_outcome.unwrap(); + assert_eq!(outcomes.processed_from_metadata, 1); + assert_eq!(outcomes.processed, 2); + assert_eq!(outcomes.pruned_before_warmup, 0); + + test_sandbox.assert_quit().await; + Ok(()) +} diff --git a/quickwit/quickwit-search/src/top_k_collector.rs b/quickwit/quickwit-search/src/top_k_collector.rs index f36eb6370e2..3dc9f2bd6f2 100644 --- a/quickwit/quickwit-search/src/top_k_collector.rs +++ b/quickwit/quickwit-search/src/top_k_collector.rs @@ -12,862 +12,179 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::{Ordering, Reverse}; -use std::fmt::Debug; -use std::marker::PhantomData; +use std::cmp::Ordering; -use quickwit_common::binary_heap::TopK; -use quickwit_proto::search::{PartialHit, SortOrder}; +use quickwit_proto::search::PartialHit; use quickwit_proto::types::SplitId; -use tantivy::{DocId, Score}; +use tantivy::{DocId, Score, SegmentOrdinal}; -use crate::collector::{ - HitSortingMapper, SegmentPartialHit, SegmentPartialHitSortingKey, - SortingFieldExtractorComponent, SortingFieldExtractorPair, -}; +use crate::collector::SortingFieldExtractorPair; +use crate::sort_repr::{ElidableU64, InternalSortValueRepr}; +use crate::top_k_computer::TopKComputer; -pub trait QuickwitSegmentTopKCollector { - fn collect_top_k_block(&mut self, docs: &[DocId]); - fn collect_top_k(&mut self, doc_id: DocId, score: Score); - fn get_top_k(&self) -> Vec; -} - -trait IntoOptionU64 { - #[inline] - fn is_unit_type() -> bool { - false - } - fn into_option_u64(self) -> Option; - fn from_option_u64(value: Option) -> Self; -} -trait MinValue { - fn min_value() -> Self; -} - -impl IntoOptionU64 for Option { - #[inline] - fn into_option_u64(self) -> Option { - self - } - #[inline] - fn from_option_u64(value: Option) -> Self { - value - } -} - -impl MinValue for Option { - #[inline] - fn min_value() -> Self { - None - } -} - -impl IntoOptionU64 for Option> { - #[inline] - fn into_option_u64(self) -> Option { - self.map(|el| el.0) - } - #[inline] - fn from_option_u64(value: Option) -> Self { - value.map(Reverse) - } -} -impl MinValue for Option> { - #[inline] - fn min_value() -> Self { - None - } -} - -impl IntoOptionU64 for () { - #[inline] - fn is_unit_type() -> bool { - true - } - #[inline] - fn into_option_u64(self) -> Option { - None - } - #[inline] - fn from_option_u64(_: Option) -> Self {} -} -impl MinValue for () { - #[inline] - fn min_value() -> Self {} -} - -/// Generic hit struct for top k collector. -/// V1 and V2 are the types of the two values to sort by. -/// They are either Option or _statically_ disabled via unit type. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct Hit { - doc_id: DocId, - value1: V1, - value2: V2, -} - -impl MinValue for Hit -where - V1: MinValue, - V2: MinValue, -{ - #[inline] - fn min_value() -> Self { - let doc_id = if REVERSE_DOCID { - DocId::MAX - } else { - DocId::MIN - }; - Hit { - doc_id, - value1: V1::min_value(), - value2: V2::min_value(), - } - } -} - -impl std::fmt::Display for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Hit(doc_id: {}, value1: {:?}, value2: {:?})", - self.doc_id, self.value1, self.value2 - ) - } -} - -impl Ord for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, -{ - #[inline] - fn cmp(&self, other: &Self) -> Ordering { - let order = self.value1.cmp(&other.value1); - order - .then_with(|| self.value2.cmp(&other.value2)) - .then_with(|| { - if REVERSE_DOCID { - other.doc_id.cmp(&self.doc_id) - } else { - self.doc_id.cmp(&other.doc_id) - } - }) - } -} - -impl PartialOrd for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, -{ - #[inline] - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> Hit -{ - #[inline] - fn into_segment_partial_hit(self) -> SegmentPartialHit { - SegmentPartialHit { - sort_value: self.value1.into_option_u64(), - sort_value2: self.value2.into_option_u64(), - doc_id: self.doc_id, - } - } -} - -pub fn specialized_top_k_segment_collector( +pub struct QuickwitSegmentTopKCollectorTemplate { split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - search_after_option: Option, - order1: SortOrder, - order2: SortOrder, -) -> Box { - // TODO: Add support for search_after to the specialized collector. - // Eventually we may want to remove the generic collector to reduce complexity. - if search_after_option.is_some() || score_extractor.is_score() { - return Box::new(GenericQuickwitSegmentTopKCollector::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - search_after_option, - order1, - order2, - )); - } - - let sort_first_by_ff = score_extractor.first.is_fast_field(); - let sort_second_by_ff = score_extractor - .second - .as_ref() - .map(|extr| extr.is_fast_field()) - .unwrap_or(false); - - #[derive(Debug)] - enum SortType { - DocId, - OneFFSort, - TwoFFSorts, - } - let sort_type = match (sort_first_by_ff, sort_second_by_ff) { - (false, false) => SortType::DocId, - (true, false) => SortType::OneFFSort, - (true, true) => SortType::TwoFFSorts, - (false, true) => panic!("Internal error: Got second sort, but no first sort"), - }; - // only check order1 for OneFFSort and DocId, as it's the only sort - // - // REVERSE_DOCID is only used for SortType::DocId and SortType::OneFFSort - match (sort_type, order1, order2) { - (SortType::DocId, SortOrder::Desc, _) => { - Box::new(SpecializedSegmentTopKCollector::<(), (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - )) - } - (SortType::DocId, SortOrder::Asc, _) => { - Box::new(SpecializedSegmentTopKCollector::<(), (), true>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - )) - } - (SortType::OneFFSort, SortOrder::Asc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - (), - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::OneFFSort, SortOrder::Desc, SortOrder::Asc) => Box::new( - SpecializedSegmentTopKCollector::, (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - ), - ), - (SortType::OneFFSort, SortOrder::Asc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - (), - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::OneFFSort, SortOrder::Desc, SortOrder::Desc) => Box::new( - SpecializedSegmentTopKCollector::, (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - ), - ), - (SortType::TwoFFSorts, SortOrder::Asc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - Option>, - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Asc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - Option, - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Desc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option, - Option>, - false, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Desc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option, - Option, - false, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - } -} - -/// Fast Top K Computation -/// -/// The buffer is truncated to the top_n elements when it reaches the capacity of the Vec. -/// That means capacity has special meaning and should be carried over when cloning or serializing. -/// -/// For TopK == 0, it will be relative expensive. -struct TopKComputer { - /// Reverses sort order to get top-semantics instead of bottom-semantics - buffer: Vec>, - top_n: usize, - pub(crate) threshold: D, -} - -// Custom clone to keep capacity -impl Clone for TopKComputer { - fn clone(&self) -> Self { - let mut buffer_clone = Vec::with_capacity(self.buffer.capacity()); - buffer_clone.extend(self.buffer.iter().cloned()); - - TopKComputer { - buffer: buffer_clone, - top_n: self.top_n, - threshold: self.threshold.clone(), - } - } -} - -impl TopKComputer -where D: Ord + Copy + Debug + MinValue -{ - /// Create a new `TopKComputer`. - pub fn new(top_n: usize) -> Self { - // Vec cap can't be 0, since it would panic in push - let vec_cap = top_n.max(1) * 10; - TopKComputer { - buffer: Vec::with_capacity(vec_cap), - top_n, - threshold: D::min_value(), - } + // We track the segment ordinal here, but splits only have 1 segment so this + // should always be 0. + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k_hits: TopKComputer>, + search_after_opt: Option>, +} + +impl QuickwitSegmentTopKCollectorTemplate { + pub(crate) fn collect_top_k_block(&mut self, docs: &[DocId]) { + let search_after_opt = self.search_after_opt; + let top_k_hits = &mut self.top_k_hits; + self.hit_fetcher + .project_to_internal_sort_value_block(docs, |repr| { + if let Some(search_after) = search_after_opt + && repr.cmp(&search_after) != Ordering::Less + { + return; + } + top_k_hits.push(repr); + }); } - /// Push a new document to the top n. - /// If the document is below the current threshold, it will be ignored. - #[inline] - pub fn push(&mut self, doc: D) { - if doc < self.threshold { + pub(crate) fn collect_top_k(&mut self, doc_id: DocId, score: Score) { + let internal_repr = self + .hit_fetcher + .project_to_internal_sort_value(doc_id, score); + if let Some(search_after) = self.search_after_opt + && internal_repr.cmp(&search_after) != Ordering::Less + { return; } - if self.buffer.len() == self.buffer.capacity() { - let median = self.truncate_top_n(); - self.threshold = median; - } - - // This is faster since it avoids the buffer resizing to be inlined from vec.push() - // (this is in the hot path) - // TODO: Replace with `push_within_capacity` when it's stabilized - let uninit = self.buffer.spare_capacity_mut(); - // This cannot panic, because we truncate_median will at least remove one element, since - // the min capacity is larger than 2. - uninit[0].write(Reverse(doc)); - // This is safe because it would panic in the line above - unsafe { - self.buffer.set_len(self.buffer.len() + 1); - } + self.top_k_hits.push(internal_repr); } - #[inline(never)] - fn truncate_top_n(&mut self) -> D { - // Use select_nth_unstable to find the top nth score - let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n); - - let median_score = *median_el; - // Remove all elements below the top_n - self.buffer.truncate(self.top_n); - - median_score.0 - } - - /// Returns the top n elements in sorted order. - pub fn into_sorted_vec(mut self) -> Vec { - if self.buffer.len() > self.top_n { - self.truncate_top_n(); - } - self.buffer.sort_unstable(); - self.buffer.into_iter().map(|el| el.0).collect() - } - - /// Returns the top n elements in stored order. - /// Useful if you do not need the elements in sorted order, - /// for example when merging the results of multiple segments. - #[allow(dead_code)] - pub fn into_vec(mut self) -> Vec { - if self.buffer.len() > self.top_n { - self.truncate_top_n(); - } - self.buffer.into_iter().map(|el| el.0).collect() - } -} - -pub use tantivy::COLLECT_BLOCK_BUFFER_LEN; -struct SpecSortingFieldExtractor { - _phantom: std::marker::PhantomData<(V1, V2)>, - sort_values1: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - sort_values2: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - - pub first: SortingFieldExtractorComponent, - pub second: Option, -} - -impl< - V1: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64 + Debug, - V2: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64 + Debug, -> SpecSortingFieldExtractor -{ - fn new( - first: SortingFieldExtractorComponent, - second: Option, - ) -> Self { - Self { - _phantom: PhantomData, - sort_values1: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - sort_values2: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - first, - second, - } - } - /// Fetches the sort values for the given docs. - /// Does noting when sorting by docid. - fn fetch_data(&mut self, docs: &[DocId]) { - self.first - .extract_typed_sort_values_block(docs, &mut self.sort_values1[..docs.len()]); - if let Some(second) = self.second.as_ref() { - second.extract_typed_sort_values_block(docs, &mut self.sort_values2[..docs.len()]); - } - } - #[inline] - fn iter_hits<'a, const REVERSE_DOCID: bool>( - &'a self, - docs: &'a [DocId], - ) -> impl Iterator> + 'a { - SpecSortingFieldIter::::new( - docs, - &self.sort_values1, - &self.sort_values2, - ) - } -} - -struct SpecSortingFieldIter<'a, V1, V2, const REVERSE_DOCID: bool> { - docs: std::slice::Iter<'a, DocId>, - sort_values1: std::slice::Iter<'a, Option>, - sort_values2: std::slice::Iter<'a, Option>, - _phantom: PhantomData<(V1, V2)>, -} - -impl<'a, V1, V2, const REVERSE_DOCID: bool> SpecSortingFieldIter<'a, V1, V2, REVERSE_DOCID> -where - V1: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64, - V2: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64, -{ - #[inline] - pub fn new( - docs: &'a [DocId], - sort_values1: &'a [Option; COLLECT_BLOCK_BUFFER_LEN], - sort_values2: &'a [Option; COLLECT_BLOCK_BUFFER_LEN], - ) -> Self { - Self { - docs: docs.iter(), - sort_values1: sort_values1.iter(), - sort_values2: sort_values2.iter(), - _phantom: PhantomData, - } - } -} - -impl Iterator for SpecSortingFieldIter<'_, V1, V2, REVERSE_DOCID> -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug, -{ - type Item = Hit; - - #[inline] - fn next(&mut self) -> Option { - let doc_id = *self.docs.next()?; - - let value1 = if !V1::is_unit_type() { - V1::from_option_u64(*self.sort_values1.next()?) - } else { - V1::from_option_u64(None) - }; - - let value2 = if !V2::is_unit_type() { - V2::from_option_u64(*self.sort_values2.next()?) - } else { - V2::from_option_u64(None) - }; - - Some(Hit { - doc_id, - value1, - value2, - }) - } -} - -/// No search after handling -/// Quickwit collector working at the scale of the segment. -struct SpecializedSegmentTopKCollector< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> { - split_id: SplitId, - hit_fetcher: SpecSortingFieldExtractor, - top_k_hits: TopKComputer>, - segment_ord: u32, -} - -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue + 'static, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue + 'static, - const REVERSE_DOCID: bool, -> SpecializedSegmentTopKCollector -{ - pub fn new( - split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - ) -> Self { - let hit_fetcher = - SpecSortingFieldExtractor::new(score_extractor.first, score_extractor.second); - let top_k_hits = TopKComputer::new(leaf_max_hits); - Self { - split_id, - hit_fetcher, - top_k_hits, - segment_ord, - } - } -} -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> QuickwitSegmentTopKCollector for SpecializedSegmentTopKCollector -{ - fn collect_top_k_block(&mut self, docs: &[DocId]) { - self.hit_fetcher.fetch_data(docs); - let iter = self.hit_fetcher.iter_hits::(docs); - for doc_id in iter { - self.top_k_hits.push(doc_id); - } - } - - #[inline] - fn collect_top_k(&mut self, _doc_id: DocId, _score: Score) { - panic!("Internal Error: This collector does not support collect_top_k"); - } - - fn get_top_k(&self) -> Vec { + pub(crate) fn get_top_k(&self) -> tantivy::Result> { self.top_k_hits .clone() .into_sorted_vec() .into_iter() - .map(|el| el.into_segment_partial_hit()) - .map(|segment_partial_hit: SegmentPartialHit| { - segment_partial_hit.into_partial_hit( - self.split_id.clone(), + .map(|internal_repr| { + self.hit_fetcher.internal_to_partial_hit( + &self.split_id, self.segment_ord, - &self.hit_fetcher.first, - &self.hit_fetcher.second, + internal_repr, ) }) .collect() } } -/// Quickwit collector working at the scale of the segment. -pub(crate) struct GenericQuickwitSegmentTopKCollector { - split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - // PartialHits in this heap don't contain a split_id yet. - top_k_hits: TopK, - segment_ord: u32, - search_after: Option, - // Precomputed order for search_after for split_id and segment_ord - precomp_search_after_order: Ordering, - sort_values1: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - sort_values2: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, +pub enum QuickwitSegmentTopKCollector { + DocIdSort(QuickwitSegmentTopKCollectorTemplate<(), ()>), + OneDimSort(QuickwitSegmentTopKCollectorTemplate), + TwoDimSort(QuickwitSegmentTopKCollectorTemplate), + Noop, } -impl GenericQuickwitSegmentTopKCollector { - pub fn new( +impl QuickwitSegmentTopKCollector { + pub fn new_with_doc_id_sort( split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - search_after_option: Option, - order1: SortOrder, - order2: SortOrder, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair<(), ()>, + top_k: usize, + search_after_opt: Option>, ) -> Self { - let sort_key_mapper = HitSortingMapper { order1, order2 }; - let precomp_search_after_order = match &search_after_option { - Some(search_after) if !search_after.split_id.is_empty() => order1 - .compare(&split_id, &search_after.split_id) - .then_with(|| order1.compare(&segment_ord, &search_after.segment_ord)), - // This value isn't actually used. - _ => Ordering::Equal, - }; - let search_after = - SearchAfterSegment::new(search_after_option, order1, order2, &score_extractor); - - GenericQuickwitSegmentTopKCollector { - split_id, - score_extractor, - top_k_hits: TopK::new(leaf_max_hits, sort_key_mapper), // Adjusted for context - segment_ord, - search_after, - precomp_search_after_order, - sort_values1: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - sort_values2: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop + } else { + QuickwitSegmentTopKCollector::DocIdSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) } } - #[inline] - /// Generic top k collection, that includes search_after handling - /// - /// Outside of the collector to circumvent lifetime issues. - fn collect_top_k_vals( - doc_id: DocId, - sort_value: Option, - sort_value2: Option, - search_after: &Option, - precomp_search_after_order: Ordering, - top_k_hits: &mut TopK, - ) { - if let Some(search_after) = &search_after { - let search_after_value1 = search_after.sort_value; - let search_after_value2 = search_after.sort_value2; - let orders = &top_k_hits.sort_key_mapper; - let mut cmp_result = orders - .order1 - .compare_opt(&sort_value, &search_after_value1) - .then_with(|| { - orders - .order2 - .compare_opt(&sort_value2, &search_after_value2) - }); - if search_after.compare_on_equal { - // TODO actually it's not first, it should be what's in _shard_doc then first then - // default - let order = orders.order1; - cmp_result = cmp_result - .then(precomp_search_after_order) - // We compare doc_id only if sort_value1, sort_value2, split_id and segment_ord - // are equal. - .then_with(|| order.compare(&doc_id, &search_after.doc_id)) - } - if cmp_result != Ordering::Less { - return; - } + pub fn new_with_one_dim_sort( + split_id: SplitId, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k: usize, + search_after_opt: Option>, + ) -> Self { + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop + } else { + QuickwitSegmentTopKCollector::OneDimSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) } - - let hit = SegmentPartialHit { - sort_value, - sort_value2, - doc_id, - }; - top_k_hits.add_entry(hit); } -} -impl QuickwitSegmentTopKCollector for GenericQuickwitSegmentTopKCollector { - fn collect_top_k_block(&mut self, docs: &[DocId]) { - self.score_extractor.extract_typed_sort_values( - docs, - &mut self.sort_values1[..], - &mut self.sort_values2[..], - ); - if self.search_after.is_some() { - // Search after not optimized for block collection yet - for ((doc_id, sort_value), sort_value2) in docs - .iter() - .cloned() - .zip(self.sort_values1.iter().cloned()) - .zip(self.sort_values2.iter().cloned()) - { - Self::collect_top_k_vals( - doc_id, - sort_value, - sort_value2, - &self.search_after, - self.precomp_search_after_order, - &mut self.top_k_hits, - ); - } + + pub fn new_with_two_dim_sort( + split_id: SplitId, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k: usize, + search_after_opt: Option>, + ) -> Self { + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop } else { - // Probably would make sense to check the fence against e.g. sort_values1 earlier, - // before creating the SegmentPartialHit. - // - // Below are different versions to avoid iterating the caches if they are unused. - // - // No sort values loaded. Sort only by doc_id. - if !self.score_extractor.first.is_fast_field() { - for doc_id in docs.iter().cloned() { - let hit = SegmentPartialHit { - sort_value: None, - sort_value2: None, - doc_id, - }; - self.top_k_hits.add_entry(hit); - } - return; + QuickwitSegmentTopKCollector::TwoDimSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) + } + } + + pub(crate) fn collect_top_k_block(&mut self, docs: &[DocId]) { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => { + collector.collect_top_k_block(docs) } - let has_no_second_sort = !self - .score_extractor - .second - .as_ref() - .map(|extr| extr.is_fast_field()) - .unwrap_or(false); - // No second sort values => We can skip iterating the second sort values cache. - if has_no_second_sort { - for (doc_id, sort_value) in - docs.iter().cloned().zip(self.sort_values1.iter().cloned()) - { - let hit = SegmentPartialHit { - sort_value, - sort_value2: None, - doc_id, - }; - self.top_k_hits.add_entry(hit); - } - return; + QuickwitSegmentTopKCollector::OneDimSort(collector) => { + collector.collect_top_k_block(docs) } - - for ((doc_id, sort_value), sort_value2) in docs - .iter() - .cloned() - .zip(self.sort_values1.iter().cloned()) - .zip(self.sort_values2.iter().cloned()) - { - let hit = SegmentPartialHit { - sort_value, - sort_value2, - doc_id, - }; - self.top_k_hits.add_entry(hit); + QuickwitSegmentTopKCollector::TwoDimSort(collector) => { + collector.collect_top_k_block(docs) } + QuickwitSegmentTopKCollector::Noop => {} } } - #[inline] - fn collect_top_k(&mut self, doc_id: DocId, score: Score) { - let (sort_value, sort_value2): (Option, Option) = - self.score_extractor.extract_typed_sort_value(doc_id, score); - Self::collect_top_k_vals( - doc_id, - sort_value, - sort_value2, - &self.search_after, - self.precomp_search_after_order, - &mut self.top_k_hits, - ); - } - - fn get_top_k(&self) -> Vec { - self.top_k_hits - .clone() - .finalize() - .into_iter() - .map(|segment_partial_hit: SegmentPartialHit| { - segment_partial_hit.into_partial_hit( - self.split_id.clone(), - self.segment_ord, - &self.score_extractor.first, - &self.score_extractor.second, - ) - }) - .collect() - } -} - -/// Search After, but the sort values are converted to the u64 fast field representation. -pub(crate) struct SearchAfterSegment { - sort_value: Option, - sort_value2: Option, - compare_on_equal: bool, - doc_id: DocId, -} -impl SearchAfterSegment { - pub fn new( - search_after_opt: Option, - sort_order1: SortOrder, - sort_order2: SortOrder, - score_extractor: &SortingFieldExtractorPair, - ) -> Option { - let search_after = search_after_opt?; - let mut sort_value = None; - if let Some(search_after_sort_value) = search_after - .sort_value - .and_then(|sort_value| sort_value.sort_value) - { - if let Some(new_value) = score_extractor - .first - .convert_to_u64_ff_val(search_after_sort_value, sort_order1) - { - sort_value = Some(new_value); - } else { - // Value is out of bounds, we ignore sort_value2 and disable the whole - // search_after - return None; + pub(crate) fn collect_top_k(&mut self, doc_id: DocId, score: Score) { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => { + collector.collect_top_k(doc_id, score) } - } - let mut sort_value2 = None; - if let Some(search_after_sort_value) = search_after - .sort_value2 - .and_then(|sort_value2| sort_value2.sort_value) - { - let extractor = score_extractor - .second - .as_ref() - .expect("Internal error: Got sort_value2, but no sort extractor"); - if let Some(new_value) = - extractor.convert_to_u64_ff_val(search_after_sort_value, sort_order2) - { - sort_value2 = Some(new_value); + QuickwitSegmentTopKCollector::OneDimSort(collector) => { + collector.collect_top_k(doc_id, score) + } + QuickwitSegmentTopKCollector::TwoDimSort(collector) => { + collector.collect_top_k(doc_id, score) } + QuickwitSegmentTopKCollector::Noop => {} + } + } + + pub(crate) fn get_top_k(&self) -> tantivy::Result> { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::OneDimSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::TwoDimSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::Noop => Ok(vec![]), } - Some(Self { - sort_value, - sort_value2, - compare_on_equal: !search_after.split_id.is_empty(), - doc_id: search_after.doc_id, - }) } } diff --git a/quickwit/quickwit-search/src/top_k_computer.rs b/quickwit/quickwit-search/src/top_k_computer.rs new file mode 100644 index 00000000000..8f6ff7c8d07 --- /dev/null +++ b/quickwit/quickwit-search/src/top_k_computer.rs @@ -0,0 +1,111 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Reverse; +use std::fmt::Debug; + +pub(crate) trait MinValue { + fn min_value() -> Self; +} + +/// Fast Top K Computation +/// +/// The buffer is truncated to the top_n elements when it reaches the capacity of the Vec. +/// That means capacity has special meaning and should be carried over when cloning or serializing. +/// +/// For TopK == 0, it will be relative expensive. +pub(crate) struct TopKComputer { + /// Reverses sort order to get top-semantics instead of bottom-semantics + buffer: Vec>, + top_n: usize, + pub(crate) threshold: D, +} + +// Custom clone to keep capacity +impl Clone for TopKComputer { + fn clone(&self) -> Self { + let mut buffer_clone = Vec::with_capacity(self.buffer.capacity()); + buffer_clone.extend(self.buffer.iter().cloned()); + + TopKComputer { + buffer: buffer_clone, + top_n: self.top_n, + threshold: self.threshold.clone(), + } + } +} + +impl TopKComputer +where D: Ord + Copy + Debug + MinValue +{ + /// Create a new `TopKComputer`. + pub fn new(top_n: usize) -> Self { + let vec_cap = top_n.max(1) * 10; + TopKComputer { + buffer: Vec::with_capacity(vec_cap), + top_n, + threshold: D::min_value(), + } + } +} + +impl TopKComputer +where D: Ord + Copy + Debug +{ + /// Push a new document to the top n. + /// If the document is below the current threshold, it will be ignored. + #[inline] + pub fn push(&mut self, doc: D) { + if doc < self.threshold { + return; + } + if self.buffer.len() == self.buffer.capacity() { + let median = self.truncate_top_n(); + self.threshold = median; + } + + // This is faster since it avoids the buffer resizing to be inlined from vec.push() + // (this is in the hot path) + // TODO: Replace with `push_within_capacity` when it's stabilized + let uninit = self.buffer.spare_capacity_mut(); + // This cannot panic, because truncate_top_n will at least remove one element, since + // the min capacity is larger than 2. + uninit[0].write(Reverse(doc)); + // This is safe because it would panic in the line above + unsafe { + self.buffer.set_len(self.buffer.len() + 1); + } + } + + #[inline(never)] + fn truncate_top_n(&mut self) -> D { + // Use select_nth_unstable to find the top nth score + let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n); + + let median_score = *median_el; + // Remove all elements below the top_n + self.buffer.truncate(self.top_n); + + median_score.0 + } + + /// Returns the top n elements in sorted order. + pub fn into_sorted_vec(mut self) -> Vec { + if self.buffer.len() > self.top_n { + self.truncate_top_n(); + } + self.buffer.sort_unstable(); + self.buffer.into_iter().map(|el| el.0).collect() + } +} diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml index 363065a3403..a30df6519bf 100644 --- a/quickwit/quickwit-serve/Cargo.toml +++ b/quickwit/quickwit-serve/Cargo.toml @@ -31,6 +31,7 @@ itertools = { workspace = true } mime_guess = { workspace = true } once_cell = { workspace = true } percent-encoding = { workspace = true } +pin-project = { workspace = true } pprof = { workspace = true, optional = true } prost = { workspace = true } prost-types = { workspace = true } diff --git a/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs b/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs index f38ddac1627..071122f799d 100644 --- a/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/cluster_api/rest_handler.rs @@ -15,6 +15,11 @@ use std::convert::Infallible; use quickwit_cluster::{Cluster, ClusterSnapshot, NodeIdSchema}; +use quickwit_proto::control_plane::{ + ControlPlaneError, ControlPlaneService, ControlPlaneServiceClient, + DisableMaintenanceModeRequest, EnableMaintenanceModeRequest, EnableMaintenanceModeResponse, + GetMaintenanceModeRequest, GetMaintenanceModeResponse, +}; use warp::{Filter, Rejection}; use crate::format::extract_format_from_qs; @@ -23,22 +28,39 @@ use crate::rest_api_response::into_rest_api_response; #[derive(utoipa::OpenApi)] #[openapi( - paths(get_cluster), - components(schemas(ClusterSnapshot, NodeIdSchema,)) + paths( + get_cluster, + get_maintenance_endpoint, + enable_maintenance_endpoint, + disable_maintenance_endpoint + ), + components(schemas( + ClusterSnapshot, + NodeIdSchema, + GetMaintenanceModeResponse, + EnableMaintenanceModeResponse + )) )] pub struct ClusterApi; /// Cluster handler. pub fn cluster_handler( cluster: Cluster, + control_plane_client: ControlPlaneServiceClient, ) -> impl Filter + Clone { - warp::path!("cluster") + let cluster_info_handler = warp::path!("cluster") .and(warp::path::end()) .and(warp::get()) .and(warp::path::end().map(move || cluster.clone())) .then(get_cluster) .and(extract_format_from_qs()) .map(into_rest_api_response) + .boxed(); + + let maintenance_routes = maintenance_handler(control_plane_client); + + cluster_info_handler + .or(maintenance_routes) .recover(recover_fn) .boxed() } @@ -57,3 +79,100 @@ async fn get_cluster(cluster: Cluster) -> Result { let snapshot = cluster.snapshot().await; Ok(snapshot) } + +#[utoipa::path( + get, + tag = "Cluster Info", + path = "/cluster/maintenance", + responses( + (status = 200, description = "Successfully fetched maintenance mode status.", body = GetMaintenanceModeResponse) + ) +)] +async fn get_maintenance_endpoint( + control_plane_client: ControlPlaneServiceClient, +) -> Result { + control_plane_client + .get_maintenance_mode(GetMaintenanceModeRequest {}) + .await +} + +#[utoipa::path( + put, + tag = "Cluster Info", + path = "/cluster/maintenance", + responses( + (status = 200, description = "Successfully enabled maintenance mode.", body = EnableMaintenanceModeResponse) + ) +)] +async fn enable_maintenance_endpoint( + control_plane_client: ControlPlaneServiceClient, +) -> Result { + control_plane_client + .enable_maintenance_mode(EnableMaintenanceModeRequest {}) + .await +} + +#[utoipa::path( + delete, + tag = "Cluster Info", + path = "/cluster/maintenance", + responses( + (status = 200, description = "Successfully disabled maintenance mode.") + ) +)] +async fn disable_maintenance_endpoint( + control_plane_client: ControlPlaneServiceClient, +) -> Result<(), ControlPlaneError> { + control_plane_client + .disable_maintenance_mode(DisableMaintenanceModeRequest {}) + .await?; + Ok(()) +} + +fn maintenance_get_filter() -> impl Filter + Clone { + warp::path!("cluster" / "maintenance").and(warp::get()) +} + +fn maintenance_put_filter() -> impl Filter + Clone { + warp::path!("cluster" / "maintenance").and(warp::put()) +} + +fn maintenance_delete_filter() -> impl Filter + Clone { + warp::path!("cluster" / "maintenance").and(warp::delete()) +} + +/// Maintenance mode endpoints handler. +/// +/// - `GET /api/v1/cluster/maintenance` — get maintenance status +/// - `PUT /api/v1/cluster/maintenance` — enable maintenance mode +/// - `DELETE /api/v1/cluster/maintenance` — disable maintenance mode +fn maintenance_handler( + control_plane_client: ControlPlaneServiceClient, +) -> impl Filter + Clone { + let get_client = control_plane_client.clone(); + let put_client = control_plane_client.clone(); + let delete_client = control_plane_client; + + let get_handler = maintenance_get_filter() + .and(warp::any().map(move || get_client.clone())) + .then(get_maintenance_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .boxed(); + + let put_handler = maintenance_put_filter() + .and(warp::any().map(move || put_client.clone())) + .then(enable_maintenance_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .boxed(); + + let delete_handler = maintenance_delete_filter() + .and(warp::any().map(move || delete_client.clone())) + .then(disable_maintenance_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .boxed(); + + get_handler.or(put_handler).or(delete_handler).boxed() +} diff --git a/quickwit/quickwit-serve/src/developer_api/debug.rs b/quickwit/quickwit-serve/src/developer_api/debug.rs index 1668af1f94e..503f3947844 100644 --- a/quickwit/quickwit-serve/src/developer_api/debug.rs +++ b/quickwit/quickwit-serve/src/developer_api/debug.rs @@ -104,10 +104,10 @@ async fn get_node_debug_infos( let mut get_debug_info_futures = FuturesUnordered::new(); for ready_node in ready_nodes { - if node_id_patterns.matches(ready_node.node_id()) { - let node_id = ready_node.node_id().to_owned(); + if node_id_patterns.matches(&ready_node.node_id) { + let node_id = ready_node.node_id.clone(); let client = DeveloperServiceClient::from_channel( - ready_node.grpc_advertise_addr(), + ready_node.grpc_advertise_addr, ready_node.channel(), DeveloperApiServer::MAX_GRPC_MESSAGE_SIZE, Some(CompressionEncoding::Zstd), diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs b/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs index b8d2343f666..071f080fe81 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/filter.rs @@ -14,6 +14,7 @@ use bytes::Bytes; use bytesize::ByteSize; +use http::HeaderValue; use serde::de::DeserializeOwned; use warp::reject::LengthRequired; use warp::{Filter, Rejection}; @@ -165,14 +166,21 @@ pub(crate) fn elastic_resolve_index_filter() } #[utoipa::path(get, tag = "Count", path = "/{index}/_count")] -pub(crate) fn elastic_index_count_filter() --> impl Filter, SearchQueryParamsCount, SearchBody), Error = Rejection> + Clone -{ +pub(crate) fn elastic_index_count_filter() -> impl Filter< + Extract = ( + Vec, + SearchQueryParamsCount, + SearchBody, + Option, + ), + Error = Rejection, +> + Clone { warp::path!("_elastic" / String / "_count") .and_then(extract_index_id_patterns) .and(warp::get().or(warp::post()).unify()) .and(warp::query()) .and(json_or_empty()) + .and(warp::header::optional::("user-agent")) } #[utoipa::path(delete, tag = "Indexes", path = "/{index}")] @@ -222,23 +230,33 @@ pub(crate) fn elastic_cat_indices_filter() } #[utoipa::path(get, tag = "Search", path = "/{index}/_search")] -pub(crate) fn elastic_index_search_filter() --> impl Filter, SearchQueryParams, SearchBody), Error = Rejection> + Clone { +pub(crate) fn elastic_index_search_filter() -> impl Filter< + Extract = ( + Vec, + SearchQueryParams, + SearchBody, + Option, + ), + Error = Rejection, +> + Clone { warp::path!("_elastic" / String / "_search") .and_then(extract_index_id_patterns) .and(warp::get().or(warp::post()).unify()) .and(warp::query()) .and(json_or_empty()) + .and(warp::header::optional::("user-agent")) } #[utoipa::path(post, tag = "Search", path = "/_msearch")] pub(crate) fn elastic_multi_search_filter() --> impl Filter + Clone { +-> impl Filter), Error = Rejection> + Clone +{ warp::path!("_elastic" / "_msearch") .and(warp::body::content_length_limit(BODY_LENGTH_LIMIT.as_u64())) .and(warp::body::bytes()) .and(warp::post()) .and(warp::query()) + .and(warp::header::optional::("user-agent")) } fn merge_scroll_body_params( diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs index a9649200e5b..adf8f1df259 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs @@ -21,6 +21,7 @@ use bytes::Bytes; use elasticsearch_dsl::search::Hit as ElasticHit; use elasticsearch_dsl::{HitsMetadata, ShardStatistics, Source, TotalHits, TotalHitsRelation}; use futures_util::StreamExt; +use http::HeaderValue; use itertools::Itertools; use quickwit_cluster::Cluster; use quickwit_common::truncate_str; @@ -30,7 +31,7 @@ use quickwit_metastore::*; use quickwit_proto::metastore::MetastoreServiceClient; use quickwit_proto::search::{ CountHits, ListFieldsResponse, PartialHit, ScrollRequest, SearchResponse, SortByValue, - SortDatetimeFormat, + SortDatetimeFormat, SplitsByOutcome, }; use quickwit_proto::types::IndexUid; use quickwit_query::BooleanOperand; @@ -307,6 +308,7 @@ fn build_request_for_es_api( index_id_patterns: Vec, search_params: SearchQueryParams, search_body: SearchBody, + user_agent: Option, ) -> Result<(quickwit_proto::search::SearchRequest, bool), ElasticsearchError> { let default_operator = search_params.default_operator.unwrap_or(BooleanOperand::Or); // The query string, if present, takes priority over what can be in the request @@ -363,10 +365,14 @@ fn build_request_for_es_api( .track_total_hits .or(search_body.track_total_hits) { - None => CountHits::Underestimate, Some(TrackTotalHits::Track(false)) => CountHits::Underestimate, Some(TrackTotalHits::Count(count)) if count <= max_hits as i64 => CountHits::Underestimate, Some(TrackTotalHits::Track(true) | TrackTotalHits::Count(_)) => CountHits::CountAll, + // A query without aggregation and a size set to 0 cannot be used for + // anything else than counting. We avoid setting `Underestimate` in that + // case as it would always return 0. + None if max_hits == 0 && aggregation_request.is_none() => CountHits::CountAll, + None => CountHits::Underestimate, } .into(); @@ -413,6 +419,7 @@ fn build_request_for_es_api( count_hits, ignore_missing_indexes, split_id: None, + user_agent: user_agent.and_then(|h| h.to_str().ok().map(str::to_owned)), }, has_doc_id_field, )) @@ -490,12 +497,13 @@ async fn es_compat_index_count( index_id_patterns: Vec, search_params: SearchQueryParamsCount, search_body: SearchBody, + user_agent: Option, search_service: Arc, ) -> Result { let mut search_params: SearchQueryParams = search_params.into(); search_params.track_total_hits = Some(TrackTotalHits::Track(true)); let (search_request, _append_shard_doc) = - build_request_for_es_api(index_id_patterns, search_params, search_body)?; + build_request_for_es_api(index_id_patterns, search_params, search_body, user_agent)?; let search_response: SearchResponse = search_service.root_search(search_request).await?; let search_response_rest: ElasticsearchCountResponse = ElasticsearchCountResponse { count: search_response.num_hits, @@ -507,6 +515,7 @@ async fn es_compat_index_search( index_id_patterns: Vec, search_params: SearchQueryParams, search_body: SearchBody, + user_agent: Option, search_service: Arc, ) -> Result { if search_params.scroll.is_some() && !search_params.allow_partial_search_results() { @@ -520,7 +529,7 @@ async fn es_compat_index_search( let start_instant = Instant::now(); let allow_partial_search_results = search_params.allow_partial_search_results(); let (search_request, append_shard_doc) = - build_request_for_es_api(index_id_patterns, search_params, search_body)?; + build_request_for_es_api(index_id_patterns, search_params, search_body, user_agent)?; let search_response: SearchResponse = search_service.root_search(search_request).await?; let elapsed = start_instant.elapsed(); let mut search_response_rest: ElasticsearchResponse = convert_to_es_search_response( @@ -778,16 +787,16 @@ fn convert_hit( .unwrap_or_else(|_| Source::from_string("{}".to_string()).unwrap()); let mut sort = Vec::new(); - if let Some(partial_hit) = hit.partial_hit { - if let Some(sort_value) = partial_hit.sort_value { - sort.push(sort_value.into_json()); + if let Some(partial_hit) = &hit.partial_hit { + if let Some(sort_value) = &partial_hit.sort_value { + sort.push(sort_value.clone().into_json()); } - if let Some(sort_value2) = partial_hit.sort_value2 { - sort.push(sort_value2.into_json()); + if let Some(sort_value2) = &partial_hit.sort_value2 { + sort.push(sort_value2.clone().into_json()); } if append_shard_doc { sort.push(serde_json::Value::String( - quickwit_search::GlobalDocAddress::from_partial_hit(&partial_hit).to_string(), + quickwit_search::GlobalDocAddress::from_partial_hit(partial_hit).to_string(), )); } } @@ -810,6 +819,7 @@ fn convert_hit( async fn es_compat_index_multi_search( payload: Bytes, multi_search_params: MultiSearchQueryParams, + user_agent: Option, search_service: Arc, ) -> Result { let mut search_requests = Vec::new(); @@ -864,8 +874,12 @@ async fn es_compat_index_multi_search( if let Some(extra_filters) = &multi_search_params.extra_filters { search_query_params.extra_filters = Some(extra_filters.to_vec()); } - let es_request = - build_request_for_es_api(index_ids_patterns, search_query_params, search_body)?; + let es_request = build_request_for_es_api( + index_ids_patterns, + search_query_params, + search_body, + user_agent.clone(), + )?; search_requests.push(es_request); } @@ -998,6 +1012,34 @@ fn convert_to_es_stats_response( ElasticsearchStatsResponse { _all, indices } } +fn get_relation_from_split_outcome( + splits_by_outcome: &Option, + num_failed_splits: usize, +) -> TotalHitsRelation { + let Some(splits_by_outcome) = splits_by_outcome else { + return TotalHitsRelation::GreaterThanOrEqualTo; + }; + // Destructure to make sure we update this if a state is added. + let SplitsByOutcome { + cancel_before_warmup: _, + cancel_warmup: _, + cancel_cpu_queue: _, + cancel_cpu: _, + pruned_before_warmup, + pruned_after_warmup, + cache_hit: _, + processed: _, + processed_from_metadata: _, + } = *splits_by_outcome; + // A cancelled split may be retried and eventually succeed, so cancel + // counters alone don't imply an underestimated count. Use reported failed + // splits instead. + if num_failed_splits == 0 && pruned_before_warmup == 0 && pruned_after_warmup == 0 { + return TotalHitsRelation::Equal; + } + TotalHitsRelation::GreaterThanOrEqualTo +} + #[allow(clippy::result_large_err)] fn convert_to_es_search_response( resp: SearchResponse, @@ -1033,12 +1075,16 @@ fn convert_to_es_search_response( let num_failed_splits = resp.failed_splits.len() as u32; let num_successful_splits = resp.num_successful_splits as u32; let num_total_splits = num_successful_splits + num_failed_splits; + + let relation = + get_relation_from_split_outcome(&resp.splits_by_outcome, resp.failed_splits.len()); + Ok(ElasticsearchResponse { timed_out: false, hits: HitsMetadata { total: Some(TotalHits { value: resp.num_hits, - relation: TotalHitsRelation::Equal, + relation, }), max_score: None, hits, diff --git a/quickwit/quickwit-serve/src/grpc.rs b/quickwit/quickwit-serve/src/grpc.rs index 27d370c38aa..351341af895 100644 --- a/quickwit/quickwit-serve/src/grpc.rs +++ b/quickwit/quickwit-serve/src/grpc.rs @@ -188,10 +188,11 @@ pub(crate) async fn start_grpc_server( let search_service = services.search_service.clone(); let grpc_search_service = GrpcSearchAdapter::from(search_service); + let max_message_size_bytes = grpc_config.max_search_message_size.0 as usize; Some( SearchServiceServer::new(grpc_search_service) - .max_decoding_message_size(grpc_config.max_message_size.0 as usize) - .max_encoding_message_size(grpc_config.max_message_size.0 as usize), + .max_decoding_message_size(max_message_size_bytes) + .max_encoding_message_size(max_message_size_bytes), ) } else { None diff --git a/quickwit/quickwit-serve/src/indexing_api/mod.rs b/quickwit/quickwit-serve/src/indexing_api/mod.rs index 9d3740615a3..e9e16d79431 100644 --- a/quickwit/quickwit-serve/src/indexing_api/mod.rs +++ b/quickwit/quickwit-serve/src/indexing_api/mod.rs @@ -14,4 +14,4 @@ mod rest_handler; -pub use rest_handler::{IndexingApi, indexing_get_handler}; +pub use rest_handler::{IndexingApi, indexing_get_handler, swap_pipelines_handler}; diff --git a/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs b/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs index 1dcc3cd05df..4412c83c43f 100644 --- a/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/indexing_api/rest_handler.rs @@ -16,6 +16,10 @@ use std::convert::Infallible; use quickwit_actors::{AskError, Mailbox, Observe}; use quickwit_indexing::actors::{IndexingService, IndexingServiceCounters}; +use quickwit_proto::control_plane::{ + ControlPlaneError, ControlPlaneService, ControlPlaneServiceClient, SwapIndexingPipelinesEntry, + SwapIndexingPipelinesRequest, SwapIndexingPipelinesResponse, SwapIndexingPipelinesResult, +}; use warp::{Filter, Rejection}; use crate::format::extract_format_from_qs; @@ -24,7 +28,15 @@ use crate::rest::recover_fn; use crate::rest_api_response::into_rest_api_response; #[derive(utoipa::OpenApi)] -#[openapi(paths(indexing_endpoint))] +#[openapi( + paths(indexing_endpoint, swap_pipelines_endpoint), + components(schemas( + SwapIndexingPipelinesRequest, + SwapIndexingPipelinesResponse, + SwapIndexingPipelinesEntry, + SwapIndexingPipelinesResult, + )) +)] pub struct IndexingApi; #[utoipa::path( @@ -59,3 +71,244 @@ pub fn indexing_get_handler( .recover(recover_fn) .boxed() } + +#[utoipa::path( + post, + tag = "Swap pipelines", + path = "/indexing/swap-pipelines", + request_body = SwapIndexingPipelinesRequest, + responses( + (status = 200, description = "Successfully swapped indexing pipelines.", body = SwapIndexingPipelinesResponse) + ) +)] +async fn swap_pipelines_endpoint( + body: SwapIndexingPipelinesRequest, + control_plane_client: ControlPlaneServiceClient, +) -> Result { + control_plane_client.swap_indexing_pipelines(body).await +} + +fn swap_pipelines_post_filter() -> impl Filter + Clone { + warp::path!("indexing" / "swap-pipelines").and(warp::post()) +} + +pub fn swap_pipelines_handler( + control_plane_client: ControlPlaneServiceClient, +) -> impl Filter + Clone { + swap_pipelines_post_filter() + .and(warp::body::json()) + .and(warp::any().map(move || control_plane_client.clone())) + .then(swap_pipelines_endpoint) + .and(extract_format_from_qs()) + .map(into_rest_api_response) + .recover(recover_fn) + .boxed() +} + +#[cfg(test)] +mod tests { + use quickwit_proto::control_plane::{ + ControlPlaneServiceClient, MockControlPlaneService, SwapIndexingPipelinesEntry, + SwapIndexingPipelinesRequest, SwapIndexingPipelinesResponse, SwapIndexingPipelinesResult, + }; + use warp::Filter; + + use super::swap_pipelines_handler; + use crate::rest::recover_fn; + + #[tokio::test] + async fn test_swap_pipelines_handler_success() { + let mut mock = MockControlPlaneService::new(); + mock.expect_swap_indexing_pipelines().returning(|request| { + let results = request + .swaps + .iter() + .map(|swap| SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + }) + .collect(); + Ok(SwapIndexingPipelinesResponse { results }) + }); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let body = serde_json::to_vec(&SwapIndexingPipelinesRequest { + swaps: vec![SwapIndexingPipelinesEntry { + left_node_id: "indexer-1".to_string(), + left_index_id: "index-a".to_string(), + right_node_id: "indexer-2".to_string(), + right_index_id: Some("index-b".to_string()), + }], + }) + .unwrap(); + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(body) + .reply(&handler) + .await; + + assert_eq!(resp.status(), 200); + + let response: SwapIndexingPipelinesResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.results.len(), 1); + assert!(response.results[0].success); + let swap = response.results[0].swap.as_ref().unwrap(); + assert_eq!(swap.left_node_id, "indexer-1"); + assert_eq!(swap.left_index_id, "index-a"); + assert_eq!(swap.right_node_id, "indexer-2"); + assert_eq!(swap.right_index_id.as_deref(), Some("index-b")); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_partial_failure() { + let mut mock = MockControlPlaneService::new(); + mock.expect_swap_indexing_pipelines().returning(|request| { + let results = request + .swaps + .iter() + .enumerate() + .map(|(i, swap)| { + if i == 0 { + SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + } + } else { + SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: false, + reason: "pipeline count mismatch".to_string(), + } + } + }) + .collect(); + Ok(SwapIndexingPipelinesResponse { results }) + }); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let body = serde_json::to_vec(&SwapIndexingPipelinesRequest { + swaps: vec![ + SwapIndexingPipelinesEntry { + left_node_id: "indexer-1".to_string(), + left_index_id: "index-a".to_string(), + right_node_id: "indexer-2".to_string(), + right_index_id: Some("index-b".to_string()), + }, + SwapIndexingPipelinesEntry { + left_node_id: "indexer-3".to_string(), + left_index_id: "index-c".to_string(), + right_node_id: "indexer-4".to_string(), + right_index_id: Some("index-d".to_string()), + }, + ], + }) + .unwrap(); + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(body) + .reply(&handler) + .await; + + assert_eq!(resp.status(), 200); + + let response: SwapIndexingPipelinesResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.results.len(), 2); + assert!(response.results[0].success); + assert!(!response.results[1].success); + assert!( + response.results[1] + .reason + .contains("pipeline count mismatch") + ); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_move_without_right_index() { + let mut mock = MockControlPlaneService::new(); + mock.expect_swap_indexing_pipelines().returning(|request| { + let results = request + .swaps + .iter() + .map(|swap| SwapIndexingPipelinesResult { + swap: Some(swap.clone()), + success: true, + reason: String::new(), + }) + .collect(); + Ok(SwapIndexingPipelinesResponse { results }) + }); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + // Send JSON without right_index_id field — should deserialize to None. + let body = r#"{"swaps": [{"left_node_id": "indexer-1", "left_index_id": "index-a", "right_node_id": "indexer-2"}]}"#; + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(body) + .reply(&handler) + .await; + + assert_eq!(resp.status(), 200); + + let response: SwapIndexingPipelinesResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.results.len(), 1); + assert!(response.results[0].success); + let swap = response.results[0].swap.as_ref().unwrap(); + assert_eq!(swap.left_node_id, "indexer-1"); + assert_eq!(swap.left_index_id, "index-a"); + assert_eq!(swap.right_node_id, "indexer-2"); + assert!(swap.right_index_id.is_none()); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_invalid_json_body() { + let mock = MockControlPlaneService::new(); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let resp = warp::test::request() + .method("POST") + .path("/indexing/swap-pipelines") + .header("content-type", "application/json") + .body(b"not json at all") + .reply(&handler) + .await; + + // Warp returns 400 for invalid JSON bodies. + assert_eq!(resp.status(), 400); + } + + #[tokio::test] + async fn test_swap_pipelines_handler_wrong_method() { + let mock = MockControlPlaneService::new(); + let control_plane_client = ControlPlaneServiceClient::from_mock(mock); + + let handler = swap_pipelines_handler(control_plane_client).recover(recover_fn); + + let resp = warp::test::request() + .method("GET") + .path("/indexing/swap-pipelines") + .reply(&handler) + .await; + + // GET on a POST-only route returns 405. + assert_eq!(resp.status(), 405); + } +} diff --git a/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs b/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs index def8a4c6ca7..7755e9e289e 100644 --- a/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/jaeger_api/rest_handler.rs @@ -476,11 +476,11 @@ mod tests { num_hits: 0, hits: Vec::new(), elapsed_time_micros: 0, - errors: Vec::new(), aggregation_postcard: None, scroll_id: None, failed_splits: Vec::new(), num_successful_splits: 1, + splits_by_outcome: None, }) }); let mock_search_service = Arc::new(mock_search_service); @@ -509,11 +509,11 @@ mod tests { num_hits: 0, hits: Vec::new(), elapsed_time_micros: 0, - errors: Vec::new(), aggregation_postcard: None, scroll_id: None, failed_splits: Vec::new(), num_successful_splits: 1, + splits_by_outcome: None, }) }); let mock_search_service = Arc::new(mock_search_service); diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 9c7543e2e04..390519d33ae 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -28,6 +28,7 @@ mod indexing_api; mod ingest_api; mod jaeger_api; mod load_shield; + mod metrics; mod metrics_api; mod node_info_handler; @@ -38,6 +39,7 @@ mod rest; mod rest_api_response; mod search_api; pub(crate) mod simple_list; +mod soft_delete_api; pub mod tcp_listener; mod template_api; mod ui_handler; @@ -65,15 +67,17 @@ use quickwit_common::pubsub::{EventBroker, EventSubscriptionHandle}; use quickwit_common::rate_limiter::RateLimiterSettings; use quickwit_common::retry::RetryParams; use quickwit_common::runtimes::RuntimesConfig; +use quickwit_common::spawn_named_task; use quickwit_common::tower::{ - BalanceChannel, BoxFutureInfaillible, BufferLayer, Change, CircuitBreakerEvaluator, + BalanceChannel, BoxFutureInfaillible, BoxLayer, BufferLayer, Change, CircuitBreakerEvaluator, ConstantRate, EstimateRateLayer, EventListenerLayer, GrpcMetricsLayer, LoadShedLayer, RateLimitLayer, RetryLayer, RetryPolicy, SmaRateEstimator, TimeoutLayer, }; use quickwit_common::uri::Uri; -use quickwit_common::{get_bool_from_env, spawn_named_task}; use quickwit_config::service::QuickwitService; -use quickwit_config::{ClusterConfig, IngestApiConfig, NodeConfig}; +use quickwit_config::{ + ClusterConfig, IngestApiConfig, NodeConfig, is_delete_task_service_disabled, +}; use quickwit_control_plane::control_plane::{ControlPlane, ControlPlaneEventSubscriber}; use quickwit_control_plane::{IndexerNodeInfo, IndexerPool}; use quickwit_index_management::{IndexService as IndexManager, IndexServiceError}; @@ -101,7 +105,7 @@ use quickwit_proto::ingest::router::IngestRouterServiceClient; use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause}; use quickwit_proto::metastore::{ EntityKind, ListIndexesMetadataRequest, MetastoreError, MetastoreService, - MetastoreServiceClient, + MetastoreServiceClient, MetastoreServiceTowerLayerStack, }; use quickwit_proto::search::ReportSplitsRequest; use quickwit_proto::types::NodeId; @@ -136,7 +140,6 @@ const READINESS_REPORTING_INTERVAL: Duration = if cfg!(any(test, feature = "test const METASTORE_CLIENT_MAX_CONCURRENCY_ENV_KEY: &str = "QW_METASTORE_CLIENT_MAX_CONCURRENCY"; const DEFAULT_METASTORE_CLIENT_MAX_CONCURRENCY: usize = 6; -const DISABLE_DELETE_TASK_SERVICE_ENV_KEY: &str = "QW_DISABLE_DELETE_TASK_SERVICE"; pub type EnvFilterReloadFn = Arc anyhow::Result<()> + Send + Sync>; @@ -152,6 +155,30 @@ fn get_metastore_client_max_concurrency() -> usize { ) } +/// Configures per-method retry layers on a metastore tower layer stack. +/// +/// All methods get a standard retry. `stage_splits` and `publish_splits` replace it +/// with a harder retry because those calls happen after significant indexing work and +/// losing them to a transient failure is particularly costly. +fn stack_metastore_retry_layer( + tower: MetastoreServiceTowerLayerStack, +) -> MetastoreServiceTowerLayerStack { + let mut tower = tower.stack_layer(RetryLayer::new(RetryPolicy::from(RetryParams::standard()))); + let harder_retry_params = RetryParams { + base_delay: Duration::from_secs(1), + max_delay: Duration::from_secs(20), + // adding just 2 more retries bumps the retry duration from ~2s to ~12s + max_attempts: 5, + }; + tower.stage_splits_layers = vec![BoxLayer::new(RetryLayer::new(RetryPolicy::from( + harder_retry_params, + )))]; + tower.publish_splits_layers = vec![BoxLayer::new(RetryLayer::new(RetryPolicy::from( + harder_retry_params, + )))]; + tower +} + static CP_GRPC_CLIENT_METRICS_LAYER: Lazy = Lazy::new(|| GrpcMetricsLayer::new("control_plane", "client")); static CP_GRPC_SERVER_METRICS_LAYER: Lazy = @@ -229,27 +256,27 @@ async fn balance_channel_for_service( let service_change_stream = cluster_change_stream.filter_map(move |cluster_change| { Box::pin(async move { match cluster_change { - ClusterChange::Add(node) if node.enabled_services().contains(&service) => { + ClusterChange::Add(node) if node.is_service_enabled(service) => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to {} pool", chitchat_id.node_id, service.as_str().replace('_', " "), ); - Some(Change::Insert(node.grpc_advertise_addr(), node.channel())) + Some(Change::Insert(node.grpc_advertise_addr, node.channel())) } - ClusterChange::Remove(node) if node.enabled_services().contains(&service) => { + ClusterChange::Remove(node) if node.is_service_enabled(service) => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "removing node `{}` from {} pool", chitchat_id.node_id, service.as_str().replace('_', " "), ); - Some(Change::Remove(node.grpc_advertise_addr())) + Some(Change::Remove(node.grpc_advertise_addr)) } _ => None, } @@ -317,7 +344,7 @@ async fn start_control_plane_if_needed( ) .await?; - let self_node_id: NodeId = cluster.self_node_id().into(); + let self_node_id: NodeId = cluster.self_node_id(); let control_plane_mailbox = setup_control_plane( universe, @@ -502,8 +529,8 @@ pub async fn serve_quickwit( { bail!("could not find any metastore node in the cluster"); } - MetastoreServiceClient::tower() - .stack_layer(RetryLayer::new(RetryPolicy::from(RetryParams::standard()))) + + stack_metastore_retry_layer(MetastoreServiceClient::tower()) .stack_layer(TimeoutLayer::new(GRPC_METASTORE_SERVICE_TIMEOUT)) .stack_layer(METASTORE_GRPC_CLIENT_METRICS_LAYER.clone()) .stack_layer(tower::limit::GlobalConcurrencyLimitLayer::new( @@ -546,6 +573,7 @@ pub async fn serve_quickwit( ingester_pool.clone(), storage_resolver.clone(), event_broker.clone(), + is_delete_task_service_disabled(), ) .await .context("failed to start indexing service")?; @@ -676,7 +704,7 @@ pub async fn serve_quickwit( search_job_placer, storage_resolver.clone(), event_broker.clone(), - !get_bool_from_env(DISABLE_DELETE_TASK_SERVICE_ENV_KEY, false), + !is_delete_task_service_disabled(), ) .await .context("failed to start janitor service")?; @@ -888,7 +916,7 @@ async fn setup_ingest_v2( ingester_pool: IngesterPool, ) -> anyhow::Result<(IngestRouter, IngestRouterServiceClient, Option)> { // Instantiate ingest router. - let self_node_id: NodeId = cluster.self_node_id().into(); + let self_node_id: NodeId = cluster.self_node_id(); let grpc_compression_encoding_opt = node_config.ingest_api_config.grpc_compression_encoding(); let replication_factor = node_config .ingest_api_config @@ -958,12 +986,12 @@ async fn setup_ingest_v2( ClusterChange::Add(node) if node.is_indexer() => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to ingester pool", chitchat_id.node_id, ); - let node_id: NodeId = node.node_id().into(); + let node_id: NodeId = node.node_id.clone(); if node.is_self_node() { // Here, since the service is available locally, we bypass the network stack @@ -982,7 +1010,7 @@ async fn setup_ingest_v2( .stack_layer(INGEST_GRPC_CLIENT_METRICS_LAYER.clone()) .stack_layer(TimeoutLayer::new(GRPC_INGESTER_SERVICE_TIMEOUT)) .build_from_channel( - node.grpc_advertise_addr(), + node.grpc_advertise_addr, node.channel(), max_message_size, grpc_compression_encoding_opt, @@ -993,12 +1021,12 @@ async fn setup_ingest_v2( ClusterChange::Remove(node) if node.is_indexer() => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "removing node `{}` from ingester pool", chitchat_id.node_id, ); - Some(Change::Remove(node.node_id().into())) + Some(Change::Remove(node.node_id.clone())) } _ => None, } @@ -1025,7 +1053,7 @@ async fn setup_searcher( ) .await?; let search_service_clone = search_service.clone(); - let max_message_size = node_config.grpc_config.max_message_size; + let max_message_size = node_config.grpc_config.max_search_message_size; let searcher_change_stream = cluster_change_stream.filter_map(move |cluster_change| { let search_service_clone = search_service_clone.clone(); Box::pin(async move { @@ -1033,12 +1061,12 @@ async fn setup_searcher( ClusterChange::Add(node) if node.is_searcher() => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to searcher pool", chitchat_id.node_id, ); - let grpc_addr = node.grpc_advertise_addr(); + let grpc_addr = node.grpc_advertise_addr; if node.is_self_node() { let search_client = @@ -1058,12 +1086,12 @@ async fn setup_searcher( ClusterChange::Remove(node) if node.is_searcher() => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "removing node `{}` from searcher pool", chitchat_id.node_id, ); - Some(Change::Remove(node.grpc_advertise_addr())) + Some(Change::Remove(node.grpc_advertise_addr)) } _ => None, } @@ -1141,7 +1169,7 @@ fn setup_indexer_pool( ClusterChange::Add(node) if node.is_indexer() => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to indexer pool", chitchat_id.node_id, @@ -1151,9 +1179,9 @@ fn setup_indexer_pool( }; match cluster_change { ClusterChange::Add(node) | ClusterChange::Update(node) if node.is_indexer() => { - let node_id = node.node_id().to_owned(); - let indexing_tasks = node.indexing_tasks().to_vec(); - let indexing_capacity = node.indexing_capacity(); + let node_id = node.node_id.clone(); + let indexing_tasks = node.indexing_tasks.to_vec(); + let indexing_capacity = node.indexing_cpu_capacity; if node.is_self_node() { // Here, since the service is available locally, we bypass the network stack @@ -1185,7 +1213,7 @@ fn setup_indexer_pool( .stack_layer(INDEXING_GRPC_CLIENT_METRICS_LAYER.clone()) .stack_layer(TimeoutLayer::new(GRPC_INDEXING_SERVICE_TIMEOUT)) .build_from_channel( - node.grpc_advertise_addr(), + node.grpc_advertise_addr, node.channel(), max_message_size, None, @@ -1206,12 +1234,12 @@ fn setup_indexer_pool( ClusterChange::Remove(node) if node.is_indexer() => { let chitchat_id = node.chitchat_id(); info!( - node_id = chitchat_id.node_id, + node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "removing node `{}` from indexer pool", chitchat_id.node_id, ); - Some(Change::Remove(node.node_id().to_owned())) + Some(Change::Remove(node.node_id.clone())) } _ => None, } @@ -1609,4 +1637,40 @@ mod tests { .unwrap(); assert!(!searcher_client.is_local()); } + + #[tokio::test(start_paused = true)] + async fn test_stack_metastore_retry_layer() { + use quickwit_proto::metastore::{ListIndexesMetadataRequest, MetastoreError}; + + let mut mock = MockMetastoreService::new(); + mock.expect_list_indexes_metadata() + .times(3) + .returning(|_| Err(MetastoreError::Unavailable("transient".to_string()))); + + let client = + stack_metastore_retry_layer(MetastoreServiceClient::tower()).build_from_mock(mock); + let err = client + .list_indexes_metadata(ListIndexesMetadataRequest::default()) + .await + .unwrap_err(); + assert!(matches!(err, MetastoreError::Unavailable(_))); + } + + #[tokio::test(start_paused = true)] + async fn test_stack_metastore_retry_layer_harder_retry() { + use quickwit_proto::metastore::{MetastoreError, StageSplitsRequest}; + + let mut mock = MockMetastoreService::new(); + mock.expect_stage_splits() + .times(5) + .returning(|_| Err(MetastoreError::Unavailable("transient".to_string()))); + + let client = + stack_metastore_retry_layer(MetastoreServiceClient::tower()).build_from_mock(mock); + let err = client + .stage_splits(StageSplitsRequest::default()) + .await + .unwrap_err(); + assert!(matches!(err, MetastoreError::Unavailable(_))); + } } diff --git a/quickwit/quickwit-serve/src/metrics.rs b/quickwit/quickwit-serve/src/metrics.rs index c1e4fa24d93..333e407a8ad 100644 --- a/quickwit/quickwit-serve/src/metrics.rs +++ b/quickwit/quickwit-serve/src/metrics.rs @@ -19,8 +19,8 @@ use quickwit_common::metrics::{ }; pub struct ServeMetrics { - pub http_requests_total: IntCounterVec<2>, - pub request_duration_secs: HistogramVec<2>, + pub http_requests_total: IntCounterVec<1>, + pub request_duration_secs: HistogramVec<1>, pub ongoing_requests: IntGaugeVec<1>, pub pending_requests: IntGaugeVec<1>, pub circuit_break_total: IntCounter, @@ -40,14 +40,14 @@ impl Default for ServeMetrics { "Total number of HTTP requests processed.", "", &[], - ["method", "status_code"], + ["status_code"], ), request_duration_secs: new_histogram_vec( "request_duration_secs", "Response time in seconds", "", &[], - ["method", "status_code"], + ["status_code"], // last bucket is 163.84s quickwit_common::metrics::exponential_buckets(0.02, 2.0, 14).unwrap(), ), diff --git a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs index 1654a840dad..4ec47c15847 100644 --- a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs @@ -25,7 +25,6 @@ use quickwit_proto::opentelemetry::proto::collector::trace::v1::{ use quickwit_proto::types::IndexId; use quickwit_proto::{ServiceError, ServiceErrorCode, tonic}; use serde::{self, Serialize}; -use tracing::error; use warp::{Filter, Rejection}; use crate::decompression::get_body_bytes; diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 3f193783b04..96c0f03fefd 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -13,12 +13,17 @@ // limitations under the License. use std::fmt::Formatter; +use std::future::Future; use std::io; +use std::pin::Pin; use std::sync::Arc; +use std::task::{Context, Poll, ready}; +use std::time::Instant; use hyper_util::rt::{TokioExecutor, TokioIo}; use hyper_util::server::conn::auto::Builder; use hyper_util::service::TowerToHyperService; +use pin_project::{pin_project, pinned_drop}; use quickwit_common::tower::BoxFutureInfaillible; use quickwit_config::{disable_ingest_v1, enable_ingest_v2}; use quickwit_search::SearchService; @@ -26,12 +31,11 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_rustls::TlsAcceptor; use tokio_util::either::Either; -use tower::ServiceBuilder; +use tower::{Layer, Service, ServiceBuilder}; use tower_http::compression::CompressionLayer; use tower_http::compression::predicate::{NotForContentType, Predicate, SizeAbove}; use tower_http::cors::{AllowOrigin, CorsLayer}; use tracing::{error, info}; -use warp::filters::log::Info; use warp::hyper::http::HeaderValue; use warp::hyper::{Method, StatusCode, http}; use warp::{Filter, Rejection, Reply, redirect}; @@ -43,7 +47,7 @@ use crate::developer_api::developer_api_routes; use crate::elasticsearch_api::elastic_api_handlers; use crate::health_check_api::health_check_handlers; use crate::index_api::index_management_handlers; -use crate::indexing_api::indexing_get_handler; +use crate::indexing_api::{indexing_get_handler, swap_pipelines_handler}; use crate::ingest_api::ingest_api_handlers; use crate::jaeger_api::jaeger_api_handlers; use crate::metrics_api::metrics_handler; @@ -53,6 +57,7 @@ use crate::rest_api_response::{RestApiError, RestApiResponse}; use crate::search_api::{ search_get_handler, search_plan_get_handler, search_plan_post_handler, search_post_handler, }; +use crate::soft_delete_api::soft_delete_api_handlers; use crate::template_api::index_template_api_handlers; use crate::ui_handler::ui_handler; use crate::{BodyFormat, BuildInfo, QuickwitServices, RuntimeInfo}; @@ -78,6 +83,111 @@ impl std::fmt::Display for TooManyRequests { } } +/// Tower layer that records HTTP request metrics for every request, including +/// cancelled ones. +#[derive(Clone)] +struct HttpMetricsLayer; + +impl Layer for HttpMetricsLayer { + type Service = HttpMetricsService; + fn layer(&self, inner: S) -> Self::Service { + HttpMetricsService { inner } + } +} + +#[derive(Clone)] +struct HttpMetricsService { + inner: S, +} + +impl Service> for HttpMetricsService +where S: Service< + http::Request, + Response = http::Response, + Error = std::convert::Infallible, + > +{ + type Response = S::Response; + type Error = S::Error; + type Future = HttpMetricsFuture; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, req: http::Request) -> Self::Future { + let method = req.method().to_string(); + let path = req.uri().path().to_string(); + let user_agent = req + .headers() + .get(http::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .unwrap_or_default() + .to_string(); + HttpMetricsFuture { + inner: self.inner.call(req), + start: Instant::now(), + method, + status: None, + path, + user_agent, + } + } +} + +#[pin_project(PinnedDrop)] +struct HttpMetricsFuture { + #[pin] + inner: F, + start: Instant, + method: String, + path: String, + user_agent: String, + /// `None` while in-flight (including if dropped before completion). + /// `Some(status)` once the response future resolves. + status: Option, +} + +#[pinned_drop] +impl PinnedDrop for HttpMetricsFuture { + fn drop(self: Pin<&mut Self>) { + let status = self.status.as_deref().unwrap_or("cancelled"); + let duration = self.start.elapsed(); + info!( + method = self.method, + path = self.path, + status = status, + elapsed_ms = duration.as_millis(), + ua = self.user_agent, + "request finished" + ); + crate::SERVE_METRICS + .http_requests_total + .with_label_values([status]) + .inc(); + crate::SERVE_METRICS + .request_duration_secs + .with_label_values([status]) + .observe(duration.as_secs_f64()); + } +} + +impl Future for HttpMetricsFuture +where F: Future, std::convert::Infallible>> +{ + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + let result = ready!(this.inner.poll(cx)); + *this.status = Some(match &result { + Ok(response) => response.status().as_str().to_owned(), + Err(infallible) => match *infallible {}, + }); + Poll::Ready(result) + } +} + /// Env variable key to define the minimum size above which a response should be compressed. /// If unset, no compression is applied. const QW_MINIMUM_COMPRESSION_SIZE_KEY: &str = "QW_MINIMUM_COMPRESSION_SIZE"; @@ -132,19 +242,6 @@ pub(crate) async fn start_rest_server( readiness_trigger: BoxFutureInfaillible<()>, shutdown_signal: BoxFutureInfaillible<()>, ) -> anyhow::Result<()> { - let request_counter = warp::log::custom(|info: Info| { - let elapsed = info.elapsed(); - let status = info.status(); - let label_values: [&str; 2] = [info.method().as_str(), status.as_str()]; - crate::SERVE_METRICS - .request_duration_secs - .with_label_values(label_values) - .observe(elapsed.as_secs_f64()); - crate::SERVE_METRICS - .http_requests_total - .with_label_values(label_values) - .inc(); - }); // Docs routes let api_doc = warp::path("openapi.json") .and(warp::get()) @@ -199,7 +296,6 @@ pub(crate) async fn start_rest_server( .or(health_check_routes) .or(metrics_routes) .or(developer_routes) - .with(request_counter) .recover(recover_fn_final) .with(extra_headers) .boxed(); @@ -209,6 +305,7 @@ pub(crate) async fn start_rest_server( let cors = build_cors(&quickwit_services.node_config.rest_config.cors_allow_origins); let service = ServiceBuilder::new() + .layer(HttpMetricsLayer) .layer( CompressionLayer::new() .zstd(true) @@ -303,7 +400,10 @@ fn api_v1_routes( !disable_ingest_v1(), enable_ingest_v2(), ) - .or(cluster_handler(quickwit_services.cluster.clone())) + .or(cluster_handler( + quickwit_services.cluster.clone(), + quickwit_services.control_plane_client.clone(), + )) .boxed() .or(node_info_handler( BuildInfo::get(), @@ -315,6 +415,10 @@ fn api_v1_routes( quickwit_services.indexing_service_opt.clone(), )) .boxed() + .or(swap_pipelines_handler( + quickwit_services.control_plane_client.clone(), + )) + .boxed() .or(search_routes(quickwit_services.search_service.clone())) .boxed() .or(ingest_api_handlers( @@ -339,6 +443,11 @@ fn api_v1_routes( quickwit_services.metastore_client.clone(), )) .boxed() + .or(soft_delete_api_handlers( + quickwit_services.search_service.clone(), + quickwit_services.metastore_client.clone(), + )) + .boxed() .or(jaeger_api_handlers( quickwit_services.jaeger_service_opt.clone(), )) diff --git a/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs b/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs index c5250ee2465..463c5ca33b5 100644 --- a/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs +++ b/quickwit/quickwit-serve/src/search_api/grpc_adapter.rs @@ -15,15 +15,18 @@ use std::sync::Arc; use async_trait::async_trait; +use futures::stream::{self, StreamExt}; use quickwit_proto::error::convert_to_grpc_result; use quickwit_proto::search::{ GetKvRequest, GetKvResponse, LeafListFieldsRequest, ListFieldsRequest, ListFieldsResponse, ReportSplitsRequest, ReportSplitsResponse, search_service_server as grpc, }; -use quickwit_proto::{set_parent_span_from_request_metadata, tonic}; +use quickwit_proto::{GrpcServiceError, set_parent_span_from_request_metadata, tonic}; use quickwit_search::SearchService; use tracing::instrument; +const FETCH_DOCS_BATCH_SIZE: usize = 500; + #[derive(Clone)] pub struct GrpcSearchAdapter(Arc); @@ -68,6 +71,41 @@ impl grpc::SearchService for GrpcSearchAdapter { convert_to_grpc_result(fetch_docs_result) } + type StreamFetchDocsStream = + quickwit_proto::tonic::codegen::BoxStream; + + #[instrument(skip(self, request))] + async fn stream_fetch_docs( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + set_parent_span_from_request_metadata(request.metadata()); + let fetch_docs_request = request.into_inner(); + + // Call the regular fetch_docs method + let fetch_docs_result = self.0.fetch_docs(fetch_docs_request).await; + + let fetch_docs_response = match fetch_docs_result { + Ok(response) => response, + Err(err) => return Err(err.into_grpc_status()), + }; + + // If there is only one batch, return it directly to avoid copying to a new vec. + if fetch_docs_response.hits.len() <= FETCH_DOCS_BATCH_SIZE { + let batch = quickwit_proto::search::FetchDocsResponse { + hits: fetch_docs_response.hits, + }; + let batch_stream = stream::iter([Ok(batch)]); + return Ok(tonic::Response::new(Box::pin(batch_stream))); + } + + let batch_stream = stream::iter(fetch_docs_response.hits) + .chunks(FETCH_DOCS_BATCH_SIZE) + .map(|batch| Ok(quickwit_proto::search::FetchDocsResponse { hits: batch })); + + Ok(tonic::Response::new(Box::pin(batch_stream))) + } + #[instrument(skip(self, request))] async fn root_list_terms( &self, diff --git a/quickwit/quickwit-serve/src/search_api/rest_handler.rs b/quickwit/quickwit-serve/src/search_api/rest_handler.rs index 671d7a6c2fa..a5192eb76b5 100644 --- a/quickwit/quickwit-serve/src/search_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/search_api/rest_handler.rs @@ -15,6 +15,7 @@ use std::convert::TryFrom; use std::sync::Arc; +use http::HeaderValue; use percent_encoding::percent_decode_str; use quickwit_config::validate_index_id_pattern; use quickwit_proto::search::{CountHits, SortField, SortOrder}; @@ -246,6 +247,7 @@ mod count_hits_from_bool { pub fn search_request_from_api_request( index_id_patterns: Vec, search_request: SearchRequestQueryString, + user_agent: Option, ) -> Result { // The query ast below may still contain user input query. The actual // parsing of the user query will happen in the root service, and might require @@ -269,6 +271,7 @@ pub fn search_request_from_api_request( count_hits: search_request.count_all.into(), ignore_missing_indexes: false, split_id: search_request.split_id, + user_agent, }; Ok(search_request) } @@ -276,10 +279,12 @@ pub fn search_request_from_api_request( async fn search_endpoint( index_id_patterns: Vec, search_request: SearchRequestQueryString, + user_agent: Option, search_service: &dyn SearchService, ) -> Result { let allow_failed_splits = search_request.allow_failed_splits; - let search_request = search_request_from_api_request(index_id_patterns, search_request)?; + let search_request = + search_request_from_api_request(index_id_patterns, search_request, user_agent)?; let search_response = search_service .root_search(search_request) @@ -298,20 +303,24 @@ async fn search_endpoint( } fn search_get_filter() --> impl Filter, SearchRequestQueryString), Error = Rejection> + Clone { +-> impl Filter, SearchRequestQueryString, Option), Error = Rejection> ++ Clone { warp::path!(String / "search") .and_then(extract_index_id_patterns) .and(warp::get()) .and(warp::query()) + .and(warp::header::optional::("user-agent")) } fn search_post_filter() --> impl Filter, SearchRequestQueryString), Error = Rejection> + Clone { +-> impl Filter, SearchRequestQueryString, Option), Error = Rejection> ++ Clone { warp::path!(String / "search") .and_then(extract_index_id_patterns) .and(warp::post()) .and(warp::body::content_length_limit(1024 * 1024)) .and(warp::body::json()) + .and(warp::header::optional::("user-agent")) } fn search_plan_get_filter() @@ -334,11 +343,18 @@ fn search_plan_post_filter() async fn search( index_id_patterns: Vec, search_request: SearchRequestQueryString, + user_agent: Option, search_service: Arc, ) -> impl warp::Reply { info!(request =? search_request, "search"); let body_format = search_request.format; - let result = search_endpoint(index_id_patterns, search_request, &*search_service).await; + let result = search_endpoint( + index_id_patterns, + search_request, + user_agent.and_then(|h| h.to_str().ok().map(str::to_owned)), + &*search_service, + ) + .await; into_rest_api_response(result, body_format) } @@ -349,7 +365,8 @@ async fn search_plan( ) -> impl warp::Reply { let body_format = search_request.format; let result: Result = async { - let plan_request = search_request_from_api_request(index_id_patterns, search_request)?; + let plan_request = + search_request_from_api_request(index_id_patterns, search_request, None)?; let plan_response = search_service.search_plan(plan_request).await?; let response = serde_json::from_str(&plan_response.result)?; Ok(response) @@ -503,7 +520,6 @@ mod tests { hits: Vec::new(), snippets: None, elapsed_time_micros: 0u64, - errors: Vec::new(), aggregations: None, }; let search_response_json: JsonValue = serde_json::to_value(search_response)?; @@ -522,7 +538,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_post() { let rest_search_api_filter = search_post_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .method("POST") .path("/quickwit-demo-index/search") .json(&true) @@ -550,7 +566,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_post_multi_indexes() { let rest_search_api_filter = search_post_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .method("POST") .path("/quickwit-demo-index,quickwit-demo,quickwit-demo-index-*/search") .json(&true) @@ -605,7 +621,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_simple() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path( "/quickwit-demo-index/search?query=*&end_timestamp=1450720000&max_hits=10&\ start_offset=22", @@ -633,7 +649,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_count_all() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&count_all=true") .filter(&rest_search_api_filter) .await @@ -651,7 +667,7 @@ mod tests { } ); let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&count_all=false") .filter(&rest_search_api_filter) .await @@ -673,7 +689,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_simple_default_num_hits_default_offset() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path( "/quickwit-demo-index/search?query=*&end_timestamp=1450720000&search_field=title,\ body", @@ -701,7 +717,7 @@ mod tests { #[tokio::test] async fn test_rest_search_api_route_simple_format() { let rest_search_api_filter = search_get_filter(); - let (indexes, req) = warp::test::request() + let (indexes, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&format=json") .filter(&rest_search_api_filter) .await @@ -826,7 +842,7 @@ mod tests { "/quickwit-demo-index/search?query=*&format=json&sort_by={sort_by_query_param}" ); let rest_search_api_filter = search_get_filter(); - let (_, req) = warp::test::request() + let (_, req, _) = warp::test::request() .path(&path) .filter(&rest_search_api_filter) .await @@ -840,7 +856,7 @@ mod tests { } let rest_search_api_filter = search_get_filter(); - let (_, req) = warp::test::request() + let (_, req, _) = warp::test::request() .path("/quickwit-demo-index/search?query=*&format=json&sort_by_field=fiel1") .filter(&rest_search_api_filter) .await @@ -897,7 +913,6 @@ mod tests { hits: Vec::new(), num_hits: 10, elapsed_time_micros: 16, - errors: Vec::new(), ..Default::default() }) }); @@ -1008,7 +1023,6 @@ mod tests { }], num_hits: 1, elapsed_time_micros: 16, - errors: Vec::new(), ..Default::default() }) }); @@ -1028,7 +1042,6 @@ mod tests { "hits": [{"title": "foo", "body": "foo bar baz"}], "snippets": [{"title": [], "body": ["foo bar baz"]}], "elapsed_time_micros": 16, - "errors": [], }); assert_json_eq!(resp_json, expected_response_json); Ok(()) diff --git a/quickwit/quickwit-serve/src/soft_delete_api/handler.rs b/quickwit/quickwit-serve/src/soft_delete_api/handler.rs new file mode 100644 index 00000000000..b7000237573 --- /dev/null +++ b/quickwit/quickwit-serve/src/soft_delete_api/handler.rs @@ -0,0 +1,373 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use itertools::Itertools; +use quickwit_metastore::IndexMetadataResponseExt; +use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, MetastoreServiceClient, SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, SplitDocIds, +}; +use quickwit_proto::search::SearchRequest; +use quickwit_proto::types::IndexId; +use quickwit_query::query_ast::query_ast_from_user_text; +use quickwit_search::{SearchError, SearchService}; +use serde::{Deserialize, Serialize}; +use warp::{Filter, Rejection}; + +use crate::format::extract_format_from_qs; +use crate::rest::recover_fn; +use crate::rest_api_response::into_rest_api_response; +use crate::with_arg; + +const MAX_SOFT_DELETED_HITS: u64 = 100; + +#[allow(dead_code)] +#[derive(utoipa::OpenApi)] +#[openapi( + paths(post_soft_delete), + components(schemas(SoftDeleteRequest, SoftDeleteResponse)) +)] +pub struct SoftDeleteApi; + +/// Request body for the soft-delete endpoint. +#[derive(Deserialize, Debug, PartialEq, Eq, Default, utoipa::ToSchema)] +#[serde(deny_unknown_fields)] +pub struct SoftDeleteRequest { + /// Query text in Tantivy query language to match events to soft-delete. + pub query: String, + /// Maximum number of events to soft-delete in a single call (default: 100). + #[serde(default = "default_max_soft_deletes")] + pub max_hits: u64, + /// If set, restrict soft-delete to documents with a `timestamp >= start_timestamp`. + pub start_timestamp: Option, + /// If set, restrict soft-delete to documents with a `timestamp < end_timestamp`. + pub end_timestamp: Option, +} + +fn default_max_soft_deletes() -> u64 { + MAX_SOFT_DELETED_HITS +} + +/// Response from the soft-delete endpoint. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, utoipa::ToSchema)] +pub struct SoftDeleteResponse { + /// Total number of doc_ids that were newly soft-deleted across all splits. + pub num_soft_deleted_doc_ids: u64, +} + +/// Top-level filter combining all soft-delete API handlers. +pub fn soft_delete_api_handlers( + search_service: Arc, + metastore: MetastoreServiceClient, +) -> impl Filter + Clone { + post_soft_delete_handler(search_service, metastore.clone()) + .recover(recover_fn) + .boxed() +} + +fn post_soft_delete_handler( + search_service: Arc, + metastore: MetastoreServiceClient, +) -> impl Filter + Clone { + warp::path!(String / "soft-delete") + .and(warp::body::json()) + .and(warp::post()) + .and(with_arg(search_service)) + .and(with_arg(metastore)) + .then(post_soft_delete) + .and(extract_format_from_qs()) + .map(into_rest_api_response) +} + +#[utoipa::path( + post, + tag = "Soft Delete", + path = "/{index_id}/soft-delete", + request_body = SoftDeleteRequest, + responses( + (status = 200, description = "Successfully soft-deleted documents.", body = SoftDeleteResponse) + ), + params( + ("index_id" = String, Path, description = "The index ID to soft-delete documents from."), + ) +)] +/// Soft Delete Documents +/// +/// Runs a search query to identify matching documents, then records their internal +/// doc IDs in the metastore so they are excluded from future search results. +pub async fn post_soft_delete( + index_id: IndexId, + request: SoftDeleteRequest, + search_service: Arc, + metastore: MetastoreServiceClient, +) -> Result { + // 1. Build a SearchRequest from the soft-delete query. + // Validate the query and make sure it doesn't require default search fields + let query_ast = query_ast_from_user_text(&request.query, None); + query_ast.clone().parse_user_query(&[])?; + let query_ast_json = serde_json::to_string(&query_ast) + .map_err(|err| SearchError::Internal(format!("failed to serialize query AST: {err}")))?; + + // Enforce a hits limit that guarantee we won't delete + // more than MAX_SOFT_DELETED_HITS per split + let max_hits = if request.max_hits > MAX_SOFT_DELETED_HITS { + MAX_SOFT_DELETED_HITS + } else { + request.max_hits + }; + + let search_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: query_ast_json, + max_hits, + start_timestamp: request.start_timestamp, + end_timestamp: request.end_timestamp, + ..Default::default() + }; + + // 2. Execute root_search to get PartialHits (split_id, doc_id). + let search_response = search_service.root_search(search_request).await?; + + // 3. Group hits by split_id. + let split_doc_ids: Vec = search_response + .hits + .iter() + .filter_map(|hit| hit.partial_hit.as_ref()) + .into_group_map_by(|ph| ph.split_id.clone()) + .into_iter() + .map(|(split_id, hits)| SplitDocIds { + split_id, + doc_ids: hits.into_iter().map(|h| h.doc_id).collect(), + }) + .collect(); + + if split_doc_ids.is_empty() { + return Ok(SoftDeleteResponse { + num_soft_deleted_doc_ids: 0, + }); + } + + // 4. Resolve index_uid. + let index_metadata_request = IndexMetadataRequest::for_index_id(index_id.to_string()); + let index_uid = metastore + .index_metadata(index_metadata_request) + .await + .map_err(|err| SearchError::Internal(format!("failed to fetch index metadata: {err}")))? + .deserialize_index_metadata() + .map_err(|err| { + SearchError::Internal(format!("failed to deserialize index metadata: {err}")) + })? + .index_uid; + + // 5. Store in metastore. + let SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids, + } = metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids, + }) + .await + .map_err(|err| SearchError::Internal(format!("failed to soft-delete documents: {err}")))?; + + Ok(SoftDeleteResponse { + num_soft_deleted_doc_ids, + }) +} + +#[cfg(test)] +mod tests { + use std::net::{Ipv4Addr, SocketAddr}; + + use quickwit_config::SearcherConfig; + use quickwit_indexing::TestSandbox; + use quickwit_search::{ClusterClient, SearchJobPlacer, SearchServiceImpl, SearcherPool}; + use warp::Filter; + + use super::*; + use crate::rest::recover_fn; + + /// Build a real `Arc` wired to the given `TestSandbox`. + async fn build_search_service(sandbox: &TestSandbox) -> Arc { + let socket_addr = SocketAddr::new(Ipv4Addr::new(127, 0, 0, 1).into(), 7280u16); + let searcher_pool = SearcherPool::default(); + let search_job_placer = SearchJobPlacer::new(searcher_pool.clone()); + let cluster_client = ClusterClient::new(search_job_placer); + let searcher_config = SearcherConfig::default(); + let searcher_context = + Arc::new(quickwit_search::SearcherContext::new(searcher_config, None)); + let search_service: Arc = Arc::new(SearchServiceImpl::new( + sandbox.metastore(), + sandbox.storage_resolver(), + cluster_client, + searcher_context, + )); + let search_service_client = + quickwit_search::SearchServiceClient::from_service(search_service.clone(), socket_addr); + searcher_pool.insert(socket_addr, search_service_client); + search_service + } + + #[tokio::test] + async fn test_soft_delete_api_post_no_matching_docs() { + let index_id = "test-soft-delete-rest"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // POST a soft-delete query matching no docs → should get 0 + let resp = warp::test::request() + .path("/test-soft-delete-rest/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:nonexistent_term_xyz"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 0); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_with_matching_docs() { + let index_id = "test-soft-delete-match"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + + // Ingest some documents. + let docs = vec![ + serde_json::json!({"title": "apple"}), + serde_json::json!({"title": "banana"}), + serde_json::json!({"title": "cherry"}), + ]; + test_sandbox.add_documents(docs).await.unwrap(); + + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // Soft-delete documents matching "apple". + let resp = warp::test::request() + .path("/test-soft-delete-match/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 1); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_idempotent() { + let index_id = "test-soft-delete-idempotent"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + + let docs = vec![serde_json::json!({"title": "apple"})]; + test_sandbox.add_documents(docs).await.unwrap(); + + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // First soft-delete. + let resp = warp::test::request() + .path("/test-soft-delete-idempotent/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 1); + + // Second soft-delete of same doc — the doc is already excluded from search + // results, so the search won't find it again, yielding 0 new deletions. + let resp = warp::test::request() + .path("/test-soft-delete-idempotent/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 0); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_deny_unknown_fields() { + let index_id = "test-soft-delete-unknown"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // POST with unknown field should fail. + let resp = warp::test::request() + .path("/test-soft-delete-unknown/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple", "unknown_field": true}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 400); + + test_sandbox.assert_quit().await; + } +} diff --git a/quickwit/quickwit-serve/src/soft_delete_api/mod.rs b/quickwit/quickwit-serve/src/soft_delete_api/mod.rs new file mode 100644 index 00000000000..d72811748f5 --- /dev/null +++ b/quickwit/quickwit-serve/src/soft_delete_api/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod handler; + +pub use handler::soft_delete_api_handlers; diff --git a/quickwit/quickwit-storage/src/object_storage/policy.rs b/quickwit/quickwit-storage/src/object_storage/policy.rs index 6ce48ab7a94..9f6b0ced2b6 100644 --- a/quickwit/quickwit-storage/src/object_storage/policy.rs +++ b/quickwit/quickwit-storage/src/object_storage/policy.rs @@ -67,9 +67,11 @@ impl MultiPartPolicy { impl Default for MultiPartPolicy { fn default() -> Self { MultiPartPolicy { - // S3 limits part size from 5M to 5GB, we want to end up with as few parts as possible - // since each part is charged as a put request. - target_part_num_bytes: 5_000_000_000, // 5GB + // QW originally used 5GB to limit the number of PUT requests. This + // is a bit excessive, and many cloud providers don't bill by + // request. We don't want it to be too small either because parts + // incur a performance overhead when a range request spans 2 parts. + target_part_num_bytes: 2_000_000_000, // 2GB multipart_threshold_num_bytes: 128 * 1_024 * 1_024, // 128 MiB max_num_parts: 10_000, max_object_num_bytes: 5_000_000_000_000u64, // S3 allows up to 5TB objects diff --git a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs index ecce3c795da..d34eedcc5fd 100644 --- a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs +++ b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs @@ -17,10 +17,12 @@ use std::ops::Range; use std::path::{Path, PathBuf}; use std::pin::Pin; use std::task::{Context, Poll}; +use std::time::Duration; use std::{fmt, io}; use anyhow::{Context as AnyhhowContext, anyhow}; use async_trait::async_trait; +use aws_config::timeout::TimeoutConfig; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::Client as S3Client; use aws_sdk_s3::config::{Credentials, Region}; @@ -145,7 +147,13 @@ pub async fn create_s3_client(s3_storage_config: &S3StorageConfig) -> S3Client { s3_config.set_retry_config(aws_config.retry_config().cloned()); s3_config.set_sleep_impl(aws_config.sleep_impl()); s3_config.set_stalled_stream_protection(aws_config.stalled_stream_protection()); - s3_config.set_timeout_config(aws_config.timeout_config().cloned()); + s3_config.set_timeout_config(Some( + TimeoutConfig::builder() + .connect_timeout(Duration::from_secs(5)) + .operation_attempt_timeout(Duration::from_secs(900)) // Single attempt timeout + .operation_timeout(Duration::from_secs(1800)) // Total timeout + .build(), + )); if let Some(endpoint) = s3_storage_config.endpoint() { info!(endpoint=%endpoint, "using S3 endpoint defined in storage config or environment variable"); diff --git a/quickwit/quickwit-ui/src/components/IndexSummary.tsx b/quickwit/quickwit-ui/src/components/IndexSummary.tsx index c3eca2da261..7be3b8b01ee 100644 --- a/quickwit/quickwit-ui/src/components/IndexSummary.tsx +++ b/quickwit/quickwit-ui/src/components/IndexSummary.tsx @@ -13,7 +13,7 @@ // limitations under the License. import styled from "@emotion/styled"; -import { Paper } from "@mui/material"; +import { Alert, Paper } from "@mui/material"; import dayjs from "dayjs"; import utc from "dayjs/plugin/utc"; import { FC, ReactNode } from "react"; @@ -75,6 +75,12 @@ export function IndexSummary({ index }: { index: Index }) { return ( + {index.split_limit_reached && ( + + Split limit reached. Only the first 10,000 splits were retrieved. + The actual total may be higher. Statistics shown are incomplete. + + )} {dayjs .unix(index.metadata.create_timestamp) diff --git a/quickwit/quickwit-ui/src/services/client.ts b/quickwit/quickwit-ui/src/services/client.ts index cc7643b6687..95baaceed99 100644 --- a/quickwit/quickwit-ui/src/services/client.ts +++ b/quickwit/quickwit-ui/src/services/client.ts @@ -81,7 +81,8 @@ export class Client { ]); return { metadata: metadata, - splits: splits, + splits: splits[0], + split_limit_reached: splits[1], }; } @@ -89,14 +90,16 @@ export class Client { return this.fetch(`${this.apiRoot()}indexes/${indexId}`, {}); } - async getAllSplits(indexId: string): Promise> { + async getAllSplits( + indexId: string, + ): Promise<[Array, boolean]> { // TODO: restrieve all the splits. const results: { splits: Array } = await this.fetch( `${this.apiRoot()}indexes/${indexId}/splits?limit=10000`, {}, ); - return results["splits"]; + return [results["splits"], results["splits"].length === 10000]; } async listIndexes(): Promise> { diff --git a/quickwit/quickwit-ui/src/utils/models.ts b/quickwit/quickwit-ui/src/utils/models.ts index 67e77add3de..8abe8acc6e1 100644 --- a/quickwit/quickwit-ui/src/utils/models.ts +++ b/quickwit/quickwit-ui/src/utils/models.ts @@ -282,6 +282,7 @@ export type Range = { export type Index = { metadata: IndexMetadata; splits: SplitMetadata[]; + split_limit_reached: boolean; }; export type Cluster = { diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml index 40413bbfcec..755e8ae1db1 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml @@ -284,14 +284,16 @@ expected: response: values: - key: 85.0 - value: 100.49456770856702 + value: + $expect: 'abs(val - 100.4945) < 0.1' - doc_count: 2 key: 1422662400000.0 key_as_string: '2015-01-31T00:00:00Z' response: values: - key: 85.0 - value: 30.26717133872237 + value: + $expect: 'abs(val - 30.2617) < 0.1' --- # Test histogram method: [GET] @@ -353,12 +355,16 @@ json: field: "date" expected: aggregations: + # cardinality queries are currently being improved upstream unique_names: - value: 8.0 + value: + $expect: 'abs(val - 8) <= 2' unique_response: - value: 5.0 # TODO: Check. The correct number is 6 + value: + $expect: 'abs(val - 6) <= 2' unique_dates: - value: 6.0 + value: + $expect: 'abs(val - 6) <= 3' --- # Test extended stats aggregation method: [GET] diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml index 3fe75d61973..9a691dbb75e 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml @@ -61,7 +61,6 @@ expected: relation: "eq" # Also testing numbers, and numbers as string in the JSON query --- -engines: ["elasticsearch"] params: size: 0 json: @@ -170,3 +169,57 @@ expected: total: value: 1 relation: "eq" +--- +params: + size: 0 +json: + track_total_hits: true + query: + term: + payload.commits.distinct: "true" +expected: + hits: + total: + value: 60 + relation: "eq" +--- +params: + size: 0 +json: + track_total_hits: true + query: + term: + payload.commits.distinct: true +expected: + hits: + total: + value: 60 + relation: "eq" +--- +endpoint: "simple_es_compat/_search" +params: + size: 0 +json: + track_total_hits: true + query: + term: + float_field: "1.1" +expected: + hits: + total: + value: 1 + relation: "eq" +--- +endpoint: "simple_es_compat/_search" +params: + size: 0 +json: + track_total_hits: true + query: + term: + float_field: 1.1 +expected: + hits: + total: + value: 1 + relation: "eq" \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml index bd24f4fb718..c34cd43d64a 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml @@ -45,24 +45,6 @@ expected: hits: - sort: [9018] --- -# Test with a search after value as string -# Quickwit should convert it to the correct type -json: - size: 1 - query: - match_all: {} - sort: - - actor.id: - order: asc - search_after: ["5688"] -expected: - hits: - total: - value: 100 - relation: eq - hits: - - sort: [9018] ---- json: size: 1 query: @@ -93,21 +75,6 @@ expected: hits: $expect: "len(val) == 4" --- -# Quickwit should accept timestamp as string. -json: - size: 100 - track_total_hits: true - query: - match_all: {} - sort: - - created_at: - order: asc - search_after: ["1422748815000"] -expected: - hits: - hits: - $expect: "len(val) == 4" ---- json: size: 100 track_total_hits: true @@ -116,7 +83,7 @@ json: sort: - created_at: order: desc - search_after: ["1422748800001"] + search_after: [1422748800001] expected: hits: hits: diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.elasticsearch.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.elasticsearch.yaml index 22d847c8679..553b94d7c81 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.elasticsearch.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.elasticsearch.yaml @@ -154,6 +154,6 @@ params: headers: {"Content-Type": "application/json"} ndjson: - {"index":{"_index":"simple_es_compat"}} - - {"keyword_text": "red"} + - {"keyword_text": "red", "float_field": 1.1} - {"index":{"_index":"simple_es_compat"}} - - {"keyword_text": "gold$"} + - {"keyword_text": "gold$", "float_field": 2.2} diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml index f84d3587fed..0e1f76b3b28 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml @@ -135,5 +135,5 @@ endpoint: simple_es_compat/ingest params: commit: force ndjson: - - {"keyword_text": "red"} - - {"keyword_text": "gold$"} + - {"keyword_text": "red", "float_field": 1.1} + - {"keyword_text": "gold$", "float_field": 2.2} diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml index dc9765b634e..7dae4d645da 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml @@ -40,3 +40,9 @@ params: query: "auto_date:>=2023-05-25T00:00:00Z AND auto_date:<2023-05-26T00:00:00Z" expected: num_hits: 2 +--- +endpoint: millisec/search +params: + query: "ts:>=2022-12-16T10:00:57.000Z AND ts:<=2022-12-16T10:00:57.000Z" +expected: + num_hits: 1 \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index b333ed3c86a..e410ecd96c0 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -98,3 +98,31 @@ ndjson: - {"text_raw": "indexed with raw tokenizer dashes"} - {"text_fast": "fast-text-value-dashes"} - {"text_fast": "fast text value whitespaces"} +--- +method: DELETE +endpoint: indexes/millisec +status_code: null +--- +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: millisec + doc_mapping: + timestamp_field: ts + mode: strict + field_mappings: + - name: ts + type: datetime + fast: true + input_formats: ["rfc3339"] + fast_precision: milliseconds +--- +method: POST +endpoint: millisec/ingest +params: + commit: force +ndjson: + - {"ts": "2022-12-16T10:00:56.297Z"} + - {"ts": "2022-12-16T10:00:57.000Z"} + - {"ts": "2022-12-16T10:00:57.297Z"} \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index 56cd2bda8a9..ebfa1c4931b 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -3,3 +3,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/nested +--- +method: DELETE +endpoint: indexes/millisec \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml b/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml index a1e958e0e50..85f6aa999f6 100644 --- a/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml +++ b/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml @@ -227,9 +227,9 @@ expected: relation: eq hits: - sort: [0] - - sort: [True] - sort: [10.5] - sort: [18000000000000000000] + - sort: [True] --- desc: "search after on mixed column desc match nothing" json: @@ -263,8 +263,8 @@ expected: value: 5 relation: eq hits: - - sort: [True] - sort: [0] - sort: [-10] + diff --git a/quickwit/rust-toolchain.toml b/quickwit/rust-toolchain.toml index e54a09951e9..2a30998f14b 100644 --- a/quickwit/rust-toolchain.toml +++ b/quickwit/rust-toolchain.toml @@ -1,4 +1,4 @@ [toolchain] -channel = "1.91" +channel = "1.93" components = ["cargo", "clippy", "rustfmt", "rust-docs"]