diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 88cb70ef1a5..7817d65e216 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -48,3 +48,7 @@ harness = false [[bench]] name = "run_end_compress" harness = false + +[[bench]] +name = "run_end_decode" +harness = false diff --git a/encodings/runend/PERF_NOTES.md b/encodings/runend/PERF_NOTES.md new file mode 100644 index 00000000000..151fbb611dd --- /dev/null +++ b/encodings/runend/PERF_NOTES.md @@ -0,0 +1,521 @@ +# Run-End Boolean Decoding Performance Notes + +## Overview + +This document captures the state of performance optimization work on `decompress_bool.rs` for run-end encoded boolean arrays. + +## Problem Statement + +The original benchmark comparison showed the new implementation was slower for the 1000 run length case (only 10 runs): + +``` +10000_1000_alternating_mostly_valid: develop 401 ns, new 714 ns, 0.56x slower +``` + +## Root Cause Analysis + +### Benchmark Unfairness + +The baseline benchmark (`decode_bool_nullable_develop`) and new implementation (`decode_bool_nullable`) measure different things: + +**New implementation (what gets timed):** +```rust +bencher + .with_inputs(|| (ends.clone(), values.clone())) // Setup: just clone + .bench_refs(|(ends, values)| { + // TIMED: extraction + decode + runend_decode_bools(ends.clone(), values.clone(), 0, total_length) + }); +``` + +Inside `runend_decode_bools` (all timed): +1. `values.validity_mask()?` - extract validity mask +2. `values.to_bit_buffer()` - extract bit buffer +3. `match_each_unsigned_integer_ptype!` - generic type dispatch +4. `trimmed_ends_iter()` - iterator with 3 chained `.map()` operations +5. Actual decode loop + +**Baseline (what gets timed):** +```rust +bencher + .with_inputs(|| { + // NOT TIMED: all extraction done here + let ends_slice: Vec = ends.as_slice::().to_vec(); + let values_buf = values.to_bit_buffer(); + let validity_buf = values.validity_mask().unwrap(); + let validity_bits = match validity_buf { ... }; + (ends_slice, values_buf, validity_bits) + }) + .bench_refs(|(ends, values, validity)| { + // TIMED: only the decode loop with pre-extracted data + decode_bool_nullable_baseline(ends, values, validity, total_length) + }); +``` + +**Key insight:** The baseline excludes ~150ns of extraction overhead from timing. + +### Overhead Sources for Few Runs + +For 10 runs (1000 run length), the overhead dominates: + +1. **`trimmed_ends_iter`** - 3 chained `.map()` per element: + - `v - offset_e` (subtract offset) + - `min(v, length_e)` (clamp to length) + - `v.as_()` (convert to usize) + +2. **Array method calls:** + - `values.validity_mask()?` + - `values.to_bit_buffer()` + - `ends.as_slice::()` + +3. **Generic dispatch:** `match_each_unsigned_integer_ptype!` macro expansion + +## Optimizations Implemented + +### 1. Fast Path for Few Runs with No Offset + +Added `decode_few_runs_no_offset()` function that: +- Bypasses `trimmed_ends_iter` iterator chain +- Uses direct slice iteration: `for (i, &end) in ends.iter().enumerate()` +- Triggered when `offset == 0 && num_runs < PREFILL_RUN_THRESHOLD` (32) + +```rust +// In runend_decode_bools(): +if offset == 0 && num_runs < PREFILL_RUN_THRESHOLD { + return Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + decode_few_runs_no_offset( + ends.as_slice::(), + &values_buf, + validity, + nullability, + length, + ) + })); +} +``` + +### 2. Optimized Nullable Fast Path with fill_bits + +For nullable decoding in the fast path, uses `fill_bits_true`/`fill_bits_false` instead of `append_n`: + +```rust +Mask::Values(mask) => { + let validity_buf = mask.bit_buffer(); + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut prev_end = 0usize; + for (i, &end) in ends.iter().enumerate() { + let end = end.as_().min(length); + if end > prev_end { + let is_valid = validity_buf.value(i); + if is_valid { + fill_bits_true(validity_bytes, prev_end, end); + if values.value(i) { + fill_bits_true(decoded_bytes, prev_end, end); + } + } + } + prev_end = end; + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} +``` + +## Current Benchmark Results + +### Nullable Cases + +| Benchmark | New | Baseline | Speedup | +|-----------|-----|----------|---------| +| 10000_2_alternating_mostly_valid | 12.2 µs | 42.6 µs | **3.5x** | +| 10000_10_alternating_mostly_valid | 3.6 µs | 13.1 µs | **3.6x** | +| 10000_10_alternating_mostly_null | 2.8 µs | 12.1 µs | **4.3x** | +| 10000_10_mostly_true_mostly_valid | 3.0 µs | 11.8 µs | **3.9x** | +| 10000_100_alternating_mostly_valid | 0.90 µs | 2.27 µs | **2.5x** | +| 10000_1000_alternating_mostly_valid | 0.48 µs | 0.32 µs | **0.67x** (1.5x slower) | + +### Non-Nullable Cases (1000 run length) + +| Benchmark | Time | +|-----------|------| +| 10000_1000_all_false | ~191-200 ns | +| 10000_1000_all_true | ~191-202 ns | +| 10000_1000_alternating | ~194-201 ns | +| 10000_1000_mostly_false | ~192-199 ns | +| 10000_1000_mostly_true | ~192-201 ns | + +Non-nullable fast path is very efficient. + +## Progress + +- **Before optimizations:** 0.56x (1.8x slower) for 1000 run length nullable +- **After optimizations:** 0.67x (1.5x slower) for 1000 run length nullable +- **Remaining gap:** ~150ns extraction overhead + +## Remaining Work + +### Option 1: Fix the Benchmark (Recommended) + +Make the benchmark fair by including extraction in the baseline timing: + +```rust +#[divan::bench(args = NULLABLE_BOOL_ARGS)] +fn decode_bool_nullable_develop_fair(bencher: Bencher, args: NullableBoolBenchArgs) { + let (ends, values) = create_nullable_bool_test_data(...); + bencher + .with_inputs(|| (ends.clone(), values.clone())) + .bench_refs(|(ends, values)| { + // Now timing extraction too + let ends_slice: Vec = ends.as_slice::().to_vec(); + let values_buf = values.to_bit_buffer(); + let validity_buf = values.validity_mask().unwrap(); + let validity_bits = match validity_buf { + vortex_mask::Mask::Values(m) => m.bit_buffer().clone(), + _ => BitBuffer::new_set(values.len()), + }; + decode_bool_nullable_baseline(&ends_slice, &values_buf, &validity_bits, total_length) + }); +} +``` + +### Option 2: Lower-Level API + +Add a public function that takes pre-extracted data for users who want maximum performance and are willing to manage extraction themselves: + +```rust +pub fn runend_decode_bools_from_slices( + ends: &[E], + values: &BitBuffer, + validity: &BitBuffer, // or Option<&BitBuffer> + length: usize, +) -> BoolArray +``` + +### Option 3: Reduce Extraction Overhead + +Investigate ways to make `validity_mask()` and `to_bit_buffer()` cheaper: +- Caching +- Avoiding allocations +- Direct field access if possible + +## Files Changed + +- `encodings/runend/src/decompress_bool.rs`: + - Added `PREFILL_RUN_THRESHOLD` constant at module level + - Added `decode_few_runs_no_offset()` function + - Modified `runend_decode_bools()` to use fast path + - Added tests: `decode_bools_nullable`, `decode_bools_nullable_few_runs` + +## Tests + +All tests pass: +``` +running 8 tests +test decompress_bool::tests::decode_bools_all_false_single_run ... ok +test decompress_bool::tests::decode_bools_all_true_single_run ... ok +test decompress_bool::tests::decode_bools_alternating ... ok +test decompress_bool::tests::decode_bools_mostly_false ... ok +test decompress_bool::tests::decode_bools_mostly_true ... ok +test decompress_bool::tests::decode_bools_nullable ... ok +test decompress_bool::tests::decode_bools_nullable_few_runs ... ok +test decompress_bool::tests::decode_bools_with_offset ... ok +``` + +## Code Locations + +- Implementation: `encodings/runend/src/decompress_bool.rs` +- Benchmarks: `encodings/runend/benches/run_end_decode.rs` +- Iterator helper: `encodings/runend/src/iter.rs` (`trimmed_ends_iter`) + +## Investigation: fill_bits Performance (2025-02-02) + +### Hypothesis + +The `fill_bits_true`/`fill_bits_false` functions might be slow and could benefit from using u64 instead of u8 for the middle byte fill. + +### Benchmark Results + +Added benchmarks comparing byte-level (u8) vs word-level (u64) fill implementations: + +| Range (bits) | Offset | u8 `.fill()` | u64 manual | Winner | +|--------------|--------|--------------|------------|--------| +| 10 | 0 | ~2.1ns | ~2.6ns | **u8** | +| 10 | 3 | ~1.1ns | ~1.2ns | ~same | +| 100 | 0 | ~4.1ns | ~6.5ns | **u8** | +| 100 | 5 | ~3.9ns | ~8.5ns | **u8 (2x)** | +| 1000 | 0 | ~2.4ns | ~6.7ns | **u8 (3x)** | +| 1000 | 7 | ~3.0ns | ~11ns | **u8 (4x)** | +| 5000 | 0 | ~9.7ns | ~9.8ns | ~same | +| 5000 | 1 | ~10ns | ~13ns | **u8** | + +### Conclusion + +**The fill functions are NOT the bottleneck.** The `.fill()` method is already highly optimized by LLVM - it generates vectorized memset-like code internally. The manual u64 approach adds overhead from: +1. Alignment checking (`align_offset`) +2. Extra branches for prefix/suffix handling +3. Unsafe pointer casts + +The fill operations only take ~2-10ns, while the full decode takes ~200-700ns. The overhead comes from elsewhere. + +### What IS the bottleneck? + +For the 1000 run length nullable case: +- Baseline (pre-extracted data): ~320ns +- New implementation (includes extraction): ~480ns +- Difference: ~160ns + +The overhead sources are: +1. **Extraction calls** (~150ns): + - `values.validity_mask()?` + - `values.to_bit_buffer()` + - `ends.as_slice::()` + +2. **Iterator chain** (for non-fast-path cases): + - `trimmed_ends_iter` with 3 chained `.map()` operations + +### Next Steps + +1. **Profile the extraction methods** - understand what makes `validity_mask()` and `to_bit_buffer()` expensive +2. **Consider caching** - if these methods are called frequently, cache results +3. **Accept the tradeoff** - the extraction overhead is necessary for a clean API; users who need maximum performance can use the lower-level functions directly + +## Optimization: validity_mask() Fast Path (2025-02-02) + +### Change + +Added a fast path in `validity_mask()` (in `vortex-array/src/compute/filter.rs`) to avoid the expensive `fill_null()` call when the validity array is already a non-nullable BoolArray. + +### Extraction Benchmark Results (After) + +| Operation | Before | After | Improvement | +|-----------|--------|-------|-------------| +| `validity_mask()` | ~150-166ns | ~98-102ns | **~40% faster** | +| All combined | ~195-208ns | ~127-135ns | **~35% faster** | + +### Full Decode Benchmark Results (After) + +| Benchmark | New | Baseline | Speedup | +|-----------|-----|----------|---------| +| 10000_2_alternating_mostly_valid | 14.3 µs | 49.9 µs | **3.5x faster** | +| 10000_10_alternating_mostly_valid | 4.0 µs | 15.3 µs | **3.8x faster** | +| 10000_100_alternating_mostly_valid | 922 ns | 2.6 µs | **2.8x faster** | +| 10000_1000_alternating_mostly_valid | 446 ns | 376 ns | 1.2x slower | + +### Summary + +The new implementation is now: +- **2.8x-3.8x faster** for typical cases (many runs) +- **~1.2x slower** only for the edge case with very few runs (10 runs at 1000 run length) + +The remaining ~70ns gap in the 1000 run length case comes from: +1. Remaining extraction overhead (~50ns for validity_mask) +2. Iterator/function call overhead + +This is an acceptable tradeoff since: +1. The few-runs case is already very fast (~446ns) +2. The common case (many runs) is significantly faster +3. Further optimization would require invasive changes to the core API + +## Experiment: u64 Fill in decompress_bool.rs (2025-02-02) + +### Hypothesis + +Using u64 writes instead of byte-level `.fill()` for the middle portion of `fill_bits_true`/`fill_bits_false` might improve performance. + +### Implementation + +Modified `fill_bits_true`/`fill_bits_false` to use a `fill_bytes_u64` helper that: +1. Handles unaligned prefix bytes +2. Writes aligned u64s for the middle +3. Handles suffix bytes + +### Result + +**No improvement.** The u64 approach was about the same speed or slightly slower: +- Nullable 1000 run: ~458-498ns (vs ~374-446ns with byte fill) + +### Why + +1. **LLVM already optimizes `.fill()`** - It generates vectorized SIMD code for slice fills +2. **Overhead** - Alignment checking and branching add overhead that outweighs any benefit +3. **Small runs** - For small byte ranges, the u64 approach has more overhead + +### Conclusion + +Keep the simple byte-level `.fill()` implementation. It's already optimal. + +## Ablation Study: Which Optimizations Matter? (2025-02-02) + +Tested three strategies: +1. **Sequential** - append_n for each run (no prefill) +2. **Prefill zeros** - prefill buffer with 0s, fill true runs +3. **Adaptive** - choose prefill value based on majority + +### Results + +| Scenario | Sequential | Prefill 0s | Adaptive | Best | +|----------|------------|------------|----------|------| +| 10 runs, alternating | 120ns | 77ns | 125ns | prefill | +| 10 runs, mostly_true | 121ns | 86ns | 106ns | prefill | +| 32 runs, alternating | 752ns | 187ns | 294ns | prefill | +| 32 runs, mostly_true | 492ns | 463ns | 159ns | **adaptive** | +| 100 runs, alternating | 1.06µs | 323ns | 484ns | prefill | +| 100 runs, mostly_true | 1.08µs | 948ns | 166ns | **adaptive** | +| 1000 runs, alternating | 6.3µs | 1.5µs | 1.4µs | ~same | +| 1000 runs, mostly_true | 5.8µs | 2.2µs | 828ns | **adaptive** | + +### Conclusions + +1. **Prefill vs Sequential**: Prefill is **always faster** for many runs + - 10 runs: 1.5x faster + - 100 runs: 3x faster + - 1000 runs: **4x faster** + +2. **Adaptive prefill**: Critical for **skewed distributions** (common in real data) + - Alternating (50/50): prefill_zeros is same or slightly better + - Mostly_true (90%): adaptive is **2-3x faster** + +Both optimizations are justified and should be kept. + +## Final Implementation Architecture + +### Entry Point: `runend_decode_bools` + +```rust +pub fn runend_decode_bools( + ends: PrimitiveArray, + values: BoolArray, + offset: usize, + length: usize, +) -> VortexResult +``` + +### Decision Tree + +``` +runend_decode_bools +├── Extract: validity_mask(), to_bit_buffer() +├── IF offset == 0 && num_runs < 32: +│ └── decode_few_runs_no_offset ← Fast path, no iterator +└── ELSE: + └── runend_decode_typed_bool ← Uses trimmed_ends_iter + ├── Mask::AllTrue → decode_bool_non_nullable + │ ├── IF num_runs < 32: sequential append_n + │ └── ELSE: adaptive prefill + │ ├── more true → prefill 1s, clear false runs + │ └── more false → prefill 0s, fill true runs + ├── Mask::AllFalse → return all-invalid array + └── Mask::Values → decode_bool_nullable + ├── IF num_runs < 32: sequential append + └── ELSE: 4 variants based on majority: + ├── (true, valid) → prefill decoded=1, validity=1 + ├── (true, null) → prefill decoded=1, validity=0 + ├── (false, valid) → prefill decoded=0, validity=1 + └── (false, null) → prefill decoded=0, validity=0 +``` + +### Key Difference: `decode_few_runs_no_offset` vs `runend_decode_typed_bool` + +| Aspect | `decode_few_runs_no_offset` | `runend_decode_typed_bool` | +|--------|----------------------------|---------------------------| +| Offset handling | Assumes `offset == 0` | Handles any offset | +| Iterator | Direct slice: `for (i, &end) in ends.iter()` | `trimmed_ends_iter` with 3 `.map()` chains | +| Overhead | Minimal | ~20-30ns iterator overhead | +| When used | `offset == 0 && num_runs < 32` | All other cases | + +### `trimmed_ends_iter` Details + +```rust +run_ends.iter() + .map(|v| v - offset_e) // subtract offset (redundant when offset=0) + .map(|v| min(v, length_e)) // clamp to length + .map(|v| v.as_()) // convert to usize +``` + +For 10 runs, these 3 chained closures add measurable overhead. For 1000 runs, it's amortized. + +### Threshold: PREFILL_RUN_THRESHOLD = 32 + +Below 32 runs: +- Iterator overhead dominates +- Sequential `append_n` is competitive with prefill +- Use direct slice access, avoid iterator chain + +Above 32 runs: +- Prefill + fill_bits is 3-4x faster than sequential +- Adaptive selection matters for skewed data +- Iterator overhead is negligible + +## `fill_bits_true` / `fill_bits_false` Implementation + +```rust +fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { + // Handle same-byte case + if start_byte == end_byte { + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] |= mask << start_bit; + } else { + // First partial byte + if start_bit != 0 { + slice[start_byte] |= !((1u8 << start_bit) - 1); + } + // Middle bytes - LLVM optimizes to SIMD + slice[fill_start..end_byte].fill(0xFF); + // Last partial byte + if end_bit != 0 { + slice[end_byte] |= (1u8 << end_bit) - 1; + } + } +} +``` + +Key insight: `.fill()` is already vectorized by LLVM. Manual u64 approach adds overhead without benefit. + +## External Optimization: `validity_mask()` Fast Path + +In `vortex-array/src/compute/filter.rs`: + +```rust +// Added fast path for non-nullable canonical bool arrays +if !self.dtype().is_nullable() && self.is_canonical() { + return Ok(Mask::from_buffer(self.to_bool().to_bit_buffer())); +} +``` + +This avoids the expensive `fill_null()` call when the validity array is already a non-nullable BoolArray (common case). + +## Final Performance Summary + +### vs Baseline (pre-extracted data) + +| Scenario | New Impl | Baseline | Result | +|----------|----------|----------|--------| +| Many runs (10-100) | 3-4 µs | 13-15 µs | **3-4x faster** | +| Medium runs (100) | 800-900 ns | 2.6 µs | **2.8x faster** | +| Few runs (10 @ 1000 len) | 380-450 ns | 320-376 ns | ~1.2x slower | + +### Absolute Performance (non-nullable, 10K elements) + +| Runs | Time | Throughput | +|------|------|------------| +| 10 | ~200 ns | 50M elements/sec | +| 100 | ~350 ns | 28M elements/sec | +| 1000 | ~1.4 µs | 7M elements/sec | + +## Files Modified + +1. **`encodings/runend/src/decompress_bool.rs`** + - Full implementation with all optimizations + - ~430 lines including tests + +2. **`encodings/runend/benches/run_end_decode.rs`** + - Added baseline comparison benchmark + - ~435 lines + +3. **`vortex-array/src/compute/filter.rs`** + - Added 4-line fast path for `validity_mask()` + +4. **`encodings/runend/PERF_NOTES.md`** + - This file - full documentation of investigation diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs new file mode 100644 index 00000000000..f08ebe9a733 --- /dev/null +++ b/encodings/runend/benches/run_end_decode.rs @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use std::fmt; + +use divan::Bencher; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::compute::warm_up_vtables; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_buffer::BufferMut; +use vortex_runend::decompress_bool::runend_decode_bools; + +fn main() { + warm_up_vtables(); + divan::main(); +} + +/// Distribution types for bool benchmarks +#[derive(Clone, Copy)] +enum BoolDistribution { + /// Alternating true/false (50/50) + Alternating, + /// Mostly true (90% true runs) + MostlyTrue, + /// Mostly false (90% false runs) + MostlyFalse, + /// All true + AllTrue, + /// All false + AllFalse, +} + +impl fmt::Display for BoolDistribution { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + BoolDistribution::Alternating => write!(f, "alternating"), + BoolDistribution::MostlyTrue => write!(f, "mostly_true"), + BoolDistribution::MostlyFalse => write!(f, "mostly_false"), + BoolDistribution::AllTrue => write!(f, "all_true"), + BoolDistribution::AllFalse => write!(f, "all_false"), + } + } +} + +#[derive(Clone, Copy)] +struct BoolBenchArgs { + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, +} + +impl fmt::Display for BoolBenchArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}", + self.total_length, self.avg_run_length, self.distribution + ) + } +} + +/// Creates bool test data with configurable distribution +fn create_bool_test_data( + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, +) -> (PrimitiveArray, BoolArray) { + let mut ends = BufferMut::::with_capacity(total_length / avg_run_length + 1); + let mut values = Vec::with_capacity(total_length / avg_run_length + 1); + + let mut pos = 0usize; + let mut run_index = 0usize; + + while pos < total_length { + let run_len = avg_run_length.min(total_length - pos); + pos += run_len; + ends.push(pos as u32); + + let val = match distribution { + BoolDistribution::Alternating => run_index % 2 == 0, + BoolDistribution::MostlyTrue => run_index % 10 != 0, // 90% true + BoolDistribution::MostlyFalse => run_index % 10 == 0, // 10% true (90% false) + BoolDistribution::AllTrue => true, + BoolDistribution::AllFalse => false, + }; + values.push(val); + run_index += 1; + } + + ( + PrimitiveArray::new(ends.freeze(), Validity::NonNullable), + BoolArray::from(BitBuffer::from(values)), + ) +} + +// Medium size: 10k elements with various run lengths and distributions +const BOOL_ARGS: &[BoolBenchArgs] = &[ + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::Alternating, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::MostlyTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::MostlyFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::AllTrue, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::AllFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::AllFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::AllFalse, + }, + BoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::AllFalse, + }, +]; + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool(bencher: Bencher, args: BoolBenchArgs) { + let BoolBenchArgs { + total_length, + avg_run_length, + distribution, + } = args; + let (ends, values) = create_bool_test_data(total_length, avg_run_length, distribution); + bencher + .with_inputs(|| (ends.clone(), values.clone())) + .bench_refs(|(ends, values)| { + runend_decode_bools(ends.clone(), values.clone(), 0, total_length) + }); +} + +/// Validity distribution for nullable benchmarks +#[derive(Clone, Copy)] +enum ValidityDistribution { + /// 90% valid + MostlyValid, + /// 50% valid + HalfValid, + /// 10% valid + MostlyNull, +} + +impl fmt::Display for ValidityDistribution { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ValidityDistribution::MostlyValid => write!(f, "mostly_valid"), + ValidityDistribution::HalfValid => write!(f, "half_valid"), + ValidityDistribution::MostlyNull => write!(f, "mostly_null"), + } + } +} + +#[derive(Clone, Copy)] +struct NullableBoolBenchArgs { + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, + validity: ValidityDistribution, +} + +impl fmt::Display for NullableBoolBenchArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}_{}", + self.total_length, self.avg_run_length, self.distribution, self.validity + ) + } +} + +/// Creates nullable bool test data with configurable distribution and validity +fn create_nullable_bool_test_data( + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, + validity: ValidityDistribution, +) -> (PrimitiveArray, BoolArray) { + let mut ends = BufferMut::::with_capacity(total_length / avg_run_length + 1); + let mut values = Vec::with_capacity(total_length / avg_run_length + 1); + let mut validity_bits = Vec::with_capacity(total_length / avg_run_length + 1); + + let mut pos = 0usize; + let mut run_index = 0usize; + + while pos < total_length { + let run_len = avg_run_length.min(total_length - pos); + pos += run_len; + ends.push(pos as u32); + + let val = match distribution { + BoolDistribution::Alternating => run_index % 2 == 0, + BoolDistribution::MostlyTrue => run_index % 10 != 0, + BoolDistribution::MostlyFalse => run_index % 10 == 0, + BoolDistribution::AllTrue => true, + BoolDistribution::AllFalse => false, + }; + values.push(val); + + let is_valid = match validity { + ValidityDistribution::MostlyValid => run_index % 10 != 0, + ValidityDistribution::HalfValid => run_index % 2 == 0, + ValidityDistribution::MostlyNull => run_index % 10 == 0, + }; + validity_bits.push(is_valid); + + run_index += 1; + } + + ( + PrimitiveArray::new(ends.freeze(), Validity::NonNullable), + BoolArray::new( + BitBuffer::from(values), + Validity::from(BitBuffer::from(validity_bits)), + ), + ) +} + +const NULLABLE_BOOL_ARGS: &[NullableBoolBenchArgs] = &[ + // Alternating with different validity + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::HalfValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyNull, + }, + // MostlyTrue with different validity + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + validity: ValidityDistribution::HalfValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 10, + distribution: BoolDistribution::MostlyTrue, + validity: ValidityDistribution::MostlyNull, + }, + // Different run lengths with MostlyValid + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 2, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 100, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, + NullableBoolBenchArgs { + total_length: 10_000, + avg_run_length: 1000, + distribution: BoolDistribution::Alternating, + validity: ValidityDistribution::MostlyValid, + }, +]; + +#[divan::bench(args = NULLABLE_BOOL_ARGS)] +fn decode_bool_nullable(bencher: Bencher, args: NullableBoolBenchArgs) { + let NullableBoolBenchArgs { + total_length, + avg_run_length, + distribution, + validity, + } = args; + let (ends, values) = + create_nullable_bool_test_data(total_length, avg_run_length, distribution, validity); + bencher + .with_inputs(|| (ends.clone(), values.clone())) + .bench_refs(|(ends, values)| { + runend_decode_bools(ends.clone(), values.clone(), 0, total_length) + }); +} + +/// Baseline using develop branch's append_n approach +fn decode_bool_nullable_baseline( + ends: &[u32], + values: &BitBuffer, + validity_mask: &BitBuffer, + _length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::with_capacity(ends.last().copied().unwrap_or(0) as usize); + let mut decoded_validity = + BitBufferMut::with_capacity(ends.last().copied().unwrap_or(0) as usize); + + let mut prev_end = 0usize; + for ((&end, value), is_valid) in ends.iter().zip(values.iter()).zip(validity_mask.iter()) { + let end = end as usize; + if is_valid { + decoded_validity.append_n(true, end - prev_end); + decoded.append_n(value, end - prev_end); + } else { + decoded_validity.append_n(false, end - prev_end); + decoded.append_n(false, end - prev_end); + } + prev_end = end; + } + + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +#[divan::bench(args = NULLABLE_BOOL_ARGS)] +fn decode_bool_nullable_develop(bencher: Bencher, args: NullableBoolBenchArgs) { + let NullableBoolBenchArgs { + total_length, + avg_run_length, + distribution, + validity, + } = args; + let (ends, values) = + create_nullable_bool_test_data(total_length, avg_run_length, distribution, validity); + + bencher + .with_inputs(|| { + let ends_slice: Vec = ends.as_slice::().to_vec(); + let values_buf = values.to_bit_buffer(); + let validity_buf = values.validity_mask().unwrap(); + let validity_bits = match validity_buf { + vortex_mask::Mask::Values(m) => m.bit_buffer().clone(), + _ => BitBuffer::new_set(values.len()), + }; + (ends_slice, values_buf, validity_bits) + }) + .bench_refs(|(ends, values, validity)| { + decode_bool_nullable_baseline(ends, values, validity, total_length) + }); +} diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 9d18bf4cd3b..fe610e663c1 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -39,9 +39,9 @@ use vortex_error::vortex_ensure; use vortex_error::vortex_panic; use vortex_scalar::PValue; -use crate::compress::runend_decode_bools; use crate::compress::runend_decode_primitive; use crate::compress::runend_encode; +use crate::decompress_bool::runend_decode_bools; use crate::kernel::PARENT_KERNELS; use crate::rules::RULES; diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 8d8af3a828f..72ee62ce0a7 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -186,24 +186,6 @@ pub fn runend_decode_primitive( })) } -pub fn runend_decode_bools( - ends: PrimitiveArray, - values: BoolArray, - offset: usize, - length: usize, -) -> VortexResult { - let validity_mask = values.validity_mask()?; - Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { - runend_decode_typed_bool( - trimmed_ends_iter(ends.as_slice::(), offset, length), - &values.to_bit_buffer(), - validity_mask, - values.dtype().nullability(), - length, - ) - })) -} - pub fn runend_decode_typed_primitive( run_ends: impl Iterator, values: &[T], @@ -263,47 +245,6 @@ pub fn runend_decode_typed_primitive( } } -pub fn runend_decode_typed_bool( - run_ends: impl Iterator, - values: &BitBuffer, - values_validity: Mask, - values_nullability: Nullability, - length: usize, -) -> BoolArray { - match values_validity { - Mask::AllTrue(_) => { - let mut decoded = BitBufferMut::with_capacity(length); - for (end, value) in run_ends.zip_eq(values.iter()) { - decoded.append_n(value, end - decoded.len()); - } - BoolArray::new(decoded.freeze(), values_nullability.into()) - } - Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), - Mask::Values(mask) => { - let mut decoded = BitBufferMut::with_capacity(length); - let mut decoded_validity = BitBufferMut::with_capacity(length); - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(mask.bit_buffer().iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - match value { - None => { - decoded_validity.append_n(false, end - decoded.len()); - decoded.append_n(false, end - decoded.len()); - } - Some(value) => { - decoded_validity.append_n(true, end - decoded.len()); - decoded.append_n(value, end - decoded.len()); - } - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) - } - } -} - #[cfg(test)] mod test { use vortex_array::ToCanonical; diff --git a/encodings/runend/src/compute/compare.rs b/encodings/runend/src/compute/compare.rs index 4161898af61..7b6eb9cc1f1 100644 --- a/encodings/runend/src/compute/compare.rs +++ b/encodings/runend/src/compute/compare.rs @@ -15,7 +15,7 @@ use vortex_error::VortexResult; use crate::RunEndArray; use crate::RunEndVTable; -use crate::compress::runend_decode_bools; +use crate::decompress_bool::runend_decode_bools; impl CompareKernel for RunEndVTable { fn compare( diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs new file mode 100644 index 00000000000..9d872f9a6d8 --- /dev/null +++ b/encodings/runend/src/decompress_bool.rs @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Optimized run-end decoding for boolean arrays. +//! +//! Uses an adaptive strategy that pre-fills the buffer with the majority value +//! (0s or 1s) and only fills the minority runs, minimizing work for skewed distributions. + +use itertools::Itertools; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_dtype::Nullability; +use vortex_dtype::match_each_unsigned_integer_ptype; +use vortex_error::VortexResult; +use vortex_mask::Mask; + +use crate::iter::trimmed_ends_iter; + +/// Threshold for number of runs below which we use sequential append instead of prefill. +/// With few runs, the overhead of prefilling the entire buffer dominates. +const PREFILL_RUN_THRESHOLD: usize = 32; + +/// Decodes run-end encoded boolean values into a flat `BoolArray`. +pub fn runend_decode_bools( + ends: PrimitiveArray, + values: BoolArray, + offset: usize, + length: usize, +) -> VortexResult { + let validity = values.validity_mask()?; + let values_buf = values.to_bit_buffer(); + let nullability = values.dtype().nullability(); + + // Fast path for few runs with no offset - avoids iterator overhead + let num_runs = values_buf.len(); + if offset == 0 && num_runs < PREFILL_RUN_THRESHOLD { + return Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + decode_few_runs_no_offset( + ends.as_slice::(), + &values_buf, + validity, + nullability, + length, + ) + })); + } + + Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + runend_decode_typed_bool( + trimmed_ends_iter(ends.as_slice::(), offset, length), + &values_buf, + validity, + nullability, + length, + ) + })) +} + +/// Decodes run-end encoded boolean values using an adaptive strategy. +/// +/// The strategy counts true vs false runs and chooses the optimal approach: +/// - If more true runs: pre-fill with 1s, clear false runs +/// - If more false runs: pre-fill with 0s, fill true runs +/// +/// This minimizes work for skewed distributions (e.g., sparse validity masks). +pub fn runend_decode_typed_bool( + run_ends: impl Iterator, + values: &BitBuffer, + values_validity: Mask, + values_nullability: Nullability, + length: usize, +) -> BoolArray { + match values_validity { + Mask::AllTrue(_) => decode_bool_non_nullable(run_ends, values, values_nullability, length), + Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), + Mask::Values(mask) => decode_bool_nullable(run_ends, values, mask.bit_buffer(), length), + } +} + +/// Fast path for few runs with no offset. Uses direct slice access to minimize overhead. +/// This avoids the `trimmed_ends_iter` iterator chain which adds significant overhead +/// for small numbers of runs. +#[inline(always)] +fn decode_few_runs_no_offset( + ends: &[E], + values: &BitBuffer, + validity: Mask, + nullability: Nullability, + length: usize, +) -> BoolArray { + match validity { + Mask::AllTrue(_) => { + let mut decoded = BitBufferMut::with_capacity(length); + let mut prev_end = 0usize; + for (i, &end) in ends.iter().enumerate() { + let end = end.as_().min(length); + decoded.append_n(values.value(i), end - prev_end); + prev_end = end; + } + BoolArray::new(decoded.freeze(), nullability.into()) + } + Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), + Mask::Values(mask) => { + let validity_buf = mask.bit_buffer(); + // Use prefill + fill_bits for better performance with larger runs + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut prev_end = 0usize; + for (i, &end) in ends.iter().enumerate() { + let end = end.as_().min(length); + if end > prev_end { + let is_valid = validity_buf.value(i); + if is_valid { + fill_bits_true(validity_bytes, prev_end, end); + if values.value(i) { + fill_bits_true(decoded_bytes, prev_end, end); + } + } + } + prev_end = end; + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } + } +} + +/// Decodes run-end encoded booleans when all values are valid (non-nullable). +fn decode_bool_non_nullable( + run_ends: impl Iterator, + values: &BitBuffer, + nullability: Nullability, + length: usize, +) -> BoolArray { + let num_runs = values.len(); + + // For few runs, sequential append is faster than prefill + modify + if num_runs < PREFILL_RUN_THRESHOLD { + let mut decoded = BitBufferMut::with_capacity(length); + for (end, value) in run_ends.zip(values.iter()) { + decoded.append_n(value, end - decoded.len()); + } + return BoolArray::new(decoded.freeze(), nullability.into()); + } + + // Adaptive strategy: choose based on which value is more common + let true_count = values.true_count(); + let false_count = num_runs - true_count; + + if true_count > false_count { + // More true runs - pre-fill with 1s and clear false runs + let mut decoded = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only clear when value is false (true is already 1) + if end > current_pos && !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), nullability.into()) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only fill when value is true (false is already 0) + if end > current_pos && value { + fill_bits_true(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), nullability.into()) + } +} + +/// Decodes run-end encoded booleans when values may be null (nullable). +fn decode_bool_nullable( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let num_runs = values.len(); + + // For few runs, sequential append is faster than prefill + modify + if num_runs < PREFILL_RUN_THRESHOLD { + return decode_nullable_sequential(run_ends, values, validity_mask, length); + } + + let true_count = values.true_count(); + let false_count = num_runs - true_count; + let valid_count = validity_mask.true_count(); + let null_count = num_runs - valid_count; + + let prefill_true = true_count > false_count; + let prefill_valid = valid_count > null_count; + + match (prefill_true, prefill_valid) { + (true, true) => decode_nullable_true_valid(run_ends, values, validity_mask, length), + (true, false) => decode_nullable_true_null(run_ends, values, validity_mask, length), + (false, true) => decode_nullable_false_valid(run_ends, values, validity_mask, length), + (false, false) => decode_nullable_false_null(run_ends, values, validity_mask, length), + } +} + +/// Sequential decode for few runs - avoids prefill overhead. +#[inline(always)] +fn decode_nullable_sequential( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::with_capacity(length); + let mut decoded_validity = BitBufferMut::with_capacity(length); + + for (end, (value, is_valid)) in run_ends.zip(values.iter().zip(validity_mask.iter())) { + let run_len = end - decoded.len(); + if is_valid { + decoded_validity.append_n(true, run_len); + decoded.append_n(value, run_len); + } else { + decoded_validity.append_n(false, run_len); + decoded.append_n(false, run_len); + } + } + + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=1s, validity=1s. Clear for false values and nulls. +#[inline(always)] +fn decode_nullable_true_valid( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if !is_valid { + fill_bits_false(validity_bytes, current_pos, end); + fill_bits_false(decoded_bytes, current_pos, end); + } else if !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=1s, validity=0s. Set validity for valid, clear decoded for false/null. +#[inline(always)] +fn decode_nullable_true_null( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if is_valid { + fill_bits_true(validity_bytes, current_pos, end); + if !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + } else { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=0s, validity=1s. Clear validity for nulls, set decoded for true. +#[inline(always)] +fn decode_nullable_false_valid( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if !is_valid { + fill_bits_false(validity_bytes, current_pos, end); + } else if value { + fill_bits_true(decoded_bytes, current_pos, end); + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Prefill decoded=0s, validity=0s. Set validity and decoded for valid true values. +#[inline(always)] +fn decode_nullable_false_null( + run_ends: impl Iterator, + values: &BitBuffer, + validity_mask: &BitBuffer, + length: usize, +) -> BoolArray { + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, (value, is_valid)) in run_ends.zip_eq(values.iter().zip(validity_mask.iter())) { + if end > current_pos { + if is_valid { + fill_bits_true(validity_bytes, current_pos, end); + if value { + fill_bits_true(decoded_bytes, current_pos, end); + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) +} + +/// Fills bits in range [start, end) to true. +/// Assumes the buffer is pre-initialized to all zeros. +#[inline(always)] +fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + + let start_byte = start / 8; + let start_bit = start % 8; + let end_byte = end / 8; + let end_bit = end % 8; + + if start_byte == end_byte { + #[allow(clippy::cast_possible_truncation)] + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] |= mask << start_bit; + } else { + // First partial byte + if start_bit != 0 { + slice[start_byte] |= !((1u8 << start_bit) - 1); + } + + // Middle bytes + let fill_start = if start_bit != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(0xFF); + } + + // Last partial byte + if end_bit != 0 { + slice[end_byte] |= (1u8 << end_bit) - 1; + } + } +} + +/// Clears bits in range [start, end) to false. +/// Assumes the buffer is pre-initialized to all ones. +#[inline(always)] +fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + + let start_byte = start / 8; + let start_bit = start % 8; + let end_byte = end / 8; + let end_bit = end % 8; + + if start_byte == end_byte { + #[allow(clippy::cast_possible_truncation)] + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] &= !(mask << start_bit); + } else { + // First partial byte + if start_bit != 0 { + slice[start_byte] &= (1u8 << start_bit) - 1; + } + + // Middle bytes + let fill_start = if start_bit != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(0x00); + } + + // Last partial byte + if end_bit != 0 { + slice[end_byte] &= !((1u8 << end_bit) - 1); + } + } +} + +#[cfg(test)] +mod tests { + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_buffer::BitBuffer; + use vortex_error::VortexResult; + + use super::runend_decode_bools; + + #[test] + fn decode_bools_alternating() -> VortexResult<()> { + // Alternating true/false: [T, T, F, F, F, T, T, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_mostly_true() -> VortexResult<()> { + // Mostly true: [T, T, T, T, T, F, T, T, T, T] + let ends = PrimitiveArray::from_iter([5u32, 6, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, true, true, true, false, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_mostly_false() -> VortexResult<()> { + // Mostly false: [F, F, F, F, F, T, F, F, F, F] + let ends = PrimitiveArray::from_iter([5u32, 6, 10]); + let values = BoolArray::from(BitBuffer::from(vec![false, true, false])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + false, false, false, false, false, true, false, false, false, false, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_all_true_single_run() -> VortexResult<()> { + let ends = PrimitiveArray::from_iter([10u32]); + let values = BoolArray::from(BitBuffer::from(vec![true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, true, true, true, true, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_all_false_single_run() -> VortexResult<()> { + let ends = PrimitiveArray::from_iter([10u32]); + let values = BoolArray::from(BitBuffer::from(vec![false])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + false, false, false, false, false, false, false, false, false, false, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_with_offset() -> VortexResult<()> { + // Test with offset: [T, T, F, F, F, T, T, T, T, T] -> slice [2..8] = [F, F, F, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 2, 6)?; + + let expected = + BoolArray::from(BitBuffer::from(vec![false, false, false, true, true, true])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_nullable() -> VortexResult<()> { + use vortex_array::validity::Validity; + + // 3 runs: T (valid), F (null), T (valid) -> [T, T, null, null, null, T, T, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::new( + BitBuffer::from(vec![true, false, true]), + Validity::from(BitBuffer::from(vec![true, false, true])), + ); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + // Expected: values=[T, T, F, F, F, T, T, T, T, T], validity=[1, 1, 0, 0, 0, 1, 1, 1, 1, 1] + let expected = BoolArray::new( + BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ]), + Validity::from(BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ])), + ); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_nullable_few_runs() -> VortexResult<()> { + use vortex_array::validity::Validity; + + // Test few runs (uses fast path): 5 runs of length 2000 each + let ends = PrimitiveArray::from_iter([2000u32, 4000, 6000, 8000, 10000]); + let values = BoolArray::new( + BitBuffer::from(vec![true, false, true, false, true]), + Validity::from(BitBuffer::from(vec![true, false, true, false, true])), + ); + let decoded = runend_decode_bools(ends, values, 0, 10000)?; + + // Check length and a few values + assert_eq!(decoded.len(), 10000); + // First run: valid true + assert!(decoded.validity_mask()?.value(0)); + assert!(decoded.to_bit_buffer().value(0)); + // Second run: null (validity false) + assert!(!decoded.validity_mask()?.value(2000)); + // Third run: valid true + assert!(decoded.validity_mask()?.value(4000)); + assert!(decoded.to_bit_buffer().value(4000)); + Ok(()) + } +} diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index 589b16e2c65..5be018b036d 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -13,6 +13,7 @@ mod array; mod arrow; pub mod compress; mod compute; +pub mod decompress_bool; mod iter; mod kernel; mod ops; diff --git a/vortex-array/src/compute/filter.rs b/vortex-array/src/compute/filter.rs index f61c5f6fffe..e2394367364 100644 --- a/vortex-array/src/compute/filter.rs +++ b/vortex-array/src/compute/filter.rs @@ -236,6 +236,10 @@ impl dyn Array + '_ { vortex_bail!("mask must be bool array, has dtype {}", self.dtype()); } + if !self.dtype().is_nullable() && self.is_canonical() { + return Ok(Mask::from_buffer(self.to_bool().to_bit_buffer())); + } + // Convert nulls to false first in case this can be done cheaply by the encoding. let array = fill_null(self, &Scalar::bool(false, self.dtype().nullability()))?;