Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions vortex-array/src/expr/analysis/immediate_access.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,44 @@ use crate::expr::exprs::select::Select;

pub type FieldAccesses<'a> = Annotations<'a, FieldName>;

/// An [`AnnotationFn`] for annotating scope accesses.
pub fn annotate_scope_access(scope: &StructFields) -> impl AnnotationFn<Annotation = FieldName> {
/// Returns the "free fields" for this expression node.
///
/// A "free field" is a top-level field from the root scope that this expression references—not
/// nested fields within those top-level fields. For example, `root().a.b` has free field `{a}`,
/// not `{b}`, because `a` is the top-level field being accessed from root.
///
/// The term "free" is borrowed from PL theory's "free variables"—variables that reference an
/// outer scope rather than being introduced locally.
///
/// This is useful for column pruning, where we only need to read the top-level fields that an
/// expression actually touches.
///
/// # Annotation Rules
///
/// - **[`Select`]**: Returns the included field names if the child is [`Root`].
/// - **[`GetItem`] on [`Root`]**: Returns `[field_name]` if the child is [`Root`].
/// - **[`Root`]**: Returns all field names from `scope` (conservative over-approximation).
/// - **Everything else**: Returns empty (annotations aggregate from children automatically).
///
/// # Example
///
/// Given `scope = {a: {b: .., c: ..}, d: ..}` and `expr = root().a.b + root().d`:
/// - `root().a` has free fields `{a}`.
/// - `root().d` has free fields `{d}`.
/// - The full expression has free fields `{a, d}` (not `b`, only top-level fields are tracked).
pub fn make_free_field_annotator(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can rename this back but this makes more sense to me

scope: &StructFields,
) -> impl AnnotationFn<Annotation = FieldName> {
move |expr: &Expression| {
assert!(
!expr.is::<Select>(),
"cannot analyse select, simplify the expression"
);

if let Some(field_name) = expr.as_opt::<GetItem>() {
if let Some(selection) = expr.as_opt::<Select>() {
if expr.child(0).is::<Root>() {
return selection
.normalize_to_included_fields(scope.names())
.vortex_expect("Select fields must be valid for scope")
.into_iter()
.collect();
}
} else if let Some(field_name) = expr.as_opt::<GetItem>() {
if expr.child(0).is::<Root>() {
return vec![field_name.clone()];
}
Expand All @@ -47,7 +76,7 @@ pub fn immediate_scope_accesses<'a>(
expr: &'a Expression,
scope: &'a StructFields,
) -> FieldAccesses<'a> {
descendent_annotations(expr, annotate_scope_access(scope))
descendent_annotations(expr, make_free_field_annotator(scope))
}

/// This returns the immediate scope_access (as explained `immediate_scope_accesses`) for `expr`.
Expand Down
103 changes: 63 additions & 40 deletions vortex-array/src/expr/exprs/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ use vortex_session::VortexSession;

use crate::IntoArray;
use crate::arrays::StructArray;
use crate::expr;
use crate::expr::Arity;
use crate::expr::ChildName;
use crate::expr::ExecutionArgs;
use crate::expr::ExecutionResult;
use crate::expr::ExprId;
use crate::expr::Pack;
use crate::expr::SimplifyCtx;
use crate::expr::VTable;
use crate::expr::VTableExt;
use crate::expr::expression::Expression;
use crate::expr::field::DisplayFieldNames;
use crate::expr::get_item;
use crate::expr::pack;

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum FieldSelection {
Expand All @@ -47,7 +47,7 @@ impl VTable for Select {
ExprId::new_ref("vortex.select")
}

fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
fn serialize(&self, instance: &FieldSelection) -> VortexResult<Option<Vec<u8>>> {
let opts = match instance {
FieldSelection::Include(fields) => Opts::Include(ProtoFieldNames {
names: fields.iter().map(|f| f.to_string()).collect(),
Expand All @@ -65,7 +65,7 @@ impl VTable for Select {
&self,
_metadata: &[u8],
_session: &VortexSession,
) -> VortexResult<Self::Options> {
) -> VortexResult<FieldSelection> {
let prost_metadata = SelectOpts::decode(_metadata)?;

let select_opts = prost_metadata
Expand All @@ -84,11 +84,11 @@ impl VTable for Select {
Ok(field_selection)
}

fn arity(&self, _options: &Self::Options) -> Arity {
fn arity(&self, _options: &FieldSelection) -> Arity {
Arity::Exact(1)
}

fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
fn child_name(&self, _instance: &FieldSelection, child_idx: usize) -> ChildName {
match child_idx {
0 => ChildName::new_ref("child"),
_ => unreachable!(),
Expand Down Expand Up @@ -165,44 +165,66 @@ impl VTable for Select {

fn simplify(
&self,
options: &Self::Options,
selection: &FieldSelection,
expr: &Expression,
ctx: &dyn SimplifyCtx,
) -> VortexResult<Option<Expression>> {
let child = expr.child(0);
let child_dtype = ctx.return_dtype(child)?;
let child_nullability = child_dtype.nullability();
let child_struct = expr.child(0);
let struct_dtype = ctx.return_dtype(child_struct)?;
let struct_nullability = struct_dtype.nullability();

let child_dtype = child_dtype.as_struct_fields_opt().ok_or_else(|| {
let struct_fields = struct_dtype.as_struct_fields_opt().ok_or_else(|| {
vortex_err!(
"Select child must return a struct dtype, however it was a {}",
child_dtype
struct_dtype
)
})?;

let expr = pack(
options
.as_include_names(child_dtype.names())
.map_err(|e| {
e.with_context(format!(
"Select fields {:?} must be a subset of child fields {:?}",
options,
child_dtype.names()
))
})?
.iter()
.map(|name| (name.clone(), get_item(name.clone(), child.clone()))),
child_nullability,
);
// "Mask" out the unwanted fields of the child struct `DType`.
let included_fields = selection.normalize_to_included_fields(struct_fields.names())?;
let all_included_fields_are_nullable = included_fields.iter().all(|name| {
struct_fields
.field(name)
.vortex_expect(
"`normalize_to_included_fields` checks that the included fields already exist \
in `struct_fields`",
)
.is_nullable()
});

// We cannot always convert a `select` into a `pack(get_item(f1), get_item(f2), ...)`.
// This is because `get_item` does a validity intersection of the struct validity with its
// fields, which is not the same as just "masking" out the unwanted fields (a selection).
//
// We can, however, make this simplification when the child of the `select` is already a
// `pack` and we know that `get_item` will do no validity intersections.
let child_is_pack = child_struct.is::<Pack>();

// `get_item` only performs validity intersection when the struct is nullable but the field
// is not. This would change the semantics of a `select`, so we can only simplify when this
// won't happen.
let would_intersect_validity =
struct_nullability.is_nullable() && !all_included_fields_are_nullable;

if child_is_pack && !would_intersect_validity {
let pack_expr = expr::pack(
included_fields
.into_iter()
.map(|name| (name.clone(), expr::get_item(name, child_struct.clone()))),
struct_nullability,
);

Ok(Some(expr))
return Ok(Some(pack_expr));
}

Ok(None)
}

fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
fn is_null_sensitive(&self, _instance: &FieldSelection) -> bool {
true
}

fn is_fallible(&self, _instance: &Self::Options) -> bool {
fn is_fallible(&self, _instance: &FieldSelection) -> bool {
// If this type-checks its infallible.
false
}
Expand Down Expand Up @@ -260,21 +282,26 @@ impl FieldSelection {
fields
}

pub fn as_include_names(&self, field_names: &FieldNames) -> VortexResult<FieldNames> {
pub fn normalize_to_included_fields(
&self,
available_fields: &FieldNames,
) -> VortexResult<FieldNames> {
// Check that all of the field names exist in the available fields.
if self
.field_names()
.iter()
.any(|f| !field_names.iter().contains(f))
.any(|f| !available_fields.iter().contains(f))
{
vortex_bail!(
"Field {:?} in select not in field names {:?}",
"Select fields {:?} must be a subset of child fields {:?}",
self,
field_names
available_fields
);
}

match self {
FieldSelection::Include(fields) => Ok(fields.clone()),
FieldSelection::Exclude(exc_fields) => Ok(field_names
FieldSelection::Exclude(exc_fields) => Ok(available_fields
.iter()
.filter(|f| !exc_fields.iter().contains(f))
.cloned()
Expand Down Expand Up @@ -308,7 +335,6 @@ mod tests {
use crate::IntoArray;
use crate::ToCanonical;
use crate::arrays::StructArray;
use crate::expr::exprs::pack::Pack;
use crate::expr::exprs::root::root;
use crate::expr::exprs::select::Select;
use crate::expr::test_harness;
Expand Down Expand Up @@ -393,11 +419,11 @@ mod tests {
assert_eq!(
&include
.as_::<Select>()
.as_include_names(&field_names)
.normalize_to_included_fields(&field_names)
.unwrap(),
&exclude
.as_::<Select>()
.as_include_names(&field_names)
.normalize_to_included_fields(&field_names)
.unwrap()
);
}
Expand All @@ -412,7 +438,6 @@ mod tests {

let result = e.optimize_recursive(&dtype).unwrap();

assert!(result.is::<Pack>());
assert!(result.return_dtype(&dtype).unwrap().is_nullable());
}

Expand All @@ -431,8 +456,6 @@ mod tests {

let result = e.optimize_recursive(&dtype).unwrap();

assert!(result.is::<Pack>());

// Should exclude "c" and include "a" and "b"
let result_dtype = result.return_dtype(&dtype).unwrap();
assert!(result_dtype.is_nullable());
Expand Down
Loading
Loading