move the aggregate operator to its own file.

The code is becoming impossible to reason about with everything in operator.rs
2026-02-11 03:04:22 +01:00 · 2025-09-19 03:58:35 -05:00
parent 7178d8d31c
commit aa8fcdbe54
4 changed files with 1796 additions and 1821 deletions
--- a/core/incremental/aggregate_operator.rs
+++ b/core/incremental/aggregate_operator.rs
--- a/core/incremental/mod.rs
+++ b/core/incremental/mod.rs
@@ -1,3 +1,4 @@
+pub mod aggregate_operator;
 pub mod compiler;
 pub mod cursor;
 pub mod dbsp;
--- a/core/incremental/operator.rs
+++ b/core/incremental/operator.rs
--- a/core/incremental/persistence.rs
+++ b/core/incremental/persistence.rs
@@ -1,12 +1,7 @@
-use crate::incremental::dbsp::HashableRow;
-use crate::incremental::operator::{
-    generate_storage_id, AggColumnInfo, AggregateFunction, AggregateOperator, AggregateState,
-    DbspStateCursors, MinMaxDeltas, AGG_TYPE_MINMAX,
-};
+use crate::incremental::operator::{AggregateFunction, AggregateState, DbspStateCursors};
 use crate::storage::btree::{BTreeCursor, BTreeKey};
-use crate::types::{IOResult, ImmutableRecord, RefValue, SeekKey, SeekOp, SeekResult};
+use crate::types::{IOResult, ImmutableRecord, SeekKey, SeekOp, SeekResult};
 use crate::{return_if_io, LimboError, Result, Value};
-use std::collections::{HashMap, HashSet};

 #[derive(Debug, Default)]
 pub enum ReadRecord {
@@ -290,672 +285,3 @@ impl WriteRow {
        }
    }
 }
-
-/// State machine for recomputing MIN/MAX values after deletion
-#[derive(Debug)]
-pub enum RecomputeMinMax {
-    ProcessElements {
-        /// Current column being processed
-        current_column_idx: usize,
-        /// Columns to process (combined MIN and MAX)
-        columns_to_process: Vec<(String, String, bool)>, // (group_key, column_name, is_min)
-        /// MIN/MAX deltas for checking values and weights
-        min_max_deltas: MinMaxDeltas,
-    },
-    Scan {
-        /// Columns still to process
-        columns_to_process: Vec<(String, String, bool)>,
-        /// Current index in columns_to_process (will resume from here)
-        current_column_idx: usize,
-        /// MIN/MAX deltas for checking values and weights
-        min_max_deltas: MinMaxDeltas,
-        /// Current group key being processed
-        group_key: String,
-        /// Current column name being processed
-        column_name: String,
-        /// Whether we're looking for MIN (true) or MAX (false)
-        is_min: bool,
-        /// The scan state machine for finding the new MIN/MAX
-        scan_state: Box<ScanState>,
-    },
-    Done,
-}
-
-impl RecomputeMinMax {
-    pub fn new(
-        min_max_deltas: MinMaxDeltas,
-        existing_groups: &HashMap<String, AggregateState>,
-        operator: &AggregateOperator,
-    ) -> Self {
-        let mut groups_to_check: HashSet<(String, String, bool)> = HashSet::new();
-
-        // Remember the min_max_deltas are essentially just the only column that is affected by
-        // this min/max, in delta (actually ZSet - consolidated delta) format. This makes it easier
-        // for us to consume it in here.
-        //
-        // The most challenging case is the case where there is a retraction, since we need to go
-        // back to the index.
-        for (group_key_str, values) in &min_max_deltas {
-            for ((col_name, hashable_row), weight) in values {
-                let col_info = operator.column_min_max.get(col_name);
-
-                let value = &hashable_row.values[0];
-
-                if *weight < 0 {
-                    // Deletion detected - check if it's the current MIN/MAX
-                    if let Some(state) = existing_groups.get(group_key_str) {
-                        // Check for MIN
-                        if let Some(current_min) = state.mins.get(col_name) {
-                            if current_min == value {
-                                groups_to_check.insert((
-                                    group_key_str.clone(),
-                                    col_name.clone(),
-                                    true,
-                                ));
-                            }
-                        }
-                        // Check for MAX
-                        if let Some(current_max) = state.maxs.get(col_name) {
-                            if current_max == value {
-                                groups_to_check.insert((
-                                    group_key_str.clone(),
-                                    col_name.clone(),
-                                    false,
-                                ));
-                            }
-                        }
-                    }
-                } else if *weight > 0 {
-                    // If it is not found in the existing groups, then we only need to care
-                    // about this if this is a new record being inserted
-                    if let Some(info) = col_info {
-                        if info.has_min {
-                            groups_to_check.insert((group_key_str.clone(), col_name.clone(), true));
-                        }
-                        if info.has_max {
-                            groups_to_check.insert((
-                                group_key_str.clone(),
-                                col_name.clone(),
-                                false,
-                            ));
-                        }
-                    }
-                }
-            }
-        }
-
-        if groups_to_check.is_empty() {
-            // No recomputation or initialization needed
-            Self::Done
-        } else {
-            // Convert HashSet to Vec for indexed processing
-            let groups_to_check_vec: Vec<_> = groups_to_check.into_iter().collect();
-            Self::ProcessElements {
-                current_column_idx: 0,
-                columns_to_process: groups_to_check_vec,
-                min_max_deltas,
-            }
-        }
-    }
-
-    pub fn process(
-        &mut self,
-        existing_groups: &mut HashMap<String, AggregateState>,
-        operator: &AggregateOperator,
-        cursors: &mut DbspStateCursors,
-    ) -> Result<IOResult<()>> {
-        loop {
-            match self {
-                RecomputeMinMax::ProcessElements {
-                    current_column_idx,
-                    columns_to_process,
-                    min_max_deltas,
-                } => {
-                    if *current_column_idx >= columns_to_process.len() {
-                        *self = RecomputeMinMax::Done;
-                        return Ok(IOResult::Done(()));
-                    }
-
-                    let (group_key, column_name, is_min) =
-                        columns_to_process[*current_column_idx].clone();
-
-                    // Get column index from pre-computed info
-                    let column_index = operator
-                        .column_min_max
-                        .get(&column_name)
-                        .map(|info| info.index)
-                        .unwrap(); // Should always exist since we're processing known columns
-
-                    // Get current value from existing state
-                    let current_value = existing_groups.get(&group_key).and_then(|state| {
-                        if is_min {
-                            state.mins.get(&column_name).cloned()
-                        } else {
-                            state.maxs.get(&column_name).cloned()
-                        }
-                    });
-
-                    // Create storage keys for index lookup
-                    let storage_id =
-                        generate_storage_id(operator.operator_id, column_index, AGG_TYPE_MINMAX);
-                    let zset_id = operator.generate_group_rowid(&group_key);
-
-                    // Get the values for this group from min_max_deltas
-                    let group_values = min_max_deltas.get(&group_key).cloned().unwrap_or_default();
-
-                    let columns_to_process = std::mem::take(columns_to_process);
-                    let min_max_deltas = std::mem::take(min_max_deltas);
-
-                    let scan_state = if is_min {
-                        Box::new(ScanState::new_for_min(
-                            current_value,
-                            group_key.clone(),
-                            column_name.clone(),
-                            storage_id,
-                            zset_id,
-                            group_values,
-                        ))
-                    } else {
-                        Box::new(ScanState::new_for_max(
-                            current_value,
-                            group_key.clone(),
-                            column_name.clone(),
-                            storage_id,
-                            zset_id,
-                            group_values,
-                        ))
-                    };
-
-                    *self = RecomputeMinMax::Scan {
-                        columns_to_process,
-                        current_column_idx: *current_column_idx,
-                        min_max_deltas,
-                        group_key,
-                        column_name,
-                        is_min,
-                        scan_state,
-                    };
-                }
-                RecomputeMinMax::Scan {
-                    columns_to_process,
-                    current_column_idx,
-                    min_max_deltas,
-                    group_key,
-                    column_name,
-                    is_min,
-                    scan_state,
-                } => {
-                    // Find new value using the scan state machine
-                    let new_value = return_if_io!(scan_state.find_new_value(cursors));
-
-                    // Update the state with new value (create if doesn't exist)
-                    let state = existing_groups.entry(group_key.clone()).or_default();
-
-                    if *is_min {
-                        if let Some(min_val) = new_value {
-                            state.mins.insert(column_name.clone(), min_val);
-                        } else {
-                            state.mins.remove(column_name);
-                        }
-                    } else if let Some(max_val) = new_value {
-                        state.maxs.insert(column_name.clone(), max_val);
-                    } else {
-                        state.maxs.remove(column_name);
-                    }
-
-                    // Move to next column
-                    let min_max_deltas = std::mem::take(min_max_deltas);
-                    let columns_to_process = std::mem::take(columns_to_process);
-                    *self = RecomputeMinMax::ProcessElements {
-                        current_column_idx: *current_column_idx + 1,
-                        columns_to_process,
-                        min_max_deltas,
-                    };
-                }
-                RecomputeMinMax::Done => {
-                    return Ok(IOResult::Done(()));
-                }
-            }
-        }
-    }
-}
-
-/// State machine for scanning through the index to find new MIN/MAX values
-#[derive(Debug)]
-pub enum ScanState {
-    CheckCandidate {
-        /// Current candidate value for MIN/MAX
-        candidate: Option<Value>,
-        /// Group key being processed
-        group_key: String,
-        /// Column name being processed
-        column_name: String,
-        /// Storage ID for the index seek
-        storage_id: i64,
-        /// ZSet ID for the group
-        zset_id: i64,
-        /// Group values from MinMaxDeltas: (column_name, HashableRow) -> weight
-        group_values: HashMap<(String, HashableRow), isize>,
-        /// Whether we're looking for MIN (true) or MAX (false)
-        is_min: bool,
-    },
-    FetchNextCandidate {
-        /// Current candidate to seek past
-        current_candidate: Value,
-        /// Group key being processed
-        group_key: String,
-        /// Column name being processed
-        column_name: String,
-        /// Storage ID for the index seek
-        storage_id: i64,
-        /// ZSet ID for the group
-        zset_id: i64,
-        /// Group values from MinMaxDeltas: (column_name, HashableRow) -> weight
-        group_values: HashMap<(String, HashableRow), isize>,
-        /// Whether we're looking for MIN (true) or MAX (false)
-        is_min: bool,
-    },
-    Done {
-        /// The final MIN/MAX value found
-        result: Option<Value>,
-    },
-}
-
-impl ScanState {
-    pub fn new_for_min(
-        current_min: Option<Value>,
-        group_key: String,
-        column_name: String,
-        storage_id: i64,
-        zset_id: i64,
-        group_values: HashMap<(String, HashableRow), isize>,
-    ) -> Self {
-        Self::CheckCandidate {
-            candidate: current_min,
-            group_key,
-            column_name,
-            storage_id,
-            zset_id,
-            group_values,
-            is_min: true,
-        }
-    }
-
-    // Extract a new candidate from the index. It is possible that, when searching,
-    // we end up going into a different operator altogether. That means we have
-    // exhausted this operator (or group) entirely, and no good candidate was found
-    fn extract_new_candidate(
-        cursors: &mut DbspStateCursors,
-        index_record: &ImmutableRecord,
-        seek_op: SeekOp,
-        storage_id: i64,
-        zset_id: i64,
-    ) -> Result<IOResult<Option<Value>>> {
-        let seek_result = return_if_io!(cursors
-            .index_cursor
-            .seek(SeekKey::IndexKey(index_record), seek_op));
-        if !matches!(seek_result, SeekResult::Found) {
-            return Ok(IOResult::Done(None));
-        }
-
-        let record = return_if_io!(cursors.index_cursor.record()).ok_or_else(|| {
-            LimboError::InternalError(
-                "Record found on the cursor, but could not be read".to_string(),
-            )
-        })?;
-
-        let values = record.get_values();
-        if values.len() < 3 {
-            return Ok(IOResult::Done(None));
-        }
-
-        let Some(rec_storage_id) = values.first() else {
-            return Ok(IOResult::Done(None));
-        };
-        let Some(rec_zset_id) = values.get(1) else {
-            return Ok(IOResult::Done(None));
-        };
-
-        // Check if we're still in the same group
-        if let (RefValue::Integer(rec_sid), RefValue::Integer(rec_zid)) =
-            (rec_storage_id, rec_zset_id)
-        {
-            if *rec_sid != storage_id || *rec_zid != zset_id {
-                return Ok(IOResult::Done(None));
-            }
-        } else {
-            return Ok(IOResult::Done(None));
-        }
-
-        // Get the value (3rd element)
-        Ok(IOResult::Done(values.get(2).map(|v| v.to_owned())))
-    }
-
-    pub fn new_for_max(
-        current_max: Option<Value>,
-        group_key: String,
-        column_name: String,
-        storage_id: i64,
-        zset_id: i64,
-        group_values: HashMap<(String, HashableRow), isize>,
-    ) -> Self {
-        Self::CheckCandidate {
-            candidate: current_max,
-            group_key,
-            column_name,
-            storage_id,
-            zset_id,
-            group_values,
-            is_min: false,
-        }
-    }
-
-    pub fn find_new_value(
-        &mut self,
-        cursors: &mut DbspStateCursors,
-    ) -> Result<IOResult<Option<Value>>> {
-        loop {
-            match self {
-                ScanState::CheckCandidate {
-                    candidate,
-                    group_key,
-                    column_name,
-                    storage_id,
-                    zset_id,
-                    group_values,
-                    is_min,
-                } => {
-                    // First, check if we have a candidate
-                    if let Some(cand_val) = candidate {
-                        // Check if the candidate is retracted (weight <= 0)
-                        // Create a HashableRow to look up the weight
-                        let hashable_cand = HashableRow::new(0, vec![cand_val.clone()]);
-                        let key = (column_name.clone(), hashable_cand);
-                        let is_retracted =
-                            group_values.get(&key).is_some_and(|weight| *weight <= 0);
-
-                        if is_retracted {
-                            // Candidate is retracted, need to fetch next from index
-                            *self = ScanState::FetchNextCandidate {
-                                current_candidate: cand_val.clone(),
-                                group_key: std::mem::take(group_key),
-                                column_name: std::mem::take(column_name),
-                                storage_id: *storage_id,
-                                zset_id: *zset_id,
-                                group_values: std::mem::take(group_values),
-                                is_min: *is_min,
-                            };
-                            continue;
-                        }
-                    }
-
-                    // Candidate is valid or we have no candidate
-                    // Now find the best value from insertions in group_values
-                    let mut best_from_zset = None;
-                    for ((col, hashable_val), weight) in group_values.iter() {
-                        if col == column_name && *weight > 0 {
-                            let value = &hashable_val.values[0];
-                            // Skip NULL values - they don't participate in MIN/MAX
-                            if value == &Value::Null {
-                                continue;
-                            }
-                            // This is an insertion for our column
-                            if let Some(ref current_best) = best_from_zset {
-                                if *is_min {
-                                    if value.cmp(current_best) == std::cmp::Ordering::Less {
-                                        best_from_zset = Some(value.clone());
-                                    }
-                                } else if value.cmp(current_best) == std::cmp::Ordering::Greater {
-                                    best_from_zset = Some(value.clone());
-                                }
-                            } else {
-                                best_from_zset = Some(value.clone());
-                            }
-                        }
-                    }
-
-                    // Compare candidate with best from ZSet, filtering out NULLs
-                    let result = match (&candidate, &best_from_zset) {
-                        (Some(cand), Some(zset_val)) if cand != &Value::Null => {
-                            if *is_min {
-                                if zset_val.cmp(cand) == std::cmp::Ordering::Less {
-                                    Some(zset_val.clone())
-                                } else {
-                                    Some(cand.clone())
-                                }
-                            } else if zset_val.cmp(cand) == std::cmp::Ordering::Greater {
-                                Some(zset_val.clone())
-                            } else {
-                                Some(cand.clone())
-                            }
-                        }
-                        (Some(cand), None) if cand != &Value::Null => Some(cand.clone()),
-                        (None, Some(zset_val)) => Some(zset_val.clone()),
-                        (Some(cand), Some(_)) if cand == &Value::Null => best_from_zset,
-                        _ => None,
-                    };
-
-                    *self = ScanState::Done { result };
-                }
-
-                ScanState::FetchNextCandidate {
-                    current_candidate,
-                    group_key,
-                    column_name,
-                    storage_id,
-                    zset_id,
-                    group_values,
-                    is_min,
-                } => {
-                    // Seek to the next value in the index
-                    let index_key = vec![
-                        Value::Integer(*storage_id),
-                        Value::Integer(*zset_id),
-                        current_candidate.clone(),
-                    ];
-                    let index_record = ImmutableRecord::from_values(&index_key, index_key.len());
-
-                    let seek_op = if *is_min {
-                        SeekOp::GT // For MIN, seek greater than current
-                    } else {
-                        SeekOp::LT // For MAX, seek less than current
-                    };
-
-                    let new_candidate = return_if_io!(Self::extract_new_candidate(
-                        cursors,
-                        &index_record,
-                        seek_op,
-                        *storage_id,
-                        *zset_id
-                    ));
-
-                    *self = ScanState::CheckCandidate {
-                        candidate: new_candidate,
-                        group_key: std::mem::take(group_key),
-                        column_name: std::mem::take(column_name),
-                        storage_id: *storage_id,
-                        zset_id: *zset_id,
-                        group_values: std::mem::take(group_values),
-                        is_min: *is_min,
-                    };
-                }
-
-                ScanState::Done { result } => {
-                    return Ok(IOResult::Done(result.clone()));
-                }
-            }
-        }
-    }
-}
-
-/// State machine for persisting Min/Max values to storage
-#[derive(Debug)]
-pub enum MinMaxPersistState {
-    Init {
-        min_max_deltas: MinMaxDeltas,
-        group_keys: Vec<String>,
-    },
-    ProcessGroup {
-        min_max_deltas: MinMaxDeltas,
-        group_keys: Vec<String>,
-        group_idx: usize,
-        value_idx: usize,
-    },
-    WriteValue {
-        min_max_deltas: MinMaxDeltas,
-        group_keys: Vec<String>,
-        group_idx: usize,
-        value_idx: usize,
-        value: Value,
-        column_name: String,
-        weight: isize,
-        write_row: WriteRow,
-    },
-    Done,
-}
-
-impl MinMaxPersistState {
-    pub fn new(min_max_deltas: MinMaxDeltas) -> Self {
-        let group_keys: Vec<String> = min_max_deltas.keys().cloned().collect();
-        Self::Init {
-            min_max_deltas,
-            group_keys,
-        }
-    }
-
-    pub fn persist_min_max(
-        &mut self,
-        operator_id: usize,
-        column_min_max: &HashMap<String, AggColumnInfo>,
-        cursors: &mut DbspStateCursors,
-        generate_group_rowid: impl Fn(&str) -> i64,
-    ) -> Result<IOResult<()>> {
-        loop {
-            match self {
-                MinMaxPersistState::Init {
-                    min_max_deltas,
-                    group_keys,
-                } => {
-                    let min_max_deltas = std::mem::take(min_max_deltas);
-                    let group_keys = std::mem::take(group_keys);
-                    *self = MinMaxPersistState::ProcessGroup {
-                        min_max_deltas,
-                        group_keys,
-                        group_idx: 0,
-                        value_idx: 0,
-                    };
-                }
-                MinMaxPersistState::ProcessGroup {
-                    min_max_deltas,
-                    group_keys,
-                    group_idx,
-                    value_idx,
-                } => {
-                    // Check if we're past all groups
-                    if *group_idx >= group_keys.len() {
-                        *self = MinMaxPersistState::Done;
-                        continue;
-                    }
-
-                    let group_key_str = &group_keys[*group_idx];
-                    let values = &min_max_deltas[group_key_str]; // This should always exist
-
-                    // Convert HashMap to Vec for indexed access
-                    let values_vec: Vec<_> = values.iter().collect();
-
-                    // Check if we have more values in current group
-                    if *value_idx >= values_vec.len() {
-                        *group_idx += 1;
-                        *value_idx = 0;
-                        // Continue to check if we're past all groups now
-                        continue;
-                    }
-
-                    // Process current value and extract what we need before taking ownership
-                    let ((column_name, hashable_row), weight) = values_vec[*value_idx];
-                    let column_name = column_name.clone();
-                    let value = hashable_row.values[0].clone(); // Extract the Value from HashableRow
-                    let weight = *weight;
-
-                    let min_max_deltas = std::mem::take(min_max_deltas);
-                    let group_keys = std::mem::take(group_keys);
-                    *self = MinMaxPersistState::WriteValue {
-                        min_max_deltas,
-                        group_keys,
-                        group_idx: *group_idx,
-                        value_idx: *value_idx,
-                        column_name,
-                        value,
-                        weight,
-                        write_row: WriteRow::new(),
-                    };
-                }
-                MinMaxPersistState::WriteValue {
-                    min_max_deltas,
-                    group_keys,
-                    group_idx,
-                    value_idx,
-                    value,
-                    column_name,
-                    weight,
-                    write_row,
-                } => {
-                    // Should have exited in the previous state
-                    assert!(*group_idx < group_keys.len());
-
-                    let group_key_str = &group_keys[*group_idx];
-
-                    // Get the column index from the pre-computed map
-                    let column_info = column_min_max
-                        .get(&*column_name)
-                        .expect("Column should exist in column_min_max map");
-                    let column_index = column_info.index;
-
-                    // Build the key components for MinMax storage using new encoding
-                    let storage_id =
-                        generate_storage_id(operator_id, column_index, AGG_TYPE_MINMAX);
-                    let zset_id = generate_group_rowid(group_key_str);
-
-                    // element_id is the actual value for Min/Max
-                    let element_id_val = value.clone();
-
-                    // Create index key
-                    let index_key = vec![
-                        Value::Integer(storage_id),
-                        Value::Integer(zset_id),
-                        element_id_val.clone(),
-                    ];
-
-                    // Record values (operator_id, zset_id, element_id, unused_placeholder)
-                    // For MIN/MAX, the element_id IS the value, so we use NULL for the 4th column
-                    let record_values = vec![
-                        Value::Integer(storage_id),
-                        Value::Integer(zset_id),
-                        element_id_val.clone(),
-                        Value::Null, // Placeholder - not used for MIN/MAX
-                    ];
-
-                    return_if_io!(write_row.write_row(
-                        cursors,
-                        index_key.clone(),
-                        record_values,
-                        *weight
-                    ));
-
-                    // Move to next value
-                    let min_max_deltas = std::mem::take(min_max_deltas);
-                    let group_keys = std::mem::take(group_keys);
-                    *self = MinMaxPersistState::ProcessGroup {
-                        min_max_deltas,
-                        group_keys,
-                        group_idx: *group_idx,
-                        value_idx: *value_idx + 1,
-                    };
-                }
-                MinMaxPersistState::Done => {
-                    return Ok(IOResult::Done(()));
-                }
-            }
-        }
-    }
-}