diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 05dd114be71b..d7c005b0a593 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -230,7 +230,7 @@ mod tests { // Simulate a NULL value in the parent array, for instance, if array being queried by // invalid index - mutable.extend_nulls(1); + mutable.try_extend_nulls(1).unwrap(); let data = mutable.freeze(); let struct_array = Arc::new(StructArray::from(data.clone())); diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index f50dd3420baa..8e887f447d0a 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -1729,7 +1729,7 @@ mod tests_from_ffi { let data = array.to_data(); let mut mutable = MutableArrayData::new(vec![&data], false, len); - mutable.extend(0, 0, len); + mutable.try_extend(0, 0, len).unwrap(); make_array(mutable.freeze()) } diff --git a/arrow-cast/src/cast/list.rs b/arrow-cast/src/cast/list.rs index 5d7209ee1111..e4762d1e5fb1 100644 --- a/arrow-cast/src/cast/list.rs +++ b/arrow-cast/src/cast/list.rs @@ -190,10 +190,14 @@ where if cast_options.safe || array.is_null(idx) { if last_pos != start_pos { // Extend with valid slices - mutable.extend(0, last_pos, start_pos); + mutable + .try_extend(0, last_pos, start_pos) + .map_err(|e| ArrowError::CastError(e.to_string()))?; } // Pad this slice with nulls - mutable.extend_nulls(size as _); + mutable + .try_extend_nulls(size as _) + .map_err(|e| ArrowError::CastError(e.to_string()))?; null_builder.set_bit(idx, false); // Set last_pos to the end of this slice's values last_pos = end_pos @@ -211,7 +215,9 @@ where if mutable.len() != cap { // Remaining slices were all correct length let remaining = cap - mutable.len(); - mutable.extend(0, last_pos, last_pos + remaining) + mutable + .try_extend(0, last_pos, last_pos + remaining) + .map_err(|e| ArrowError::CastError(e.to_string()))?; } make_array(mutable.freeze()) } @@ -252,7 +258,9 @@ pub(crate) fn cast_list_view_to_fixed_size_list( if len != size as usize { // Nulls in FixedSizeListArray take up space and so we must pad the values if cast_options.safe || array.is_null(idx) { - mutable.extend_nulls(size as _); + mutable + .try_extend_nulls(size as _) + .map_err(|e| ArrowError::CastError(e.to_string()))?; null_builder.set_bit(idx, false); } else { return Err(ArrowError::CastError(format!( @@ -260,7 +268,9 @@ pub(crate) fn cast_list_view_to_fixed_size_list( ))); } } else { - mutable.extend(0, offset, offset + len); + mutable + .try_extend(0, offset, offset + len) + .map_err(|e| ArrowError::CastError(e.to_string()))?; } } diff --git a/arrow-data/src/transform/boolean.rs b/arrow-data/src/transform/boolean.rs index 1f3bd8f885c0..d37d68462994 100644 --- a/arrow-data/src/transform/boolean.rs +++ b/arrow-data/src/transform/boolean.rs @@ -32,11 +32,16 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> { array.offset() + start, len, ); + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), arrow_schema::ArrowError> { let buffer = &mut mutable.buffer1; resize_for_bits(buffer, mutable.len + len); + Ok(()) } diff --git a/arrow-data/src/transform/fixed_binary.rs b/arrow-data/src/transform/fixed_binary.rs index 626ecbee0261..49a4dbd76b9c 100644 --- a/arrow-data/src/transform/fixed_binary.rs +++ b/arrow-data/src/transform/fixed_binary.rs @@ -30,11 +30,15 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> { move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { let buffer = &mut mutable.buffer1; buffer.extend_from_slice(&values[start * size..(start + len) * size]); + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), arrow_schema::ArrowError> { let size = match mutable.data_type { DataType::FixedSizeBinary(i) => i as usize, _ => unreachable!(), @@ -42,4 +46,5 @@ pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let values_buffer = &mut mutable.buffer1; values_buffer.extend_zeros(len * size); + Ok(()) } diff --git a/arrow-data/src/transform/fixed_size_list.rs b/arrow-data/src/transform/fixed_size_list.rs index ada1a2f763c4..139ee45e51d0 100644 --- a/arrow-data/src/transform/fixed_size_list.rs +++ b/arrow-data/src/transform/fixed_size_list.rs @@ -16,7 +16,7 @@ // under the License. use crate::ArrayData; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use super::{_MutableArrayData, Extend}; @@ -28,22 +28,22 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> { Box::new( move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend(index, start * size, (start + len) * size)) + for child in mutable.child_data.iter_mut() { + child.try_extend(index, start * size, (start + len) * size)?; + } + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) -> Result<(), ArrowError> { let size = match mutable.data_type { DataType::FixedSizeList(_, i) => i as usize, _ => unreachable!(), }; - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(len * size)) + for child in mutable.child_data.iter_mut() { + child.try_extend_nulls(len * size)?; + } + Ok(()) } diff --git a/arrow-data/src/transform/list.rs b/arrow-data/src/transform/list.rs index b7a9ab6da0ed..d9d61a7c0315 100644 --- a/arrow-data/src/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -17,10 +17,11 @@ use super::{ _MutableArrayData, Extend, - utils::{extend_offsets, get_last_offset}, + utils::{get_last_offset, try_extend_offsets}, }; use crate::ArrayData; use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; use num_integer::Integer; use num_traits::CheckedAdd; @@ -36,9 +37,9 @@ pub(super) fn build_extend( let last_offset: T = unsafe { get_last_offset(offset_buffer) }; // offsets - extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1]); + try_extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1])?; - mutable.child_data[0].extend( + mutable.child_data[0].try_extend( index, offsets[start].as_usize(), offsets[start + len].as_usize(), @@ -47,11 +48,15 @@ pub(super) fn build_extend( ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), ArrowError> { let offset_buffer = &mut mutable.buffer1; // this is safe due to how offset is built. See details on `get_last_offset` let last_offset: T = unsafe { get_last_offset(offset_buffer) }; - (0..len).for_each(|_| offset_buffer.push(last_offset)) + (0..len).for_each(|_| offset_buffer.push(last_offset)); + Ok(()) } diff --git a/arrow-data/src/transform/list_view.rs b/arrow-data/src/transform/list_view.rs index f01e14b978c9..473b574bf3f4 100644 --- a/arrow-data/src/transform/list_view.rs +++ b/arrow-data/src/transform/list_view.rs @@ -18,6 +18,7 @@ use crate::ArrayData; use crate::transform::_MutableArrayData; use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; use num_integer::Integer; use num_traits::CheckedAdd; @@ -33,23 +34,35 @@ pub(super) fn build_extend( for i in start..start + len { mutable.buffer1.push(new_offset); mutable.buffer2.push(sizes[i]); - new_offset = new_offset.checked_add(&sizes[i]).expect("offset overflow"); + new_offset = new_offset.checked_add(&sizes[i]).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "offset overflow: data exceeds the capacity of the offset type. \ + Try splitting into smaller batches or using a larger type \ + (e.g. LargeListView instead of ListView)" + .to_string(), + ) + })?; let size = sizes[i].as_usize(); if size > 0 { let child_start = offsets[i].as_usize(); - mutable.child_data[0].extend(index, child_start, child_start + size); + mutable.child_data[0].try_extend(index, child_start, child_start + size)?; } } + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), ArrowError> { let offset_buffer = &mut mutable.buffer1; let sizes_buffer = &mut mutable.buffer2; // We push 0 as a placeholder for NULL values in both the offsets and sizes (0..len).for_each(|_| offset_buffer.push(T::default())); (0..len).for_each(|_| sizes_buffer.push(T::default())); + Ok(()) } diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 66f6603f02fc..60a3245632b6 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -45,9 +45,10 @@ mod variable_size; type ExtendNullBits<'a> = Box; // function that extends `[start..start+len]` to the mutable array. // this is dynamic because different data_types influence how buffers and children are extended. -type Extend<'a> = Box; +type Extend<'a> = + Box Result<(), ArrowError> + 'a>; -type ExtendNulls = Box; +type ExtendNulls = Box Result<(), ArrowError>>; /// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. /// This is just a data container. @@ -230,7 +231,8 @@ fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend<'_> { let mut view = ByteView::from(*v); view.buffer_index += buffer_offset; view.into() - })) + })); + Ok(()) }, ) } @@ -628,7 +630,10 @@ impl<'a> MutableArrayData<'a> { let mut mutable = MutableArrayData::new(dictionaries, false, capacity); for (i, len) in lengths.iter().enumerate() { - mutable.extend(i, 0, *len) + mutable.try_extend(i, 0, *len).expect( + "extend failed while building dictionary; \ + this is a bug in MutableArrayData", + ) } (Some(mutable.freeze()), true) @@ -716,36 +721,95 @@ impl<'a> MutableArrayData<'a> { } } - /// Extends the in progress array with a region of the input arrays + /// Extends the in progress array with a region of the input arrays, returning an error on + /// overflow. /// /// # Arguments - /// * `index` - the index of array that you what to copy values from + /// * `index` - the index of array that you want to copy values from /// * `start` - the start index of the chunk (inclusive) /// * `end` - the end index of the chunk (exclusive) /// + /// # Errors + /// Returns an error if offset arithmetic overflows the underlying integer type. + /// /// # Panic /// This function panics if there is an invalid index, /// i.e. `index` >= the number of source arrays /// or `end` > the length of the `index`th array - pub fn extend(&mut self, index: usize, start: usize, end: usize) { + pub fn try_extend(&mut self, index: usize, start: usize, end: usize) -> Result<(), ArrowError> { let len = end - start; (self.extend_null_bits[index])(&mut self.data, start, len); - (self.extend_values[index])(&mut self.data, index, start, len); + // Snapshot buffer lengths before attempting the extend so we can roll + // back to a consistent state if it fails. + let buf1_len = self.data.buffer1.len(); + let buf2_len = self.data.buffer2.len(); + if let Err(e) = (self.extend_values[index])(&mut self.data, index, start, len) { + // Restore buffers to their pre-call lengths so the array remains + // in a valid state for the caller to inspect or retry. + self.data.buffer1.truncate(buf1_len); + self.data.buffer2.truncate(buf2_len); + return Err(e); + } self.data.len += len; + Ok(()) } - /// Extends the in progress array with null elements, ignoring the input arrays. + /// Extends the in progress array with a region of the input arrays. + /// + /// # Deprecated + /// Use [`try_extend`](Self::try_extend) instead, which returns an [`ArrowError`] on overflow + /// rather than panicking. + /// + /// # Panic + /// This function panics if there is an invalid index, + /// i.e. `index` >= the number of source arrays, + /// `end` > the length of the `index`th array, + /// or the offset type overflows (e.g. more than 2 GiB in a `StringArray`). + #[deprecated( + since = "59.0.0", + note = "Use `try_extend` which returns an error on overflow instead of panicking" + )] + pub fn extend(&mut self, index: usize, start: usize, end: usize) { + self.try_extend(index, start, end) + .expect("extend failed due to offset overflow") + } + + /// Extends the in progress array with null elements, ignoring the input arrays, returning an + /// error on overflow. + /// + /// Prefer this over [`extend_nulls`](Self::extend_nulls) to handle cases where the run-end + /// counter overflows (relevant for `RunEndEncoded` arrays). /// /// # Panics /// /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays - pub fn extend_nulls(&mut self, len: usize) { + pub fn try_extend_nulls(&mut self, len: usize) -> Result<(), ArrowError> { self.data.len += len; let bit_len = bit_util::ceil(self.data.len, 8); let nulls = self.data.null_buffer(); nulls.resize(bit_len, 0); self.data.null_count += len; - (self.extend_nulls)(&mut self.data, len); + (self.extend_nulls)(&mut self.data, len)?; + Ok(()) + } + + /// Extends the in progress array with null elements, ignoring the input arrays. + /// + /// # Deprecated + /// Use [`try_extend_nulls`](Self::try_extend_nulls) instead, which returns an [`ArrowError`] + /// on overflow rather than panicking. + /// + /// # Panics + /// + /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays, + /// or if the run-end counter overflows for `RunEndEncoded` arrays. + #[deprecated( + since = "59.0.0", + note = "Use `try_extend_nulls` which returns an error on overflow instead of panicking" + )] + pub fn extend_nulls(&mut self, len: usize) { + self.try_extend_nulls(len) + .expect("extend_nulls failed due to overflow") } /// Returns the current length diff --git a/arrow-data/src/transform/null.rs b/arrow-data/src/transform/null.rs index 7355a5420b8e..48953b10d9e6 100644 --- a/arrow-data/src/transform/null.rs +++ b/arrow-data/src/transform/null.rs @@ -19,7 +19,12 @@ use super::{_MutableArrayData, Extend}; use crate::ArrayData; pub(super) fn build_extend(_: &ArrayData) -> Extend<'_> { - Box::new(move |_, _, _, _| {}) + Box::new(move |_, _, _, _| Ok(())) } -pub(super) fn extend_nulls(_: &mut _MutableArrayData, _: usize) {} +pub(super) fn extend_nulls( + _: &mut _MutableArrayData, + _: usize, +) -> Result<(), arrow_schema::ArrowError> { + Ok(()) +} diff --git a/arrow-data/src/transform/primitive.rs b/arrow-data/src/transform/primitive.rs index 8f9929c4305d..b4846fa6172a 100644 --- a/arrow-data/src/transform/primitive.rs +++ b/arrow-data/src/transform/primitive.rs @@ -29,6 +29,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> mutable .buffer1 .extend_from_slice(&values[start..start + len]); + Ok(()) }, ) } @@ -43,10 +44,15 @@ where mutable .buffer1 .extend(values[start..start + len].iter().map(|x| *x + offset)); + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), arrow_schema::ArrowError> { mutable.buffer1.extend_zeros(len * size_of::()); + Ok(()) } diff --git a/arrow-data/src/transform/run.rs b/arrow-data/src/transform/run.rs index 6ae3a034f340..3678db16e058 100644 --- a/arrow-data/src/transform/run.rs +++ b/arrow-data/src/transform/run.rs @@ -17,7 +17,7 @@ use super::{_MutableArrayData, ArrayData, Extend}; use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice}; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use num_traits::CheckedAdd; /// Generic helper to get the last run end value from a run ends array @@ -38,14 +38,14 @@ fn get_last_run_end(run_ends_data: &super::MutableArrayData) /// /// For RunEndEncoded, this adds nulls by extending the run_ends array /// and values array appropriately. -pub fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) -> Result<(), ArrowError> { if len == 0 { - return; + return Ok(()); } // For REE, we always need to add a value entry when adding a new run // The values array should have one entry per run, not per logical element - mutable.child_data[1].extend_nulls(1); + mutable.child_data[1].try_extend_nulls(1)?; // Determine the run end type from the data type let run_end_type = if let DataType::RunEndEncoded(run_ends_field, _) = &mutable.data_type { @@ -60,7 +60,13 @@ pub fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let last_run_end = get_last_run_end::<$run_end_type>(&mutable.child_data[0]); let new_value = last_run_end .checked_add(<$run_end_type as ArrowNativeType>::usize_as(len)) - .expect("run end overflow"); + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "run end overflow when extending RunEndEncoded array: \ + use a larger run-end type (e.g. Int64 instead of Int32)" + .to_string(), + ) + })?; mutable.child_data[0] .data .buffer1 @@ -77,8 +83,12 @@ pub fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { }; mutable.child_data[0].data.len += 1; + Ok(()) } +/// The run-ends bytes and optional values index range returned by [`build_extend_arrays`]. +type ExtendArrays = (Vec, Option<(usize, usize)>); + /// Build run ends bytes and values range directly for batch processing fn build_extend_arrays + CheckedAdd>( buffer: &Buffer, @@ -86,7 +96,7 @@ fn build_extend_arrays + CheckedA start: usize, len: usize, dest_last_run_end: T, -) -> (Vec, Option<(usize, usize)>) { +) -> Result { let mut run_ends_bytes = Vec::new(); let mut values_range: Option<(usize, usize)> = None; let end = start + len; @@ -109,7 +119,13 @@ fn build_extend_arrays + CheckedA }; current_run_end = current_run_end .checked_add(&T::usize_as(end_offset - start_offset)) - .expect("run end overflow"); + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "run end overflow when extending RunEndEncoded array: \ + use a larger run-end type (e.g. Int64 instead of Int32)" + .to_string(), + ) + })?; run_ends_bytes.extend_from_slice(current_run_end.to_byte_slice()); // Start the range @@ -117,7 +133,13 @@ fn build_extend_arrays + CheckedA } else if prev_end >= start && run_end <= end { current_run_end = current_run_end .checked_add(&T::usize_as(run_end - prev_end)) - .expect("run end overflow"); + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "run end overflow when extending RunEndEncoded array: \ + use a larger run-end type (e.g. Int64 instead of Int32)" + .to_string(), + ) + })?; run_ends_bytes.extend_from_slice(current_run_end.to_byte_slice()); // Extend the range @@ -128,7 +150,13 @@ fn build_extend_arrays + CheckedA } else if prev_end < end && run_end >= end { current_run_end = current_run_end .checked_add(&T::usize_as(end - prev_end)) - .expect("run end overflow"); + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "run end overflow when extending RunEndEncoded array: \ + use a larger run-end type (e.g. Int64 instead of Int32)" + .to_string(), + ) + })?; run_ends_bytes.extend_from_slice(current_run_end.to_byte_slice()); // Extend the range and break @@ -149,7 +177,7 @@ fn build_extend_arrays + CheckedA break; } } - (run_ends_bytes, values_range) + Ok((run_ends_bytes, values_range)) } /// Process extends using batch operations @@ -158,9 +186,9 @@ fn process_extends_batch( source_array_idx: usize, run_ends_bytes: Vec, values_range: Option<(usize, usize)>, -) { +) -> Result<(), ArrowError> { if run_ends_bytes.is_empty() { - return; + return Ok(()); } // Batch extend the run_ends array with all bytes at once @@ -173,7 +201,7 @@ fn process_extends_batch( // Batch extend the values array using the range let (start_idx, end_idx) = values_range.expect("values_range should be Some if run_ends_bytes is not empty"); - mutable.child_data[1].extend(source_array_idx, start_idx, end_idx); + mutable.child_data[1].try_extend(source_array_idx, start_idx, end_idx) } /// Returns a function that extends the run encoded array. @@ -183,7 +211,7 @@ pub fn build_extend(array: &ArrayData) -> Extend<'_> { Box::new( move |mutable: &mut _MutableArrayData, array_idx: usize, start: usize, len: usize| { if len == 0 { - return; + return Ok(()); } // We need to analyze the source array's run structure @@ -209,13 +237,13 @@ pub fn build_extend(array: &ArrayData) -> Extend<'_> { start + array.offset(), len, dest_last_run_end, - ); + )?; process_extends_batch::<$run_end_type>( mutable, array_idx, run_ends_bytes, values_range, - ); + )?; }}; } @@ -225,6 +253,7 @@ pub fn build_extend(array: &ArrayData) -> Extend<'_> { DataType::Int64 => build_and_process_impl!(i64), _ => panic!("Invalid run end type for RunEndEncoded array: {dest_run_end_type}",), } + Ok(()) }, ) } @@ -375,9 +404,9 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], true, 10); - mutable.extend_nulls(3); - mutable.extend(0, 0, 5); - mutable.extend_nulls(3); + mutable.try_extend_nulls(3).unwrap(); + mutable.try_extend(0, 0, 5).unwrap(); + mutable.try_extend_nulls(3).unwrap(); // Verify the run ends were extended correctly let result = mutable.freeze(); @@ -425,10 +454,10 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], true, 10); // First, we need to copy the existing data - mutable.extend(0, 0, 5); + mutable.try_extend(0, 0, 5).unwrap(); // Then add nulls - mutable.extend_nulls(3); + mutable.try_extend_nulls(3).unwrap(); // Verify the run ends were extended correctly let result = mutable.freeze(); @@ -454,10 +483,10 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], true, 10); // First, we need to copy the existing data - mutable.extend(0, 0, 5); + mutable.try_extend(0, 0, 5).unwrap(); // Then add nulls - mutable.extend_nulls(3); + mutable.try_extend_nulls(3).unwrap(); // Verify the run ends were extended correctly let result = mutable.freeze(); @@ -483,7 +512,7 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], false, 10); // Extend the entire array - mutable.extend(0, 0, 5); + mutable.try_extend(0, 0, 5).unwrap(); let result = mutable.freeze(); @@ -501,7 +530,7 @@ mod tests { let ree_array = create_run_array_data(vec![], values); let mut mutable = MutableArrayData::new(vec![&ree_array], false, 10); - mutable.extend(0, 0, 0); + mutable.try_extend(0, 0, 0).unwrap(); let result = mutable.freeze(); assert_eq!(result.len(), 0); @@ -511,7 +540,8 @@ mod tests { #[test] fn test_build_extend_arrays_int16() { let buffer = Buffer::from_vec(vec![3i16, 5i16, 8i16]); - let (run_ends_bytes, values_range) = build_extend_arrays::(&buffer, 3, 2, 4, 0i16); + let (run_ends_bytes, values_range) = + build_extend_arrays::(&buffer, 3, 2, 4, 0i16).unwrap(); // Logical array: [A, A, A, B, B, C, C, C] // Requesting indices 2-6 should give us: @@ -533,7 +563,8 @@ mod tests { #[test] fn test_build_extend_arrays_int64() { let buffer = Buffer::from_vec(vec![3i64, 5i64, 8i64]); - let (run_ends_bytes, values_range) = build_extend_arrays::(&buffer, 3, 2, 4, 0i64); + let (run_ends_bytes, values_range) = + build_extend_arrays::(&buffer, 3, 2, 4, 0i64).unwrap(); // Same logic as above but with i64 assert_eq!(run_ends_bytes.len(), 3 * std::mem::size_of::()); @@ -559,7 +590,7 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], false, 10); // Extend the entire array - mutable.extend(0, 0, 5); + mutable.try_extend(0, 0, 5).unwrap(); let result = mutable.freeze(); @@ -576,7 +607,6 @@ mod tests { } #[test] - #[should_panic(expected = "run end overflow")] fn test_extend_nulls_overflow_i16() { let values = create_int32_array_data(vec![42]); // Start with run end close to max to set up overflow condition @@ -584,14 +614,17 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], true, 10); // Extend the original data first to initialize state - mutable.extend(0, 0, 5_usize); - - // This should cause overflow: i16::MAX + 5 > i16::MAX - mutable.extend_nulls(i16::MAX as usize); + mutable.try_extend(0, 0, 5_usize).unwrap(); + + // This should return an error: i16::MAX + 5 > i16::MAX + let err = mutable.try_extend_nulls(i16::MAX as usize).unwrap_err(); + assert!( + err.to_string().contains("run end overflow"), + "unexpected error: {err}" + ); } #[test] - #[should_panic(expected = "run end overflow")] fn test_extend_nulls_overflow_i32() { let values = create_int32_array_data(vec![42]); // Start with run end close to max to set up overflow condition @@ -599,14 +632,17 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&ree_array], true, 10); // Extend the original data first to initialize state - mutable.extend(0, 0, 10_usize); - - // This should cause overflow: (i32::MAX - 10) + 20 > i32::MAX - mutable.extend_nulls(i32::MAX as usize); + mutable.try_extend(0, 0, 10_usize).unwrap(); + + // This should return an error: (i32::MAX - 10) + 20 > i32::MAX + let err = mutable.try_extend_nulls(i32::MAX as usize).unwrap_err(); + assert!( + err.to_string().contains("run end overflow"), + "unexpected error: {err}" + ); } #[test] - #[should_panic(expected = "run end overflow")] fn test_build_extend_overflow_i16() { // Create a source array with small run that will cause overflow when added let values = create_int32_array_data(vec![10]); @@ -619,14 +655,17 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&source_array, &dest_array], false, 10); // First extend the destination array to set up state - mutable.extend(1, 0, (i16::MAX - 5) as usize); - - // This should cause overflow: (i16::MAX - 5) + 20 > i16::MAX - mutable.extend(0, 0, 20); + mutable.try_extend(1, 0, (i16::MAX - 5) as usize).unwrap(); + + // This should return an error: (i16::MAX - 5) + 20 > i16::MAX + let err = mutable.try_extend(0, 0, 20).unwrap_err(); + assert!( + err.to_string().contains("run end overflow"), + "unexpected error: {err}" + ); } #[test] - #[should_panic(expected = "run end overflow")] fn test_build_extend_overflow_i32() { // Create a source array with small run that will cause overflow when added let values = create_int32_array_data(vec![10]); @@ -639,9 +678,13 @@ mod tests { let mut mutable = MutableArrayData::new(vec![&source_array, &dest_array], false, 10); // First extend the destination array to set up state - mutable.extend(1, 0, (i32::MAX - 50) as usize); - - // This should cause overflow: (i32::MAX - 50) + 100 > i32::MAX - mutable.extend(0, 0, 100); + mutable.try_extend(1, 0, (i32::MAX - 50) as usize).unwrap(); + + // This should return an error: (i32::MAX - 50) + 100 > i32::MAX + let err = mutable.try_extend(0, 0, 100).unwrap_err(); + assert!( + err.to_string().contains("run end overflow"), + "unexpected error: {err}" + ); } } diff --git a/arrow-data/src/transform/structure.rs b/arrow-data/src/transform/structure.rs index 588cc00f446b..5e7acb7c2437 100644 --- a/arrow-data/src/transform/structure.rs +++ b/arrow-data/src/transform/structure.rs @@ -17,21 +17,45 @@ use super::{_MutableArrayData, Extend}; use crate::ArrayData; +use arrow_schema::{ArrowError, DataType}; pub(super) fn build_extend(_: &ArrayData) -> Extend<'_> { Box::new( move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend(index, start, start + len)) + // Collect field names before the mutable borrow of child_data. + let field_names = struct_field_names(&mutable.data_type); + for (col_idx, child) in mutable.child_data.iter_mut().enumerate() { + child + .try_extend(index, start, start + len) + .map_err(|e| wrap_column_error(e, col_idx, &field_names))?; + } + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(len)) +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) -> Result<(), ArrowError> { + let field_names = struct_field_names(&mutable.data_type); + for (col_idx, child) in mutable.child_data.iter_mut().enumerate() { + child + .try_extend_nulls(len) + .map_err(|e| wrap_column_error(e, col_idx, &field_names))?; + } + Ok(()) +} + +fn struct_field_names(data_type: &DataType) -> Vec { + if let DataType::Struct(fields) = data_type { + fields.iter().map(|f| f.name().to_string()).collect() + } else { + vec![] + } +} + +fn wrap_column_error(e: ArrowError, col_idx: usize, field_names: &[String]) -> ArrowError { + let name_ctx = field_names + .get(col_idx) + .map(|n| format!(" (\"{n}\")")) + .unwrap_or_default(); + ArrowError::InvalidArgumentError(format!("struct column {col_idx}{name_ctx} failed: {e}")) } diff --git a/arrow-data/src/transform/union.rs b/arrow-data/src/transform/union.rs index d1301249d326..ba7015f7a984 100644 --- a/arrow-data/src/transform/union.rs +++ b/arrow-data/src/transform/union.rs @@ -17,7 +17,7 @@ use super::{_MutableArrayData, Extend}; use crate::ArrayData; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend<'_> { let type_ids = array.buffer::(0); @@ -29,10 +29,10 @@ pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend<'_> { .buffer1 .extend_from_slice(&type_ids[start..start + len]); - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend(index, start, start + len)) + for child in mutable.child_data.iter_mut() { + child.try_extend(index, start, start + len)?; + } + Ok(()) }, ) } @@ -51,7 +51,7 @@ pub(super) fn build_extend_dense(array: &ArrayData) -> Extend<'_> { .buffer1 .extend_from_slice(&type_ids[start..start + len]); - (start..start + len).for_each(|i| { + for i in start..start + len { let type_id = type_ids[i]; let child_index = src_fields .iter() @@ -63,13 +63,17 @@ pub(super) fn build_extend_dense(array: &ArrayData) -> Extend<'_> { // Extend offsets mutable.buffer2.push(dst_offset as i32); - mutable.child_data[child_index].extend(index, src_offset, src_offset + 1) - }) + mutable.child_data[child_index].try_extend(index, src_offset, src_offset + 1)?; + } + Ok(()) }, ) } -pub(super) fn extend_nulls_dense(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls_dense( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), ArrowError> { let DataType::Union(fields, _) = &mutable.data_type else { unreachable!() }; @@ -86,10 +90,14 @@ pub(super) fn extend_nulls_dense(mutable: &mut _MutableArrayData, len: usize) { let child_offset = mutable.child_data[0].len(); let (start, end) = (child_offset as i32, (child_offset + len) as i32); mutable.buffer2.extend(start..end); - mutable.child_data[0].extend_nulls(len); + mutable.child_data[0].try_extend_nulls(len)?; + Ok(()) } -pub(super) fn extend_nulls_sparse(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls_sparse( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), ArrowError> { let DataType::Union(fields, _) = &mutable.data_type else { unreachable!() }; @@ -103,8 +111,8 @@ pub(super) fn extend_nulls_sparse(mutable: &mut _MutableArrayData, len: usize) { mutable.buffer1.extend_from_slice(&vec![first_type_id; len]); // Sparse: extend nulls in ALL children - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(len)); + for child in mutable.child_data.iter_mut() { + child.try_extend_nulls(len)?; + } + Ok(()) } diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index 979738d057fd..294ed94eb7e2 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -16,6 +16,7 @@ // under the License. use arrow_buffer::{ArrowNativeType, MutableBuffer, bit_util}; +use arrow_schema::ArrowError; use num_integer::Integer; use num_traits::CheckedAdd; @@ -28,21 +29,36 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) { } } -pub(super) fn extend_offsets( +/// Extends `buffer` with the re-based offsets from `offsets`, returning an error on overflow. +pub(super) fn try_extend_offsets( buffer: &mut MutableBuffer, mut last_offset: T, offsets: &[T], -) { +) -> Result<(), ArrowError> { buffer.reserve(std::mem::size_of_val(offsets)); - offsets.windows(2).for_each(|offsets| { - // compute the new offset - let length = offsets[1] - offsets[0]; - // if you hit this appending to a StringArray / BinaryArray it is because you - // are trying to add more data than can fit into that type. Try breaking your data into - // smaller batches or using LargeStringArray / LargeBinaryArray - last_offset = last_offset.checked_add(&length).expect("offset overflow"); - buffer.push(last_offset); - }); + // Snapshot the length so we can roll back partial writes on overflow. + let original_len = buffer.len(); + for window in offsets.windows(2) { + let length = window[1] - window[0]; + match last_offset.checked_add(&length) { + Some(new_offset) => { + last_offset = new_offset; + buffer.push(last_offset); + } + None => { + // Restore the buffer to its state before this call so the + // caller is not left with a partially-written offset sequence. + buffer.resize(original_len, 0); + return Err(ArrowError::InvalidArgumentError( + "offset overflow: data exceeds the capacity of the offset type. \ + Try splitting into smaller batches or using a larger type \ + (e.g. LargeStringArray / LargeBinaryArray instead of StringArray / BinaryArray)" + .to_string(), + )); + } + } + } + Ok(()) } #[inline] @@ -60,13 +76,41 @@ pub(super) unsafe fn get_last_offset(offset_buffer: &Mutable #[cfg(test)] mod tests { - use crate::transform::utils::extend_offsets; + use crate::transform::utils::try_extend_offsets; use arrow_buffer::MutableBuffer; #[test] - #[should_panic(expected = "offset overflow")] - fn test_overflow() { + fn test_overflow_returns_error() { let mut buffer = MutableBuffer::new(10); - extend_offsets(&mut buffer, i32::MAX - 4, &[0, 5]); + let err = try_extend_offsets(&mut buffer, i32::MAX - 4, &[0i32, 5]).unwrap_err(); + assert!( + err.to_string().contains("offset overflow"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_overflow_restores_buffer() { + // Pre-populate the buffer with a known-good offset so we can verify + // it is unchanged after a failed extend. + let mut buffer = MutableBuffer::new(16); + buffer.push(0i32); + buffer.push(10i32); + let len_before = buffer.len(); + + // Offsets [0, 3, i32::MAX]: the second window (3 → i32::MAX) will overflow + // because last_offset (i32::MAX - 4 + 3 = i32::MAX - 1) + (i32::MAX - 3) overflows. + // Use a simpler case: start near MAX so the very first window overflows. + let err = try_extend_offsets(&mut buffer, i32::MAX - 2, &[0i32, 5]).unwrap_err(); + assert!( + err.to_string().contains("offset overflow"), + "unexpected error: {err}" + ); + // Buffer must be exactly as it was before the failed call. + assert_eq!( + buffer.len(), + len_before, + "buffer length changed after overflow rollback" + ); } } diff --git a/arrow-data/src/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs index ec9dcf1fd1c2..e8fc3f766c9d 100644 --- a/arrow-data/src/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -22,7 +22,7 @@ use num_traits::{AsPrimitive, CheckedAdd}; use super::{ _MutableArrayData, Extend, - utils::{extend_offsets, get_last_offset}, + utils::{get_last_offset, try_extend_offsets}, }; #[inline] @@ -52,18 +52,23 @@ pub(super) fn build_extend(offset_buffer, last_offset, &offsets[start..start + len + 1]); + try_extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1])?; // values extend_offset_values::(values_buffer, offsets, values, start, len); + Ok(()) }, ) } -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { +pub(super) fn extend_nulls( + mutable: &mut _MutableArrayData, + len: usize, +) -> Result<(), arrow_schema::ArrowError> { let offset_buffer = &mut mutable.buffer1; // this is safe due to how offset is built. See details on `get_last_offset` let last_offset: T = unsafe { get_last_offset(offset_buffer) }; - (0..len).for_each(|_| offset_buffer.push(last_offset)) + (0..len).for_each(|_| offset_buffer.push(last_offset)); + Ok(()) } diff --git a/arrow-json/src/reader/run_end_array.rs b/arrow-json/src/reader/run_end_array.rs index 1a007ccbb633..5592d3201c2f 100644 --- a/arrow-json/src/reader/run_end_array.rs +++ b/arrow-json/src/reader/run_end_array.rs @@ -78,7 +78,9 @@ impl ArrayDecoder for RunEndEncodedArrayDecoder { )) })?; run_ends.push(run_end); - mutable.extend(0, run_start, run_start + 1); + mutable + .try_extend(0, run_start, run_start + 1) + .map_err(|e| ArrowError::JsonError(e.to_string()))?; run_start = i; } } @@ -89,7 +91,9 @@ impl ArrayDecoder for RunEndEncodedArrayDecoder { )) })?; run_ends.push(run_end); - mutable.extend(0, run_start, run_start + 1); + mutable + .try_extend(0, run_start, run_start + 1) + .map_err(|e| ArrowError::JsonError(e.to_string()))?; let values_data = mutable.freeze(); let run_ends_data = diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 19eeeacbfb7b..71b905d9baa4 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -515,7 +515,7 @@ fn concat_fallback(arrays: &[&dyn Array], capacity: Capacities) -> Result Result { - slices - .iter() - .for_each(|(start, end)| mutable.extend(0, *start, *end)); + for (start, end) in slices { + mutable.try_extend(0, *start, *end)?; + } } _ => { let iter = SlicesIterator::new(&predicate.filter); - iter.for_each(|(start, end)| mutable.extend(0, start, end)); + for (start, end) in iter { + mutable.try_extend(0, start, end)?; + } } } diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index f5904bc171ee..79fada038bd4 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -432,7 +432,7 @@ fn interleave_fallback( } // emit current batch of rows for current buffer - array_data.extend(cur_array, start_row_idx, end_row_idx); + array_data.try_extend(cur_array, start_row_idx, end_row_idx)?; // start new batch of rows cur_array = array; @@ -441,7 +441,7 @@ fn interleave_fallback( } // emit final batch of rows - array_data.extend(cur_array, start_row_idx, end_row_idx); + array_data.try_extend(cur_array, start_row_idx, end_row_idx)?; Ok(make_array(array_data.freeze())) } diff --git a/arrow-select/src/merge.rs b/arrow-select/src/merge.rs index eff3db50ee7c..76958e658f5c 100644 --- a/arrow-select/src/merge.rs +++ b/arrow-select/src/merge.rs @@ -163,11 +163,11 @@ pub fn merge_n(values: &[&dyn Array], indices: &[impl MergeIndex]) -> Result mutable.extend_nulls(slice_length), + None => mutable.try_extend_nulls(slice_length)?, Some(index) => { let start_offset = take_offsets[index]; let end_offset = start_offset + slice_length; - mutable.extend(index, start_offset, end_offset); + mutable.try_extend(index, start_offset, end_offset)?; take_offsets[index] = end_offset; } } @@ -261,18 +261,18 @@ pub fn merge( _ => prep_null_mask_filter(mask).into_parts().0, }; - SlicesIterator::from(&mask_buffer).for_each(|(start, end)| { + for (start, end) in SlicesIterator::from(&mask_buffer) { // the gap needs to be filled with falsy values if start > filled { if falsy_is_scalar { for _ in filled..start { // Copy the first item from the 'falsy' array into the output buffer. - mutable.extend(1, 0, 1); + mutable.try_extend(1, 0, 1)?; } } else { let falsy_length = start - filled; let falsy_end = falsy_offset + falsy_length; - mutable.extend(1, falsy_offset, falsy_end); + mutable.try_extend(1, falsy_offset, falsy_end)?; falsy_offset = falsy_end; } } @@ -280,27 +280,27 @@ pub fn merge( if truthy_is_scalar { for _ in start..end { // Copy the first item from the 'truthy' array into the output buffer. - mutable.extend(0, 0, 1); + mutable.try_extend(0, 0, 1)?; } } else { let truthy_length = end - start; let truthy_end = truthy_offset + truthy_length; - mutable.extend(0, truthy_offset, truthy_end); + mutable.try_extend(0, truthy_offset, truthy_end)?; truthy_offset = truthy_end; } filled = end; - }); + } // the remaining part is falsy if filled < mask.len() { if falsy_is_scalar { for _ in filled..mask.len() { // Copy the first item from the 'falsy' array into the output buffer. - mutable.extend(1, 0, 1); + mutable.try_extend(1, 0, 1)?; } } else { let falsy_length = mask.len() - filled; let falsy_end = falsy_offset + falsy_length; - mutable.extend(1, falsy_offset, falsy_end); + mutable.try_extend(1, falsy_offset, falsy_end)?; } } diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index 8702b558d01f..97ecab5a8113 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -161,38 +161,38 @@ fn zip_impl( let mut filled = 0; let mask_buffer = maybe_prep_null_mask_filter(mask); - SlicesIterator::from(&mask_buffer).for_each(|(start, end)| { + for (start, end) in SlicesIterator::from(&mask_buffer) { // the gap needs to be filled with falsy values if start > filled { if falsy_is_scalar { for _ in filled..start { // Copy the first item from the 'falsy' array into the output buffer. - mutable.extend(1, 0, 1); + mutable.try_extend(1, 0, 1)?; } } else { - mutable.extend(1, filled, start); + mutable.try_extend(1, filled, start)?; } } // fill with truthy values if truthy_is_scalar { for _ in start..end { // Copy the first item from the 'truthy' array into the output buffer. - mutable.extend(0, 0, 1); + mutable.try_extend(0, 0, 1)?; } } else { - mutable.extend(0, start, end); + mutable.try_extend(0, start, end)?; } filled = end; - }); + } // the remaining part is falsy if filled < mask.len() { if falsy_is_scalar { for _ in filled..mask.len() { // Copy the first item from the 'falsy' array into the output buffer. - mutable.extend(1, 0, 1); + mutable.try_extend(1, 0, 1)?; } } else { - mutable.extend(1, filled, mask.len()); + mutable.try_extend(1, filled, mask.len())?; } } diff --git a/arrow/benches/mutable_array.rs b/arrow/benches/mutable_array.rs index 67591194ae6d..9b76f74b6ccf 100644 --- a/arrow/benches/mutable_array.rs +++ b/arrow/benches/mutable_array.rs @@ -42,7 +42,7 @@ fn bench(v1: &T, slices: &[(usize, usize)]) { let data = v1.to_data(); let mut mutable = MutableArrayData::new(vec![&data], false, 5); for (start, end) in slices { - mutable.extend(0, *start, *end) + mutable.try_extend(0, *start, *end).unwrap(); } mutable.freeze(); } diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index c24d0992a473..f5eeffd1aab7 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -45,8 +45,8 @@ fn test_decimal() { create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3).into_data(); let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 3); - a.extend(0, 0, 3); - a.extend(0, 2, 3); + a.try_extend(0, 0, 3).unwrap(); + a.try_extend(0, 2, 3).unwrap(); let result = a.freeze(); let array = Decimal128Array::from(result); let expected = create_decimal_array(vec![Some(1), Some(2), None, None], 10, 3); @@ -59,7 +59,7 @@ fn test_decimal_offset() { let decimal_array = decimal_array.slice(1, 3).into_data(); // 2, null, 3 let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); // 2, null + a.try_extend(0, 0, 2).unwrap(); // 2, null let result = a.freeze(); let array = Decimal128Array::from(result); let expected = create_decimal_array(vec![Some(2), None], 10, 3); @@ -73,9 +73,9 @@ fn test_decimal_null_offset_nulls() { let decimal_array = decimal_array.slice(1, 3).into_data(); // 2, null, 3 let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); // 2, null - a.extend_nulls(3); // 2, null, null, null, null - a.extend(0, 1, 3); //2, null, null, null, null, null, 3 + a.try_extend(0, 0, 2).unwrap(); // 2, null + a.try_extend_nulls(3).unwrap(); // 2, null, null, null, null + a.try_extend(0, 1, 3).unwrap(); //2, null, null, null, null, null, 3 let result = a.freeze(); let array = Decimal128Array::from(result); let expected = @@ -89,7 +89,7 @@ fn test_primitive() { let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]).into_data(); let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, false, 3); - a.extend(0, 0, 2); + a.try_extend(0, 0, 2).unwrap(); let result = a.freeze(); let array = UInt8Array::from(result); let expected = UInt8Array::from(vec![Some(1), Some(2)]); @@ -103,7 +103,7 @@ fn test_primitive_offset() { let b = b.slice(1, 2); let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); + a.try_extend(0, 0, 2).unwrap(); let result = a.freeze(); let array = UInt8Array::from(result); let expected = UInt8Array::from(vec![Some(2), Some(3)]); @@ -117,7 +117,7 @@ fn test_primitive_null_offset() { let b = b.slice(1, 2).into_data(); let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); + a.try_extend(0, 0, 2).unwrap(); let result = a.freeze(); let array = UInt8Array::from(result); let expected = UInt8Array::from(vec![None, Some(3)]); @@ -130,9 +130,9 @@ fn test_primitive_null_offset_nulls() { let b = b.slice(1, 2); let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); - a.extend_nulls(3); - a.extend(0, 1, 2); + a.try_extend(0, 0, 2).unwrap(); + a.try_extend_nulls(3).unwrap(); + a.try_extend(0, 1, 2).unwrap(); let result = a.freeze(); let array = UInt8Array::from(result); let expected = UInt8Array::from(vec![Some(2), Some(3), None, None, None, Some(3)]); @@ -153,7 +153,7 @@ fn test_list_null_offset() { let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 0, 1); + mutable.try_extend(0, 0, 1).unwrap(); let result = mutable.freeze(); let array = ListArray::from(result); @@ -175,7 +175,7 @@ fn test_variable_sized_nulls() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); + mutable.try_extend(0, 1, 3).unwrap(); let result = mutable.freeze(); let result = StringArray::from(result); @@ -195,7 +195,7 @@ fn test_variable_sized_offsets() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 0, 3); + mutable.try_extend(0, 0, 3).unwrap(); let result = mutable.freeze(); let result = StringArray::from(result); @@ -213,7 +213,7 @@ fn test_string_offsets() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 0, 3); + mutable.try_extend(0, 0, 3).unwrap(); let result = mutable.freeze(); let result = StringArray::from(result); @@ -231,8 +231,8 @@ fn test_multiple_with_nulls() { let mut mutable = MutableArrayData::new(arrays, false, 5); - mutable.extend(0, 0, 2); - mutable.extend(1, 0, 2); + mutable.try_extend(0, 0, 2).unwrap(); + mutable.try_extend(1, 0, 2).unwrap(); let result = mutable.freeze(); let result = StringArray::from(result); @@ -250,8 +250,8 @@ fn test_string_null_offset_nulls() { let mut mutable = MutableArrayData::new(arrays, true, 0); - mutable.extend(0, 1, 3); - mutable.extend_nulls(1); + mutable.try_extend(0, 1, 3).unwrap(); + mutable.try_extend_nulls(1).unwrap(); let result = mutable.freeze(); let result = StringArray::from(result); @@ -267,7 +267,7 @@ fn test_bool() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); + mutable.try_extend(0, 1, 3).unwrap(); let result = mutable.freeze(); let result = BooleanArray::from(result); @@ -284,8 +284,8 @@ fn test_null() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 1); + mutable.try_extend(0, 1, 3).unwrap(); + mutable.try_extend(1, 0, 1).unwrap(); let result = mutable.freeze(); let result = NullArray::from(result); @@ -316,7 +316,7 @@ fn test_dictionary() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); + mutable.try_extend(0, 1, 3).unwrap(); let result = mutable.freeze(); let result = DictionaryArray::from(result); @@ -348,7 +348,7 @@ fn test_struct() { let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); + mutable.try_extend(0, 1, 3).unwrap(); let data = mutable.freeze(); let array = StructArray::from(data); @@ -381,7 +381,7 @@ fn test_struct_offset() { let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); + mutable.try_extend(0, 1, 3).unwrap(); let data = mutable.freeze(); let array = StructArray::from(data); @@ -416,7 +416,7 @@ fn test_struct_nulls() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); + mutable.try_extend(0, 1, 3).unwrap(); let data = mutable.freeze(); let array = StructArray::from(data); @@ -452,8 +452,8 @@ fn test_struct_many() { let arrays = vec![&array, &array]; let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 2); + mutable.try_extend(0, 1, 3).unwrap(); + mutable.try_extend(1, 0, 2).unwrap(); let data = mutable.freeze(); let array = StructArray::from(data); @@ -508,7 +508,7 @@ fn test_union_dense() { let mut mutable = MutableArrayData::new(arrays, false, 0); // Slice it by `MutableArrayData` - mutable.extend(0, 4, 7); + mutable.try_extend(0, 4, 7).unwrap(); let data = mutable.freeze(); let array = UnionArray::from(data); @@ -536,8 +536,8 @@ fn test_binary_fixed_sized_offsets() { let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 1, 2); - mutable.extend(0, 0, 1); + mutable.try_extend(0, 1, 2).unwrap(); + mutable.try_extend(0, 0, 1).unwrap(); let result = mutable.freeze(); let result = FixedSizeBinaryArray::from(result); @@ -571,9 +571,9 @@ fn test_list_append() { let c = b.slice(1, 2); let mut mutable = MutableArrayData::new(vec![&a, &b, &c], false, 1); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); + mutable.try_extend(0, 0, a.len()).unwrap(); + mutable.try_extend(1, 0, b.len()).unwrap(); + mutable.try_extend(2, 0, c.len()).unwrap(); let finished = mutable.freeze(); @@ -641,10 +641,10 @@ fn test_list_nulls_append() { let mut mutable = MutableArrayData::new(vec![&a, &b, &c, &d], false, 10); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); + mutable.try_extend(0, 0, a.len()).unwrap(); + mutable.try_extend(1, 0, b.len()).unwrap(); + mutable.try_extend(2, 0, c.len()).unwrap(); + mutable.try_extend(3, 0, d.len()).unwrap(); let result = mutable.freeze(); let expected_int_array = Int64Array::from(vec![ @@ -734,10 +734,10 @@ fn test_map_nulls_append() { let mut mutable = MutableArrayData::new(vec![&a, &b, &c, &d], false, 10); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); + mutable.try_extend(0, 0, a.len()).unwrap(); + mutable.try_extend(1, 0, b.len()).unwrap(); + mutable.try_extend(2, 0, c.len()).unwrap(); + mutable.try_extend(3, 0, d.len()).unwrap(); let result = mutable.freeze(); let expected_key_array = Int64Array::from(vec![ @@ -907,10 +907,10 @@ fn test_list_of_strings_append() { let mut mutable = MutableArrayData::new(vec![&a, &b], false, 10); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 3); - mutable.extend(1, 0, 0); + mutable.try_extend(0, 0, a.len()).unwrap(); + mutable.try_extend(1, 0, b.len()).unwrap(); + mutable.try_extend(1, 1, 3).unwrap(); + mutable.try_extend(1, 0, 0).unwrap(); let result = mutable.freeze(); let expected_string_array = StringArray::from(vec![ @@ -972,11 +972,11 @@ fn test_fixed_size_binary_append() { let mut mutable = MutableArrayData::new(vec![&a, &b], false, 10); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 4); - mutable.extend(1, 2, 3); - mutable.extend(1, 5, 5); + mutable.try_extend(0, 0, a.len()).unwrap(); + mutable.try_extend(1, 0, b.len()).unwrap(); + mutable.try_extend(1, 1, 4).unwrap(); + mutable.try_extend(1, 2, 3).unwrap(); + mutable.try_extend(1, 5, 5).unwrap(); let result = mutable.freeze(); let expected = vec![ @@ -1009,8 +1009,8 @@ fn test_fixed_size_binary_append() { fn test_extend_nulls() { let int = Int32Array::from(vec![1, 2, 3, 4]).into_data(); let mut mutable = MutableArrayData::new(vec![&int], true, 4); - mutable.extend(0, 2, 3); - mutable.extend_nulls(2); + mutable.try_extend(0, 2, 3).unwrap(); + mutable.try_extend_nulls(2).unwrap(); let data = mutable.freeze(); data.validate_full().unwrap(); @@ -1025,7 +1025,7 @@ fn test_extend_nulls() { fn test_extend_nulls_panic() { let int = Int32Array::from(vec![1, 2, 3, 4]).into_data(); let mut mutable = MutableArrayData::new(vec![&int], false, 4); - mutable.extend_nulls(2); + mutable.try_extend_nulls(2).unwrap(); } #[test] @@ -1043,10 +1043,10 @@ fn test_string_view() { a2.validate_full().unwrap(); let mut mutable = MutableArrayData::new(vec![&a1, &a2], false, 4); - mutable.extend(1, 0, 1); - mutable.extend(0, 1, 2); - mutable.extend(0, 0, 1); - mutable.extend(1, 2, 3); + mutable.try_extend(1, 0, 1).unwrap(); + mutable.try_extend(0, 1, 2).unwrap(); + mutable.try_extend(0, 0, 1).unwrap(); + mutable.try_extend(1, 2, 3).unwrap(); let array = StringViewArray::from(mutable.freeze()); assert_eq!(array.data_buffers().len(), 2); @@ -1102,12 +1102,12 @@ fn test_fixed_size_list_append() { let b = a_builder.finish().into_data(); let mut mutable = MutableArrayData::new(vec![&a, &b], false, 10); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); + mutable.try_extend(0, 0, a.len()).unwrap(); + mutable.try_extend(1, 0, b.len()).unwrap(); // append array - mutable.extend(1, 1, 4); - mutable.extend(1, 2, 3); + mutable.try_extend(1, 1, 4).unwrap(); + mutable.try_extend(1, 2, 3).unwrap(); let finished = mutable.freeze(); @@ -1176,8 +1176,8 @@ fn test_extend_nulls_sparse_union() { let data = union_array.to_data(); let mut mutable = MutableArrayData::new(vec![&data], true, 4); - mutable.extend(0, 0, 1); // copy the first element - mutable.extend_nulls(2); // add two nulls + mutable.try_extend(0, 0, 1).unwrap(); // copy the first element + mutable.try_extend_nulls(2).unwrap(); // add two nulls let result = mutable.freeze(); // Union arrays must not have a null bitmap per Arrow spec @@ -1220,8 +1220,8 @@ fn test_extend_nulls_dense_union() { let data = union_array.to_data(); let mut mutable = MutableArrayData::new(vec![&data], true, 4); - mutable.extend(0, 0, 1); // copy the first element - mutable.extend_nulls(2); // add two nulls + mutable.try_extend(0, 0, 1).unwrap(); // copy the first element + mutable.try_extend_nulls(2).unwrap(); // add two nulls let result = mutable.freeze(); // Union arrays must not have a null bitmap per Arrow spec diff --git a/parquet/src/arrow/array_reader/fixed_size_list_array.rs b/parquet/src/arrow/array_reader/fixed_size_list_array.rs index 8ef3bd6c2a4b..85a2f0a8f3f5 100644 --- a/parquet/src/arrow/array_reader/fixed_size_list_array.rs +++ b/parquet/src/arrow/array_reader/fixed_size_list_array.rs @@ -156,10 +156,14 @@ impl ArrayReader for FixedSizeListArrayReader { if let Some(start) = start_idx.take() { // Flush pending child items - child_data_builder.extend(0, start, child_idx); + child_data_builder + .try_extend(0, start, child_idx) + .map_err(|e| general_err!("{}", e))?; } // Pad list with nulls - child_data_builder.extend_nulls(self.fixed_size); + child_data_builder + .try_extend_nulls(self.fixed_size) + .map_err(|e| general_err!("{}", e))?; if let Some(validity) = validity.as_mut() { // Valid if empty list @@ -179,7 +183,9 @@ impl ArrayReader for FixedSizeListArrayReader { } Some(start) => { // Flush pending child items - child_data_builder.extend(0, start, child_idx); + child_data_builder + .try_extend(0, start, child_idx) + .map_err(|e| general_err!("{}", e))?; child_data_builder.freeze() } None => child_data_builder.freeze(), diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 1d5c68c22e11..11ab50d6b6ee 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -179,7 +179,9 @@ impl ArrayReader for ListArrayReader { } else { // Flush the current slice of child values if any if let Some(start) = filter_start.take() { - child_data_builder.extend(0, start, cur_offset + skipped); + child_data_builder + .try_extend(0, start, cur_offset + skipped) + .map_err(|e| general_err!("{}", e))?; } if let Some(validity) = validity.as_mut() { @@ -202,7 +204,9 @@ impl ArrayReader for ListArrayReader { } else { // One or more filtered values - must build new array if let Some(start) = filter_start.take() { - child_data_builder.extend(0, start, cur_offset + skipped) + child_data_builder + .try_extend(0, start, cur_offset + skipped) + .map_err(|e| general_err!("{}", e))?; } child_data_builder.freeze()