Skip to main content

parquet/file/page_index/
column_index.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`ColumnIndexMetaData`] structures holding decoded [`ColumnIndex`] information
19//!
20//! [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
21//!
22
23use crate::{
24    data_type::{ByteArray, FixedLenByteArray},
25    errors::{ParquetError, Result},
26    parquet_thrift::{
27        ElementType, FieldType, ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
28    },
29};
30use std::ops::Deref;
31
32use crate::{
33    basic::BoundaryOrder,
34    data_type::{Int96, private::ParquetValueType},
35    file::page_index::index_reader::ThriftColumnIndex,
36};
37
38/// Common bits of the column index
39#[derive(Debug, Clone, PartialEq)]
40pub struct ColumnIndex {
41    pub(crate) null_pages: Vec<bool>,
42    pub(crate) boundary_order: BoundaryOrder,
43    pub(crate) null_counts: Option<Vec<i64>>,
44    pub(crate) repetition_level_histograms: Option<Vec<i64>>,
45    pub(crate) definition_level_histograms: Option<Vec<i64>>,
46}
47
48impl ColumnIndex {
49    /// Returns the number of pages
50    pub fn num_pages(&self) -> u64 {
51        self.null_pages.len() as u64
52    }
53
54    /// Returns the number of null values in the page indexed by `idx`
55    ///
56    /// Returns `None` if no null counts have been set in the index
57    pub fn null_count(&self, idx: usize) -> Option<i64> {
58        self.null_counts.as_ref().map(|nc| nc[idx])
59    }
60
61    /// Returns the repetition level histogram for the page indexed by `idx`
62    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
63        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
64            let num_lvls = rep_hists.len() / self.num_pages() as usize;
65            let start = num_lvls * idx;
66            Some(&rep_hists[start..start + num_lvls])
67        } else {
68            None
69        }
70    }
71
72    /// Returns the definition level histogram for the page indexed by `idx`
73    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
74        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
75            let num_lvls = def_hists.len() / self.num_pages() as usize;
76            let start = num_lvls * idx;
77            Some(&def_hists[start..start + num_lvls])
78        } else {
79            None
80        }
81    }
82
83    /// Returns whether the page indexed by `idx` consists of all null values
84    pub fn is_null_page(&self, idx: usize) -> bool {
85        self.null_pages[idx]
86    }
87}
88
89/// Column index for primitive types
90#[derive(Debug, Clone, PartialEq)]
91pub struct PrimitiveColumnIndex<T> {
92    pub(crate) column_index: ColumnIndex,
93    pub(crate) min_values: Vec<T>,
94    pub(crate) max_values: Vec<T>,
95}
96
97impl<T: ParquetValueType> PrimitiveColumnIndex<T> {
98    pub(crate) fn try_new(
99        null_pages: Vec<bool>,
100        boundary_order: BoundaryOrder,
101        null_counts: Option<Vec<i64>>,
102        repetition_level_histograms: Option<Vec<i64>>,
103        definition_level_histograms: Option<Vec<i64>>,
104        min_bytes: Vec<&[u8]>,
105        max_bytes: Vec<&[u8]>,
106    ) -> Result<Self> {
107        let len = null_pages.len();
108
109        if min_bytes.len() != len || max_bytes.len() != len {
110            return Err(ParquetError::General(format!(
111                "ColumnIndex min/max length mismatch: expected {len}, got min={} max={}",
112                min_bytes.len(),
113                max_bytes.len()
114            )));
115        }
116        if let Some(ref nc) = null_counts {
117            if nc.len() != len {
118                return Err(ParquetError::General(format!(
119                    "ColumnIndex null_counts length mismatch: expected {len}, got {}",
120                    nc.len()
121                )));
122            }
123        }
124        if let Some(ref rep) = repetition_level_histograms {
125            if len != 0 && rep.len() % len != 0 {
126                return Err(ParquetError::General(
127                    "Invalid repetition_level_histograms length".to_string(),
128                ));
129            }
130        }
131        if let Some(ref def) = definition_level_histograms {
132            if len != 0 && def.len() % len != 0 {
133                return Err(ParquetError::General(
134                    "Invalid definition_level_histograms length".to_string(),
135                ));
136            }
137        }
138
139        let mut min_values = Vec::with_capacity(len);
140        let mut max_values = Vec::with_capacity(len);
141
142        for (i, is_null) in null_pages.iter().enumerate().take(len) {
143            if !is_null {
144                let min = min_bytes[i];
145                min_values.push(T::try_from_le_slice(min)?);
146
147                let max = max_bytes[i];
148                max_values.push(T::try_from_le_slice(max)?);
149            } else {
150                // need placeholders
151                min_values.push(Default::default());
152                max_values.push(Default::default());
153            }
154        }
155
156        Ok(Self {
157            column_index: ColumnIndex {
158                null_pages,
159                boundary_order,
160                null_counts,
161                repetition_level_histograms,
162                definition_level_histograms,
163            },
164            min_values,
165            max_values,
166        })
167    }
168
169    pub(super) fn try_from_thrift(index: ThriftColumnIndex) -> Result<Self> {
170        Self::try_new(
171            index.null_pages,
172            index.boundary_order,
173            index.null_counts,
174            index.repetition_level_histograms,
175            index.definition_level_histograms,
176            index.min_values,
177            index.max_values,
178        )
179    }
180}
181
182impl<T> PrimitiveColumnIndex<T> {
183    /// Returns an array containing the min values for each page.
184    ///
185    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
186    /// is `false` for the same index.
187    pub fn min_values(&self) -> &[T] {
188        &self.min_values
189    }
190
191    /// Returns an array containing the max values for each page.
192    ///
193    /// Values in the returned slice are only valid if [`ColumnIndex::is_null_page()`]
194    /// is `false` for the same index.
195    pub fn max_values(&self) -> &[T] {
196        &self.max_values
197    }
198
199    /// Returns an iterator over the min values.
200    ///
201    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
202    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
203        self.min_values.iter().enumerate().map(|(i, min)| {
204            if self.is_null_page(i) {
205                None
206            } else {
207                Some(min)
208            }
209        })
210    }
211
212    /// Returns an iterator over the max values.
213    ///
214    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
215    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&T>> {
216        self.max_values.iter().enumerate().map(|(i, min)| {
217            if self.is_null_page(i) {
218                None
219            } else {
220                Some(min)
221            }
222        })
223    }
224
225    /// Returns the min value for the page indexed by `idx`
226    ///
227    /// It is `None` when all values are null
228    #[inline]
229    pub fn min_value(&self, idx: usize) -> Option<&T> {
230        if self.null_pages[idx] {
231            None
232        } else {
233            Some(&self.min_values[idx])
234        }
235    }
236
237    /// Returns the max value for the page indexed by `idx`
238    ///
239    /// It is `None` when all values are null
240    #[inline]
241    pub fn max_value(&self, idx: usize) -> Option<&T> {
242        if self.null_pages[idx] {
243            None
244        } else {
245            Some(&self.max_values[idx])
246        }
247    }
248}
249
250impl<T> Deref for PrimitiveColumnIndex<T> {
251    type Target = ColumnIndex;
252
253    fn deref(&self) -> &Self::Target {
254        &self.column_index
255    }
256}
257
258impl<T: ParquetValueType> WriteThrift for PrimitiveColumnIndex<T> {
259    const ELEMENT_TYPE: ElementType = ElementType::Struct;
260    fn write_thrift<W: std::io::Write>(
261        &self,
262        writer: &mut ThriftCompactOutputProtocol<W>,
263    ) -> Result<()> {
264        self.null_pages.write_thrift_field(writer, 1, 0)?;
265
266        // need to handle min/max manually
267        let len = self.null_pages.len();
268        writer.write_field_begin(FieldType::List, 2, 1)?;
269        writer.write_list_begin(ElementType::Binary, len)?;
270        for i in 0..len {
271            let min = self.min_value(i).map(|m| m.as_bytes()).unwrap_or(&[]);
272            min.write_thrift(writer)?;
273        }
274        writer.write_field_begin(FieldType::List, 3, 2)?;
275        writer.write_list_begin(ElementType::Binary, len)?;
276        for i in 0..len {
277            let max = self.max_value(i).map(|m| m.as_bytes()).unwrap_or(&[]);
278            max.write_thrift(writer)?;
279        }
280        let mut last_field_id = self.boundary_order.write_thrift_field(writer, 4, 3)?;
281        if self.null_counts.is_some() {
282            last_field_id =
283                self.null_counts
284                    .as_ref()
285                    .unwrap()
286                    .write_thrift_field(writer, 5, last_field_id)?;
287        }
288        if self.repetition_level_histograms.is_some() {
289            last_field_id = self
290                .repetition_level_histograms
291                .as_ref()
292                .unwrap()
293                .write_thrift_field(writer, 6, last_field_id)?;
294        }
295        if self.definition_level_histograms.is_some() {
296            self.definition_level_histograms
297                .as_ref()
298                .unwrap()
299                .write_thrift_field(writer, 7, last_field_id)?;
300        }
301        writer.write_struct_end()
302    }
303}
304
305/// Column index for byte arrays (fixed length and variable)
306#[derive(Debug, Clone, PartialEq)]
307pub struct ByteArrayColumnIndex {
308    pub(crate) column_index: ColumnIndex,
309    // raw bytes for min and max values
310    pub(crate) min_bytes: Vec<u8>,
311    pub(crate) min_offsets: Vec<usize>,
312    pub(crate) max_bytes: Vec<u8>,
313    pub(crate) max_offsets: Vec<usize>,
314}
315
316impl ByteArrayColumnIndex {
317    pub(crate) fn try_new(
318        null_pages: Vec<bool>,
319        boundary_order: BoundaryOrder,
320        null_counts: Option<Vec<i64>>,
321        repetition_level_histograms: Option<Vec<i64>>,
322        definition_level_histograms: Option<Vec<i64>>,
323        min_values: Vec<&[u8]>,
324        max_values: Vec<&[u8]>,
325    ) -> Result<Self> {
326        let len = null_pages.len();
327
328        if min_values.len() != len || max_values.len() != len {
329            return Err(ParquetError::General(format!(
330                "ColumnIndex min/max length mismatch: expected {len}, got min={} max={}",
331                min_values.len(),
332                max_values.len()
333            )));
334        }
335        if let Some(ref nc) = null_counts {
336            if nc.len() != len {
337                return Err(ParquetError::General(format!(
338                    "ColumnIndex null_counts length mismatch: expected {len}, got {}",
339                    nc.len()
340                )));
341            }
342        }
343        if let Some(ref rep) = repetition_level_histograms {
344            if len != 0 && rep.len() % len != 0 {
345                return Err(ParquetError::General(
346                    "Invalid repetition_level_histograms length".to_string(),
347                ));
348            }
349        }
350        if let Some(ref def) = definition_level_histograms {
351            if len != 0 && def.len() % len != 0 {
352                return Err(ParquetError::General(
353                    "Invalid definition_level_histograms length".to_string(),
354                ));
355            }
356        }
357
358        let min_len = min_values.iter().map(|&v| v.len()).sum();
359        let max_len = max_values.iter().map(|&v| v.len()).sum();
360        let mut min_bytes = vec![0u8; min_len];
361        let mut max_bytes = vec![0u8; max_len];
362
363        let mut min_offsets = vec![0usize; len + 1];
364        let mut max_offsets = vec![0usize; len + 1];
365
366        let mut min_pos = 0;
367        let mut max_pos = 0;
368
369        for (i, is_null) in null_pages.iter().enumerate().take(len) {
370            if !is_null {
371                let min = min_values[i];
372                let dst = &mut min_bytes[min_pos..min_pos + min.len()];
373                dst.copy_from_slice(min);
374                min_offsets[i] = min_pos;
375                min_pos += min.len();
376
377                let max = max_values[i];
378                let dst = &mut max_bytes[max_pos..max_pos + max.len()];
379                dst.copy_from_slice(max);
380                max_offsets[i] = max_pos;
381                max_pos += max.len();
382            } else {
383                min_offsets[i] = min_pos;
384                max_offsets[i] = max_pos;
385            }
386        }
387
388        min_offsets[len] = min_pos;
389        max_offsets[len] = max_pos;
390
391        Ok(Self {
392            column_index: ColumnIndex {
393                null_pages,
394                boundary_order,
395                null_counts,
396                repetition_level_histograms,
397                definition_level_histograms,
398            },
399            min_bytes,
400            min_offsets,
401            max_bytes,
402            max_offsets,
403        })
404    }
405
406    pub(super) fn try_from_thrift(index: ThriftColumnIndex) -> Result<Self> {
407        Self::try_new(
408            index.null_pages,
409            index.boundary_order,
410            index.null_counts,
411            index.repetition_level_histograms,
412            index.definition_level_histograms,
413            index.min_values,
414            index.max_values,
415        )
416    }
417
418    /// Returns the min value for the page indexed by `idx`
419    ///
420    /// It is `None` when all values are null
421    pub fn min_value(&self, idx: usize) -> Option<&[u8]> {
422        if self.null_pages[idx] {
423            None
424        } else {
425            let start = self.min_offsets[idx];
426            let end = self.min_offsets[idx + 1];
427            Some(&self.min_bytes[start..end])
428        }
429    }
430
431    /// Returns the max value for the page indexed by `idx`
432    ///
433    /// It is `None` when all values are null
434    pub fn max_value(&self, idx: usize) -> Option<&[u8]> {
435        if self.null_pages[idx] {
436            None
437        } else {
438            let start = self.max_offsets[idx];
439            let end = self.max_offsets[idx + 1];
440            Some(&self.max_bytes[start..end])
441        }
442    }
443
444    /// Returns an iterator over the min values.
445    ///
446    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
447    pub fn min_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
448        (0..self.num_pages() as usize).map(|i| self.min_value(i))
449    }
450
451    /// Returns an iterator over the max values.
452    ///
453    /// Values may be `None` when [`ColumnIndex::is_null_page()`] is `true`.
454    pub fn max_values_iter(&self) -> impl Iterator<Item = Option<&[u8]>> {
455        (0..self.num_pages() as usize).map(|i| self.max_value(i))
456    }
457}
458
459impl Deref for ByteArrayColumnIndex {
460    type Target = ColumnIndex;
461
462    fn deref(&self) -> &Self::Target {
463        &self.column_index
464    }
465}
466
467impl WriteThrift for ByteArrayColumnIndex {
468    const ELEMENT_TYPE: ElementType = ElementType::Struct;
469    fn write_thrift<W: std::io::Write>(
470        &self,
471        writer: &mut ThriftCompactOutputProtocol<W>,
472    ) -> Result<()> {
473        self.null_pages.write_thrift_field(writer, 1, 0)?;
474
475        // need to handle min/max manually
476        let len = self.null_pages.len();
477        writer.write_field_begin(FieldType::List, 2, 1)?;
478        writer.write_list_begin(ElementType::Binary, len)?;
479        for i in 0..len {
480            let min = self.min_value(i).unwrap_or(&[]);
481            min.write_thrift(writer)?;
482        }
483        writer.write_field_begin(FieldType::List, 3, 2)?;
484        writer.write_list_begin(ElementType::Binary, len)?;
485        for i in 0..len {
486            let max = self.max_value(i).unwrap_or(&[]);
487            max.write_thrift(writer)?;
488        }
489        let mut last_field_id = self.boundary_order.write_thrift_field(writer, 4, 3)?;
490        if self.null_counts.is_some() {
491            last_field_id =
492                self.null_counts
493                    .as_ref()
494                    .unwrap()
495                    .write_thrift_field(writer, 5, last_field_id)?;
496        }
497        if self.repetition_level_histograms.is_some() {
498            last_field_id = self
499                .repetition_level_histograms
500                .as_ref()
501                .unwrap()
502                .write_thrift_field(writer, 6, last_field_id)?;
503        }
504        if self.definition_level_histograms.is_some() {
505            self.definition_level_histograms
506                .as_ref()
507                .unwrap()
508                .write_thrift_field(writer, 7, last_field_id)?;
509        }
510        writer.write_struct_end()
511    }
512}
513
514// Macro to generate getter functions for ColumnIndexMetaData.
515macro_rules! colidx_enum_func {
516    ($self:ident, $func:ident, $arg:ident) => {{
517        match *$self {
518            Self::BOOLEAN(ref typed) => typed.$func($arg),
519            Self::INT32(ref typed) => typed.$func($arg),
520            Self::INT64(ref typed) => typed.$func($arg),
521            Self::INT96(ref typed) => typed.$func($arg),
522            Self::FLOAT(ref typed) => typed.$func($arg),
523            Self::DOUBLE(ref typed) => typed.$func($arg),
524            Self::BYTE_ARRAY(ref typed) => typed.$func($arg),
525            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func($arg),
526            _ => panic!(concat!(
527                "Cannot call ",
528                stringify!($func),
529                " on ColumnIndexMetaData::NONE"
530            )),
531        }
532    }};
533    ($self:ident, $func:ident) => {{
534        match *$self {
535            Self::BOOLEAN(ref typed) => typed.$func(),
536            Self::INT32(ref typed) => typed.$func(),
537            Self::INT64(ref typed) => typed.$func(),
538            Self::INT96(ref typed) => typed.$func(),
539            Self::FLOAT(ref typed) => typed.$func(),
540            Self::DOUBLE(ref typed) => typed.$func(),
541            Self::BYTE_ARRAY(ref typed) => typed.$func(),
542            Self::FIXED_LEN_BYTE_ARRAY(ref typed) => typed.$func(),
543            _ => panic!(concat!(
544                "Cannot call ",
545                stringify!($func),
546                " on ColumnIndexMetaData::NONE"
547            )),
548        }
549    }};
550}
551
552/// Parsed [`ColumnIndex`] information for a Parquet file.
553///
554/// See [`ParquetColumnIndex`] for more information.
555///
556/// [`ParquetColumnIndex`]: crate::file::metadata::ParquetColumnIndex
557/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
558#[derive(Debug, Clone, PartialEq)]
559#[allow(non_camel_case_types)]
560pub enum ColumnIndexMetaData {
561    /// Sometimes reading page index from parquet file
562    /// will only return pageLocations without min_max index,
563    /// `NONE` represents this lack of index information
564    NONE,
565    /// Boolean type index
566    BOOLEAN(PrimitiveColumnIndex<bool>),
567    /// 32-bit integer type index
568    INT32(PrimitiveColumnIndex<i32>),
569    /// 64-bit integer type index
570    INT64(PrimitiveColumnIndex<i64>),
571    /// 96-bit integer type (timestamp) index
572    INT96(PrimitiveColumnIndex<Int96>),
573    /// 32-bit floating point type index
574    FLOAT(PrimitiveColumnIndex<f32>),
575    /// 64-bit floating point type index
576    DOUBLE(PrimitiveColumnIndex<f64>),
577    /// Byte array type index
578    BYTE_ARRAY(ByteArrayColumnIndex),
579    /// Fixed length byte array type index
580    FIXED_LEN_BYTE_ARRAY(ByteArrayColumnIndex),
581}
582
583impl ColumnIndexMetaData {
584    /// Return min/max elements inside ColumnIndex are ordered or not.
585    pub fn is_sorted(&self) -> bool {
586        // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING,
587        if let Some(order) = self.get_boundary_order() {
588            order != BoundaryOrder::UNORDERED
589        } else {
590            false
591        }
592    }
593
594    /// Get boundary_order of this page index.
595    pub fn get_boundary_order(&self) -> Option<BoundaryOrder> {
596        match self {
597            Self::NONE => None,
598            Self::BOOLEAN(index) => Some(index.boundary_order),
599            Self::INT32(index) => Some(index.boundary_order),
600            Self::INT64(index) => Some(index.boundary_order),
601            Self::INT96(index) => Some(index.boundary_order),
602            Self::FLOAT(index) => Some(index.boundary_order),
603            Self::DOUBLE(index) => Some(index.boundary_order),
604            Self::BYTE_ARRAY(index) => Some(index.boundary_order),
605            Self::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order),
606        }
607    }
608
609    /// Returns array of null counts, one per page.
610    ///
611    /// Returns `None` if now null counts have been set in the index
612    pub fn null_counts(&self) -> Option<&Vec<i64>> {
613        match self {
614            Self::NONE => None,
615            Self::BOOLEAN(index) => index.null_counts.as_ref(),
616            Self::INT32(index) => index.null_counts.as_ref(),
617            Self::INT64(index) => index.null_counts.as_ref(),
618            Self::INT96(index) => index.null_counts.as_ref(),
619            Self::FLOAT(index) => index.null_counts.as_ref(),
620            Self::DOUBLE(index) => index.null_counts.as_ref(),
621            Self::BYTE_ARRAY(index) => index.null_counts.as_ref(),
622            Self::FIXED_LEN_BYTE_ARRAY(index) => index.null_counts.as_ref(),
623        }
624    }
625
626    /// Returns the number of pages
627    pub fn num_pages(&self) -> u64 {
628        colidx_enum_func!(self, num_pages)
629    }
630
631    /// Returns the number of null values in the page indexed by `idx`
632    ///
633    /// Returns `None` if no null counts have been set in the index
634    pub fn null_count(&self, idx: usize) -> Option<i64> {
635        colidx_enum_func!(self, null_count, idx)
636    }
637
638    /// Returns the repetition level histogram for the page indexed by `idx`
639    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
640        colidx_enum_func!(self, repetition_level_histogram, idx)
641    }
642
643    /// Returns the definition level histogram for the page indexed by `idx`
644    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
645        colidx_enum_func!(self, definition_level_histogram, idx)
646    }
647
648    /// Returns whether the page indexed by `idx` consists of all null values
649    #[inline]
650    pub fn is_null_page(&self, idx: usize) -> bool {
651        colidx_enum_func!(self, is_null_page, idx)
652    }
653}
654
655/// Provides iterators over min and max values of a [`ColumnIndexMetaData`]
656pub trait ColumnIndexIterators {
657    /// Can be one of `bool`, `i32`, `i64`, `Int96`, `f32`, `f64`, [`ByteArray`],
658    /// or [`FixedLenByteArray`]
659    type Item;
660
661    /// Return iterator over the min values for the index
662    fn min_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator<Item = Option<Self::Item>>;
663
664    /// Return iterator over the max values for the index
665    fn max_values_iter(colidx: &ColumnIndexMetaData) -> impl Iterator<Item = Option<Self::Item>>;
666}
667
668macro_rules! column_index_iters {
669    ($item: ident, $variant: ident, $conv:expr) => {
670        impl ColumnIndexIterators for $item {
671            type Item = $item;
672
673            fn min_values_iter(
674                colidx: &ColumnIndexMetaData,
675            ) -> impl Iterator<Item = Option<Self::Item>> {
676                if let ColumnIndexMetaData::$variant(index) = colidx {
677                    index.min_values_iter().map($conv)
678                } else {
679                    panic!(concat!("Wrong type for ", stringify!($item), " iterator"))
680                }
681            }
682
683            fn max_values_iter(
684                colidx: &ColumnIndexMetaData,
685            ) -> impl Iterator<Item = Option<Self::Item>> {
686                if let ColumnIndexMetaData::$variant(index) = colidx {
687                    index.max_values_iter().map($conv)
688                } else {
689                    panic!(concat!("Wrong type for ", stringify!($item), " iterator"))
690                }
691            }
692        }
693    };
694}
695
696column_index_iters!(bool, BOOLEAN, |v| v.copied());
697column_index_iters!(i32, INT32, |v| v.copied());
698column_index_iters!(i64, INT64, |v| v.copied());
699column_index_iters!(Int96, INT96, |v| v.copied());
700column_index_iters!(f32, FLOAT, |v| v.copied());
701column_index_iters!(f64, DOUBLE, |v| v.copied());
702column_index_iters!(ByteArray, BYTE_ARRAY, |v| v
703    .map(|v| ByteArray::from(v.to_owned())));
704column_index_iters!(FixedLenByteArray, FIXED_LEN_BYTE_ARRAY, |v| v
705    .map(|v| FixedLenByteArray::from(v.to_owned())));
706
707impl WriteThrift for ColumnIndexMetaData {
708    const ELEMENT_TYPE: ElementType = ElementType::Struct;
709
710    fn write_thrift<W: std::io::Write>(
711        &self,
712        writer: &mut ThriftCompactOutputProtocol<W>,
713    ) -> Result<()> {
714        match self {
715            ColumnIndexMetaData::BOOLEAN(index) => index.write_thrift(writer),
716            ColumnIndexMetaData::INT32(index) => index.write_thrift(writer),
717            ColumnIndexMetaData::INT64(index) => index.write_thrift(writer),
718            ColumnIndexMetaData::INT96(index) => index.write_thrift(writer),
719            ColumnIndexMetaData::FLOAT(index) => index.write_thrift(writer),
720            ColumnIndexMetaData::DOUBLE(index) => index.write_thrift(writer),
721            ColumnIndexMetaData::BYTE_ARRAY(index) => index.write_thrift(writer),
722            ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => index.write_thrift(writer),
723            _ => Err(general_err!("Cannot serialize NONE index")),
724        }
725    }
726}
727
728#[cfg(test)]
729mod tests {
730    use super::*;
731
732    #[test]
733    fn test_page_index_min_max_null() {
734        let column_index = PrimitiveColumnIndex {
735            column_index: ColumnIndex {
736                null_pages: vec![false],
737                boundary_order: BoundaryOrder::ASCENDING,
738                null_counts: Some(vec![0]),
739                repetition_level_histograms: Some(vec![1, 2]),
740                definition_level_histograms: Some(vec![1, 2, 3]),
741            },
742            min_values: vec![-123],
743            max_values: vec![234],
744        };
745
746        assert_eq!(column_index.min_value(0), Some(&-123));
747        assert_eq!(column_index.max_value(0), Some(&234));
748        assert_eq!(column_index.null_count(0), Some(0));
749        assert_eq!(column_index.repetition_level_histogram(0).unwrap(), &[1, 2]);
750        assert_eq!(
751            column_index.definition_level_histogram(0).unwrap(),
752            &[1, 2, 3]
753        );
754    }
755
756    #[test]
757    fn test_page_index_min_max_null_none() {
758        let column_index: PrimitiveColumnIndex<i32> = PrimitiveColumnIndex::<i32> {
759            column_index: ColumnIndex {
760                null_pages: vec![true],
761                boundary_order: BoundaryOrder::ASCENDING,
762                null_counts: Some(vec![1]),
763                repetition_level_histograms: None,
764                definition_level_histograms: Some(vec![1, 0]),
765            },
766            min_values: vec![Default::default()],
767            max_values: vec![Default::default()],
768        };
769
770        assert_eq!(column_index.min_value(0), None);
771        assert_eq!(column_index.max_value(0), None);
772        assert_eq!(column_index.null_count(0), Some(1));
773        assert_eq!(column_index.repetition_level_histogram(0), None);
774        assert_eq!(column_index.definition_level_histogram(0).unwrap(), &[1, 0]);
775    }
776
777    #[test]
778    fn test_invalid_column_index() {
779        let column_index = ThriftColumnIndex {
780            null_pages: vec![true, false],
781            min_values: vec![
782                &[],
783                &[], // this shouldn't be empty as null_pages[1] is false
784            ],
785            max_values: vec![
786                &[],
787                &[], // this shouldn't be empty as null_pages[1] is false
788            ],
789            null_counts: None,
790            repetition_level_histograms: None,
791            definition_level_histograms: None,
792            boundary_order: BoundaryOrder::UNORDERED,
793        };
794
795        let err = PrimitiveColumnIndex::<i32>::try_from_thrift(column_index).unwrap_err();
796        assert_eq!(
797            err.to_string(),
798            "Parquet error: error converting value, expected 4 bytes got 0"
799        );
800    }
801
802    #[test]
803    fn test_column_index_rejects_mismatched_min_max_lengths() {
804        // Two pages, but only one min/max entry. The entry itself is valid i32 bytes,
805        // so this specifically checks that lengths must match the number of pages.
806        let column_index = ThriftColumnIndex {
807            null_pages: vec![false, false],
808            min_values: vec![&[1u8, 0, 0, 0]],
809            max_values: vec![&[10u8, 0, 0, 0]],
810            null_counts: None,
811            repetition_level_histograms: None,
812            definition_level_histograms: None,
813            boundary_order: BoundaryOrder::UNORDERED,
814        };
815
816        // ColumnIndex arrays must align with the number of pages (null_pages.len()).
817        let err = PrimitiveColumnIndex::<i32>::try_from_thrift(column_index).unwrap_err();
818        // Should fail because min/max lengths don’t match null_pages
819        assert!(err.to_string().contains("length mismatch"));
820    }
821}