arrow_schema/fields.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::ops::Deref;
19use std::sync::Arc;
20
21use crate::{ArrowError, DataType, Field, FieldRef};
22
23/// A cheaply cloneable, owned slice of [`FieldRef`]
24///
25/// Similar to `Arc<Vec<FieldRef>>` or `Arc<[FieldRef]>`
26///
27/// Can be constructed in a number of ways
28///
29/// ```
30/// # use std::sync::Arc;
31/// # use arrow_schema::{DataType, Field, Fields, SchemaBuilder};
32/// // Can be constructed from Vec<Field>
33/// Fields::from(vec![Field::new("a", DataType::Boolean, false)]);
34/// // Can be constructed from Vec<FieldRef>
35/// Fields::from(vec![Arc::new(Field::new("a", DataType::Boolean, false))]);
36/// // Can be constructed from an iterator of Field
37/// std::iter::once(Field::new("a", DataType::Boolean, false)).collect::<Fields>();
38/// // Can be constructed from an iterator of FieldRef
39/// std::iter::once(Arc::new(Field::new("a", DataType::Boolean, false))).collect::<Fields>();
40/// ```
41///
42/// See [`SchemaBuilder`] for mutating or updating [`Fields`]
43///
44/// ```
45/// # use arrow_schema::{DataType, Field, SchemaBuilder};
46/// let mut builder = SchemaBuilder::new();
47/// builder.push(Field::new("a", DataType::Boolean, false));
48/// builder.push(Field::new("b", DataType::Boolean, false));
49/// let fields = builder.finish().fields;
50///
51/// let mut builder = SchemaBuilder::from(&fields);
52/// builder.remove(0);
53/// let new = builder.finish().fields;
54/// ```
55///
56/// [`SchemaBuilder`]: crate::SchemaBuilder
57#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
58#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
59#[cfg_attr(feature = "serde", serde(transparent))]
60pub struct Fields(Arc<[FieldRef]>);
61
62impl std::fmt::Debug for Fields {
63 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64 self.0.as_ref().fmt(f)
65 }
66}
67
68impl Fields {
69 /// Returns a new empty [`Fields`]
70 pub fn empty() -> Self {
71 Self(Arc::new([]))
72 }
73
74 /// Return size of this instance in bytes.
75 pub fn size(&self) -> usize {
76 self.iter()
77 .map(|field| field.size() + std::mem::size_of::<FieldRef>())
78 .sum()
79 }
80
81 /// Searches for a field by name, returning it along with its index if found
82 pub fn find(&self, name: &str) -> Option<(usize, &FieldRef)> {
83 self.0.iter().enumerate().find(|(_, b)| b.name() == name)
84 }
85
86 /// Check to see if `self` is a superset of `other`
87 ///
88 /// In particular returns true if both have the same number of fields, and [`Field::contains`]
89 /// for each field across self and other
90 ///
91 /// In other words, any record that conforms to `other` should also conform to `self`
92 pub fn contains(&self, other: &Fields) -> bool {
93 if Arc::ptr_eq(&self.0, &other.0) {
94 return true;
95 }
96 self.len() == other.len()
97 && self
98 .iter()
99 .zip(other.iter())
100 .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b))
101 }
102
103 /// Returns a copy of this [`Fields`] containing only those [`FieldRef`] passing a predicate
104 ///
105 /// Performs a depth-first scan of [`Fields`] invoking `filter` for each [`FieldRef`]
106 /// containing no child [`FieldRef`], a leaf field, along with a count of the number
107 /// of such leaves encountered so far. Only [`FieldRef`] for which `filter`
108 /// returned `true` will be included in the result.
109 ///
110 /// This can therefore be used to select a subset of fields from nested types
111 /// such as [`DataType::Struct`] or [`DataType::List`].
112 ///
113 /// ```
114 /// # use arrow_schema::{DataType, Field, Fields};
115 /// let fields = Fields::from(vec![
116 /// Field::new("a", DataType::Int32, true), // Leaf 0
117 /// Field::new("b", DataType::Struct(Fields::from(vec![
118 /// Field::new("c", DataType::Float32, false), // Leaf 1
119 /// Field::new("d", DataType::Float64, false), // Leaf 2
120 /// Field::new("e", DataType::Struct(Fields::from(vec![
121 /// Field::new("f", DataType::Int32, false), // Leaf 3
122 /// Field::new("g", DataType::Float16, false), // Leaf 4
123 /// ])), true),
124 /// ])), false)
125 /// ]);
126 /// let filtered = fields.filter_leaves(|idx, _| [0, 2, 3, 4].contains(&idx));
127 /// let expected = Fields::from(vec![
128 /// Field::new("a", DataType::Int32, true),
129 /// Field::new("b", DataType::Struct(Fields::from(vec![
130 /// Field::new("d", DataType::Float64, false),
131 /// Field::new("e", DataType::Struct(Fields::from(vec![
132 /// Field::new("f", DataType::Int32, false),
133 /// Field::new("g", DataType::Float16, false),
134 /// ])), true),
135 /// ])), false)
136 /// ]);
137 /// assert_eq!(filtered, expected);
138 /// ```
139 pub fn filter_leaves<F: FnMut(usize, &FieldRef) -> bool>(&self, mut filter: F) -> Self {
140 self.try_filter_leaves(|idx, field| Ok(filter(idx, field)))
141 .unwrap()
142 }
143
144 /// Returns a copy of this [`Fields`] containing only those [`FieldRef`] passing a predicate
145 /// or an error if the predicate fails.
146 ///
147 /// See [`Fields::filter_leaves`] for more information.
148 pub fn try_filter_leaves<F: FnMut(usize, &FieldRef) -> Result<bool, ArrowError>>(
149 &self,
150 mut filter: F,
151 ) -> Result<Self, ArrowError> {
152 fn filter_field<F: FnMut(&FieldRef) -> Result<bool, ArrowError>>(
153 f: &FieldRef,
154 filter: &mut F,
155 ) -> Result<Option<FieldRef>, ArrowError> {
156 use DataType::*;
157
158 let v = match f.data_type() {
159 Dictionary(_, v) => v.as_ref(), // Key must be integer
160 RunEndEncoded(_, v) => v.data_type(), // Run-ends must be integer
161 d => d,
162 };
163 let d = match v {
164 List(child) => {
165 let fields = filter_field(child, filter)?;
166 if let Some(fields) = fields {
167 List(fields)
168 } else {
169 return Ok(None);
170 }
171 }
172 LargeList(child) => {
173 let fields = filter_field(child, filter)?;
174 if let Some(fields) = fields {
175 LargeList(fields)
176 } else {
177 return Ok(None);
178 }
179 }
180 Map(child, ordered) => {
181 let fields = filter_field(child, filter)?;
182 if let Some(fields) = fields {
183 Map(fields, *ordered)
184 } else {
185 return Ok(None);
186 }
187 }
188 FixedSizeList(child, size) => {
189 let fields = filter_field(child, filter)?;
190 if let Some(fields) = fields {
191 FixedSizeList(fields, *size)
192 } else {
193 return Ok(None);
194 }
195 }
196 Struct(fields) => {
197 let filtered: Result<Vec<_>, _> =
198 fields.iter().map(|f| filter_field(f, filter)).collect();
199 let filtered: Fields = filtered?
200 .iter()
201 .filter_map(|f| f.as_ref().cloned())
202 .collect();
203
204 if filtered.is_empty() {
205 return Ok(None);
206 }
207
208 Struct(filtered)
209 }
210 Union(fields, mode) => {
211 let filtered: Result<Vec<_>, _> = fields
212 .iter()
213 .map(|(id, f)| filter_field(f, filter).map(|f| f.map(|f| (id, f))))
214 .collect();
215 let filtered: UnionFields = filtered?
216 .iter()
217 .filter_map(|f| f.as_ref().cloned())
218 .collect();
219
220 if filtered.is_empty() {
221 return Ok(None);
222 }
223
224 Union(filtered, *mode)
225 }
226 _ => {
227 let filtered = filter(f)?;
228 return Ok(filtered.then(|| f.clone()));
229 }
230 };
231 let d = match f.data_type() {
232 Dictionary(k, _) => Dictionary(k.clone(), Box::new(d)),
233 RunEndEncoded(v, f) => {
234 RunEndEncoded(v.clone(), Arc::new(f.as_ref().clone().with_data_type(d)))
235 }
236 _ => d,
237 };
238 Ok(Some(Arc::new(f.as_ref().clone().with_data_type(d))))
239 }
240
241 let mut leaf_idx = 0;
242 let mut filter = |f: &FieldRef| {
243 let t = filter(leaf_idx, f)?;
244 leaf_idx += 1;
245 Ok(t)
246 };
247
248 let filtered: Result<Vec<_>, _> = self
249 .0
250 .iter()
251 .map(|f| filter_field(f, &mut filter))
252 .collect();
253 let filtered = filtered?
254 .iter()
255 .filter_map(|f| f.as_ref().cloned())
256 .collect();
257 Ok(filtered)
258 }
259}
260
261impl Default for Fields {
262 fn default() -> Self {
263 Self::empty()
264 }
265}
266
267impl FromIterator<Field> for Fields {
268 fn from_iter<T: IntoIterator<Item = Field>>(iter: T) -> Self {
269 iter.into_iter().map(Arc::new).collect()
270 }
271}
272
273impl FromIterator<FieldRef> for Fields {
274 fn from_iter<T: IntoIterator<Item = FieldRef>>(iter: T) -> Self {
275 Self(iter.into_iter().collect())
276 }
277}
278
279impl From<Vec<Field>> for Fields {
280 fn from(value: Vec<Field>) -> Self {
281 value.into_iter().collect()
282 }
283}
284
285impl From<Vec<FieldRef>> for Fields {
286 fn from(value: Vec<FieldRef>) -> Self {
287 Self(value.into())
288 }
289}
290
291impl From<&[FieldRef]> for Fields {
292 fn from(value: &[FieldRef]) -> Self {
293 Self(value.into())
294 }
295}
296
297impl<const N: usize> From<[FieldRef; N]> for Fields {
298 fn from(value: [FieldRef; N]) -> Self {
299 Self(Arc::new(value))
300 }
301}
302
303impl Deref for Fields {
304 type Target = [FieldRef];
305
306 fn deref(&self) -> &Self::Target {
307 self.0.as_ref()
308 }
309}
310
311impl<'a> IntoIterator for &'a Fields {
312 type Item = &'a FieldRef;
313 type IntoIter = std::slice::Iter<'a, FieldRef>;
314
315 fn into_iter(self) -> Self::IntoIter {
316 self.0.iter()
317 }
318}
319
320/// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids
321#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
322#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
323#[cfg_attr(feature = "serde", serde(transparent))]
324pub struct UnionFields(Arc<[(i8, FieldRef)]>);
325
326impl std::fmt::Debug for UnionFields {
327 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
328 self.0.as_ref().fmt(f)
329 }
330}
331
332/// Allows direct indexing into [`UnionFields`] to access fields by position.
333///
334/// # Panics
335///
336/// Panics if the index is out of bounds. Note that [`UnionFields`] supports
337/// a maximum of 128 fields, as type IDs are represented as `i8` values.
338///
339/// For a non-panicking alternative, use [`UnionFields::get`].
340impl std::ops::Index<usize> for UnionFields {
341 type Output = (i8, FieldRef);
342
343 fn index(&self, index: usize) -> &Self::Output {
344 &self.0[index]
345 }
346}
347
348impl UnionFields {
349 /// Create a new [`UnionFields`] with no fields
350 pub fn empty() -> Self {
351 Self(Arc::from([]))
352 }
353
354 /// Create a new [`UnionFields`] from a [`Fields`] and array of type_ids
355 ///
356 /// See <https://arrow.apache.org/docs/format/Columnar.html#union-layout>
357 ///
358 /// ```
359 /// use arrow_schema::{DataType, Field, UnionFields};
360 /// // Create a new UnionFields with type id mapping
361 /// // 1 -> DataType::UInt8
362 /// // 3 -> DataType::Utf8
363 /// UnionFields::new(
364 /// vec![1, 3],
365 /// vec![
366 /// Field::new("field1", DataType::UInt8, false),
367 /// Field::new("field3", DataType::Utf8, false),
368 /// ],
369 /// );
370 /// ```
371 pub fn new<F, T>(type_ids: T, fields: F) -> Self
372 where
373 F: IntoIterator,
374 F::Item: Into<FieldRef>,
375 T: IntoIterator<Item = i8>,
376 {
377 let fields = fields.into_iter().map(Into::into);
378 let mut set = 0_u128;
379 type_ids
380 .into_iter()
381 .inspect(|&idx| {
382 let mask = 1_u128 << idx;
383 if (set & mask) != 0 {
384 panic!("duplicate type id: {idx}");
385 } else {
386 set |= mask;
387 }
388 })
389 .zip(fields)
390 .collect()
391 }
392
393 /// Return size of this instance in bytes.
394 pub fn size(&self) -> usize {
395 self.iter()
396 .map(|(_, field)| field.size() + std::mem::size_of::<(i8, FieldRef)>())
397 .sum()
398 }
399
400 /// Returns the number of fields in this [`UnionFields`]
401 pub fn len(&self) -> usize {
402 self.0.len()
403 }
404
405 /// Returns `true` if this is empty
406 pub fn is_empty(&self) -> bool {
407 self.0.is_empty()
408 }
409
410 /// Returns an iterator over the fields and type ids in this [`UnionFields`]
411 pub fn iter(&self) -> impl Iterator<Item = (i8, &FieldRef)> + '_ {
412 self.0.iter().map(|(id, f)| (*id, f))
413 }
414
415 /// Returns a reference to the field at the given index, or `None` if out of bounds.
416 ///
417 /// This is a safe alternative to direct indexing via `[]`.
418 ///
419 /// # Example
420 ///
421 /// ```
422 /// use arrow_schema::{DataType, Field, UnionFields};
423 ///
424 /// let fields = UnionFields::new(
425 /// vec![1, 3],
426 /// vec![
427 /// Field::new("field1", DataType::UInt8, false),
428 /// Field::new("field3", DataType::Utf8, false),
429 /// ],
430 /// );
431 ///
432 /// assert!(fields.get(0).is_some());
433 /// assert!(fields.get(1).is_some());
434 /// assert!(fields.get(2).is_none());
435 /// ```
436 pub fn get(&self, index: usize) -> Option<&(i8, FieldRef)> {
437 self.0.get(index)
438 }
439
440 /// Searches for a field by its type id, returning the type id and field reference if found.
441 /// Returns `None` if no field with the given type id exists.
442 pub fn find_by_type_id(&self, type_id: i8) -> Option<(i8, &FieldRef)> {
443 self.iter().find(|&(i, _)| i == type_id)
444 }
445
446 /// Searches for a field by value equality, returning its type id and reference if found.
447 /// Returns `None` if no matching field exists in this [`UnionFields`].
448 pub fn find_by_field(&self, field: &Field) -> Option<(i8, &FieldRef)> {
449 self.iter().find(|&(_, f)| f.as_ref() == field)
450 }
451
452 /// Merge this field into self if it is compatible.
453 ///
454 /// See [`Field::try_merge`]
455 pub(crate) fn try_merge(&mut self, other: &Self) -> Result<(), ArrowError> {
456 // TODO: This currently may produce duplicate type IDs (#3982)
457 let mut output: Vec<_> = self.iter().map(|(id, f)| (id, f.clone())).collect();
458 for (field_type_id, from_field) in other.iter() {
459 let mut is_new_field = true;
460 for (self_type_id, self_field) in output.iter_mut() {
461 if from_field == self_field {
462 // If the nested fields in two unions are the same, they must have same
463 // type id.
464 if *self_type_id != field_type_id {
465 return Err(ArrowError::SchemaError(format!(
466 "Fail to merge schema field '{}' because the self_type_id = {} does not equal field_type_id = {}",
467 self_field.name(),
468 self_type_id,
469 field_type_id
470 )));
471 }
472
473 is_new_field = false;
474 break;
475 }
476 }
477
478 if is_new_field {
479 output.push((field_type_id, from_field.clone()))
480 }
481 }
482 *self = output.into_iter().collect();
483 Ok(())
484 }
485}
486
487impl FromIterator<(i8, FieldRef)> for UnionFields {
488 fn from_iter<T: IntoIterator<Item = (i8, FieldRef)>>(iter: T) -> Self {
489 // TODO: Should this validate type IDs are unique (#3982)
490 Self(iter.into_iter().collect())
491 }
492}
493
494#[cfg(test)]
495mod tests {
496 use super::*;
497 use crate::UnionMode;
498
499 #[test]
500 fn test_filter() {
501 let floats = Fields::from(vec![
502 Field::new("a", DataType::Float32, false),
503 Field::new("b", DataType::Float32, false),
504 ]);
505 let fields = Fields::from(vec![
506 Field::new("a", DataType::Int32, true),
507 Field::new("floats", DataType::Struct(floats.clone()), true),
508 Field::new("b", DataType::Int16, true),
509 Field::new(
510 "c",
511 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
512 false,
513 ),
514 Field::new(
515 "d",
516 DataType::Dictionary(
517 Box::new(DataType::Int32),
518 Box::new(DataType::Struct(floats.clone())),
519 ),
520 false,
521 ),
522 Field::new_list(
523 "e",
524 Field::new("floats", DataType::Struct(floats.clone()), true),
525 true,
526 ),
527 Field::new_fixed_size_list(
528 "f",
529 Field::new_list_field(DataType::Int32, false),
530 3,
531 false,
532 ),
533 Field::new_map(
534 "g",
535 "entries",
536 Field::new("keys", DataType::LargeUtf8, false),
537 Field::new("values", DataType::Int32, true),
538 false,
539 false,
540 ),
541 Field::new(
542 "h",
543 DataType::Union(
544 UnionFields::new(
545 vec![1, 3],
546 vec![
547 Field::new("field1", DataType::UInt8, false),
548 Field::new("field3", DataType::Utf8, false),
549 ],
550 ),
551 UnionMode::Dense,
552 ),
553 true,
554 ),
555 Field::new(
556 "i",
557 DataType::RunEndEncoded(
558 Arc::new(Field::new("run_ends", DataType::Int32, false)),
559 Arc::new(Field::new("values", DataType::Struct(floats.clone()), true)),
560 ),
561 false,
562 ),
563 ]);
564
565 let floats_a = DataType::Struct(vec![floats[0].clone()].into());
566
567 let r = fields.filter_leaves(|idx, _| idx == 0 || idx == 1);
568 assert_eq!(r.len(), 2);
569 assert_eq!(r[0], fields[0]);
570 assert_eq!(r[1].data_type(), &floats_a);
571
572 let r = fields.filter_leaves(|_, f| f.name() == "a");
573 assert_eq!(r.len(), 5);
574 assert_eq!(r[0], fields[0]);
575 assert_eq!(r[1].data_type(), &floats_a);
576 assert_eq!(
577 r[2].data_type(),
578 &DataType::Dictionary(Box::new(DataType::Int32), Box::new(floats_a.clone()))
579 );
580 assert_eq!(
581 r[3].as_ref(),
582 &Field::new_list("e", Field::new("floats", floats_a.clone(), true), true)
583 );
584 assert_eq!(
585 r[4].as_ref(),
586 &Field::new(
587 "i",
588 DataType::RunEndEncoded(
589 Arc::new(Field::new("run_ends", DataType::Int32, false)),
590 Arc::new(Field::new("values", floats_a.clone(), true)),
591 ),
592 false,
593 )
594 );
595
596 let r = fields.filter_leaves(|_, f| f.name() == "floats");
597 assert_eq!(r.len(), 0);
598
599 let r = fields.filter_leaves(|idx, _| idx == 9);
600 assert_eq!(r.len(), 1);
601 assert_eq!(r[0], fields[6]);
602
603 let r = fields.filter_leaves(|idx, _| idx == 10 || idx == 11);
604 assert_eq!(r.len(), 1);
605 assert_eq!(r[0], fields[7]);
606
607 let union = DataType::Union(
608 UnionFields::new(vec![1], vec![Field::new("field1", DataType::UInt8, false)]),
609 UnionMode::Dense,
610 );
611
612 let r = fields.filter_leaves(|idx, _| idx == 12);
613 assert_eq!(r.len(), 1);
614 assert_eq!(r[0].data_type(), &union);
615
616 let r = fields.filter_leaves(|idx, _| idx == 14 || idx == 15);
617 assert_eq!(r.len(), 1);
618 assert_eq!(r[0], fields[9]);
619
620 // Propagate error
621 let r = fields.try_filter_leaves(|_, _| Err(ArrowError::SchemaError("error".to_string())));
622 assert!(r.is_err());
623 }
624}