1use crate::shred_variant::{
19 NullValue, VariantToShreddedVariantRowBuilder,
20 make_variant_to_shredded_variant_arrow_row_builder,
21};
22use crate::type_conversion::{
23 PrimitiveFromVariant, TimestampFromVariant, variant_cast_with_options,
24 variant_to_unscaled_decimal,
25};
26use crate::variant_array::ShreddedVariantFieldArray;
27use crate::{VariantArray, VariantValueArrayBuilder};
28use arrow::array::{
29 ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewBuilder,
30 BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, GenericListViewArray,
31 LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, OffsetSizeTrait,
32 PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, StructArray,
33};
34use arrow::buffer::{OffsetBuffer, ScalarBuffer};
35use arrow::compute::{CastOptions, DecimalCast};
36use arrow::datatypes::{self, DataType, DecimalType};
37use arrow::error::{ArrowError, Result};
38use arrow_schema::{FieldRef, Fields, TimeUnit};
39use parquet_variant::{Variant, VariantPath};
40use std::sync::Arc;
41
42pub(crate) enum VariantToArrowRowBuilder<'a> {
47 Primitive(PrimitiveVariantToArrowRowBuilder<'a>),
48 Array(ArrayVariantToArrowRowBuilder<'a>),
49 Struct(StructVariantToArrowRowBuilder<'a>),
50 BinaryVariant(VariantToBinaryVariantArrowRowBuilder),
51
52 WithPath(VariantPathRowBuilder<'a>),
54}
55
56impl<'a> VariantToArrowRowBuilder<'a> {
57 pub fn append_null(&mut self) -> Result<()> {
58 use VariantToArrowRowBuilder::*;
59 match self {
60 Primitive(b) => b.append_null(),
61 Array(b) => b.append_null(),
62 Struct(b) => b.append_null(),
63 BinaryVariant(b) => b.append_null(),
64 WithPath(path_builder) => path_builder.append_null(),
65 }
66 }
67
68 pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
69 use VariantToArrowRowBuilder::*;
70 match self {
71 Primitive(b) => b.append_value(&value),
72 Array(b) => b.append_value(&value),
73 Struct(b) => b.append_value(&value),
74 BinaryVariant(b) => b.append_value(value),
75 WithPath(path_builder) => path_builder.append_value(value),
76 }
77 }
78
79 pub fn finish(self) -> Result<ArrayRef> {
80 use VariantToArrowRowBuilder::*;
81 match self {
82 Primitive(b) => b.finish(),
83 Array(b) => b.finish(),
84 Struct(b) => b.finish(),
85 BinaryVariant(b) => b.finish(),
86 WithPath(path_builder) => path_builder.finish(),
87 }
88 }
89}
90
91fn make_typed_variant_to_arrow_row_builder<'a>(
92 data_type: &'a DataType,
93 cast_options: &'a CastOptions,
94 capacity: usize,
95) -> Result<VariantToArrowRowBuilder<'a>> {
96 use VariantToArrowRowBuilder::*;
97
98 match data_type {
99 DataType::Struct(fields) => {
100 let builder = StructVariantToArrowRowBuilder::try_new(fields, cast_options, capacity)?;
101 Ok(Struct(builder))
102 }
103 data_type @ (DataType::List(_)
104 | DataType::LargeList(_)
105 | DataType::ListView(_)
106 | DataType::LargeListView(_)
107 | DataType::FixedSizeList(..)) => {
108 let builder =
109 ArrayVariantToArrowRowBuilder::try_new(data_type, cast_options, capacity, false)?;
110 Ok(Array(builder))
111 }
112 data_type => {
113 let builder =
114 make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?;
115 Ok(Primitive(builder))
116 }
117 }
118}
119
120pub(crate) fn make_variant_to_arrow_row_builder<'a>(
121 metadata: &ArrayRef,
122 path: VariantPath<'a>,
123 data_type: Option<&'a DataType>,
124 cast_options: &'a CastOptions,
125 capacity: usize,
126) -> Result<VariantToArrowRowBuilder<'a>> {
127 use VariantToArrowRowBuilder::*;
128
129 let mut builder = match data_type {
130 None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(
132 metadata.clone(),
133 capacity,
134 )),
135 Some(data_type) => {
136 make_typed_variant_to_arrow_row_builder(data_type, cast_options, capacity)?
137 }
138 };
139
140 if !path.is_empty() {
142 builder = WithPath(VariantPathRowBuilder {
143 builder: Box::new(builder),
144 path,
145 })
146 };
147
148 Ok(builder)
149}
150
151pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
155 Null(VariantToNullArrowRowBuilder<'a>),
156 Boolean(VariantToBooleanArrowRowBuilder<'a>),
157 Int8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int8Type>),
158 Int16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int16Type>),
159 Int32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int32Type>),
160 Int64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int64Type>),
161 UInt8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt8Type>),
162 UInt16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt16Type>),
163 UInt32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt32Type>),
164 UInt64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::UInt64Type>),
165 Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>),
166 Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>),
167 Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>),
168 Decimal32(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal32Type>),
169 Decimal64(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal64Type>),
170 Decimal128(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal128Type>),
171 Decimal256(VariantToDecimalArrowRowBuilder<'a, datatypes::Decimal256Type>),
172 TimestampSecond(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampSecondType>),
173 TimestampSecondNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampSecondType>),
174 TimestampMilli(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMillisecondType>),
175 TimestampMilliNtz(
176 VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampMillisecondType>,
177 ),
178 TimestampMicro(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>),
179 TimestampMicroNtz(
180 VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>,
181 ),
182 TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
183 TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
184 Time32Second(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time32SecondType>),
185 Time32Milli(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time32MillisecondType>),
186 Time64Micro(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64MicrosecondType>),
187 Time64Nano(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64NanosecondType>),
188 Date32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
189 Date64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date64Type>),
190 Uuid(VariantToUuidArrowRowBuilder<'a>),
191 String(VariantToStringArrowBuilder<'a, StringBuilder>),
192 LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>),
193 StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>),
194 Binary(VariantToBinaryArrowRowBuilder<'a, BinaryBuilder>),
195 LargeBinary(VariantToBinaryArrowRowBuilder<'a, LargeBinaryBuilder>),
196 BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>),
197}
198
199impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
200 pub fn append_null(&mut self) -> Result<()> {
201 use PrimitiveVariantToArrowRowBuilder::*;
202 match self {
203 Null(b) => b.append_null(),
204 Boolean(b) => b.append_null(),
205 Int8(b) => b.append_null(),
206 Int16(b) => b.append_null(),
207 Int32(b) => b.append_null(),
208 Int64(b) => b.append_null(),
209 UInt8(b) => b.append_null(),
210 UInt16(b) => b.append_null(),
211 UInt32(b) => b.append_null(),
212 UInt64(b) => b.append_null(),
213 Float16(b) => b.append_null(),
214 Float32(b) => b.append_null(),
215 Float64(b) => b.append_null(),
216 Decimal32(b) => b.append_null(),
217 Decimal64(b) => b.append_null(),
218 Decimal128(b) => b.append_null(),
219 Decimal256(b) => b.append_null(),
220 TimestampSecond(b) => b.append_null(),
221 TimestampSecondNtz(b) => b.append_null(),
222 TimestampMilli(b) => b.append_null(),
223 TimestampMilliNtz(b) => b.append_null(),
224 TimestampMicro(b) => b.append_null(),
225 TimestampMicroNtz(b) => b.append_null(),
226 TimestampNano(b) => b.append_null(),
227 TimestampNanoNtz(b) => b.append_null(),
228 Time32Second(b) => b.append_null(),
229 Time32Milli(b) => b.append_null(),
230 Time64Micro(b) => b.append_null(),
231 Time64Nano(b) => b.append_null(),
232 Date32(b) => b.append_null(),
233 Date64(b) => b.append_null(),
234 Uuid(b) => b.append_null(),
235 String(b) => b.append_null(),
236 LargeString(b) => b.append_null(),
237 StringView(b) => b.append_null(),
238 Binary(b) => b.append_null(),
239 LargeBinary(b) => b.append_null(),
240 BinaryView(b) => b.append_null(),
241 }
242 }
243
244 pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
245 use PrimitiveVariantToArrowRowBuilder::*;
246 match self {
247 Null(b) => b.append_value(value),
248 Boolean(b) => b.append_value(value),
249 Int8(b) => b.append_value(value),
250 Int16(b) => b.append_value(value),
251 Int32(b) => b.append_value(value),
252 Int64(b) => b.append_value(value),
253 UInt8(b) => b.append_value(value),
254 UInt16(b) => b.append_value(value),
255 UInt32(b) => b.append_value(value),
256 UInt64(b) => b.append_value(value),
257 Float16(b) => b.append_value(value),
258 Float32(b) => b.append_value(value),
259 Float64(b) => b.append_value(value),
260 Decimal32(b) => b.append_value(value),
261 Decimal64(b) => b.append_value(value),
262 Decimal128(b) => b.append_value(value),
263 Decimal256(b) => b.append_value(value),
264 TimestampSecond(b) => b.append_value(value),
265 TimestampSecondNtz(b) => b.append_value(value),
266 TimestampMilli(b) => b.append_value(value),
267 TimestampMilliNtz(b) => b.append_value(value),
268 TimestampMicro(b) => b.append_value(value),
269 TimestampMicroNtz(b) => b.append_value(value),
270 TimestampNano(b) => b.append_value(value),
271 TimestampNanoNtz(b) => b.append_value(value),
272 Time32Second(b) => b.append_value(value),
273 Time32Milli(b) => b.append_value(value),
274 Time64Micro(b) => b.append_value(value),
275 Time64Nano(b) => b.append_value(value),
276 Date32(b) => b.append_value(value),
277 Date64(b) => b.append_value(value),
278 Uuid(b) => b.append_value(value),
279 String(b) => b.append_value(value),
280 LargeString(b) => b.append_value(value),
281 StringView(b) => b.append_value(value),
282 Binary(b) => b.append_value(value),
283 LargeBinary(b) => b.append_value(value),
284 BinaryView(b) => b.append_value(value),
285 }
286 }
287
288 pub fn finish(self) -> Result<ArrayRef> {
289 use PrimitiveVariantToArrowRowBuilder::*;
290 match self {
291 Null(b) => b.finish(),
292 Boolean(b) => b.finish(),
293 Int8(b) => b.finish(),
294 Int16(b) => b.finish(),
295 Int32(b) => b.finish(),
296 Int64(b) => b.finish(),
297 UInt8(b) => b.finish(),
298 UInt16(b) => b.finish(),
299 UInt32(b) => b.finish(),
300 UInt64(b) => b.finish(),
301 Float16(b) => b.finish(),
302 Float32(b) => b.finish(),
303 Float64(b) => b.finish(),
304 Decimal32(b) => b.finish(),
305 Decimal64(b) => b.finish(),
306 Decimal128(b) => b.finish(),
307 Decimal256(b) => b.finish(),
308 TimestampSecond(b) => b.finish(),
309 TimestampSecondNtz(b) => b.finish(),
310 TimestampMilli(b) => b.finish(),
311 TimestampMilliNtz(b) => b.finish(),
312 TimestampMicro(b) => b.finish(),
313 TimestampMicroNtz(b) => b.finish(),
314 TimestampNano(b) => b.finish(),
315 TimestampNanoNtz(b) => b.finish(),
316 Time32Second(b) => b.finish(),
317 Time32Milli(b) => b.finish(),
318 Time64Micro(b) => b.finish(),
319 Time64Nano(b) => b.finish(),
320 Date32(b) => b.finish(),
321 Date64(b) => b.finish(),
322 Uuid(b) => b.finish(),
323 String(b) => b.finish(),
324 LargeString(b) => b.finish(),
325 StringView(b) => b.finish(),
326 Binary(b) => b.finish(),
327 LargeBinary(b) => b.finish(),
328 BinaryView(b) => b.finish(),
329 }
330 }
331}
332
333pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
335 data_type: &'a DataType,
336 cast_options: &'a CastOptions,
337 capacity: usize,
338) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
339 use PrimitiveVariantToArrowRowBuilder::*;
340
341 let builder =
342 match data_type {
343 DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)),
344 DataType::Boolean => {
345 Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity))
346 }
347 DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
348 cast_options,
349 capacity,
350 )),
351 DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
352 cast_options,
353 capacity,
354 )),
355 DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
356 cast_options,
357 capacity,
358 )),
359 DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
360 cast_options,
361 capacity,
362 )),
363 DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
364 cast_options,
365 capacity,
366 )),
367 DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
368 cast_options,
369 capacity,
370 )),
371 DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
372 cast_options,
373 capacity,
374 )),
375 DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
376 cast_options,
377 capacity,
378 )),
379 DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
380 cast_options,
381 capacity,
382 )),
383 DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
384 cast_options,
385 capacity,
386 )),
387 DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
388 cast_options,
389 capacity,
390 )),
391 DataType::Decimal32(precision, scale) => Decimal32(
392 VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
393 ),
394 DataType::Decimal64(precision, scale) => Decimal64(
395 VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
396 ),
397 DataType::Decimal128(precision, scale) => Decimal128(
398 VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
399 ),
400 DataType::Decimal256(precision, scale) => Decimal256(
401 VariantToDecimalArrowRowBuilder::new(cast_options, capacity, *precision, *scale)?,
402 ),
403 DataType::Date32 => Date32(VariantToPrimitiveArrowRowBuilder::new(
404 cast_options,
405 capacity,
406 )),
407 DataType::Date64 => Date64(VariantToPrimitiveArrowRowBuilder::new(
408 cast_options,
409 capacity,
410 )),
411 DataType::Time32(TimeUnit::Second) => Time32Second(
412 VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
413 ),
414 DataType::Time32(TimeUnit::Millisecond) => Time32Milli(
415 VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
416 ),
417 DataType::Time32(t) => {
418 return Err(ArrowError::InvalidArgumentError(format!(
419 "The unit for Time32 must be second/millisecond, received {t:?}"
420 )));
421 }
422 DataType::Time64(TimeUnit::Microsecond) => Time64Micro(
423 VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
424 ),
425 DataType::Time64(TimeUnit::Nanosecond) => Time64Nano(
426 VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
427 ),
428 DataType::Time64(t) => {
429 return Err(ArrowError::InvalidArgumentError(format!(
430 "The unit for Time64 must be micro/nano seconds, received {t:?}"
431 )));
432 }
433 DataType::Timestamp(TimeUnit::Second, None) => TimestampSecondNtz(
434 VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
435 ),
436 DataType::Timestamp(TimeUnit::Second, tz) => TimestampSecond(
437 VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
438 ),
439 DataType::Timestamp(TimeUnit::Millisecond, None) => TimestampMilliNtz(
440 VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
441 ),
442 DataType::Timestamp(TimeUnit::Millisecond, tz) => TimestampMilli(
443 VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
444 ),
445 DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
446 VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
447 ),
448 DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
449 VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
450 ),
451 DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
452 VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
453 ),
454 DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
455 VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()),
456 ),
457 DataType::Duration(_) | DataType::Interval(_) => {
458 return Err(ArrowError::InvalidArgumentError(
459 "Casting Variant to duration/interval types is not supported. \
460 The Variant format does not define duration/interval types."
461 .to_string(),
462 ));
463 }
464 DataType::Binary => Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)),
465 DataType::LargeBinary => {
466 LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
467 }
468 DataType::BinaryView => {
469 BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
470 }
471 DataType::FixedSizeBinary(16) => {
472 Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
473 }
474 DataType::FixedSizeBinary(_) => {
475 return Err(ArrowError::NotYetImplemented(format!(
476 "DataType {data_type:?} not yet implemented"
477 )));
478 }
479 DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)),
480 DataType::LargeUtf8 => {
481 LargeString(VariantToStringArrowBuilder::new(cast_options, capacity))
482 }
483 DataType::Utf8View => {
484 StringView(VariantToStringArrowBuilder::new(cast_options, capacity))
485 }
486 DataType::List(_)
487 | DataType::LargeList(_)
488 | DataType::ListView(_)
489 | DataType::LargeListView(_)
490 | DataType::FixedSizeList(..)
491 | DataType::Struct(_)
492 | DataType::Map(..)
493 | DataType::Union(..)
494 | DataType::Dictionary(..)
495 | DataType::RunEndEncoded(..) => {
496 return Err(ArrowError::InvalidArgumentError(format!(
497 "Casting to {data_type:?} is not applicable for primitive Variant types"
498 )));
499 }
500 };
501 Ok(builder)
502}
503
504pub(crate) enum ArrayVariantToArrowRowBuilder<'a> {
505 List(VariantToListArrowRowBuilder<'a, i32, false>),
506 LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
507 ListView(VariantToListArrowRowBuilder<'a, i32, true>),
508 LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
509}
510
511pub(crate) struct StructVariantToArrowRowBuilder<'a> {
512 fields: &'a Fields,
513 field_builders: Vec<VariantToArrowRowBuilder<'a>>,
514 nulls: NullBufferBuilder,
515 cast_options: &'a CastOptions<'a>,
516}
517
518impl<'a> StructVariantToArrowRowBuilder<'a> {
519 fn try_new(
520 fields: &'a Fields,
521 cast_options: &'a CastOptions<'a>,
522 capacity: usize,
523 ) -> Result<Self> {
524 let mut field_builders = Vec::with_capacity(fields.len());
525 for field in fields.iter() {
526 field_builders.push(make_typed_variant_to_arrow_row_builder(
527 field.data_type(),
528 cast_options,
529 capacity,
530 )?);
531 }
532 Ok(Self {
533 fields,
534 field_builders,
535 nulls: NullBufferBuilder::new(capacity),
536 cast_options,
537 })
538 }
539
540 fn append_null(&mut self) -> Result<()> {
541 for builder in &mut self.field_builders {
542 builder.append_null()?;
543 }
544 self.nulls.append_null();
545 Ok(())
546 }
547
548 fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
549 match variant_cast_with_options(value, self.cast_options, Variant::as_object) {
550 Ok(Some(obj)) => {
551 for (index, field) in self.fields.iter().enumerate() {
552 match obj.get(field.name()) {
553 Some(field_value) => {
554 self.field_builders[index].append_value(field_value)?;
555 }
556 None => {
557 self.field_builders[index].append_null()?;
558 }
559 }
560 }
561
562 self.nulls.append_non_null();
563 Ok(true)
564 }
565 Ok(None) => {
566 self.append_null()?;
567 Ok(false)
568 }
569 Err(_) => Err(ArrowError::CastError(format!(
570 "Failed to extract struct from variant {value:?}"
571 ))),
572 }
573 }
574
575 fn finish(mut self) -> Result<ArrayRef> {
576 let mut children = Vec::with_capacity(self.field_builders.len());
577 for builder in self.field_builders {
578 children.push(builder.finish()?);
579 }
580 Ok(Arc::new(StructArray::try_new(
581 self.fields.clone(),
582 children,
583 self.nulls.finish(),
584 )?))
585 }
586}
587
588impl<'a> ArrayVariantToArrowRowBuilder<'a> {
589 pub(crate) fn try_new(
596 data_type: &'a DataType,
597 cast_options: &'a CastOptions,
598 capacity: usize,
599 shredded: bool,
600 ) -> Result<Self> {
601 use ArrayVariantToArrowRowBuilder::*;
602
603 macro_rules! make_list_builder {
605 ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => {
606 $variant(VariantToListArrowRowBuilder::<$offset, $is_view>::try_new(
607 $field.clone(),
608 $field.data_type(),
609 cast_options,
610 capacity,
611 shredded,
612 )?)
613 };
614 }
615
616 let builder = match data_type {
617 DataType::List(field) => make_list_builder!(List, i32, false, field),
618 DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field),
619 DataType::ListView(field) => make_list_builder!(ListView, i32, true, field),
620 DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field),
621 DataType::FixedSizeList(..) => {
622 return Err(ArrowError::NotYetImplemented(
623 "Converting unshredded variant arrays to arrow fixed-size lists".to_string(),
624 ));
625 }
626 other => {
627 return Err(ArrowError::InvalidArgumentError(format!(
628 "Casting to {other:?} is not applicable for array Variant types"
629 )));
630 }
631 };
632 Ok(builder)
633 }
634
635 pub(crate) fn append_null(&mut self) -> Result<()> {
636 match self {
637 Self::List(builder) => builder.append_null(),
638 Self::LargeList(builder) => builder.append_null(),
639 Self::ListView(builder) => builder.append_null(),
640 Self::LargeListView(builder) => builder.append_null(),
641 }
642 }
643
644 pub(crate) fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
645 match self {
646 Self::List(builder) => builder.append_value(value),
647 Self::LargeList(builder) => builder.append_value(value),
648 Self::ListView(builder) => builder.append_value(value),
649 Self::LargeListView(builder) => builder.append_value(value),
650 }
651 }
652
653 pub(crate) fn finish(self) -> Result<ArrayRef> {
654 match self {
655 Self::List(builder) => builder.finish(),
656 Self::LargeList(builder) => builder.finish(),
657 Self::ListView(builder) => builder.finish(),
658 Self::LargeListView(builder) => builder.finish(),
659 }
660 }
661}
662
663pub(crate) struct VariantPathRowBuilder<'a> {
666 builder: Box<VariantToArrowRowBuilder<'a>>,
667 path: VariantPath<'a>,
668}
669
670impl<'a> VariantPathRowBuilder<'a> {
671 fn append_null(&mut self) -> Result<()> {
672 self.builder.append_null()
673 }
674
675 fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
676 if let Some(v) = value.get_path(&self.path) {
677 self.builder.append_value(v)
678 } else {
679 self.builder.append_null()?;
680 Ok(false)
681 }
682 }
683
684 fn finish(self) -> Result<ArrayRef> {
685 self.builder.finish()
686 }
687}
688
689macro_rules! define_variant_to_primitive_builder {
690 (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
691 |$array_param:ident $(, $field:ident: $field_type:ty)?| -> $builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
692 |$value: ident| $value_transform:expr,
693 type_name: $type_name:expr) => {
694 pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
695 {
696 builder: $builder_name $(<$array_type>)?,
697 cast_options: &$lifetime CastOptions<$lifetime>,
698 }
699
700 impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic )?> {
701 fn new(
702 cast_options: &$lifetime CastOptions<$lifetime>,
703 $array_param: usize,
704 $( $field: $field_type, )?
706 ) -> Self {
707 Self {
708 builder: $init_expr,
709 cast_options,
710 }
711 }
712
713 fn append_null(&mut self) -> Result<()> {
714 self.builder.append_null();
715 Ok(())
716 }
717
718 fn append_value(&mut self, $value: &Variant<'_, '_>) -> Result<bool> {
719 match variant_cast_with_options(
720 $value,
721 self.cast_options,
722 |$value| $value_transform,
723 ) {
724 Ok(Some(v)) => {
725 self.builder.append_value(v);
726 Ok(true)
727 }
728 Ok(None) => {
729 self.builder.append_null();
730 Ok(false)
731 }
732 Err(_) => Err(ArrowError::CastError(format!(
733 "Failed to extract primitive of type {type_name} from variant {value:?} at path VariantPath([])",
734 type_name = $type_name,
735 value = $value
736 ))),
737 }
738 }
739
740 #[allow(unused_mut)]
743 fn finish(mut self) -> Result<ArrayRef> {
744 Ok(Arc::from(self.builder.finish()))
748 }
749 }
750 }
751}
752
753define_variant_to_primitive_builder!(
754 struct VariantToStringArrowBuilder<'a, B: StringLikeArrayBuilder>
755 |capacity| -> B { B::with_capacity(capacity) },
756 |value| value.as_string(),
757 type_name: B::type_name()
758);
759
760define_variant_to_primitive_builder!(
761 struct VariantToBooleanArrowRowBuilder<'a>
762 |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
763 |value| value.as_boolean(),
764 type_name: datatypes::BooleanType::DATA_TYPE
765);
766
767define_variant_to_primitive_builder!(
768 struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
769 |capacity| -> PrimitiveBuilder<T> { PrimitiveBuilder::<T>::with_capacity(capacity) },
770 |value| T::from_variant(value),
771 type_name: T::DATA_TYPE
772);
773
774define_variant_to_primitive_builder!(
775 struct VariantToTimestampNtzArrowRowBuilder<'a, T:TimestampFromVariant<true>>
776 |capacity| -> PrimitiveBuilder<T> { PrimitiveBuilder::<T>::with_capacity(capacity) },
777 |value| T::from_variant(value),
778 type_name: T::DATA_TYPE
779);
780
781define_variant_to_primitive_builder!(
782 struct VariantToTimestampArrowRowBuilder<'a, T:TimestampFromVariant<false>>
783 |capacity, tz: Option<Arc<str>> | -> PrimitiveBuilder<T> {
784 PrimitiveBuilder::<T>::with_capacity(capacity).with_timezone_opt(tz)
785 },
786 |value| T::from_variant(value),
787 type_name: T::DATA_TYPE
788);
789
790define_variant_to_primitive_builder!(
791 struct VariantToBinaryArrowRowBuilder<'a, B: BinaryLikeArrayBuilder>
792 |capacity| -> B { B::with_capacity(capacity) },
793 |value| value.as_u8_slice(),
794 type_name: B::type_name()
795);
796
797pub(crate) struct VariantToDecimalArrowRowBuilder<'a, T>
799where
800 T: DecimalType,
801 T::Native: DecimalCast,
802{
803 builder: PrimitiveBuilder<T>,
804 cast_options: &'a CastOptions<'a>,
805 precision: u8,
806 scale: i8,
807}
808
809impl<'a, T> VariantToDecimalArrowRowBuilder<'a, T>
810where
811 T: DecimalType,
812 T::Native: DecimalCast,
813{
814 fn new(
815 cast_options: &'a CastOptions<'a>,
816 capacity: usize,
817 precision: u8,
818 scale: i8,
819 ) -> Result<Self> {
820 let builder = PrimitiveBuilder::<T>::with_capacity(capacity)
821 .with_precision_and_scale(precision, scale)?;
822 Ok(Self {
823 builder,
824 cast_options,
825 precision,
826 scale,
827 })
828 }
829
830 fn append_null(&mut self) -> Result<()> {
831 self.builder.append_null();
832 Ok(())
833 }
834
835 fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
836 match variant_cast_with_options(value, self.cast_options, |value| {
837 variant_to_unscaled_decimal::<T>(value, self.precision, self.scale)
838 }) {
839 Ok(Some(scaled)) => {
840 self.builder.append_value(scaled);
841 Ok(true)
842 }
843 Ok(None) => {
844 self.builder.append_null();
845 Ok(false)
846 }
847 Err(_) => Err(ArrowError::CastError(format!(
848 "Failed to cast to {prefix}(precision={precision}, scale={scale}) from variant {value:?}",
849 prefix = T::PREFIX,
850 precision = self.precision,
851 scale = self.scale
852 ))),
853 }
854 }
855
856 fn finish(mut self) -> Result<ArrayRef> {
857 Ok(Arc::new(self.builder.finish()))
858 }
859}
860
861pub(crate) struct VariantToUuidArrowRowBuilder<'a> {
863 builder: FixedSizeBinaryBuilder,
864 cast_options: &'a CastOptions<'a>,
865}
866
867impl<'a> VariantToUuidArrowRowBuilder<'a> {
868 fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
869 Self {
870 builder: FixedSizeBinaryBuilder::with_capacity(capacity, 16),
871 cast_options,
872 }
873 }
874
875 fn append_null(&mut self) -> Result<()> {
876 self.builder.append_null();
877 Ok(())
878 }
879
880 fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
881 match variant_cast_with_options(value, self.cast_options, Variant::as_uuid) {
882 Ok(Some(uuid)) => {
883 self.builder
884 .append_value(uuid.as_bytes())
885 .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
886 Ok(true)
887 }
888 Ok(None) => {
889 self.builder.append_null();
890 Ok(false)
891 }
892 Err(_) => Err(ArrowError::CastError(format!(
893 "Failed to extract UUID from variant {value:?}"
894 ))),
895 }
896 }
897
898 fn finish(mut self) -> Result<ArrayRef> {
899 Ok(Arc::new(self.builder.finish()))
900 }
901}
902
903enum ListElementBuilder<'a> {
906 Typed(Box<VariantToArrowRowBuilder<'a>>),
908 Shredded(Box<VariantToShreddedVariantRowBuilder<'a>>),
910}
911
912impl<'a> ListElementBuilder<'a> {
913 fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
914 match self {
915 Self::Typed(b) => b.append_value(value),
916 Self::Shredded(b) => b.append_value(value),
917 }
918 }
919
920 fn finish(self) -> Result<ArrayRef> {
921 match self {
922 Self::Typed(b) => b.finish(),
923 Self::Shredded(b) => {
924 let (value, typed_value, nulls) = b.finish()?;
925 Ok(ArrayRef::from(ShreddedVariantFieldArray::from_parts(
926 Some(Arc::new(value)),
927 Some(typed_value),
928 nulls,
929 )))
930 }
931 }
932 }
933}
934
935pub(crate) struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool>
936where
937 O: OffsetSizeTrait + ArrowNativeTypeOp,
938{
939 field: FieldRef,
940 offsets: Vec<O>,
941 element_builder: ListElementBuilder<'a>,
942 nulls: NullBufferBuilder,
943 current_offset: O,
944 cast_options: &'a CastOptions<'a>,
945}
946
947impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW>
948where
949 O: OffsetSizeTrait + ArrowNativeTypeOp,
950{
951 fn try_new(
952 field: FieldRef,
953 element_data_type: &'a DataType,
954 cast_options: &'a CastOptions,
955 capacity: usize,
956 shredded: bool,
957 ) -> Result<Self> {
958 if capacity >= isize::MAX as usize {
959 return Err(ArrowError::ComputeError(
960 "Capacity exceeds isize::MAX when reserving list offsets".to_string(),
961 ));
962 }
963 let mut offsets = Vec::with_capacity(capacity + 1);
964 offsets.push(O::ZERO);
965 let element_builder = if shredded {
966 let builder = make_variant_to_shredded_variant_arrow_row_builder(
967 element_data_type,
968 cast_options,
969 capacity,
970 NullValue::ArrayElement,
971 )?;
972 ListElementBuilder::Shredded(Box::new(builder))
973 } else {
974 let builder =
975 make_typed_variant_to_arrow_row_builder(element_data_type, cast_options, capacity)?;
976 ListElementBuilder::Typed(Box::new(builder))
977 };
978
979 Ok(Self {
980 field,
981 offsets,
982 element_builder,
983 nulls: NullBufferBuilder::new(capacity),
984 current_offset: O::ZERO,
985 cast_options,
986 })
987 }
988
989 fn append_null(&mut self) -> Result<()> {
990 self.offsets.push(self.current_offset);
991 self.nulls.append_null();
992 Ok(())
993 }
994
995 fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
996 match variant_cast_with_options(value, self.cast_options, Variant::as_list) {
997 Ok(Some(list)) => {
998 for element in list.iter() {
999 self.element_builder.append_value(element)?;
1000 self.current_offset = self.current_offset.add_checked(O::ONE)?;
1001 }
1002 self.offsets.push(self.current_offset);
1003 self.nulls.append_non_null();
1004 Ok(true)
1005 }
1006 Ok(None) => {
1007 self.append_null()?;
1008 Ok(false)
1009 }
1010 Err(_) => Err(ArrowError::CastError(format!(
1011 "Failed to extract list from variant {value:?}"
1012 ))),
1013 }
1014 }
1015
1016 fn finish(mut self) -> Result<ArrayRef> {
1017 let element_array: ArrayRef = self.element_builder.finish()?;
1018 let field = Arc::new(
1019 self.field
1020 .as_ref()
1021 .clone()
1022 .with_data_type(element_array.data_type().clone()),
1023 );
1024
1025 if IS_VIEW {
1026 let mut sizes = Vec::with_capacity(self.offsets.len() - 1);
1028 for i in 1..self.offsets.len() {
1029 sizes.push(self.offsets[i] - self.offsets[i - 1]);
1030 }
1031 self.offsets.pop();
1032 let list_view_array = GenericListViewArray::<O>::new(
1033 field,
1034 ScalarBuffer::from(self.offsets),
1035 ScalarBuffer::from(sizes),
1036 element_array,
1037 self.nulls.finish(),
1038 );
1039 Ok(Arc::new(list_view_array))
1040 } else {
1041 let list_array = GenericListArray::<O>::new(
1042 field,
1043 OffsetBuffer::<O>::new(ScalarBuffer::from(self.offsets)),
1044 element_array,
1045 self.nulls.finish(),
1046 );
1047 Ok(Arc::new(list_array))
1048 }
1049 }
1050}
1051
1052pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
1054 metadata: ArrayRef,
1055 builder: VariantValueArrayBuilder,
1056 nulls: NullBufferBuilder,
1057}
1058
1059impl VariantToBinaryVariantArrowRowBuilder {
1060 fn new(metadata: ArrayRef, capacity: usize) -> Self {
1061 Self {
1062 metadata,
1063 builder: VariantValueArrayBuilder::new(capacity),
1064 nulls: NullBufferBuilder::new(capacity),
1065 }
1066 }
1067}
1068
1069impl VariantToBinaryVariantArrowRowBuilder {
1070 fn append_null(&mut self) -> Result<()> {
1071 self.builder.append_null();
1072 self.nulls.append_null();
1073 Ok(())
1074 }
1075
1076 fn append_value(&mut self, value: Variant<'_, '_>) -> Result<bool> {
1077 self.builder.append_value(value);
1078 self.nulls.append_non_null();
1079 Ok(true)
1080 }
1081
1082 fn finish(mut self) -> Result<ArrayRef> {
1083 let variant_array = VariantArray::from_parts(
1084 self.metadata,
1085 Some(Arc::new(self.builder.build()?)),
1086 None, self.nulls.finish(),
1088 );
1089
1090 Ok(ArrayRef::from(variant_array))
1091 }
1092}
1093
1094#[derive(Default)]
1095struct FakeNullBuilder {
1096 item_count: usize,
1097}
1098
1099impl FakeNullBuilder {
1100 fn append_value(&mut self, _: ()) {
1101 self.item_count += 1;
1102 }
1103
1104 fn append_null(&mut self) {
1105 self.item_count += 1;
1106 }
1107
1108 fn finish(self) -> NullArray {
1109 NullArray::new(self.item_count)
1110 }
1111}
1112
1113define_variant_to_primitive_builder!(
1114 struct VariantToNullArrowRowBuilder<'a>
1115 |_capacity| -> FakeNullBuilder { FakeNullBuilder::default() },
1116 |value| value.as_null(),
1117 type_name: "Null"
1118);
1119
1120#[cfg(test)]
1121mod tests {
1122 use super::{
1123 make_primitive_variant_to_arrow_row_builder, make_typed_variant_to_arrow_row_builder,
1124 };
1125 use arrow::array::{
1126 Array, Decimal32Array, FixedSizeBinaryArray, Int32Array, ListArray, StructArray,
1127 };
1128 use arrow::compute::CastOptions;
1129 use arrow::datatypes::{DataType, Field, Fields, UnionFields, UnionMode};
1130 use arrow::error::ArrowError;
1131 use parquet_variant::{Variant, VariantDecimal4};
1132 use std::sync::Arc;
1133 use uuid::Uuid;
1134
1135 #[test]
1136 fn make_primitive_builder_rejects_non_primitive_types() {
1137 let cast_options = CastOptions::default();
1138 let item_field = Arc::new(Field::new("item", DataType::Int32, true));
1139 let struct_fields = Fields::from(vec![Field::new("child", DataType::Int32, true)]);
1140 let map_entries_field = Arc::new(Field::new(
1141 "entries",
1142 DataType::Struct(Fields::from(vec![
1143 Field::new("key", DataType::Utf8, false),
1144 Field::new("value", DataType::Float64, true),
1145 ])),
1146 true,
1147 ));
1148 let union_fields =
1149 UnionFields::try_new(vec![1], vec![Field::new("child", DataType::Int32, true)])
1150 .unwrap();
1151 let run_ends_field = Arc::new(Field::new("run_ends", DataType::Int32, false));
1152 let ree_values_field = Arc::new(Field::new("values", DataType::Utf8, true));
1153
1154 let non_primitive_types = vec![
1155 DataType::List(item_field.clone()),
1156 DataType::LargeList(item_field.clone()),
1157 DataType::ListView(item_field.clone()),
1158 DataType::LargeListView(item_field.clone()),
1159 DataType::FixedSizeList(item_field.clone(), 2),
1160 DataType::Struct(struct_fields.clone()),
1161 DataType::Map(map_entries_field.clone(), false),
1162 DataType::Union(union_fields.clone(), UnionMode::Dense),
1163 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
1164 DataType::RunEndEncoded(run_ends_field.clone(), ree_values_field.clone()),
1165 ];
1166
1167 for data_type in non_primitive_types {
1168 let err =
1169 match make_primitive_variant_to_arrow_row_builder(&data_type, &cast_options, 1) {
1170 Ok(_) => panic!("non-primitive type {data_type:?} should be rejected"),
1171 Err(err) => err,
1172 };
1173
1174 match err {
1175 ArrowError::InvalidArgumentError(msg) => {
1176 assert!(msg.contains(&format!("{data_type:?}")));
1177 }
1178 other => panic!("expected InvalidArgumentError, got {other:?}"),
1179 }
1180 }
1181 }
1182
1183 #[test]
1184 fn strict_cast_allows_variant_null_for_primitive_builder() {
1185 let cast_options = CastOptions {
1186 safe: false,
1187 ..Default::default()
1188 };
1189 let mut builder =
1190 make_primitive_variant_to_arrow_row_builder(&DataType::Int32, &cast_options, 2)
1191 .unwrap();
1192
1193 assert!(!builder.append_value(&Variant::Null).unwrap());
1194 assert!(builder.append_value(&Variant::Int32(42)).unwrap());
1195
1196 let array = builder.finish().unwrap();
1197 let int_array = array.as_any().downcast_ref::<Int32Array>().unwrap();
1198 assert!(int_array.is_null(0));
1199 assert_eq!(int_array.value(1), 42);
1200 }
1201
1202 #[test]
1203 fn strict_cast_allows_variant_null_for_decimal_builder() {
1204 let cast_options = CastOptions {
1205 safe: false,
1206 ..Default::default()
1207 };
1208 let mut builder = make_primitive_variant_to_arrow_row_builder(
1209 &DataType::Decimal32(9, 2),
1210 &cast_options,
1211 2,
1212 )
1213 .unwrap();
1214 let decimal_variant: Variant<'_, '_> = VariantDecimal4::try_new(1234, 2).unwrap().into();
1215
1216 assert!(!builder.append_value(&Variant::Null).unwrap());
1217 assert!(builder.append_value(&decimal_variant).unwrap());
1218
1219 let array = builder.finish().unwrap();
1220 let decimal_array = array.as_any().downcast_ref::<Decimal32Array>().unwrap();
1221 assert!(decimal_array.is_null(0));
1222 assert_eq!(decimal_array.value(1), 1234);
1223 }
1224
1225 #[test]
1226 fn strict_cast_allows_variant_null_for_uuid_builder() {
1227 let cast_options = CastOptions {
1228 safe: false,
1229 ..Default::default()
1230 };
1231 let mut builder = make_primitive_variant_to_arrow_row_builder(
1232 &DataType::FixedSizeBinary(16),
1233 &cast_options,
1234 2,
1235 )
1236 .unwrap();
1237 let uuid = Uuid::nil();
1238
1239 assert!(!builder.append_value(&Variant::Null).unwrap());
1240 assert!(builder.append_value(&Variant::Uuid(uuid)).unwrap());
1241
1242 let array = builder.finish().unwrap();
1243 let uuid_array = array
1244 .as_any()
1245 .downcast_ref::<FixedSizeBinaryArray>()
1246 .unwrap();
1247 assert!(uuid_array.is_null(0));
1248 assert_eq!(uuid_array.value(1), uuid.as_bytes());
1249 }
1250
1251 #[test]
1252 fn strict_cast_allows_variant_null_for_list_and_struct_builders() {
1253 let cast_options = CastOptions {
1254 safe: false,
1255 ..Default::default()
1256 };
1257
1258 let list_type = DataType::List(Arc::new(Field::new("item", DataType::Int64, true)));
1259 let mut list_builder =
1260 make_typed_variant_to_arrow_row_builder(&list_type, &cast_options, 1).unwrap();
1261 assert!(!list_builder.append_value(Variant::Null).unwrap());
1262 let list_array = list_builder.finish().unwrap();
1263 let list_array = list_array.as_any().downcast_ref::<ListArray>().unwrap();
1264 assert!(list_array.is_null(0));
1265
1266 let struct_type =
1267 DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, true)]));
1268 let mut struct_builder =
1269 make_typed_variant_to_arrow_row_builder(&struct_type, &cast_options, 1).unwrap();
1270 assert!(!struct_builder.append_value(Variant::Null).unwrap());
1271 let struct_array = struct_builder.finish().unwrap();
1272 let struct_array = struct_array.as_any().downcast_ref::<StructArray>().unwrap();
1273 assert!(struct_array.is_null(0));
1274 }
1275}