1use crate::utils::{
18 array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice,
19};
20use crate::ShortString;
21
22use arrow_schema::ArrowError;
23use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
24
25#[derive(Debug, Clone, Copy, PartialEq)]
33pub enum VariantBasicType {
34 Primitive = 0,
35 ShortString = 1,
36 Object = 2,
37 Array = 3,
38}
39
40#[derive(Debug, Clone, Copy, PartialEq)]
48pub enum VariantPrimitiveType {
49 Null = 0,
50 BooleanTrue = 1,
51 BooleanFalse = 2,
52 Int8 = 3,
53 Int16 = 4,
54 Int32 = 5,
55 Int64 = 6,
56 Double = 7,
57 Decimal4 = 8,
58 Decimal8 = 9,
59 Decimal16 = 10,
60 Date = 11,
61 TimestampMicros = 12,
62 TimestampNtzMicros = 13,
63 Float = 14,
64 Binary = 15,
65 String = 16,
66}
67
68pub(crate) fn get_basic_type(header: u8) -> VariantBasicType {
70 let basic_type = header & 0x03; match basic_type {
73 0 => VariantBasicType::Primitive,
74 1 => VariantBasicType::ShortString,
75 2 => VariantBasicType::Object,
76 3 => VariantBasicType::Array,
77 _ => {
78 unreachable!();
81 }
82 }
83}
84
85impl TryFrom<u8> for VariantPrimitiveType {
86 type Error = ArrowError;
87
88 fn try_from(value: u8) -> Result<Self, Self::Error> {
89 match value {
90 0 => Ok(VariantPrimitiveType::Null),
91 1 => Ok(VariantPrimitiveType::BooleanTrue),
92 2 => Ok(VariantPrimitiveType::BooleanFalse),
93 3 => Ok(VariantPrimitiveType::Int8),
94 4 => Ok(VariantPrimitiveType::Int16),
95 5 => Ok(VariantPrimitiveType::Int32),
96 6 => Ok(VariantPrimitiveType::Int64),
97 7 => Ok(VariantPrimitiveType::Double),
98 8 => Ok(VariantPrimitiveType::Decimal4),
99 9 => Ok(VariantPrimitiveType::Decimal8),
100 10 => Ok(VariantPrimitiveType::Decimal16),
101 11 => Ok(VariantPrimitiveType::Date),
102 12 => Ok(VariantPrimitiveType::TimestampMicros),
103 13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
104 14 => Ok(VariantPrimitiveType::Float),
105 15 => Ok(VariantPrimitiveType::Binary),
106 16 => Ok(VariantPrimitiveType::String),
107 _ => Err(ArrowError::InvalidArgumentError(format!(
108 "unknown primitive type: {value}",
109 ))),
110 }
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq)]
118pub(crate) enum OffsetSizeBytes {
119 One = 1,
120 Two = 2,
121 Three = 3,
122 Four = 4,
123}
124
125impl OffsetSizeBytes {
126 pub(crate) fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> {
128 use OffsetSizeBytes::*;
129 let result = match offset_size_minus_one {
130 0 => One,
131 1 => Two,
132 2 => Three,
133 3 => Four,
134 _ => {
135 return Err(ArrowError::InvalidArgumentError(
136 "offset_size_minus_one must be 0–3".to_string(),
137 ))
138 }
139 };
140 Ok(result)
141 }
142
143 pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, ArrowError> {
150 self.unpack_u32_at_offset(bytes, 0, index)
151 }
152
153 pub(crate) fn unpack_u32_at_offset(
163 &self,
164 bytes: &[u8],
165 byte_offset: usize, offset_index: usize, ) -> Result<u32, ArrowError> {
168 use OffsetSizeBytes::*;
169
170 let offset = offset_index
173 .checked_mul(*self as usize)
174 .and_then(|n| n.checked_add(byte_offset))
175 .ok_or_else(|| overflow_error("unpacking offset array value"))?;
176 let value = match self {
177 One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
178 Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
179 Three => {
180 let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
182 let mut buf = [0u8; 4];
184 buf[..3].copy_from_slice(&b3_chunks);
185 u32::from_le_bytes(buf)
186 }
187 Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
188 };
189 Ok(value)
190 }
191}
192
193pub(crate) fn map_bytes_to_offsets(
195 buffer: &[u8],
196 offset_size: OffsetSizeBytes,
197) -> impl Iterator<Item = usize> + use<'_> {
198 buffer
199 .chunks_exact(offset_size as usize)
200 .map(move |chunk| match offset_size {
201 OffsetSizeBytes::One => chunk[0] as usize,
202 OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize,
203 OffsetSizeBytes::Three => {
204 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize
205 }
206 OffsetSizeBytes::Four => {
207 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize
208 }
209 })
210}
211
212pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
214 VariantPrimitiveType::try_from(metadata >> 2)
216}
217
218pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
220 Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
221}
222
223pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
225 Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
226}
227
228pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
230 Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
231}
232
233pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
235 Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
236}
237
238pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
240 let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
241 let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
242 Ok((integer, scale))
243}
244
245pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
247 let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
248 let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
249 Ok((integer, scale))
250}
251
252pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
254 let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
255 let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
256 Ok((integer, scale))
257}
258
259pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
261 Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
262}
263
264pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
266 Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
267}
268
269pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
271 let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
272 let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
273 Ok(value.date_naive())
274}
275
276pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
278 let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
279 DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
280 ArrowError::CastError(format!(
281 "Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
282 ))
283 })
284}
285
286pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
288 let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
289 DateTime::from_timestamp_micros(micros_since_epoch)
290 .ok_or_else(|| {
291 ArrowError::CastError(format!(
292 "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
293 ))
294 })
295 .map(|v| v.naive_utc())
296}
297
298pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
300 let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
301 slice_from_slice_at_offset(data, 4, 0..len)
302}
303
304pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
306 let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
307 string_from_slice(data, 4, 0..len)
308}
309
310pub(crate) fn decode_short_string(
312 metadata: u8,
313 data: &[u8],
314) -> Result<ShortString<'_>, ArrowError> {
315 let len = (metadata >> 2) as usize;
316 let string = string_from_slice(data, 0, 0..len)?;
317 ShortString::try_new(string)
318}
319
320#[cfg(test)]
321mod tests {
322 use super::*;
323 use paste::paste;
324
325 macro_rules! test_decoder_bounds {
326 ($test_name:ident, $data:expr, $decode_fn:ident, $expected:expr) => {
327 paste! {
328 #[test]
329 fn [<$test_name _exact_length>]() {
330 let result = $decode_fn(&$data).unwrap();
331 assert_eq!(result, $expected);
332 }
333
334 #[test]
335 fn [<$test_name _truncated_length>]() {
336 let truncated_data = &$data[.. $data.len() - 1];
338 let result = $decode_fn(truncated_data);
339 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
340 }
341 }
342 };
343 }
344
345 mod integer {
346 use super::*;
347
348 test_decoder_bounds!(test_i8, [0x2a], decode_int8, 42);
349 test_decoder_bounds!(test_i16, [0xd2, 0x04], decode_int16, 1234);
350 test_decoder_bounds!(test_i32, [0x40, 0xe2, 0x01, 0x00], decode_int32, 123456);
351 test_decoder_bounds!(
352 test_i64,
353 [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11],
354 decode_int64,
355 1234567890123456789
356 );
357 }
358
359 mod decimal {
360 use super::*;
361
362 test_decoder_bounds!(
363 test_decimal4,
364 [
365 0x02, 0xd2, 0x04, 0x00, 0x00, ],
368 decode_decimal4,
369 (1234, 2)
370 );
371
372 test_decoder_bounds!(
373 test_decimal8,
374 [
375 0x02, 0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, ],
378 decode_decimal8,
379 (1234567890, 2)
380 );
381
382 test_decoder_bounds!(
383 test_decimal16,
384 [
385 0x02, 0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
387 0x00, 0x00, ],
389 decode_decimal16,
390 (1234567891234567890, 2)
391 );
392 }
393
394 mod float {
395 use super::*;
396
397 test_decoder_bounds!(
398 test_float,
399 [0x06, 0x2c, 0x93, 0x4e],
400 decode_float,
401 1234567890.1234
402 );
403
404 test_decoder_bounds!(
405 test_double,
406 [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41],
407 decode_double,
408 1234567890.1234
409 );
410 }
411
412 mod datetime {
413 use super::*;
414
415 test_decoder_bounds!(
416 test_date,
417 [0xe2, 0x4e, 0x0, 0x0],
418 decode_date,
419 NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()
420 );
421
422 test_decoder_bounds!(
423 test_timestamp_micros,
424 [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
425 decode_timestamp_micros,
426 NaiveDate::from_ymd_opt(2025, 4, 16)
427 .unwrap()
428 .and_hms_milli_opt(16, 34, 56, 780)
429 .unwrap()
430 .and_utc()
431 );
432
433 test_decoder_bounds!(
434 test_timestampntz_micros,
435 [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
436 decode_timestampntz_micros,
437 NaiveDate::from_ymd_opt(2025, 4, 16)
438 .unwrap()
439 .and_hms_milli_opt(16, 34, 56, 780)
440 .unwrap()
441 );
442 }
443
444 #[test]
445 fn test_binary_exact_length() {
446 let data = [
447 0x09, 0, 0, 0, 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
449 ];
450 let result = decode_binary(&data).unwrap();
451 assert_eq!(
452 result,
453 [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
454 );
455 }
456
457 #[test]
458 fn test_binary_truncated_length() {
459 let data = [
460 0x09, 0, 0, 0, 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca,
462 ];
463 let result = decode_binary(&data);
464 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
465 }
466
467 #[test]
468 fn test_short_string_exact_length() {
469 let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
470 let result = decode_short_string(1 | 5 << 2, &data).unwrap();
471 assert_eq!(result.0, "Hello");
472 }
473
474 #[test]
475 fn test_short_string_truncated_length() {
476 let data = [b'H', b'e', b'l'];
477 let result = decode_short_string(1 | 5 << 2, &data);
478 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
479 }
480
481 #[test]
482 fn test_string_exact_length() {
483 let data = [
484 0x05, 0, 0, 0, b'H', b'e', b'l', b'l', b'o', b'o',
486 ];
487 let result = decode_long_string(&data).unwrap();
488 assert_eq!(result, "Hello");
489 }
490
491 #[test]
492 fn test_string_truncated_length() {
493 let data = [
494 0x05, 0, 0, 0, b'H', b'e', b'l',
496 ];
497 let result = decode_long_string(&data);
498 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
499 }
500
501 #[test]
502 fn test_offset() {
503 assert_eq!(OffsetSizeBytes::try_new(0).unwrap(), OffsetSizeBytes::One);
504 assert_eq!(OffsetSizeBytes::try_new(1).unwrap(), OffsetSizeBytes::Two);
505 assert_eq!(OffsetSizeBytes::try_new(2).unwrap(), OffsetSizeBytes::Three);
506 assert_eq!(OffsetSizeBytes::try_new(3).unwrap(), OffsetSizeBytes::Four);
507
508 assert!(OffsetSizeBytes::try_new(4).is_err());
510 assert!(OffsetSizeBytes::try_new(255).is_err());
511 }
512
513 #[test]
514 fn unpack_u32_all_widths() {
515 let buf_one = [0x01u8, 0xAB, 0xCD];
517 assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 0x01);
518 assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 0xCD);
519
520 let buf_two = [0x34, 0x12, 0x78, 0x56];
522 assert_eq!(
523 OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
524 0x1234
525 );
526 assert_eq!(
527 OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
528 0x5678
529 );
530
531 let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
533 assert_eq!(
534 OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
535 0x030201
536 );
537 assert_eq!(
538 OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
539 0x0000FF
540 );
541
542 let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
544 assert_eq!(
545 OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
546 0x1234_5678
547 );
548 assert_eq!(
549 OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
550 0x90AB_CDEF
551 );
552 }
553
554 #[test]
555 fn unpack_u32_out_of_bounds() {
556 let tiny = [0x00u8]; assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
558 assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
559 }
560
561 #[test]
562 fn unpack_simple() {
563 let buf = [
564 0x41, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x09, 0x00, ];
570
571 let width = OffsetSizeBytes::Two;
572
573 let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
575 assert_eq!(dict_size, 2);
576
577 let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
579 assert_eq!(first, 0);
580
581 let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
582 assert_eq!(second, 5);
583
584 let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
585 assert_eq!(third, 9);
586
587 let err = width.unpack_u32_at_offset(&buf, 1, 4);
588 assert!(err.is_err())
589 }
590}