1use crate::utils::{
18 array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice,
19};
20use crate::ShortString;
21
22use arrow_schema::ArrowError;
23use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
24
25#[derive(Debug, Clone, Copy, PartialEq)]
33pub enum VariantBasicType {
34 Primitive = 0,
35 ShortString = 1,
36 Object = 2,
37 Array = 3,
38}
39
40#[derive(Debug, Clone, Copy, PartialEq)]
48pub enum VariantPrimitiveType {
49 Null = 0,
50 BooleanTrue = 1,
51 BooleanFalse = 2,
52 Int8 = 3,
53 Int16 = 4,
54 Int32 = 5,
55 Int64 = 6,
56 Double = 7,
57 Decimal4 = 8,
58 Decimal8 = 9,
59 Decimal16 = 10,
60 Date = 11,
61 TimestampMicros = 12,
62 TimestampNtzMicros = 13,
63 Float = 14,
64 Binary = 15,
65 String = 16,
66}
67
68pub(crate) fn get_basic_type(header: u8) -> VariantBasicType {
70 let basic_type = header & 0x03; match basic_type {
73 0 => VariantBasicType::Primitive,
74 1 => VariantBasicType::ShortString,
75 2 => VariantBasicType::Object,
76 3 => VariantBasicType::Array,
77 _ => {
78 unreachable!();
81 }
82 }
83}
84
85impl TryFrom<u8> for VariantPrimitiveType {
86 type Error = ArrowError;
87
88 fn try_from(value: u8) -> Result<Self, Self::Error> {
89 match value {
90 0 => Ok(VariantPrimitiveType::Null),
91 1 => Ok(VariantPrimitiveType::BooleanTrue),
92 2 => Ok(VariantPrimitiveType::BooleanFalse),
93 3 => Ok(VariantPrimitiveType::Int8),
94 4 => Ok(VariantPrimitiveType::Int16),
95 5 => Ok(VariantPrimitiveType::Int32),
96 6 => Ok(VariantPrimitiveType::Int64),
97 7 => Ok(VariantPrimitiveType::Double),
98 8 => Ok(VariantPrimitiveType::Decimal4),
99 9 => Ok(VariantPrimitiveType::Decimal8),
100 10 => Ok(VariantPrimitiveType::Decimal16),
101 11 => Ok(VariantPrimitiveType::Date),
102 12 => Ok(VariantPrimitiveType::TimestampMicros),
103 13 => Ok(VariantPrimitiveType::TimestampNtzMicros),
104 14 => Ok(VariantPrimitiveType::Float),
105 15 => Ok(VariantPrimitiveType::Binary),
106 16 => Ok(VariantPrimitiveType::String),
107 _ => Err(ArrowError::InvalidArgumentError(format!(
108 "unknown primitive type: {value}",
109 ))),
110 }
111 }
112}
113
114#[derive(Debug, Clone, Copy, PartialEq)]
118pub(crate) enum OffsetSizeBytes {
119 One = 1,
120 Two = 2,
121 Three = 3,
122 Four = 4,
123}
124
125impl OffsetSizeBytes {
126 pub(crate) fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> {
128 use OffsetSizeBytes::*;
129 let result = match offset_size_minus_one {
130 0 => One,
131 1 => Two,
132 2 => Three,
133 3 => Four,
134 _ => {
135 return Err(ArrowError::InvalidArgumentError(
136 "offset_size_minus_one must be 0–3".to_string(),
137 ))
138 }
139 };
140 Ok(result)
141 }
142
143 pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, ArrowError> {
150 self.unpack_u32_at_offset(bytes, 0, index)
151 }
152
153 pub(crate) fn unpack_u32_at_offset(
163 &self,
164 bytes: &[u8],
165 byte_offset: usize, offset_index: usize, ) -> Result<u32, ArrowError> {
168 use OffsetSizeBytes::*;
169
170 let offset = offset_index
173 .checked_mul(*self as usize)
174 .and_then(|n| n.checked_add(byte_offset))
175 .ok_or_else(|| overflow_error("unpacking offset array value"))?;
176 let value = match self {
177 One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
178 Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
179 Three => {
180 let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
182 let mut buf = [0u8; 4];
184 buf[..3].copy_from_slice(&b3_chunks);
185 u32::from_le_bytes(buf)
186 }
187 Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
188 };
189 Ok(value)
190 }
191}
192
193pub(crate) fn map_bytes_to_offsets(
195 buffer: &[u8],
196 offset_size: OffsetSizeBytes,
197) -> impl Iterator<Item = usize> + use<'_> {
198 buffer
199 .chunks_exact(offset_size as usize)
200 .map(move |chunk| match offset_size {
201 OffsetSizeBytes::One => chunk[0] as usize,
202 OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize,
203 OffsetSizeBytes::Three => {
204 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize
205 }
206 OffsetSizeBytes::Four => {
207 u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize
208 }
209 })
210}
211
212pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
214 VariantPrimitiveType::try_from(metadata >> 2)
216}
217
218pub(crate) fn decode_int8(data: &[u8]) -> Result<i8, ArrowError> {
220 Ok(i8::from_le_bytes(array_from_slice(data, 0)?))
221}
222
223pub(crate) fn decode_int16(data: &[u8]) -> Result<i16, ArrowError> {
225 Ok(i16::from_le_bytes(array_from_slice(data, 0)?))
226}
227
228pub(crate) fn decode_int32(data: &[u8]) -> Result<i32, ArrowError> {
230 Ok(i32::from_le_bytes(array_from_slice(data, 0)?))
231}
232
233pub(crate) fn decode_int64(data: &[u8]) -> Result<i64, ArrowError> {
235 Ok(i64::from_le_bytes(array_from_slice(data, 0)?))
236}
237
238pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> {
240 let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
241 let integer = i32::from_le_bytes(array_from_slice(data, 1)?);
242 Ok((integer, scale))
243}
244
245pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> {
247 let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
248 let integer = i64::from_le_bytes(array_from_slice(data, 1)?);
249 Ok((integer, scale))
250}
251
252pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> {
254 let scale = u8::from_le_bytes(array_from_slice(data, 0)?);
255 let integer = i128::from_le_bytes(array_from_slice(data, 1)?);
256 Ok((integer, scale))
257}
258
259pub(crate) fn decode_float(data: &[u8]) -> Result<f32, ArrowError> {
261 Ok(f32::from_le_bytes(array_from_slice(data, 0)?))
262}
263
264pub(crate) fn decode_double(data: &[u8]) -> Result<f64, ArrowError> {
266 Ok(f64::from_le_bytes(array_from_slice(data, 0)?))
267}
268
269pub(crate) fn decode_date(data: &[u8]) -> Result<NaiveDate, ArrowError> {
271 let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?);
272 let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch));
273 Ok(value.date_naive())
274}
275
276pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result<DateTime<Utc>, ArrowError> {
278 let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
279 DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| {
280 ArrowError::CastError(format!(
281 "Could not cast `{micros_since_epoch}` microseconds into a DateTime<Utc>"
282 ))
283 })
284}
285
286pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result<NaiveDateTime, ArrowError> {
288 let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?);
289 DateTime::from_timestamp_micros(micros_since_epoch)
290 .ok_or_else(|| {
291 ArrowError::CastError(format!(
292 "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime"
293 ))
294 })
295 .map(|v| v.naive_utc())
296}
297
298pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
300 let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
301 slice_from_slice_at_offset(data, 4, 0..len)
302}
303
304pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
306 let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
307 string_from_slice(data, 4, 0..len)
308}
309
310pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<ShortString, ArrowError> {
312 let len = (metadata >> 2) as usize;
313 let string = string_from_slice(data, 0, 0..len)?;
314 ShortString::try_new(string)
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320 use paste::paste;
321
322 macro_rules! test_decoder_bounds {
323 ($test_name:ident, $data:expr, $decode_fn:ident, $expected:expr) => {
324 paste! {
325 #[test]
326 fn [<$test_name _exact_length>]() {
327 let result = $decode_fn(&$data).unwrap();
328 assert_eq!(result, $expected);
329 }
330
331 #[test]
332 fn [<$test_name _truncated_length>]() {
333 let truncated_data = &$data[.. $data.len() - 1];
335 let result = $decode_fn(truncated_data);
336 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
337 }
338 }
339 };
340 }
341
342 mod integer {
343 use super::*;
344
345 test_decoder_bounds!(test_i8, [0x2a], decode_int8, 42);
346 test_decoder_bounds!(test_i16, [0xd2, 0x04], decode_int16, 1234);
347 test_decoder_bounds!(test_i32, [0x40, 0xe2, 0x01, 0x00], decode_int32, 123456);
348 test_decoder_bounds!(
349 test_i64,
350 [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11],
351 decode_int64,
352 1234567890123456789
353 );
354 }
355
356 mod decimal {
357 use super::*;
358
359 test_decoder_bounds!(
360 test_decimal4,
361 [
362 0x02, 0xd2, 0x04, 0x00, 0x00, ],
365 decode_decimal4,
366 (1234, 2)
367 );
368
369 test_decoder_bounds!(
370 test_decimal8,
371 [
372 0x02, 0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, ],
375 decode_decimal8,
376 (1234567890, 2)
377 );
378
379 test_decoder_bounds!(
380 test_decimal16,
381 [
382 0x02, 0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
384 0x00, 0x00, ],
386 decode_decimal16,
387 (1234567891234567890, 2)
388 );
389 }
390
391 mod float {
392 use super::*;
393
394 test_decoder_bounds!(
395 test_float,
396 [0x06, 0x2c, 0x93, 0x4e],
397 decode_float,
398 1234567890.1234
399 );
400
401 test_decoder_bounds!(
402 test_double,
403 [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41],
404 decode_double,
405 1234567890.1234
406 );
407 }
408
409 mod datetime {
410 use super::*;
411
412 test_decoder_bounds!(
413 test_date,
414 [0xe2, 0x4e, 0x0, 0x0],
415 decode_date,
416 NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()
417 );
418
419 test_decoder_bounds!(
420 test_timestamp_micros,
421 [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
422 decode_timestamp_micros,
423 NaiveDate::from_ymd_opt(2025, 4, 16)
424 .unwrap()
425 .and_hms_milli_opt(16, 34, 56, 780)
426 .unwrap()
427 .and_utc()
428 );
429
430 test_decoder_bounds!(
431 test_timestampntz_micros,
432 [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00],
433 decode_timestampntz_micros,
434 NaiveDate::from_ymd_opt(2025, 4, 16)
435 .unwrap()
436 .and_hms_milli_opt(16, 34, 56, 780)
437 .unwrap()
438 );
439 }
440
441 #[test]
442 fn test_binary_exact_length() {
443 let data = [
444 0x09, 0, 0, 0, 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe,
446 ];
447 let result = decode_binary(&data).unwrap();
448 assert_eq!(
449 result,
450 [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe]
451 );
452 }
453
454 #[test]
455 fn test_binary_truncated_length() {
456 let data = [
457 0x09, 0, 0, 0, 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca,
459 ];
460 let result = decode_binary(&data);
461 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
462 }
463
464 #[test]
465 fn test_short_string_exact_length() {
466 let data = [b'H', b'e', b'l', b'l', b'o', b'o'];
467 let result = decode_short_string(1 | 5 << 2, &data).unwrap();
468 assert_eq!(result.0, "Hello");
469 }
470
471 #[test]
472 fn test_short_string_truncated_length() {
473 let data = [b'H', b'e', b'l'];
474 let result = decode_short_string(1 | 5 << 2, &data);
475 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
476 }
477
478 #[test]
479 fn test_string_exact_length() {
480 let data = [
481 0x05, 0, 0, 0, b'H', b'e', b'l', b'l', b'o', b'o',
483 ];
484 let result = decode_long_string(&data).unwrap();
485 assert_eq!(result, "Hello");
486 }
487
488 #[test]
489 fn test_string_truncated_length() {
490 let data = [
491 0x05, 0, 0, 0, b'H', b'e', b'l',
493 ];
494 let result = decode_long_string(&data);
495 assert!(matches!(result, Err(ArrowError::InvalidArgumentError(_))));
496 }
497
498 #[test]
499 fn test_offset() {
500 assert_eq!(OffsetSizeBytes::try_new(0).unwrap(), OffsetSizeBytes::One);
501 assert_eq!(OffsetSizeBytes::try_new(1).unwrap(), OffsetSizeBytes::Two);
502 assert_eq!(OffsetSizeBytes::try_new(2).unwrap(), OffsetSizeBytes::Three);
503 assert_eq!(OffsetSizeBytes::try_new(3).unwrap(), OffsetSizeBytes::Four);
504
505 assert!(OffsetSizeBytes::try_new(4).is_err());
507 assert!(OffsetSizeBytes::try_new(255).is_err());
508 }
509
510 #[test]
511 fn unpack_u32_all_widths() {
512 let buf_one = [0x01u8, 0xAB, 0xCD];
514 assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 0x01);
515 assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 0xCD);
516
517 let buf_two = [0x34, 0x12, 0x78, 0x56];
519 assert_eq!(
520 OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
521 0x1234
522 );
523 assert_eq!(
524 OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
525 0x5678
526 );
527
528 let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
530 assert_eq!(
531 OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
532 0x030201
533 );
534 assert_eq!(
535 OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
536 0x0000FF
537 );
538
539 let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
541 assert_eq!(
542 OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
543 0x1234_5678
544 );
545 assert_eq!(
546 OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
547 0x90AB_CDEF
548 );
549 }
550
551 #[test]
552 fn unpack_u32_out_of_bounds() {
553 let tiny = [0x00u8]; assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
555 assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
556 }
557
558 #[test]
559 fn unpack_simple() {
560 let buf = [
561 0x41, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x09, 0x00, ];
567
568 let width = OffsetSizeBytes::Two;
569
570 let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
572 assert_eq!(dict_size, 2);
573
574 let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
576 assert_eq!(first, 0);
577
578 let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
579 assert_eq!(second, 5);
580
581 let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
582 assert_eq!(third, 9);
583
584 let err = width.unpack_u32_at_offset(&buf, 1, 4);
585 assert!(err.is_err())
586 }
587}