1#![doc(
157 html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
158 html_favicon_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_transparent-bg.svg"
159)]
160#![cfg_attr(docsrs, feature(doc_auto_cfg))]
161#![warn(missing_docs)]
162use std::cmp::Ordering;
163use std::hash::{Hash, Hasher};
164use std::sync::Arc;
165
166use arrow_array::cast::*;
167use arrow_array::types::ArrowDictionaryKeyType;
168use arrow_array::*;
169use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
170use arrow_data::{ArrayData, ArrayDataBuilder};
171use arrow_schema::*;
172use variable::{decode_binary_view, decode_string_view};
173
174use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
175use crate::list::{compute_lengths_fixed_size_list, encode_fixed_size_list};
176use crate::variable::{decode_binary, decode_string};
177use arrow_array::types::{Int16Type, Int32Type, Int64Type};
178
179mod fixed;
180mod list;
181mod run;
182mod variable;
183
184#[derive(Debug)]
441pub struct RowConverter {
442 fields: Arc<[SortField]>,
443 codecs: Vec<Codec>,
445}
446
447#[derive(Debug)]
448enum Codec {
449 Stateless,
451 Dictionary(RowConverter, OwnedRow),
454 Struct(RowConverter, OwnedRow),
457 List(RowConverter),
459 RunEndEncoded(RowConverter),
461}
462
463impl Codec {
464 fn new(sort_field: &SortField) -> Result<Self, ArrowError> {
465 match &sort_field.data_type {
466 DataType::Dictionary(_, values) => {
467 let sort_field =
468 SortField::new_with_options(values.as_ref().clone(), sort_field.options);
469
470 let converter = RowConverter::new(vec![sort_field])?;
471 let null_array = new_null_array(values.as_ref(), 1);
472 let nulls = converter.convert_columns(&[null_array])?;
473
474 let owned = OwnedRow {
475 data: nulls.buffer.into(),
476 config: nulls.config,
477 };
478 Ok(Self::Dictionary(converter, owned))
479 }
480 DataType::RunEndEncoded(_, values) => {
481 let options = SortOptions {
483 descending: false,
484 nulls_first: sort_field.options.nulls_first != sort_field.options.descending,
485 };
486
487 let field = SortField::new_with_options(values.data_type().clone(), options);
488 let converter = RowConverter::new(vec![field])?;
489 Ok(Self::RunEndEncoded(converter))
490 }
491 d if !d.is_nested() => Ok(Self::Stateless),
492 DataType::List(f) | DataType::LargeList(f) => {
493 let options = SortOptions {
497 descending: false,
498 nulls_first: sort_field.options.nulls_first != sort_field.options.descending,
499 };
500
501 let field = SortField::new_with_options(f.data_type().clone(), options);
502 let converter = RowConverter::new(vec![field])?;
503 Ok(Self::List(converter))
504 }
505 DataType::FixedSizeList(f, _) => {
506 let field = SortField::new_with_options(f.data_type().clone(), sort_field.options);
507 let converter = RowConverter::new(vec![field])?;
508 Ok(Self::List(converter))
509 }
510 DataType::Struct(f) => {
511 let sort_fields = f
512 .iter()
513 .map(|x| SortField::new_with_options(x.data_type().clone(), sort_field.options))
514 .collect();
515
516 let converter = RowConverter::new(sort_fields)?;
517 let nulls: Vec<_> = f.iter().map(|x| new_null_array(x.data_type(), 1)).collect();
518
519 let nulls = converter.convert_columns(&nulls)?;
520 let owned = OwnedRow {
521 data: nulls.buffer.into(),
522 config: nulls.config,
523 };
524
525 Ok(Self::Struct(converter, owned))
526 }
527 _ => Err(ArrowError::NotYetImplemented(format!(
528 "not yet implemented: {:?}",
529 sort_field.data_type
530 ))),
531 }
532 }
533
534 fn encoder(&self, array: &dyn Array) -> Result<Encoder<'_>, ArrowError> {
535 match self {
536 Codec::Stateless => Ok(Encoder::Stateless),
537 Codec::Dictionary(converter, nulls) => {
538 let values = array.as_any_dictionary().values().clone();
539 let rows = converter.convert_columns(&[values])?;
540 Ok(Encoder::Dictionary(rows, nulls.row()))
541 }
542 Codec::Struct(converter, null) => {
543 let v = as_struct_array(array);
544 let rows = converter.convert_columns(v.columns())?;
545 Ok(Encoder::Struct(rows, null.row()))
546 }
547 Codec::List(converter) => {
548 let values = match array.data_type() {
549 DataType::List(_) => {
550 let list_array = as_list_array(array);
551 let first_offset = list_array.offsets()[0] as usize;
552 let last_offset =
553 list_array.offsets()[list_array.offsets().len() - 1] as usize;
554
555 list_array
558 .values()
559 .slice(first_offset, last_offset - first_offset)
560 }
561 DataType::LargeList(_) => {
562 let list_array = as_large_list_array(array);
563
564 let first_offset = list_array.offsets()[0] as usize;
565 let last_offset =
566 list_array.offsets()[list_array.offsets().len() - 1] as usize;
567
568 list_array
571 .values()
572 .slice(first_offset, last_offset - first_offset)
573 }
574 DataType::FixedSizeList(_, _) => {
575 as_fixed_size_list_array(array).values().clone()
576 }
577 _ => unreachable!(),
578 };
579 let rows = converter.convert_columns(&[values])?;
580 Ok(Encoder::List(rows))
581 }
582 Codec::RunEndEncoded(converter) => {
583 let values = match array.data_type() {
584 DataType::RunEndEncoded(r, _) => match r.data_type() {
585 DataType::Int16 => array.as_run::<Int16Type>().values(),
586 DataType::Int32 => array.as_run::<Int32Type>().values(),
587 DataType::Int64 => array.as_run::<Int64Type>().values(),
588 _ => unreachable!("Unsupported run end index type: {r:?}"),
589 },
590 _ => unreachable!(),
591 };
592 let rows = converter.convert_columns(std::slice::from_ref(values))?;
593 Ok(Encoder::RunEndEncoded(rows))
594 }
595 }
596 }
597
598 fn size(&self) -> usize {
599 match self {
600 Codec::Stateless => 0,
601 Codec::Dictionary(converter, nulls) => converter.size() + nulls.data.len(),
602 Codec::Struct(converter, nulls) => converter.size() + nulls.data.len(),
603 Codec::List(converter) => converter.size(),
604 Codec::RunEndEncoded(converter) => converter.size(),
605 }
606 }
607}
608
609#[derive(Debug)]
610enum Encoder<'a> {
611 Stateless,
613 Dictionary(Rows, Row<'a>),
615 Struct(Rows, Row<'a>),
621 List(Rows),
623 RunEndEncoded(Rows),
625}
626
627#[derive(Debug, Clone, PartialEq, Eq)]
629pub struct SortField {
630 options: SortOptions,
632 data_type: DataType,
634}
635
636impl SortField {
637 pub fn new(data_type: DataType) -> Self {
639 Self::new_with_options(data_type, Default::default())
640 }
641
642 pub fn new_with_options(data_type: DataType, options: SortOptions) -> Self {
644 Self { options, data_type }
645 }
646
647 pub fn size(&self) -> usize {
651 self.data_type.size() + std::mem::size_of::<Self>() - std::mem::size_of::<DataType>()
652 }
653}
654
655impl RowConverter {
656 pub fn new(fields: Vec<SortField>) -> Result<Self, ArrowError> {
658 if !Self::supports_fields(&fields) {
659 return Err(ArrowError::NotYetImplemented(format!(
660 "Row format support not yet implemented for: {fields:?}"
661 )));
662 }
663
664 let codecs = fields.iter().map(Codec::new).collect::<Result<_, _>>()?;
665 Ok(Self {
666 fields: fields.into(),
667 codecs,
668 })
669 }
670
671 pub fn supports_fields(fields: &[SortField]) -> bool {
673 fields.iter().all(|x| Self::supports_datatype(&x.data_type))
674 }
675
676 fn supports_datatype(d: &DataType) -> bool {
677 match d {
678 _ if !d.is_nested() => true,
679 DataType::List(f) | DataType::LargeList(f) | DataType::FixedSizeList(f, _) => {
680 Self::supports_datatype(f.data_type())
681 }
682 DataType::Struct(f) => f.iter().all(|x| Self::supports_datatype(x.data_type())),
683 DataType::RunEndEncoded(_, values) => Self::supports_datatype(values.data_type()),
684 _ => false,
685 }
686 }
687
688 pub fn convert_columns(&self, columns: &[ArrayRef]) -> Result<Rows, ArrowError> {
698 let num_rows = columns.first().map(|x| x.len()).unwrap_or(0);
699 let mut rows = self.empty_rows(num_rows, 0);
700 self.append(&mut rows, columns)?;
701 Ok(rows)
702 }
703
704 pub fn append(&self, rows: &mut Rows, columns: &[ArrayRef]) -> Result<(), ArrowError> {
735 assert!(
736 Arc::ptr_eq(&rows.config.fields, &self.fields),
737 "rows were not produced by this RowConverter"
738 );
739
740 if columns.len() != self.fields.len() {
741 return Err(ArrowError::InvalidArgumentError(format!(
742 "Incorrect number of arrays provided to RowConverter, expected {} got {}",
743 self.fields.len(),
744 columns.len()
745 )));
746 }
747 for colum in columns.iter().skip(1) {
748 if colum.len() != columns[0].len() {
749 return Err(ArrowError::InvalidArgumentError(format!(
750 "RowConverter columns must all have the same length, expected {} got {}",
751 columns[0].len(),
752 colum.len()
753 )));
754 }
755 }
756
757 let encoders = columns
758 .iter()
759 .zip(&self.codecs)
760 .zip(self.fields.iter())
761 .map(|((column, codec), field)| {
762 if !column.data_type().equals_datatype(&field.data_type) {
763 return Err(ArrowError::InvalidArgumentError(format!(
764 "RowConverter column schema mismatch, expected {} got {}",
765 field.data_type,
766 column.data_type()
767 )));
768 }
769 codec.encoder(column.as_ref())
770 })
771 .collect::<Result<Vec<_>, _>>()?;
772
773 let write_offset = rows.num_rows();
774 let lengths = row_lengths(columns, &encoders);
775 let total = lengths.extend_offsets(rows.offsets[write_offset], &mut rows.offsets);
776 rows.buffer.resize(total, 0);
777
778 for ((column, field), encoder) in columns.iter().zip(self.fields.iter()).zip(encoders) {
779 encode_column(
781 &mut rows.buffer,
782 &mut rows.offsets[write_offset..],
783 column.as_ref(),
784 field.options,
785 &encoder,
786 )
787 }
788
789 if cfg!(debug_assertions) {
790 assert_eq!(*rows.offsets.last().unwrap(), rows.buffer.len());
791 rows.offsets
792 .windows(2)
793 .for_each(|w| assert!(w[0] <= w[1], "offsets should be monotonic"));
794 }
795
796 Ok(())
797 }
798
799 pub fn convert_rows<'a, I>(&self, rows: I) -> Result<Vec<ArrayRef>, ArrowError>
807 where
808 I: IntoIterator<Item = Row<'a>>,
809 {
810 let mut validate_utf8 = false;
811 let mut rows: Vec<_> = rows
812 .into_iter()
813 .map(|row| {
814 assert!(
815 Arc::ptr_eq(&row.config.fields, &self.fields),
816 "rows were not produced by this RowConverter"
817 );
818 validate_utf8 |= row.config.validate_utf8;
819 row.data
820 })
821 .collect();
822
823 let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?;
827
828 if cfg!(test) {
829 for (i, row) in rows.iter().enumerate() {
830 if !row.is_empty() {
831 return Err(ArrowError::InvalidArgumentError(format!(
832 "Codecs {codecs:?} did not consume all bytes for row {i}, remaining bytes: {row:?}",
833 codecs = &self.codecs
834 )));
835 }
836 }
837 }
838
839 Ok(result)
840 }
841
842 pub fn empty_rows(&self, row_capacity: usize, data_capacity: usize) -> Rows {
871 let mut offsets = Vec::with_capacity(row_capacity.saturating_add(1));
872 offsets.push(0);
873
874 Rows {
875 offsets,
876 buffer: Vec::with_capacity(data_capacity),
877 config: RowConfig {
878 fields: self.fields.clone(),
879 validate_utf8: false,
880 },
881 }
882 }
883
884 pub fn from_binary(&self, array: BinaryArray) -> Rows {
911 assert_eq!(
912 array.null_count(),
913 0,
914 "can't construct Rows instance from array with nulls"
915 );
916 Rows {
917 buffer: array.values().to_vec(),
918 offsets: array.offsets().iter().map(|&i| i.as_usize()).collect(),
919 config: RowConfig {
920 fields: Arc::clone(&self.fields),
921 validate_utf8: true,
922 },
923 }
924 }
925
926 unsafe fn convert_raw(
932 &self,
933 rows: &mut [&[u8]],
934 validate_utf8: bool,
935 ) -> Result<Vec<ArrayRef>, ArrowError> {
936 self.fields
937 .iter()
938 .zip(&self.codecs)
939 .map(|(field, codec)| decode_column(field, rows, codec, validate_utf8))
940 .collect()
941 }
942
943 pub fn parser(&self) -> RowParser {
945 RowParser::new(Arc::clone(&self.fields))
946 }
947
948 pub fn size(&self) -> usize {
952 std::mem::size_of::<Self>()
953 + self.fields.iter().map(|x| x.size()).sum::<usize>()
954 + self.codecs.capacity() * std::mem::size_of::<Codec>()
955 + self.codecs.iter().map(Codec::size).sum::<usize>()
956 }
957}
958
959#[derive(Debug)]
961pub struct RowParser {
962 config: RowConfig,
963}
964
965impl RowParser {
966 fn new(fields: Arc<[SortField]>) -> Self {
967 Self {
968 config: RowConfig {
969 fields,
970 validate_utf8: true,
971 },
972 }
973 }
974
975 pub fn parse<'a>(&'a self, bytes: &'a [u8]) -> Row<'a> {
980 Row {
981 data: bytes,
982 config: &self.config,
983 }
984 }
985}
986
987#[derive(Debug, Clone)]
989struct RowConfig {
990 fields: Arc<[SortField]>,
992 validate_utf8: bool,
994}
995
996#[derive(Debug)]
1000pub struct Rows {
1001 buffer: Vec<u8>,
1003 offsets: Vec<usize>,
1005 config: RowConfig,
1007}
1008
1009impl Rows {
1010 pub fn push(&mut self, row: Row<'_>) {
1012 assert!(
1013 Arc::ptr_eq(&row.config.fields, &self.config.fields),
1014 "row was not produced by this RowConverter"
1015 );
1016 self.config.validate_utf8 |= row.config.validate_utf8;
1017 self.buffer.extend_from_slice(row.data);
1018 self.offsets.push(self.buffer.len())
1019 }
1020
1021 pub fn row(&self, row: usize) -> Row<'_> {
1023 assert!(row + 1 < self.offsets.len());
1024 unsafe { self.row_unchecked(row) }
1025 }
1026
1027 pub unsafe fn row_unchecked(&self, index: usize) -> Row<'_> {
1032 let end = unsafe { self.offsets.get_unchecked(index + 1) };
1033 let start = unsafe { self.offsets.get_unchecked(index) };
1034 let data = unsafe { self.buffer.get_unchecked(*start..*end) };
1035 Row {
1036 data,
1037 config: &self.config,
1038 }
1039 }
1040
1041 pub fn clear(&mut self) {
1043 self.offsets.truncate(1);
1044 self.buffer.clear();
1045 }
1046
1047 pub fn num_rows(&self) -> usize {
1049 self.offsets.len() - 1
1050 }
1051
1052 pub fn iter(&self) -> RowsIter<'_> {
1054 self.into_iter()
1055 }
1056
1057 pub fn size(&self) -> usize {
1061 std::mem::size_of::<Self>()
1063 + self.buffer.len()
1064 + self.offsets.len() * std::mem::size_of::<usize>()
1065 }
1066
1067 pub fn try_into_binary(self) -> Result<BinaryArray, ArrowError> {
1097 if self.buffer.len() > i32::MAX as usize {
1098 return Err(ArrowError::InvalidArgumentError(format!(
1099 "{}-byte rows buffer too long to convert into a i32-indexed BinaryArray",
1100 self.buffer.len()
1101 )));
1102 }
1103 let offsets_scalar = ScalarBuffer::from_iter(self.offsets.into_iter().map(i32::usize_as));
1105 let array = unsafe {
1107 BinaryArray::new_unchecked(
1108 OffsetBuffer::new_unchecked(offsets_scalar),
1109 Buffer::from_vec(self.buffer),
1110 None,
1111 )
1112 };
1113 Ok(array)
1114 }
1115}
1116
1117impl<'a> IntoIterator for &'a Rows {
1118 type Item = Row<'a>;
1119 type IntoIter = RowsIter<'a>;
1120
1121 fn into_iter(self) -> Self::IntoIter {
1122 RowsIter {
1123 rows: self,
1124 start: 0,
1125 end: self.num_rows(),
1126 }
1127 }
1128}
1129
1130#[derive(Debug)]
1132pub struct RowsIter<'a> {
1133 rows: &'a Rows,
1134 start: usize,
1135 end: usize,
1136}
1137
1138impl<'a> Iterator for RowsIter<'a> {
1139 type Item = Row<'a>;
1140
1141 fn next(&mut self) -> Option<Self::Item> {
1142 if self.end == self.start {
1143 return None;
1144 }
1145
1146 let row = unsafe { self.rows.row_unchecked(self.start) };
1148 self.start += 1;
1149 Some(row)
1150 }
1151
1152 fn size_hint(&self) -> (usize, Option<usize>) {
1153 let len = self.len();
1154 (len, Some(len))
1155 }
1156}
1157
1158impl ExactSizeIterator for RowsIter<'_> {
1159 fn len(&self) -> usize {
1160 self.end - self.start
1161 }
1162}
1163
1164impl DoubleEndedIterator for RowsIter<'_> {
1165 fn next_back(&mut self) -> Option<Self::Item> {
1166 if self.end == self.start {
1167 return None;
1168 }
1169 let row = unsafe { self.rows.row_unchecked(self.end) };
1171 self.end -= 1;
1172 Some(row)
1173 }
1174}
1175
1176#[derive(Debug, Copy, Clone)]
1185pub struct Row<'a> {
1186 data: &'a [u8],
1187 config: &'a RowConfig,
1188}
1189
1190impl<'a> Row<'a> {
1191 pub fn owned(&self) -> OwnedRow {
1193 OwnedRow {
1194 data: self.data.into(),
1195 config: self.config.clone(),
1196 }
1197 }
1198
1199 pub fn data(&self) -> &'a [u8] {
1201 self.data
1202 }
1203}
1204
1205impl PartialEq for Row<'_> {
1208 #[inline]
1209 fn eq(&self, other: &Self) -> bool {
1210 self.data.eq(other.data)
1211 }
1212}
1213
1214impl Eq for Row<'_> {}
1215
1216impl PartialOrd for Row<'_> {
1217 #[inline]
1218 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
1219 Some(self.cmp(other))
1220 }
1221}
1222
1223impl Ord for Row<'_> {
1224 #[inline]
1225 fn cmp(&self, other: &Self) -> Ordering {
1226 self.data.cmp(other.data)
1227 }
1228}
1229
1230impl Hash for Row<'_> {
1231 #[inline]
1232 fn hash<H: Hasher>(&self, state: &mut H) {
1233 self.data.hash(state)
1234 }
1235}
1236
1237impl AsRef<[u8]> for Row<'_> {
1238 #[inline]
1239 fn as_ref(&self) -> &[u8] {
1240 self.data
1241 }
1242}
1243
1244#[derive(Debug, Clone)]
1248pub struct OwnedRow {
1249 data: Box<[u8]>,
1250 config: RowConfig,
1251}
1252
1253impl OwnedRow {
1254 pub fn row(&self) -> Row<'_> {
1258 Row {
1259 data: &self.data,
1260 config: &self.config,
1261 }
1262 }
1263}
1264
1265impl PartialEq for OwnedRow {
1268 #[inline]
1269 fn eq(&self, other: &Self) -> bool {
1270 self.row().eq(&other.row())
1271 }
1272}
1273
1274impl Eq for OwnedRow {}
1275
1276impl PartialOrd for OwnedRow {
1277 #[inline]
1278 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
1279 Some(self.cmp(other))
1280 }
1281}
1282
1283impl Ord for OwnedRow {
1284 #[inline]
1285 fn cmp(&self, other: &Self) -> Ordering {
1286 self.row().cmp(&other.row())
1287 }
1288}
1289
1290impl Hash for OwnedRow {
1291 #[inline]
1292 fn hash<H: Hasher>(&self, state: &mut H) {
1293 self.row().hash(state)
1294 }
1295}
1296
1297impl AsRef<[u8]> for OwnedRow {
1298 #[inline]
1299 fn as_ref(&self) -> &[u8] {
1300 &self.data
1301 }
1302}
1303
1304#[inline]
1306fn null_sentinel(options: SortOptions) -> u8 {
1307 match options.nulls_first {
1308 true => 0,
1309 false => 0xFF,
1310 }
1311}
1312
1313enum LengthTracker {
1315 Fixed { length: usize, num_rows: usize },
1317 Variable {
1319 fixed_length: usize,
1320 lengths: Vec<usize>,
1321 },
1322}
1323
1324impl LengthTracker {
1325 fn new(num_rows: usize) -> Self {
1326 Self::Fixed {
1327 length: 0,
1328 num_rows,
1329 }
1330 }
1331
1332 fn push_fixed(&mut self, new_length: usize) {
1334 match self {
1335 LengthTracker::Fixed { length, .. } => *length += new_length,
1336 LengthTracker::Variable { fixed_length, .. } => *fixed_length += new_length,
1337 }
1338 }
1339
1340 fn push_variable(&mut self, new_lengths: impl ExactSizeIterator<Item = usize>) {
1342 match self {
1343 LengthTracker::Fixed { length, .. } => {
1344 *self = LengthTracker::Variable {
1345 fixed_length: *length,
1346 lengths: new_lengths.collect(),
1347 }
1348 }
1349 LengthTracker::Variable { lengths, .. } => {
1350 assert_eq!(lengths.len(), new_lengths.len());
1351 lengths
1352 .iter_mut()
1353 .zip(new_lengths)
1354 .for_each(|(length, new_length)| *length += new_length);
1355 }
1356 }
1357 }
1358
1359 fn materialized(&mut self) -> &mut [usize] {
1361 if let LengthTracker::Fixed { length, num_rows } = *self {
1362 *self = LengthTracker::Variable {
1363 fixed_length: length,
1364 lengths: vec![0; num_rows],
1365 };
1366 }
1367
1368 match self {
1369 LengthTracker::Variable { lengths, .. } => lengths,
1370 LengthTracker::Fixed { .. } => unreachable!(),
1371 }
1372 }
1373
1374 fn extend_offsets(&self, initial_offset: usize, offsets: &mut Vec<usize>) -> usize {
1392 match self {
1393 LengthTracker::Fixed { length, num_rows } => {
1394 offsets.extend((0..*num_rows).map(|i| initial_offset + i * length));
1395
1396 initial_offset + num_rows * length
1397 }
1398 LengthTracker::Variable {
1399 fixed_length,
1400 lengths,
1401 } => {
1402 let mut acc = initial_offset;
1403
1404 offsets.extend(lengths.iter().map(|length| {
1405 let current = acc;
1406 acc += length + fixed_length;
1407 current
1408 }));
1409
1410 acc
1411 }
1412 }
1413 }
1414}
1415
1416fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
1418 use fixed::FixedLengthEncoding;
1419
1420 let num_rows = cols.first().map(|x| x.len()).unwrap_or(0);
1421 let mut tracker = LengthTracker::new(num_rows);
1422
1423 for (array, encoder) in cols.iter().zip(encoders) {
1424 match encoder {
1425 Encoder::Stateless => {
1426 downcast_primitive_array! {
1427 array => tracker.push_fixed(fixed::encoded_len(array)),
1428 DataType::Null => {},
1429 DataType::Boolean => tracker.push_fixed(bool::ENCODED_LEN),
1430 DataType::Binary => tracker.push_variable(
1431 as_generic_binary_array::<i32>(array)
1432 .iter()
1433 .map(|slice| variable::encoded_len(slice))
1434 ),
1435 DataType::LargeBinary => tracker.push_variable(
1436 as_generic_binary_array::<i64>(array)
1437 .iter()
1438 .map(|slice| variable::encoded_len(slice))
1439 ),
1440 DataType::BinaryView => tracker.push_variable(
1441 array.as_binary_view()
1442 .iter()
1443 .map(|slice| variable::encoded_len(slice))
1444 ),
1445 DataType::Utf8 => tracker.push_variable(
1446 array.as_string::<i32>()
1447 .iter()
1448 .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
1449 ),
1450 DataType::LargeUtf8 => tracker.push_variable(
1451 array.as_string::<i64>()
1452 .iter()
1453 .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
1454 ),
1455 DataType::Utf8View => tracker.push_variable(
1456 array.as_string_view()
1457 .iter()
1458 .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
1459 ),
1460 DataType::FixedSizeBinary(len) => {
1461 let len = len.to_usize().unwrap();
1462 tracker.push_fixed(1 + len)
1463 }
1464 _ => unimplemented!("unsupported data type: {}", array.data_type()),
1465 }
1466 }
1467 Encoder::Dictionary(values, null) => {
1468 downcast_dictionary_array! {
1469 array => {
1470 tracker.push_variable(
1471 array.keys().iter().map(|v| match v {
1472 Some(k) => values.row(k.as_usize()).data.len(),
1473 None => null.data.len(),
1474 })
1475 )
1476 }
1477 _ => unreachable!(),
1478 }
1479 }
1480 Encoder::Struct(rows, null) => {
1481 let array = as_struct_array(array);
1482 tracker.push_variable((0..array.len()).map(|idx| match array.is_valid(idx) {
1483 true => 1 + rows.row(idx).as_ref().len(),
1484 false => 1 + null.data.len(),
1485 }));
1486 }
1487 Encoder::List(rows) => match array.data_type() {
1488 DataType::List(_) => {
1489 list::compute_lengths(tracker.materialized(), rows, as_list_array(array))
1490 }
1491 DataType::LargeList(_) => {
1492 list::compute_lengths(tracker.materialized(), rows, as_large_list_array(array))
1493 }
1494 DataType::FixedSizeList(_, _) => compute_lengths_fixed_size_list(
1495 &mut tracker,
1496 rows,
1497 as_fixed_size_list_array(array),
1498 ),
1499 _ => unreachable!(),
1500 },
1501 Encoder::RunEndEncoded(rows) => match array.data_type() {
1502 DataType::RunEndEncoded(r, _) => match r.data_type() {
1503 DataType::Int16 => run::compute_lengths(
1504 tracker.materialized(),
1505 rows,
1506 array.as_run::<Int16Type>(),
1507 ),
1508 DataType::Int32 => run::compute_lengths(
1509 tracker.materialized(),
1510 rows,
1511 array.as_run::<Int32Type>(),
1512 ),
1513 DataType::Int64 => run::compute_lengths(
1514 tracker.materialized(),
1515 rows,
1516 array.as_run::<Int64Type>(),
1517 ),
1518 _ => unreachable!("Unsupported run end index type: {r:?}"),
1519 },
1520 _ => unreachable!(),
1521 },
1522 }
1523 }
1524
1525 tracker
1526}
1527
1528fn encode_column(
1530 data: &mut [u8],
1531 offsets: &mut [usize],
1532 column: &dyn Array,
1533 opts: SortOptions,
1534 encoder: &Encoder<'_>,
1535) {
1536 match encoder {
1537 Encoder::Stateless => {
1538 downcast_primitive_array! {
1539 column => {
1540 if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){
1541 fixed::encode(data, offsets, column.values(), nulls, opts)
1542 } else {
1543 fixed::encode_not_null(data, offsets, column.values(), opts)
1544 }
1545 }
1546 DataType::Null => {}
1547 DataType::Boolean => {
1548 if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){
1549 fixed::encode_boolean(data, offsets, column.as_boolean().values(), nulls, opts)
1550 } else {
1551 fixed::encode_boolean_not_null(data, offsets, column.as_boolean().values(), opts)
1552 }
1553 }
1554 DataType::Binary => {
1555 variable::encode(data, offsets, as_generic_binary_array::<i32>(column).iter(), opts)
1556 }
1557 DataType::BinaryView => {
1558 variable::encode(data, offsets, column.as_binary_view().iter(), opts)
1559 }
1560 DataType::LargeBinary => {
1561 variable::encode(data, offsets, as_generic_binary_array::<i64>(column).iter(), opts)
1562 }
1563 DataType::Utf8 => variable::encode(
1564 data, offsets,
1565 column.as_string::<i32>().iter().map(|x| x.map(|x| x.as_bytes())),
1566 opts,
1567 ),
1568 DataType::LargeUtf8 => variable::encode(
1569 data, offsets,
1570 column.as_string::<i64>()
1571 .iter()
1572 .map(|x| x.map(|x| x.as_bytes())),
1573 opts,
1574 ),
1575 DataType::Utf8View => variable::encode(
1576 data, offsets,
1577 column.as_string_view().iter().map(|x| x.map(|x| x.as_bytes())),
1578 opts,
1579 ),
1580 DataType::FixedSizeBinary(_) => {
1581 let array = column.as_any().downcast_ref().unwrap();
1582 fixed::encode_fixed_size_binary(data, offsets, array, opts)
1583 }
1584 _ => unimplemented!("unsupported data type: {}", column.data_type()),
1585 }
1586 }
1587 Encoder::Dictionary(values, nulls) => {
1588 downcast_dictionary_array! {
1589 column => encode_dictionary_values(data, offsets, column, values, nulls),
1590 _ => unreachable!()
1591 }
1592 }
1593 Encoder::Struct(rows, null) => {
1594 let array = as_struct_array(column);
1595 let null_sentinel = null_sentinel(opts);
1596 offsets
1597 .iter_mut()
1598 .skip(1)
1599 .enumerate()
1600 .for_each(|(idx, offset)| {
1601 let (row, sentinel) = match array.is_valid(idx) {
1602 true => (rows.row(idx), 0x01),
1603 false => (*null, null_sentinel),
1604 };
1605 let end_offset = *offset + 1 + row.as_ref().len();
1606 data[*offset] = sentinel;
1607 data[*offset + 1..end_offset].copy_from_slice(row.as_ref());
1608 *offset = end_offset;
1609 })
1610 }
1611 Encoder::List(rows) => match column.data_type() {
1612 DataType::List(_) => list::encode(data, offsets, rows, opts, as_list_array(column)),
1613 DataType::LargeList(_) => {
1614 list::encode(data, offsets, rows, opts, as_large_list_array(column))
1615 }
1616 DataType::FixedSizeList(_, _) => {
1617 encode_fixed_size_list(data, offsets, rows, opts, as_fixed_size_list_array(column))
1618 }
1619 _ => unreachable!(),
1620 },
1621 Encoder::RunEndEncoded(rows) => match column.data_type() {
1622 DataType::RunEndEncoded(r, _) => match r.data_type() {
1623 DataType::Int16 => {
1624 run::encode(data, offsets, rows, opts, column.as_run::<Int16Type>())
1625 }
1626 DataType::Int32 => {
1627 run::encode(data, offsets, rows, opts, column.as_run::<Int32Type>())
1628 }
1629 DataType::Int64 => {
1630 run::encode(data, offsets, rows, opts, column.as_run::<Int64Type>())
1631 }
1632 _ => unreachable!("Unsupported run end index type: {r:?}"),
1633 },
1634 _ => unreachable!(),
1635 },
1636 }
1637}
1638
1639pub fn encode_dictionary_values<K: ArrowDictionaryKeyType>(
1641 data: &mut [u8],
1642 offsets: &mut [usize],
1643 column: &DictionaryArray<K>,
1644 values: &Rows,
1645 null: &Row<'_>,
1646) {
1647 for (offset, k) in offsets.iter_mut().skip(1).zip(column.keys()) {
1648 let row = match k {
1649 Some(k) => values.row(k.as_usize()).data,
1650 None => null.data,
1651 };
1652 let end_offset = *offset + row.len();
1653 data[*offset..end_offset].copy_from_slice(row);
1654 *offset = end_offset;
1655 }
1656}
1657
1658macro_rules! decode_primitive_helper {
1659 ($t:ty, $rows:ident, $data_type:ident, $options:ident) => {
1660 Arc::new(decode_primitive::<$t>($rows, $data_type, $options))
1661 };
1662}
1663
1664unsafe fn decode_column(
1670 field: &SortField,
1671 rows: &mut [&[u8]],
1672 codec: &Codec,
1673 validate_utf8: bool,
1674) -> Result<ArrayRef, ArrowError> {
1675 let options = field.options;
1676
1677 let array: ArrayRef = match codec {
1678 Codec::Stateless => {
1679 let data_type = field.data_type.clone();
1680 downcast_primitive! {
1681 data_type => (decode_primitive_helper, rows, data_type, options),
1682 DataType::Null => Arc::new(NullArray::new(rows.len())),
1683 DataType::Boolean => Arc::new(decode_bool(rows, options)),
1684 DataType::Binary => Arc::new(decode_binary::<i32>(rows, options)),
1685 DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows, options)),
1686 DataType::BinaryView => Arc::new(decode_binary_view(rows, options)),
1687 DataType::FixedSizeBinary(size) => Arc::new(decode_fixed_size_binary(rows, size, options)),
1688 DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options, validate_utf8)),
1689 DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows, options, validate_utf8)),
1690 DataType::Utf8View => Arc::new(decode_string_view(rows, options, validate_utf8)),
1691 _ => return Err(ArrowError::NotYetImplemented(format!("unsupported data type: {data_type}" )))
1692 }
1693 }
1694 Codec::Dictionary(converter, _) => {
1695 let cols = converter.convert_raw(rows, validate_utf8)?;
1696 cols.into_iter().next().unwrap()
1697 }
1698 Codec::Struct(converter, _) => {
1699 let (null_count, nulls) = fixed::decode_nulls(rows);
1700 rows.iter_mut().for_each(|row| *row = &row[1..]);
1701 let children = converter.convert_raw(rows, validate_utf8)?;
1702
1703 let child_data: Vec<ArrayData> = children.iter().map(|c| c.to_data()).collect();
1704 let corrected_fields: Vec<Field> = match &field.data_type {
1707 DataType::Struct(struct_fields) => struct_fields
1708 .iter()
1709 .zip(child_data.iter())
1710 .map(|(orig_field, child_array)| {
1711 orig_field
1712 .as_ref()
1713 .clone()
1714 .with_data_type(child_array.data_type().clone())
1715 })
1716 .collect(),
1717 _ => unreachable!("Only Struct types should be corrected here"),
1718 };
1719 let corrected_struct_type = DataType::Struct(corrected_fields.into());
1720 let builder = ArrayDataBuilder::new(corrected_struct_type)
1721 .len(rows.len())
1722 .null_count(null_count)
1723 .null_bit_buffer(Some(nulls))
1724 .child_data(child_data);
1725
1726 Arc::new(StructArray::from(builder.build_unchecked()))
1727 }
1728 Codec::List(converter) => match &field.data_type {
1729 DataType::List(_) => {
1730 Arc::new(list::decode::<i32>(converter, rows, field, validate_utf8)?)
1731 }
1732 DataType::LargeList(_) => {
1733 Arc::new(list::decode::<i64>(converter, rows, field, validate_utf8)?)
1734 }
1735 DataType::FixedSizeList(_, value_length) => Arc::new(list::decode_fixed_size_list(
1736 converter,
1737 rows,
1738 field,
1739 validate_utf8,
1740 value_length.as_usize(),
1741 )?),
1742 _ => unreachable!(),
1743 },
1744 Codec::RunEndEncoded(converter) => match &field.data_type {
1745 DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
1746 DataType::Int16 => Arc::new(run::decode::<Int16Type>(
1747 converter,
1748 rows,
1749 field,
1750 validate_utf8,
1751 )?),
1752 DataType::Int32 => Arc::new(run::decode::<Int32Type>(
1753 converter,
1754 rows,
1755 field,
1756 validate_utf8,
1757 )?),
1758 DataType::Int64 => Arc::new(run::decode::<Int64Type>(
1759 converter,
1760 rows,
1761 field,
1762 validate_utf8,
1763 )?),
1764 _ => unreachable!(),
1765 },
1766 _ => unreachable!(),
1767 },
1768 };
1769 Ok(array)
1770}
1771
1772#[cfg(test)]
1773mod tests {
1774 use rand::distr::uniform::SampleUniform;
1775 use rand::distr::{Distribution, StandardUniform};
1776 use rand::{rng, Rng};
1777
1778 use arrow_array::builder::*;
1779 use arrow_array::types::*;
1780 use arrow_array::*;
1781 use arrow_buffer::{i256, NullBuffer};
1782 use arrow_buffer::{Buffer, OffsetBuffer};
1783 use arrow_cast::display::{ArrayFormatter, FormatOptions};
1784 use arrow_ord::sort::{LexicographicalComparator, SortColumn};
1785
1786 use super::*;
1787
1788 #[test]
1789 fn test_fixed_width() {
1790 let cols = [
1791 Arc::new(Int16Array::from_iter([
1792 Some(1),
1793 Some(2),
1794 None,
1795 Some(-5),
1796 Some(2),
1797 Some(2),
1798 Some(0),
1799 ])) as ArrayRef,
1800 Arc::new(Float32Array::from_iter([
1801 Some(1.3),
1802 Some(2.5),
1803 None,
1804 Some(4.),
1805 Some(0.1),
1806 Some(-4.),
1807 Some(-0.),
1808 ])) as ArrayRef,
1809 ];
1810
1811 let converter = RowConverter::new(vec![
1812 SortField::new(DataType::Int16),
1813 SortField::new(DataType::Float32),
1814 ])
1815 .unwrap();
1816 let rows = converter.convert_columns(&cols).unwrap();
1817
1818 assert_eq!(rows.offsets, &[0, 8, 16, 24, 32, 40, 48, 56]);
1819 assert_eq!(
1820 rows.buffer,
1821 &[
1822 1, 128, 1, 1, 191, 166, 102, 102, 1, 128, 2, 1, 192, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 127, 251, 1, 192, 128, 0, 0, 1, 128, 2, 1, 189, 204, 204, 205, 1, 128, 2, 1, 63, 127, 255, 255, 1, 128, 0, 1, 127, 255, 255, 255 ]
1837 );
1838
1839 assert!(rows.row(3) < rows.row(6));
1840 assert!(rows.row(0) < rows.row(1));
1841 assert!(rows.row(3) < rows.row(0));
1842 assert!(rows.row(4) < rows.row(1));
1843 assert!(rows.row(5) < rows.row(4));
1844
1845 let back = converter.convert_rows(&rows).unwrap();
1846 for (expected, actual) in cols.iter().zip(&back) {
1847 assert_eq!(expected, actual);
1848 }
1849 }
1850
1851 #[test]
1852 fn test_decimal32() {
1853 let converter = RowConverter::new(vec![SortField::new(DataType::Decimal32(
1854 DECIMAL32_MAX_PRECISION,
1855 7,
1856 ))])
1857 .unwrap();
1858 let col = Arc::new(
1859 Decimal32Array::from_iter([
1860 None,
1861 Some(i32::MIN),
1862 Some(-13),
1863 Some(46_i32),
1864 Some(5456_i32),
1865 Some(i32::MAX),
1866 ])
1867 .with_precision_and_scale(9, 7)
1868 .unwrap(),
1869 ) as ArrayRef;
1870
1871 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
1872 for i in 0..rows.num_rows() - 1 {
1873 assert!(rows.row(i) < rows.row(i + 1));
1874 }
1875
1876 let back = converter.convert_rows(&rows).unwrap();
1877 assert_eq!(back.len(), 1);
1878 assert_eq!(col.as_ref(), back[0].as_ref())
1879 }
1880
1881 #[test]
1882 fn test_decimal64() {
1883 let converter = RowConverter::new(vec![SortField::new(DataType::Decimal64(
1884 DECIMAL64_MAX_PRECISION,
1885 7,
1886 ))])
1887 .unwrap();
1888 let col = Arc::new(
1889 Decimal64Array::from_iter([
1890 None,
1891 Some(i64::MIN),
1892 Some(-13),
1893 Some(46_i64),
1894 Some(5456_i64),
1895 Some(i64::MAX),
1896 ])
1897 .with_precision_and_scale(18, 7)
1898 .unwrap(),
1899 ) as ArrayRef;
1900
1901 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
1902 for i in 0..rows.num_rows() - 1 {
1903 assert!(rows.row(i) < rows.row(i + 1));
1904 }
1905
1906 let back = converter.convert_rows(&rows).unwrap();
1907 assert_eq!(back.len(), 1);
1908 assert_eq!(col.as_ref(), back[0].as_ref())
1909 }
1910
1911 #[test]
1912 fn test_decimal128() {
1913 let converter = RowConverter::new(vec![SortField::new(DataType::Decimal128(
1914 DECIMAL128_MAX_PRECISION,
1915 7,
1916 ))])
1917 .unwrap();
1918 let col = Arc::new(
1919 Decimal128Array::from_iter([
1920 None,
1921 Some(i128::MIN),
1922 Some(-13),
1923 Some(46_i128),
1924 Some(5456_i128),
1925 Some(i128::MAX),
1926 ])
1927 .with_precision_and_scale(38, 7)
1928 .unwrap(),
1929 ) as ArrayRef;
1930
1931 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
1932 for i in 0..rows.num_rows() - 1 {
1933 assert!(rows.row(i) < rows.row(i + 1));
1934 }
1935
1936 let back = converter.convert_rows(&rows).unwrap();
1937 assert_eq!(back.len(), 1);
1938 assert_eq!(col.as_ref(), back[0].as_ref())
1939 }
1940
1941 #[test]
1942 fn test_decimal256() {
1943 let converter = RowConverter::new(vec![SortField::new(DataType::Decimal256(
1944 DECIMAL256_MAX_PRECISION,
1945 7,
1946 ))])
1947 .unwrap();
1948 let col = Arc::new(
1949 Decimal256Array::from_iter([
1950 None,
1951 Some(i256::MIN),
1952 Some(i256::from_parts(0, -1)),
1953 Some(i256::from_parts(u128::MAX, -1)),
1954 Some(i256::from_parts(u128::MAX, 0)),
1955 Some(i256::from_parts(0, 46_i128)),
1956 Some(i256::from_parts(5, 46_i128)),
1957 Some(i256::MAX),
1958 ])
1959 .with_precision_and_scale(DECIMAL256_MAX_PRECISION, 7)
1960 .unwrap(),
1961 ) as ArrayRef;
1962
1963 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
1964 for i in 0..rows.num_rows() - 1 {
1965 assert!(rows.row(i) < rows.row(i + 1));
1966 }
1967
1968 let back = converter.convert_rows(&rows).unwrap();
1969 assert_eq!(back.len(), 1);
1970 assert_eq!(col.as_ref(), back[0].as_ref())
1971 }
1972
1973 #[test]
1974 fn test_bool() {
1975 let converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]).unwrap();
1976
1977 let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])) as ArrayRef;
1978
1979 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
1980 assert!(rows.row(2) > rows.row(1));
1981 assert!(rows.row(2) > rows.row(0));
1982 assert!(rows.row(1) > rows.row(0));
1983
1984 let cols = converter.convert_rows(&rows).unwrap();
1985 assert_eq!(&cols[0], &col);
1986
1987 let converter = RowConverter::new(vec![SortField::new_with_options(
1988 DataType::Boolean,
1989 SortOptions::default().desc().with_nulls_first(false),
1990 )])
1991 .unwrap();
1992
1993 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
1994 assert!(rows.row(2) < rows.row(1));
1995 assert!(rows.row(2) < rows.row(0));
1996 assert!(rows.row(1) < rows.row(0));
1997 let cols = converter.convert_rows(&rows).unwrap();
1998 assert_eq!(&cols[0], &col);
1999 }
2000
2001 #[test]
2002 fn test_timezone() {
2003 let a =
2004 TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]).with_timezone("+01:00".to_string());
2005 let d = a.data_type().clone();
2006
2007 let converter = RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap();
2008 let rows = converter.convert_columns(&[Arc::new(a) as _]).unwrap();
2009 let back = converter.convert_rows(&rows).unwrap();
2010 assert_eq!(back.len(), 1);
2011 assert_eq!(back[0].data_type(), &d);
2012
2013 let mut a = PrimitiveDictionaryBuilder::<Int32Type, TimestampNanosecondType>::new();
2015 a.append(34).unwrap();
2016 a.append_null();
2017 a.append(345).unwrap();
2018
2019 let dict = a.finish();
2021 let values = TimestampNanosecondArray::from(dict.values().to_data());
2022 let dict_with_tz = dict.with_values(Arc::new(values.with_timezone("+02:00")));
2023 let v = DataType::Timestamp(TimeUnit::Nanosecond, Some("+02:00".into()));
2024 let d = DataType::Dictionary(Box::new(DataType::Int32), Box::new(v.clone()));
2025
2026 assert_eq!(dict_with_tz.data_type(), &d);
2027 let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap();
2028 let rows = converter
2029 .convert_columns(&[Arc::new(dict_with_tz) as _])
2030 .unwrap();
2031 let back = converter.convert_rows(&rows).unwrap();
2032 assert_eq!(back.len(), 1);
2033 assert_eq!(back[0].data_type(), &v);
2034 }
2035
2036 #[test]
2037 fn test_null_encoding() {
2038 let col = Arc::new(NullArray::new(10));
2039 let converter = RowConverter::new(vec![SortField::new(DataType::Null)]).unwrap();
2040 let rows = converter.convert_columns(&[col]).unwrap();
2041 assert_eq!(rows.num_rows(), 10);
2042 assert_eq!(rows.row(1).data.len(), 0);
2043 }
2044
2045 #[test]
2046 fn test_variable_width() {
2047 let col = Arc::new(StringArray::from_iter([
2048 Some("hello"),
2049 Some("he"),
2050 None,
2051 Some("foo"),
2052 Some(""),
2053 ])) as ArrayRef;
2054
2055 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2056 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
2057
2058 assert!(rows.row(1) < rows.row(0));
2059 assert!(rows.row(2) < rows.row(4));
2060 assert!(rows.row(3) < rows.row(0));
2061 assert!(rows.row(3) < rows.row(1));
2062
2063 let cols = converter.convert_rows(&rows).unwrap();
2064 assert_eq!(&cols[0], &col);
2065
2066 let col = Arc::new(BinaryArray::from_iter([
2067 None,
2068 Some(vec![0_u8; 0]),
2069 Some(vec![0_u8; 6]),
2070 Some(vec![0_u8; variable::MINI_BLOCK_SIZE]),
2071 Some(vec![0_u8; variable::MINI_BLOCK_SIZE + 1]),
2072 Some(vec![0_u8; variable::BLOCK_SIZE]),
2073 Some(vec![0_u8; variable::BLOCK_SIZE + 1]),
2074 Some(vec![1_u8; 6]),
2075 Some(vec![1_u8; variable::MINI_BLOCK_SIZE]),
2076 Some(vec![1_u8; variable::MINI_BLOCK_SIZE + 1]),
2077 Some(vec![1_u8; variable::BLOCK_SIZE]),
2078 Some(vec![1_u8; variable::BLOCK_SIZE + 1]),
2079 Some(vec![0xFF_u8; 6]),
2080 Some(vec![0xFF_u8; variable::MINI_BLOCK_SIZE]),
2081 Some(vec![0xFF_u8; variable::MINI_BLOCK_SIZE + 1]),
2082 Some(vec![0xFF_u8; variable::BLOCK_SIZE]),
2083 Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]),
2084 ])) as ArrayRef;
2085
2086 let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();
2087 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
2088
2089 for i in 0..rows.num_rows() {
2090 for j in i + 1..rows.num_rows() {
2091 assert!(
2092 rows.row(i) < rows.row(j),
2093 "{} < {} - {:?} < {:?}",
2094 i,
2095 j,
2096 rows.row(i),
2097 rows.row(j)
2098 );
2099 }
2100 }
2101
2102 let cols = converter.convert_rows(&rows).unwrap();
2103 assert_eq!(&cols[0], &col);
2104
2105 let converter = RowConverter::new(vec![SortField::new_with_options(
2106 DataType::Binary,
2107 SortOptions::default().desc().with_nulls_first(false),
2108 )])
2109 .unwrap();
2110 let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
2111
2112 for i in 0..rows.num_rows() {
2113 for j in i + 1..rows.num_rows() {
2114 assert!(
2115 rows.row(i) > rows.row(j),
2116 "{} > {} - {:?} > {:?}",
2117 i,
2118 j,
2119 rows.row(i),
2120 rows.row(j)
2121 );
2122 }
2123 }
2124
2125 let cols = converter.convert_rows(&rows).unwrap();
2126 assert_eq!(&cols[0], &col);
2127 }
2128
2129 fn dictionary_eq(a: &dyn Array, b: &dyn Array) {
2131 match b.data_type() {
2132 DataType::Dictionary(_, v) => {
2133 assert_eq!(a.data_type(), v.as_ref());
2134 let b = arrow_cast::cast(b, v).unwrap();
2135 assert_eq!(a, b.as_ref())
2136 }
2137 _ => assert_eq!(a, b),
2138 }
2139 }
2140
2141 #[test]
2142 fn test_string_dictionary() {
2143 let a = Arc::new(DictionaryArray::<Int32Type>::from_iter([
2144 Some("foo"),
2145 Some("hello"),
2146 Some("he"),
2147 None,
2148 Some("hello"),
2149 Some(""),
2150 Some("hello"),
2151 Some("hello"),
2152 ])) as ArrayRef;
2153
2154 let field = SortField::new(a.data_type().clone());
2155 let converter = RowConverter::new(vec![field]).unwrap();
2156 let rows_a = converter.convert_columns(&[Arc::clone(&a)]).unwrap();
2157
2158 assert!(rows_a.row(3) < rows_a.row(5));
2159 assert!(rows_a.row(2) < rows_a.row(1));
2160 assert!(rows_a.row(0) < rows_a.row(1));
2161 assert!(rows_a.row(3) < rows_a.row(0));
2162
2163 assert_eq!(rows_a.row(1), rows_a.row(4));
2164 assert_eq!(rows_a.row(1), rows_a.row(6));
2165 assert_eq!(rows_a.row(1), rows_a.row(7));
2166
2167 let cols = converter.convert_rows(&rows_a).unwrap();
2168 dictionary_eq(&cols[0], &a);
2169
2170 let b = Arc::new(DictionaryArray::<Int32Type>::from_iter([
2171 Some("hello"),
2172 None,
2173 Some("cupcakes"),
2174 ])) as ArrayRef;
2175
2176 let rows_b = converter.convert_columns(&[Arc::clone(&b)]).unwrap();
2177 assert_eq!(rows_a.row(1), rows_b.row(0));
2178 assert_eq!(rows_a.row(3), rows_b.row(1));
2179 assert!(rows_b.row(2) < rows_a.row(0));
2180
2181 let cols = converter.convert_rows(&rows_b).unwrap();
2182 dictionary_eq(&cols[0], &b);
2183
2184 let converter = RowConverter::new(vec![SortField::new_with_options(
2185 a.data_type().clone(),
2186 SortOptions::default().desc().with_nulls_first(false),
2187 )])
2188 .unwrap();
2189
2190 let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap();
2191 assert!(rows_c.row(3) > rows_c.row(5));
2192 assert!(rows_c.row(2) > rows_c.row(1));
2193 assert!(rows_c.row(0) > rows_c.row(1));
2194 assert!(rows_c.row(3) > rows_c.row(0));
2195
2196 let cols = converter.convert_rows(&rows_c).unwrap();
2197 dictionary_eq(&cols[0], &a);
2198
2199 let converter = RowConverter::new(vec![SortField::new_with_options(
2200 a.data_type().clone(),
2201 SortOptions::default().desc().with_nulls_first(true),
2202 )])
2203 .unwrap();
2204
2205 let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap();
2206 assert!(rows_c.row(3) < rows_c.row(5));
2207 assert!(rows_c.row(2) > rows_c.row(1));
2208 assert!(rows_c.row(0) > rows_c.row(1));
2209 assert!(rows_c.row(3) < rows_c.row(0));
2210
2211 let cols = converter.convert_rows(&rows_c).unwrap();
2212 dictionary_eq(&cols[0], &a);
2213 }
2214
2215 #[test]
2216 fn test_struct() {
2217 let a = Arc::new(Int32Array::from(vec![1, 1, 2, 2])) as ArrayRef;
2219 let a_f = Arc::new(Field::new("int", DataType::Int32, false));
2220 let u = Arc::new(StringArray::from(vec!["a", "b", "c", "d"])) as ArrayRef;
2221 let u_f = Arc::new(Field::new("s", DataType::Utf8, false));
2222 let s1 = Arc::new(StructArray::from(vec![(a_f, a), (u_f, u)])) as ArrayRef;
2223
2224 let sort_fields = vec![SortField::new(s1.data_type().clone())];
2225 let converter = RowConverter::new(sort_fields).unwrap();
2226 let r1 = converter.convert_columns(&[Arc::clone(&s1)]).unwrap();
2227
2228 for (a, b) in r1.iter().zip(r1.iter().skip(1)) {
2229 assert!(a < b);
2230 }
2231
2232 let back = converter.convert_rows(&r1).unwrap();
2233 assert_eq!(back.len(), 1);
2234 assert_eq!(&back[0], &s1);
2235
2236 let data = s1
2238 .to_data()
2239 .into_builder()
2240 .null_bit_buffer(Some(Buffer::from_slice_ref([0b00001010])))
2241 .null_count(2)
2242 .build()
2243 .unwrap();
2244
2245 let s2 = Arc::new(StructArray::from(data)) as ArrayRef;
2246 let r2 = converter.convert_columns(&[Arc::clone(&s2)]).unwrap();
2247 assert_eq!(r2.row(0), r2.row(2)); assert!(r2.row(0) < r2.row(1)); assert_ne!(r1.row(0), r2.row(0)); assert_eq!(r1.row(1), r2.row(1)); let back = converter.convert_rows(&r2).unwrap();
2253 assert_eq!(back.len(), 1);
2254 assert_eq!(&back[0], &s2);
2255
2256 back[0].to_data().validate_full().unwrap();
2257 }
2258
2259 #[test]
2260 fn test_dictionary_in_struct() {
2261 let builder = StringDictionaryBuilder::<Int32Type>::new();
2262 let mut struct_builder = StructBuilder::new(
2263 vec![Field::new_dictionary(
2264 "foo",
2265 DataType::Int32,
2266 DataType::Utf8,
2267 true,
2268 )],
2269 vec![Box::new(builder)],
2270 );
2271
2272 let dict_builder = struct_builder
2273 .field_builder::<StringDictionaryBuilder<Int32Type>>(0)
2274 .unwrap();
2275
2276 dict_builder.append_value("a");
2278 dict_builder.append_null();
2279 dict_builder.append_value("a");
2280 dict_builder.append_value("b");
2281
2282 for _ in 0..4 {
2283 struct_builder.append(true);
2284 }
2285
2286 let s = Arc::new(struct_builder.finish()) as ArrayRef;
2287 let sort_fields = vec![SortField::new(s.data_type().clone())];
2288 let converter = RowConverter::new(sort_fields).unwrap();
2289 let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap();
2290
2291 let back = converter.convert_rows(&r).unwrap();
2292 let [s2] = back.try_into().unwrap();
2293
2294 assert_ne!(&s.data_type(), &s2.data_type());
2297 s2.to_data().validate_full().unwrap();
2298
2299 let s1_struct = s.as_struct();
2303 let s1_0 = s1_struct.column(0);
2304 let s1_idx_0 = s1_0.as_dictionary::<Int32Type>();
2305 let keys = s1_idx_0.keys();
2306 let values = s1_idx_0.values().as_string::<i32>();
2307 let s2_struct = s2.as_struct();
2309 let s2_0 = s2_struct.column(0);
2310 let s2_idx_0 = s2_0.as_string::<i32>();
2311
2312 for i in 0..keys.len() {
2313 if keys.is_null(i) {
2314 assert!(s2_idx_0.is_null(i));
2315 } else {
2316 let dict_index = keys.value(i) as usize;
2317 assert_eq!(values.value(dict_index), s2_idx_0.value(i));
2318 }
2319 }
2320 }
2321
2322 #[test]
2323 fn test_dictionary_in_struct_empty() {
2324 let ty = DataType::Struct(
2325 vec![Field::new_dictionary(
2326 "foo",
2327 DataType::Int32,
2328 DataType::Int32,
2329 false,
2330 )]
2331 .into(),
2332 );
2333 let s = arrow_array::new_empty_array(&ty);
2334
2335 let sort_fields = vec![SortField::new(s.data_type().clone())];
2336 let converter = RowConverter::new(sort_fields).unwrap();
2337 let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap();
2338
2339 let back = converter.convert_rows(&r).unwrap();
2340 let [s2] = back.try_into().unwrap();
2341
2342 assert_ne!(&s.data_type(), &s2.data_type());
2345 s2.to_data().validate_full().unwrap();
2346 assert_eq!(s.len(), 0);
2347 assert_eq!(s2.len(), 0);
2348 }
2349
2350 #[test]
2351 fn test_list_of_string_dictionary() {
2352 let mut builder = ListBuilder::<StringDictionaryBuilder<Int32Type>>::default();
2353 builder.values().append("a").unwrap();
2355 builder.values().append("b").unwrap();
2356 builder.values().append("zero").unwrap();
2357 builder.values().append_null();
2358 builder.values().append("c").unwrap();
2359 builder.values().append("b").unwrap();
2360 builder.values().append("d").unwrap();
2361 builder.append(true);
2362 builder.append(false);
2364 builder.values().append("e").unwrap();
2366 builder.values().append("zero").unwrap();
2367 builder.values().append("a").unwrap();
2368 builder.append(true);
2369
2370 let a = Arc::new(builder.finish()) as ArrayRef;
2371 let data_type = a.data_type().clone();
2372
2373 let field = SortField::new(data_type.clone());
2374 let converter = RowConverter::new(vec![field]).unwrap();
2375 let rows = converter.convert_columns(&[Arc::clone(&a)]).unwrap();
2376
2377 let back = converter.convert_rows(&rows).unwrap();
2378 assert_eq!(back.len(), 1);
2379 let [a2] = back.try_into().unwrap();
2380
2381 assert_ne!(&a.data_type(), &a2.data_type());
2384
2385 a2.to_data().validate_full().unwrap();
2386
2387 let a2_list = a2.as_list::<i32>();
2388 let a1_list = a.as_list::<i32>();
2389
2390 let a1_0 = a1_list.value(0);
2393 let a1_idx_0 = a1_0.as_dictionary::<Int32Type>();
2394 let keys = a1_idx_0.keys();
2395 let values = a1_idx_0.values().as_string::<i32>();
2396 let a2_0 = a2_list.value(0);
2397 let a2_idx_0 = a2_0.as_string::<i32>();
2398
2399 for i in 0..keys.len() {
2400 if keys.is_null(i) {
2401 assert!(a2_idx_0.is_null(i));
2402 } else {
2403 let dict_index = keys.value(i) as usize;
2404 assert_eq!(values.value(dict_index), a2_idx_0.value(i));
2405 }
2406 }
2407
2408 assert!(a1_list.is_null(1));
2410 assert!(a2_list.is_null(1));
2411
2412 let a1_2 = a1_list.value(2);
2414 let a1_idx_2 = a1_2.as_dictionary::<Int32Type>();
2415 let keys = a1_idx_2.keys();
2416 let values = a1_idx_2.values().as_string::<i32>();
2417 let a2_2 = a2_list.value(2);
2418 let a2_idx_2 = a2_2.as_string::<i32>();
2419
2420 for i in 0..keys.len() {
2421 if keys.is_null(i) {
2422 assert!(a2_idx_2.is_null(i));
2423 } else {
2424 let dict_index = keys.value(i) as usize;
2425 assert_eq!(values.value(dict_index), a2_idx_2.value(i));
2426 }
2427 }
2428 }
2429
2430 #[test]
2431 fn test_primitive_dictionary() {
2432 let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
2433 builder.append(2).unwrap();
2434 builder.append(3).unwrap();
2435 builder.append(0).unwrap();
2436 builder.append_null();
2437 builder.append(5).unwrap();
2438 builder.append(3).unwrap();
2439 builder.append(-1).unwrap();
2440
2441 let a = builder.finish();
2442 let data_type = a.data_type().clone();
2443 let columns = [Arc::new(a) as ArrayRef];
2444
2445 let field = SortField::new(data_type.clone());
2446 let converter = RowConverter::new(vec![field]).unwrap();
2447 let rows = converter.convert_columns(&columns).unwrap();
2448 assert!(rows.row(0) < rows.row(1));
2449 assert!(rows.row(2) < rows.row(0));
2450 assert!(rows.row(3) < rows.row(2));
2451 assert!(rows.row(6) < rows.row(2));
2452 assert!(rows.row(3) < rows.row(6));
2453
2454 let back = converter.convert_rows(&rows).unwrap();
2455 assert_eq!(back.len(), 1);
2456 back[0].to_data().validate_full().unwrap();
2457 }
2458
2459 #[test]
2460 fn test_dictionary_nulls() {
2461 let values = Int32Array::from_iter([Some(1), Some(-1), None, Some(4), None]).into_data();
2462 let keys =
2463 Int32Array::from_iter([Some(0), Some(0), Some(1), Some(2), Some(4), None]).into_data();
2464
2465 let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32));
2466 let data = keys
2467 .into_builder()
2468 .data_type(data_type.clone())
2469 .child_data(vec![values])
2470 .build()
2471 .unwrap();
2472
2473 let columns = [Arc::new(DictionaryArray::<Int32Type>::from(data)) as ArrayRef];
2474 let field = SortField::new(data_type.clone());
2475 let converter = RowConverter::new(vec![field]).unwrap();
2476 let rows = converter.convert_columns(&columns).unwrap();
2477
2478 assert_eq!(rows.row(0), rows.row(1));
2479 assert_eq!(rows.row(3), rows.row(4));
2480 assert_eq!(rows.row(4), rows.row(5));
2481 assert!(rows.row(3) < rows.row(0));
2482 }
2483
2484 #[test]
2485 #[should_panic(expected = "Encountered non UTF-8 data")]
2486 fn test_invalid_utf8() {
2487 let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();
2488 let array = Arc::new(BinaryArray::from_iter_values([&[0xFF]])) as _;
2489 let rows = converter.convert_columns(&[array]).unwrap();
2490 let binary_row = rows.row(0);
2491
2492 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2493 let parser = converter.parser();
2494 let utf8_row = parser.parse(binary_row.as_ref());
2495
2496 converter.convert_rows(std::iter::once(utf8_row)).unwrap();
2497 }
2498
2499 #[test]
2500 #[should_panic(expected = "Encountered non UTF-8 data")]
2501 fn test_invalid_utf8_array() {
2502 let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();
2503 let array = Arc::new(BinaryArray::from_iter_values([&[0xFF]])) as _;
2504 let rows = converter.convert_columns(&[array]).unwrap();
2505 let binary_rows = rows.try_into_binary().expect("known-small rows");
2506
2507 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2508 let parsed = converter.from_binary(binary_rows);
2509
2510 converter.convert_rows(parsed.iter()).unwrap();
2511 }
2512
2513 #[test]
2514 #[should_panic(expected = "index out of bounds")]
2515 fn test_invalid_empty() {
2516 let binary_row: &[u8] = &[];
2517
2518 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2519 let parser = converter.parser();
2520 let utf8_row = parser.parse(binary_row.as_ref());
2521
2522 converter.convert_rows(std::iter::once(utf8_row)).unwrap();
2523 }
2524
2525 #[test]
2526 #[should_panic(expected = "index out of bounds")]
2527 fn test_invalid_empty_array() {
2528 let row: &[u8] = &[];
2529 let binary_rows = BinaryArray::from(vec![row]);
2530
2531 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2532 let parsed = converter.from_binary(binary_rows);
2533
2534 converter.convert_rows(parsed.iter()).unwrap();
2535 }
2536
2537 #[test]
2538 #[should_panic(expected = "index out of bounds")]
2539 fn test_invalid_truncated() {
2540 let binary_row: &[u8] = &[0x02];
2541
2542 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2543 let parser = converter.parser();
2544 let utf8_row = parser.parse(binary_row.as_ref());
2545
2546 converter.convert_rows(std::iter::once(utf8_row)).unwrap();
2547 }
2548
2549 #[test]
2550 #[should_panic(expected = "index out of bounds")]
2551 fn test_invalid_truncated_array() {
2552 let row: &[u8] = &[0x02];
2553 let binary_rows = BinaryArray::from(vec![row]);
2554
2555 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
2556 let parsed = converter.from_binary(binary_rows);
2557
2558 converter.convert_rows(parsed.iter()).unwrap();
2559 }
2560
2561 #[test]
2562 #[should_panic(expected = "rows were not produced by this RowConverter")]
2563 fn test_different_converter() {
2564 let values = Arc::new(Int32Array::from_iter([Some(1), Some(-1)]));
2565 let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap();
2566 let rows = converter.convert_columns(&[values]).unwrap();
2567
2568 let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap();
2569 let _ = converter.convert_rows(&rows);
2570 }
2571
2572 fn test_single_list<O: OffsetSizeTrait>() {
2573 let mut builder = GenericListBuilder::<O, _>::new(Int32Builder::new());
2574 builder.values().append_value(32);
2575 builder.values().append_value(52);
2576 builder.values().append_value(32);
2577 builder.append(true);
2578 builder.values().append_value(32);
2579 builder.values().append_value(52);
2580 builder.values().append_value(12);
2581 builder.append(true);
2582 builder.values().append_value(32);
2583 builder.values().append_value(52);
2584 builder.append(true);
2585 builder.values().append_value(32); builder.values().append_value(52); builder.append(false);
2588 builder.values().append_value(32);
2589 builder.values().append_null();
2590 builder.append(true);
2591 builder.append(true);
2592 builder.values().append_value(17); builder.values().append_null(); builder.append(false);
2595
2596 let list = Arc::new(builder.finish()) as ArrayRef;
2597 let d = list.data_type().clone();
2598
2599 let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap();
2600
2601 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2602 assert!(rows.row(0) > rows.row(1)); assert!(rows.row(2) < rows.row(1)); assert!(rows.row(3) < rows.row(2)); assert!(rows.row(4) < rows.row(2)); assert!(rows.row(5) < rows.row(2)); assert!(rows.row(3) < rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2611 assert_eq!(back.len(), 1);
2612 back[0].to_data().validate_full().unwrap();
2613 assert_eq!(&back[0], &list);
2614
2615 let options = SortOptions::default().asc().with_nulls_first(false);
2616 let field = SortField::new_with_options(d.clone(), options);
2617 let converter = RowConverter::new(vec![field]).unwrap();
2618 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2619
2620 assert!(rows.row(0) > rows.row(1)); assert!(rows.row(2) < rows.row(1)); assert!(rows.row(3) > rows.row(2)); assert!(rows.row(4) > rows.row(2)); assert!(rows.row(5) < rows.row(2)); assert!(rows.row(3) > rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2629 assert_eq!(back.len(), 1);
2630 back[0].to_data().validate_full().unwrap();
2631 assert_eq!(&back[0], &list);
2632
2633 let options = SortOptions::default().desc().with_nulls_first(false);
2634 let field = SortField::new_with_options(d.clone(), options);
2635 let converter = RowConverter::new(vec![field]).unwrap();
2636 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2637
2638 assert!(rows.row(0) < rows.row(1)); assert!(rows.row(2) > rows.row(1)); assert!(rows.row(3) > rows.row(2)); assert!(rows.row(4) > rows.row(2)); assert!(rows.row(5) > rows.row(2)); assert!(rows.row(3) > rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2647 assert_eq!(back.len(), 1);
2648 back[0].to_data().validate_full().unwrap();
2649 assert_eq!(&back[0], &list);
2650
2651 let options = SortOptions::default().desc().with_nulls_first(true);
2652 let field = SortField::new_with_options(d, options);
2653 let converter = RowConverter::new(vec![field]).unwrap();
2654 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2655
2656 assert!(rows.row(0) < rows.row(1)); assert!(rows.row(2) > rows.row(1)); assert!(rows.row(3) < rows.row(2)); assert!(rows.row(4) < rows.row(2)); assert!(rows.row(5) > rows.row(2)); assert!(rows.row(3) < rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2665 assert_eq!(back.len(), 1);
2666 back[0].to_data().validate_full().unwrap();
2667 assert_eq!(&back[0], &list);
2668
2669 let sliced_list = list.slice(1, 5);
2670 let rows_on_sliced_list = converter
2671 .convert_columns(&[Arc::clone(&sliced_list)])
2672 .unwrap();
2673
2674 assert!(rows_on_sliced_list.row(1) > rows_on_sliced_list.row(0)); assert!(rows_on_sliced_list.row(2) < rows_on_sliced_list.row(1)); assert!(rows_on_sliced_list.row(3) < rows_on_sliced_list.row(1)); assert!(rows_on_sliced_list.row(4) > rows_on_sliced_list.row(1)); assert!(rows_on_sliced_list.row(2) < rows_on_sliced_list.row(4)); let back = converter.convert_rows(&rows_on_sliced_list).unwrap();
2681 assert_eq!(back.len(), 1);
2682 back[0].to_data().validate_full().unwrap();
2683 assert_eq!(&back[0], &sliced_list);
2684 }
2685
2686 fn test_nested_list<O: OffsetSizeTrait>() {
2687 let mut builder =
2688 GenericListBuilder::<O, _>::new(GenericListBuilder::<O, _>::new(Int32Builder::new()));
2689
2690 builder.values().values().append_value(1);
2691 builder.values().values().append_value(2);
2692 builder.values().append(true);
2693 builder.values().values().append_value(1);
2694 builder.values().values().append_null();
2695 builder.values().append(true);
2696 builder.append(true);
2697
2698 builder.values().values().append_value(1);
2699 builder.values().values().append_null();
2700 builder.values().append(true);
2701 builder.values().values().append_value(1);
2702 builder.values().values().append_null();
2703 builder.values().append(true);
2704 builder.append(true);
2705
2706 builder.values().values().append_value(1);
2707 builder.values().values().append_null();
2708 builder.values().append(true);
2709 builder.values().append(false);
2710 builder.append(true);
2711 builder.append(false);
2712
2713 builder.values().values().append_value(1);
2714 builder.values().values().append_value(2);
2715 builder.values().append(true);
2716 builder.append(true);
2717
2718 let list = Arc::new(builder.finish()) as ArrayRef;
2719 let d = list.data_type().clone();
2720
2721 let options = SortOptions::default().asc().with_nulls_first(true);
2729 let field = SortField::new_with_options(d.clone(), options);
2730 let converter = RowConverter::new(vec![field]).unwrap();
2731 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2732
2733 assert!(rows.row(0) > rows.row(1));
2734 assert!(rows.row(1) > rows.row(2));
2735 assert!(rows.row(2) > rows.row(3));
2736 assert!(rows.row(4) < rows.row(0));
2737 assert!(rows.row(4) > rows.row(1));
2738
2739 let back = converter.convert_rows(&rows).unwrap();
2740 assert_eq!(back.len(), 1);
2741 back[0].to_data().validate_full().unwrap();
2742 assert_eq!(&back[0], &list);
2743
2744 let options = SortOptions::default().desc().with_nulls_first(true);
2745 let field = SortField::new_with_options(d.clone(), options);
2746 let converter = RowConverter::new(vec![field]).unwrap();
2747 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2748
2749 assert!(rows.row(0) > rows.row(1));
2750 assert!(rows.row(1) > rows.row(2));
2751 assert!(rows.row(2) > rows.row(3));
2752 assert!(rows.row(4) > rows.row(0));
2753 assert!(rows.row(4) > rows.row(1));
2754
2755 let back = converter.convert_rows(&rows).unwrap();
2756 assert_eq!(back.len(), 1);
2757 back[0].to_data().validate_full().unwrap();
2758 assert_eq!(&back[0], &list);
2759
2760 let options = SortOptions::default().desc().with_nulls_first(false);
2761 let field = SortField::new_with_options(d, options);
2762 let converter = RowConverter::new(vec![field]).unwrap();
2763 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2764
2765 assert!(rows.row(0) < rows.row(1));
2766 assert!(rows.row(1) < rows.row(2));
2767 assert!(rows.row(2) < rows.row(3));
2768 assert!(rows.row(4) > rows.row(0));
2769 assert!(rows.row(4) < rows.row(1));
2770
2771 let back = converter.convert_rows(&rows).unwrap();
2772 assert_eq!(back.len(), 1);
2773 back[0].to_data().validate_full().unwrap();
2774 assert_eq!(&back[0], &list);
2775
2776 let sliced_list = list.slice(1, 3);
2777 let rows = converter
2778 .convert_columns(&[Arc::clone(&sliced_list)])
2779 .unwrap();
2780
2781 assert!(rows.row(0) < rows.row(1));
2782 assert!(rows.row(1) < rows.row(2));
2783
2784 let back = converter.convert_rows(&rows).unwrap();
2785 assert_eq!(back.len(), 1);
2786 back[0].to_data().validate_full().unwrap();
2787 assert_eq!(&back[0], &sliced_list);
2788 }
2789
2790 #[test]
2791 fn test_list() {
2792 test_single_list::<i32>();
2793 test_nested_list::<i32>();
2794 }
2795
2796 #[test]
2797 fn test_large_list() {
2798 test_single_list::<i64>();
2799 test_nested_list::<i64>();
2800 }
2801
2802 #[test]
2803 fn test_fixed_size_list() {
2804 let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3);
2805 builder.values().append_value(32);
2806 builder.values().append_value(52);
2807 builder.values().append_value(32);
2808 builder.append(true);
2809 builder.values().append_value(32);
2810 builder.values().append_value(52);
2811 builder.values().append_value(12);
2812 builder.append(true);
2813 builder.values().append_value(32);
2814 builder.values().append_value(52);
2815 builder.values().append_null();
2816 builder.append(true);
2817 builder.values().append_value(32); builder.values().append_value(52); builder.values().append_value(13); builder.append(false);
2821 builder.values().append_value(32);
2822 builder.values().append_null();
2823 builder.values().append_null();
2824 builder.append(true);
2825 builder.values().append_null();
2826 builder.values().append_null();
2827 builder.values().append_null();
2828 builder.append(true);
2829 builder.values().append_value(17); builder.values().append_null(); builder.values().append_value(77); builder.append(false);
2833
2834 let list = Arc::new(builder.finish()) as ArrayRef;
2835 let d = list.data_type().clone();
2836
2837 let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap();
2839
2840 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2841 assert!(rows.row(0) > rows.row(1)); assert!(rows.row(2) < rows.row(1)); assert!(rows.row(3) < rows.row(2)); assert!(rows.row(4) < rows.row(2)); assert!(rows.row(5) < rows.row(2)); assert!(rows.row(3) < rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2850 assert_eq!(back.len(), 1);
2851 back[0].to_data().validate_full().unwrap();
2852 assert_eq!(&back[0], &list);
2853
2854 let options = SortOptions::default().asc().with_nulls_first(false);
2856 let field = SortField::new_with_options(d.clone(), options);
2857 let converter = RowConverter::new(vec![field]).unwrap();
2858 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2859 assert!(rows.row(0) > rows.row(1)); assert!(rows.row(2) > rows.row(1)); assert!(rows.row(3) > rows.row(2)); assert!(rows.row(4) > rows.row(2)); assert!(rows.row(5) > rows.row(2)); assert!(rows.row(3) > rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2868 assert_eq!(back.len(), 1);
2869 back[0].to_data().validate_full().unwrap();
2870 assert_eq!(&back[0], &list);
2871
2872 let options = SortOptions::default().desc().with_nulls_first(false);
2874 let field = SortField::new_with_options(d.clone(), options);
2875 let converter = RowConverter::new(vec![field]).unwrap();
2876 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2877 assert!(rows.row(0) < rows.row(1)); assert!(rows.row(2) > rows.row(1)); assert!(rows.row(3) > rows.row(2)); assert!(rows.row(4) > rows.row(2)); assert!(rows.row(5) > rows.row(2)); assert!(rows.row(3) > rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2886 assert_eq!(back.len(), 1);
2887 back[0].to_data().validate_full().unwrap();
2888 assert_eq!(&back[0], &list);
2889
2890 let options = SortOptions::default().desc().with_nulls_first(true);
2892 let field = SortField::new_with_options(d, options);
2893 let converter = RowConverter::new(vec![field]).unwrap();
2894 let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
2895
2896 assert!(rows.row(0) < rows.row(1)); assert!(rows.row(2) < rows.row(1)); assert!(rows.row(3) < rows.row(2)); assert!(rows.row(4) < rows.row(2)); assert!(rows.row(5) < rows.row(2)); assert!(rows.row(3) < rows.row(5)); assert_eq!(rows.row(3), rows.row(6)); let back = converter.convert_rows(&rows).unwrap();
2905 assert_eq!(back.len(), 1);
2906 back[0].to_data().validate_full().unwrap();
2907 assert_eq!(&back[0], &list);
2908
2909 let sliced_list = list.slice(1, 5);
2910 let rows_on_sliced_list = converter
2911 .convert_columns(&[Arc::clone(&sliced_list)])
2912 .unwrap();
2913
2914 assert!(rows_on_sliced_list.row(2) < rows_on_sliced_list.row(1)); assert!(rows_on_sliced_list.row(3) < rows_on_sliced_list.row(1)); assert!(rows_on_sliced_list.row(4) < rows_on_sliced_list.row(1)); assert!(rows_on_sliced_list.row(2) < rows_on_sliced_list.row(4)); let back = converter.convert_rows(&rows_on_sliced_list).unwrap();
2920 assert_eq!(back.len(), 1);
2921 back[0].to_data().validate_full().unwrap();
2922 assert_eq!(&back[0], &sliced_list);
2923 }
2924
2925 #[test]
2926 fn test_two_fixed_size_lists() {
2927 let mut first = FixedSizeListBuilder::new(UInt8Builder::new(), 1);
2928 first.values().append_value(100);
2930 first.append(true);
2931 first.values().append_value(101);
2933 first.append(true);
2934 first.values().append_value(102);
2936 first.append(true);
2937 first.values().append_null();
2939 first.append(true);
2940 first.values().append_null(); first.append(false);
2943 let first = Arc::new(first.finish()) as ArrayRef;
2944 let first_type = first.data_type().clone();
2945
2946 let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1);
2947 second.values().append_value(200);
2949 second.append(true);
2950 second.values().append_value(201);
2952 second.append(true);
2953 second.values().append_value(202);
2955 second.append(true);
2956 second.values().append_null();
2958 second.append(true);
2959 second.values().append_null(); second.append(false);
2962 let second = Arc::new(second.finish()) as ArrayRef;
2963 let second_type = second.data_type().clone();
2964
2965 let converter = RowConverter::new(vec![
2966 SortField::new(first_type.clone()),
2967 SortField::new(second_type.clone()),
2968 ])
2969 .unwrap();
2970
2971 let rows = converter
2972 .convert_columns(&[Arc::clone(&first), Arc::clone(&second)])
2973 .unwrap();
2974
2975 let back = converter.convert_rows(&rows).unwrap();
2976 assert_eq!(back.len(), 2);
2977 back[0].to_data().validate_full().unwrap();
2978 assert_eq!(&back[0], &first);
2979 back[1].to_data().validate_full().unwrap();
2980 assert_eq!(&back[1], &second);
2981 }
2982
2983 #[test]
2984 fn test_fixed_size_list_with_variable_width_content() {
2985 let mut first = FixedSizeListBuilder::new(
2986 StructBuilder::from_fields(
2987 vec![
2988 Field::new(
2989 "timestamp",
2990 DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC"))),
2991 false,
2992 ),
2993 Field::new("offset_minutes", DataType::Int16, false),
2994 Field::new("time_zone", DataType::Utf8, false),
2995 ],
2996 1,
2997 ),
2998 1,
2999 );
3000 first
3002 .values()
3003 .field_builder::<TimestampMicrosecondBuilder>(0)
3004 .unwrap()
3005 .append_null();
3006 first
3007 .values()
3008 .field_builder::<Int16Builder>(1)
3009 .unwrap()
3010 .append_null();
3011 first
3012 .values()
3013 .field_builder::<StringBuilder>(2)
3014 .unwrap()
3015 .append_null();
3016 first.values().append(false);
3017 first.append(false);
3018 first
3020 .values()
3021 .field_builder::<TimestampMicrosecondBuilder>(0)
3022 .unwrap()
3023 .append_null();
3024 first
3025 .values()
3026 .field_builder::<Int16Builder>(1)
3027 .unwrap()
3028 .append_null();
3029 first
3030 .values()
3031 .field_builder::<StringBuilder>(2)
3032 .unwrap()
3033 .append_null();
3034 first.values().append(false);
3035 first.append(true);
3036 first
3038 .values()
3039 .field_builder::<TimestampMicrosecondBuilder>(0)
3040 .unwrap()
3041 .append_value(0);
3042 first
3043 .values()
3044 .field_builder::<Int16Builder>(1)
3045 .unwrap()
3046 .append_value(0);
3047 first
3048 .values()
3049 .field_builder::<StringBuilder>(2)
3050 .unwrap()
3051 .append_value("UTC");
3052 first.values().append(true);
3053 first.append(true);
3054 first
3056 .values()
3057 .field_builder::<TimestampMicrosecondBuilder>(0)
3058 .unwrap()
3059 .append_value(1126351800123456);
3060 first
3061 .values()
3062 .field_builder::<Int16Builder>(1)
3063 .unwrap()
3064 .append_value(120);
3065 first
3066 .values()
3067 .field_builder::<StringBuilder>(2)
3068 .unwrap()
3069 .append_value("Europe/Warsaw");
3070 first.values().append(true);
3071 first.append(true);
3072 let first = Arc::new(first.finish()) as ArrayRef;
3073 let first_type = first.data_type().clone();
3074
3075 let mut second = StringBuilder::new();
3076 second.append_value("somewhere near");
3077 second.append_null();
3078 second.append_value("Greenwich");
3079 second.append_value("Warsaw");
3080 let second = Arc::new(second.finish()) as ArrayRef;
3081 let second_type = second.data_type().clone();
3082
3083 let converter = RowConverter::new(vec![
3084 SortField::new(first_type.clone()),
3085 SortField::new(second_type.clone()),
3086 ])
3087 .unwrap();
3088
3089 let rows = converter
3090 .convert_columns(&[Arc::clone(&first), Arc::clone(&second)])
3091 .unwrap();
3092
3093 let back = converter.convert_rows(&rows).unwrap();
3094 assert_eq!(back.len(), 2);
3095 back[0].to_data().validate_full().unwrap();
3096 assert_eq!(&back[0], &first);
3097 back[1].to_data().validate_full().unwrap();
3098 assert_eq!(&back[1], &second);
3099 }
3100
3101 fn generate_primitive_array<K>(len: usize, valid_percent: f64) -> PrimitiveArray<K>
3102 where
3103 K: ArrowPrimitiveType,
3104 StandardUniform: Distribution<K::Native>,
3105 {
3106 let mut rng = rng();
3107 (0..len)
3108 .map(|_| rng.random_bool(valid_percent).then(|| rng.random()))
3109 .collect()
3110 }
3111
3112 fn generate_strings<O: OffsetSizeTrait>(
3113 len: usize,
3114 valid_percent: f64,
3115 ) -> GenericStringArray<O> {
3116 let mut rng = rng();
3117 (0..len)
3118 .map(|_| {
3119 rng.random_bool(valid_percent).then(|| {
3120 let len = rng.random_range(0..100);
3121 let bytes = (0..len).map(|_| rng.random_range(0..128)).collect();
3122 String::from_utf8(bytes).unwrap()
3123 })
3124 })
3125 .collect()
3126 }
3127
3128 fn generate_string_view(len: usize, valid_percent: f64) -> StringViewArray {
3129 let mut rng = rng();
3130 (0..len)
3131 .map(|_| {
3132 rng.random_bool(valid_percent).then(|| {
3133 let len = rng.random_range(0..100);
3134 let bytes = (0..len).map(|_| rng.random_range(0..128)).collect();
3135 String::from_utf8(bytes).unwrap()
3136 })
3137 })
3138 .collect()
3139 }
3140
3141 fn generate_byte_view(len: usize, valid_percent: f64) -> BinaryViewArray {
3142 let mut rng = rng();
3143 (0..len)
3144 .map(|_| {
3145 rng.random_bool(valid_percent).then(|| {
3146 let len = rng.random_range(0..100);
3147 let bytes: Vec<_> = (0..len).map(|_| rng.random_range(0..128)).collect();
3148 bytes
3149 })
3150 })
3151 .collect()
3152 }
3153
3154 fn generate_fixed_stringview_column(len: usize) -> StringViewArray {
3155 let edge_cases = vec![
3156 Some("bar".to_string()),
3157 Some("bar\0".to_string()),
3158 Some("LongerThan12Bytes".to_string()),
3159 Some("LongerThan12Bytez".to_string()),
3160 Some("LongerThan12Bytes\0".to_string()),
3161 Some("LongerThan12Byt".to_string()),
3162 Some("backend one".to_string()),
3163 Some("backend two".to_string()),
3164 Some("a".repeat(257)),
3165 Some("a".repeat(300)),
3166 ];
3167
3168 let mut values = Vec::with_capacity(len);
3170 for i in 0..len {
3171 values.push(
3172 edge_cases
3173 .get(i % edge_cases.len())
3174 .cloned()
3175 .unwrap_or(None),
3176 );
3177 }
3178
3179 StringViewArray::from(values)
3180 }
3181
3182 fn generate_dictionary<K>(
3183 values: ArrayRef,
3184 len: usize,
3185 valid_percent: f64,
3186 ) -> DictionaryArray<K>
3187 where
3188 K: ArrowDictionaryKeyType,
3189 K::Native: SampleUniform,
3190 {
3191 let mut rng = rng();
3192 let min_key = K::Native::from_usize(0).unwrap();
3193 let max_key = K::Native::from_usize(values.len()).unwrap();
3194 let keys: PrimitiveArray<K> = (0..len)
3195 .map(|_| {
3196 rng.random_bool(valid_percent)
3197 .then(|| rng.random_range(min_key..max_key))
3198 })
3199 .collect();
3200
3201 let data_type =
3202 DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
3203
3204 let data = keys
3205 .into_data()
3206 .into_builder()
3207 .data_type(data_type)
3208 .add_child_data(values.to_data())
3209 .build()
3210 .unwrap();
3211
3212 DictionaryArray::from(data)
3213 }
3214
3215 fn generate_fixed_size_binary(len: usize, valid_percent: f64) -> FixedSizeBinaryArray {
3216 let mut rng = rng();
3217 let width = rng.random_range(0..20);
3218 let mut builder = FixedSizeBinaryBuilder::new(width);
3219
3220 let mut b = vec![0; width as usize];
3221 for _ in 0..len {
3222 match rng.random_bool(valid_percent) {
3223 true => {
3224 b.iter_mut().for_each(|x| *x = rng.random());
3225 builder.append_value(&b).unwrap();
3226 }
3227 false => builder.append_null(),
3228 }
3229 }
3230
3231 builder.finish()
3232 }
3233
3234 fn generate_struct(len: usize, valid_percent: f64) -> StructArray {
3235 let mut rng = rng();
3236 let nulls = NullBuffer::from_iter((0..len).map(|_| rng.random_bool(valid_percent)));
3237 let a = generate_primitive_array::<Int32Type>(len, valid_percent);
3238 let b = generate_strings::<i32>(len, valid_percent);
3239 let fields = Fields::from(vec![
3240 Field::new("a", DataType::Int32, true),
3241 Field::new("b", DataType::Utf8, true),
3242 ]);
3243 let values = vec![Arc::new(a) as _, Arc::new(b) as _];
3244 StructArray::new(fields, values, Some(nulls))
3245 }
3246
3247 fn generate_list<F>(len: usize, valid_percent: f64, values: F) -> ListArray
3248 where
3249 F: FnOnce(usize) -> ArrayRef,
3250 {
3251 let mut rng = rng();
3252 let offsets = OffsetBuffer::<i32>::from_lengths((0..len).map(|_| rng.random_range(0..10)));
3253 let values_len = offsets.last().unwrap().to_usize().unwrap();
3254 let values = values(values_len);
3255 let nulls = NullBuffer::from_iter((0..len).map(|_| rng.random_bool(valid_percent)));
3256 let field = Arc::new(Field::new_list_field(values.data_type().clone(), true));
3257 ListArray::new(field, offsets, values, Some(nulls))
3258 }
3259
3260 fn generate_column(len: usize) -> ArrayRef {
3261 let mut rng = rng();
3262 match rng.random_range(0..18) {
3263 0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
3264 1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
3265 2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
3266 3 => Arc::new(generate_primitive_array::<UInt64Type>(len, 0.8)),
3267 4 => Arc::new(generate_primitive_array::<Float32Type>(len, 0.8)),
3268 5 => Arc::new(generate_primitive_array::<Float64Type>(len, 0.8)),
3269 6 => Arc::new(generate_strings::<i32>(len, 0.8)),
3270 7 => Arc::new(generate_dictionary::<Int64Type>(
3271 Arc::new(generate_strings::<i32>(rng.random_range(1..len), 1.0)),
3273 len,
3274 0.8,
3275 )),
3276 8 => Arc::new(generate_dictionary::<Int64Type>(
3277 Arc::new(generate_primitive_array::<Int64Type>(
3279 rng.random_range(1..len),
3280 1.0,
3281 )),
3282 len,
3283 0.8,
3284 )),
3285 9 => Arc::new(generate_fixed_size_binary(len, 0.8)),
3286 10 => Arc::new(generate_struct(len, 0.8)),
3287 11 => Arc::new(generate_list(len, 0.8, |values_len| {
3288 Arc::new(generate_primitive_array::<Int64Type>(values_len, 0.8))
3289 })),
3290 12 => Arc::new(generate_list(len, 0.8, |values_len| {
3291 Arc::new(generate_strings::<i32>(values_len, 0.8))
3292 })),
3293 13 => Arc::new(generate_list(len, 0.8, |values_len| {
3294 Arc::new(generate_struct(values_len, 0.8))
3295 })),
3296 14 => Arc::new(generate_string_view(len, 0.8)),
3297 15 => Arc::new(generate_byte_view(len, 0.8)),
3298 16 => Arc::new(generate_fixed_stringview_column(len)),
3299 17 => Arc::new(
3300 generate_list(len + 1000, 0.8, |values_len| {
3301 Arc::new(generate_primitive_array::<Int64Type>(values_len, 0.8))
3302 })
3303 .slice(500, len),
3304 ),
3305 _ => unreachable!(),
3306 }
3307 }
3308
3309 fn print_row(cols: &[SortColumn], row: usize) -> String {
3310 let t: Vec<_> = cols
3311 .iter()
3312 .map(|x| match x.values.is_valid(row) {
3313 true => {
3314 let opts = FormatOptions::default().with_null("NULL");
3315 let formatter = ArrayFormatter::try_new(x.values.as_ref(), &opts).unwrap();
3316 formatter.value(row).to_string()
3317 }
3318 false => "NULL".to_string(),
3319 })
3320 .collect();
3321 t.join(",")
3322 }
3323
3324 fn print_col_types(cols: &[SortColumn]) -> String {
3325 let t: Vec<_> = cols
3326 .iter()
3327 .map(|x| x.values.data_type().to_string())
3328 .collect();
3329 t.join(",")
3330 }
3331
3332 #[test]
3333 #[cfg_attr(miri, ignore)]
3334 fn fuzz_test() {
3335 for _ in 0..100 {
3336 let mut rng = rng();
3337 let num_columns = rng.random_range(1..5);
3338 let len = rng.random_range(5..100);
3339 let arrays: Vec<_> = (0..num_columns).map(|_| generate_column(len)).collect();
3340
3341 let options: Vec<_> = (0..num_columns)
3342 .map(|_| SortOptions {
3343 descending: rng.random_bool(0.5),
3344 nulls_first: rng.random_bool(0.5),
3345 })
3346 .collect();
3347
3348 let sort_columns: Vec<_> = options
3349 .iter()
3350 .zip(&arrays)
3351 .map(|(o, c)| SortColumn {
3352 values: Arc::clone(c),
3353 options: Some(*o),
3354 })
3355 .collect();
3356
3357 let comparator = LexicographicalComparator::try_new(&sort_columns).unwrap();
3358
3359 let columns: Vec<SortField> = options
3360 .into_iter()
3361 .zip(&arrays)
3362 .map(|(o, a)| SortField::new_with_options(a.data_type().clone(), o))
3363 .collect();
3364
3365 let converter = RowConverter::new(columns).unwrap();
3366 let rows = converter.convert_columns(&arrays).unwrap();
3367
3368 for i in 0..len {
3369 for j in 0..len {
3370 let row_i = rows.row(i);
3371 let row_j = rows.row(j);
3372 let row_cmp = row_i.cmp(&row_j);
3373 let lex_cmp = comparator.compare(i, j);
3374 assert_eq!(
3375 row_cmp,
3376 lex_cmp,
3377 "({:?} vs {:?}) vs ({:?} vs {:?}) for types {}",
3378 print_row(&sort_columns, i),
3379 print_row(&sort_columns, j),
3380 row_i,
3381 row_j,
3382 print_col_types(&sort_columns)
3383 );
3384 }
3385 }
3386
3387 let back = converter.convert_rows(&rows).unwrap();
3390 for (actual, expected) in back.iter().zip(&arrays) {
3391 actual.to_data().validate_full().unwrap();
3392 dictionary_eq(actual, expected)
3393 }
3394
3395 let rows = rows.try_into_binary().expect("reasonable size");
3398 let parser = converter.parser();
3399 let back = converter
3400 .convert_rows(rows.iter().map(|b| parser.parse(b.expect("valid bytes"))))
3401 .unwrap();
3402 for (actual, expected) in back.iter().zip(&arrays) {
3403 actual.to_data().validate_full().unwrap();
3404 dictionary_eq(actual, expected)
3405 }
3406
3407 let rows = converter.from_binary(rows);
3408 let back = converter.convert_rows(&rows).unwrap();
3409 for (actual, expected) in back.iter().zip(&arrays) {
3410 actual.to_data().validate_full().unwrap();
3411 dictionary_eq(actual, expected)
3412 }
3413 }
3414 }
3415
3416 #[test]
3417 fn test_clear() {
3418 let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap();
3419 let mut rows = converter.empty_rows(3, 128);
3420
3421 let first = Int32Array::from(vec![None, Some(2), Some(4)]);
3422 let second = Int32Array::from(vec![Some(2), None, Some(4)]);
3423 let arrays = [Arc::new(first) as ArrayRef, Arc::new(second) as ArrayRef];
3424
3425 for array in arrays.iter() {
3426 rows.clear();
3427 converter
3428 .append(&mut rows, std::slice::from_ref(array))
3429 .unwrap();
3430 let back = converter.convert_rows(&rows).unwrap();
3431 assert_eq!(&back[0], array);
3432 }
3433
3434 let mut rows_expected = converter.empty_rows(3, 128);
3435 converter.append(&mut rows_expected, &arrays[1..]).unwrap();
3436
3437 for (i, (actual, expected)) in rows.iter().zip(rows_expected.iter()).enumerate() {
3438 assert_eq!(
3439 actual, expected,
3440 "For row {i}: expected {expected:?}, actual: {actual:?}",
3441 );
3442 }
3443 }
3444
3445 #[test]
3446 fn test_append_codec_dictionary_binary() {
3447 use DataType::*;
3448 let converter = RowConverter::new(vec![SortField::new(Dictionary(
3450 Box::new(Int32),
3451 Box::new(Binary),
3452 ))])
3453 .unwrap();
3454 let mut rows = converter.empty_rows(4, 128);
3455
3456 let keys = Int32Array::from_iter_values([0, 1, 2, 3]);
3457 let values = BinaryArray::from(vec![
3458 Some("a".as_bytes()),
3459 Some(b"b"),
3460 Some(b"c"),
3461 Some(b"d"),
3462 ]);
3463 let dict_array = DictionaryArray::new(keys, Arc::new(values));
3464
3465 rows.clear();
3466 let array = Arc::new(dict_array) as ArrayRef;
3467 converter
3468 .append(&mut rows, std::slice::from_ref(&array))
3469 .unwrap();
3470 let back = converter.convert_rows(&rows).unwrap();
3471
3472 dictionary_eq(&back[0], &array);
3473 }
3474
3475 #[test]
3476 fn test_list_prefix() {
3477 let mut a = ListBuilder::new(Int8Builder::new());
3478 a.append_value([None]);
3479 a.append_value([None, None]);
3480 let a = a.finish();
3481
3482 let converter = RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap();
3483 let rows = converter.convert_columns(&[Arc::new(a) as _]).unwrap();
3484 assert_eq!(rows.row(0).cmp(&rows.row(1)), Ordering::Less);
3485 }
3486
3487 #[test]
3488 fn map_should_be_marked_as_unsupported() {
3489 let map_data_type = Field::new_map(
3490 "map",
3491 "entries",
3492 Field::new("key", DataType::Utf8, false),
3493 Field::new("value", DataType::Utf8, true),
3494 false,
3495 true,
3496 )
3497 .data_type()
3498 .clone();
3499
3500 let is_supported = RowConverter::supports_fields(&[SortField::new(map_data_type)]);
3501
3502 assert!(!is_supported, "Map should not be supported");
3503 }
3504
3505 #[test]
3506 fn should_fail_to_create_row_converter_for_unsupported_map_type() {
3507 let map_data_type = Field::new_map(
3508 "map",
3509 "entries",
3510 Field::new("key", DataType::Utf8, false),
3511 Field::new("value", DataType::Utf8, true),
3512 false,
3513 true,
3514 )
3515 .data_type()
3516 .clone();
3517
3518 let converter = RowConverter::new(vec![SortField::new(map_data_type)]);
3519
3520 match converter {
3521 Err(ArrowError::NotYetImplemented(message)) => {
3522 assert!(
3523 message.contains("Row format support not yet implemented for"),
3524 "Expected NotYetImplemented error for map data type, got: {message}",
3525 );
3526 }
3527 Err(e) => panic!("Expected NotYetImplemented error, got: {e}"),
3528 Ok(_) => panic!("Expected NotYetImplemented error for map data type"),
3529 }
3530 }
3531
3532 #[test]
3533 fn test_values_buffer_smaller_when_utf8_validation_disabled() {
3534 fn get_values_buffer_len(col: ArrayRef) -> (usize, usize) {
3535 let converter = RowConverter::new(vec![SortField::new(DataType::Utf8View)]).unwrap();
3537
3538 let rows = converter.convert_columns(&[col]).unwrap();
3540 let converted = converter.convert_rows(&rows).unwrap();
3541 let unchecked_values_len = converted[0].as_string_view().data_buffers()[0].len();
3542
3543 let rows = rows.try_into_binary().expect("reasonable size");
3545 let parser = converter.parser();
3546 let converted = converter
3547 .convert_rows(rows.iter().map(|b| parser.parse(b.expect("valid bytes"))))
3548 .unwrap();
3549 let checked_values_len = converted[0].as_string_view().data_buffers()[0].len();
3550 (unchecked_values_len, checked_values_len)
3551 }
3552
3553 let col = Arc::new(StringViewArray::from_iter([
3555 Some("hello"), None, Some("short"), Some("tiny"), ])) as ArrayRef;
3560
3561 let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
3562 assert_eq!(unchecked_values_len, 0);
3564 assert_eq!(checked_values_len, 14);
3566
3567 let col = Arc::new(StringViewArray::from_iter([
3569 Some("this is a very long string over 12 bytes"),
3570 Some("another long string to test the buffer"),
3571 ])) as ArrayRef;
3572
3573 let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
3574 assert!(unchecked_values_len > 0);
3576 assert_eq!(unchecked_values_len, checked_values_len);
3577
3578 let col = Arc::new(StringViewArray::from_iter([
3580 Some("tiny"), Some("thisisexact13"), None,
3583 Some("short"), ])) as ArrayRef;
3585
3586 let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
3587 assert_eq!(unchecked_values_len, 13);
3589 assert!(checked_values_len > unchecked_values_len);
3590 }
3591}