Skip to main content

arrow_array/array/
string_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::types::GenericStringType;
19use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20use arrow_schema::ArrowError;
21
22/// A [`GenericByteArray`] for storing `str`
23pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26    /// Returns the number of `Unicode Scalar Value` in the string at index `i`.
27    /// # Performance
28    /// This function has `O(n)` time complexity where `n` is the string length.
29    /// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`,
30    /// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
31    pub fn num_chars(&self, i: usize) -> usize {
32        self.value(i).chars().count()
33    }
34
35    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
36    pub fn take_iter<'a>(
37        &'a self,
38        indexes: impl Iterator<Item = Option<usize>> + 'a,
39    ) -> impl Iterator<Item = Option<&'a str>> {
40        indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
41    }
42
43    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
44    /// # Safety
45    ///
46    /// caller must ensure that the indexes in the iterator are less than the `array.len()`
47    pub unsafe fn take_iter_unchecked<'a>(
48        &'a self,
49        indexes: impl Iterator<Item = Option<usize>> + 'a,
50    ) -> impl Iterator<Item = Option<&'a str>> {
51        indexes.map(|opt_index| opt_index.map(|index| unsafe { self.value_unchecked(index) }))
52    }
53
54    /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning
55    /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data
56    pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
57        let (offsets, values, nulls) = v.into_parts();
58        Self::try_new(offsets, values, nulls)
59    }
60}
61
62impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
63    for GenericStringArray<OffsetSize>
64{
65    fn from(v: GenericListArray<OffsetSize>) -> Self {
66        GenericBinaryArray::<OffsetSize>::from(v).into()
67    }
68}
69
70impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
71    for GenericStringArray<OffsetSize>
72{
73    fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
74        Self::try_from_binary(v).unwrap()
75    }
76}
77
78impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
79    fn from(v: Vec<Option<&str>>) -> Self {
80        v.into_iter().collect()
81    }
82}
83
84impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
85    fn from(v: Vec<&str>) -> Self {
86        Self::from_iter_values(v)
87    }
88}
89
90impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
91    fn from(v: Vec<Option<String>>) -> Self {
92        v.into_iter().collect()
93    }
94}
95
96impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
97    fn from(v: Vec<String>) -> Self {
98        Self::from_iter_values(v)
99    }
100}
101
102/// A [`GenericStringArray`] of `str` using `i32` offsets
103///
104/// # Examples
105///
106/// Construction
107///
108/// ```
109/// # use arrow_array::StringArray;
110/// // Create from Vec<Option<&str>>
111/// let arr = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
112/// // Create from Vec<&str>
113/// let arr = StringArray::from(vec!["foo", "bar", "baz"]);
114/// // Create from iter/collect (requires Option<&str>)
115/// let arr: StringArray = std::iter::repeat(Some("foo")).take(10).collect();
116/// ```
117///
118/// Construction and Access
119///
120/// ```
121/// # use arrow_array::StringArray;
122/// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]);
123/// assert_eq!(array.value(0), "foo");
124/// ```
125///
126/// See [`GenericByteArray`] for more information and examples
127pub type StringArray = GenericStringArray<i32>;
128
129/// A [`GenericStringArray`] of `str` using `i64` offsets
130///
131/// # Examples
132///
133/// Construction
134///
135/// ```
136/// # use arrow_array::LargeStringArray;
137/// // Create from Vec<Option<&str>>
138/// let arr = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
139/// // Create from Vec<&str>
140/// let arr = LargeStringArray::from(vec!["foo", "bar", "baz"]);
141/// // Create from iter/collect (requires Option<&str>)
142/// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect();
143/// ```
144///
145/// Construction and Access
146///
147/// ```
148/// use arrow_array::LargeStringArray;
149/// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]);
150/// assert_eq!(array.value(2), "bar");
151/// ```
152///
153/// See [`GenericByteArray`] for more information and examples
154pub type LargeStringArray = GenericStringArray<i64>;
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use crate::Array;
160    use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
161    use crate::types::UInt8Type;
162    use arrow_buffer::Buffer;
163    use arrow_data::ArrayData;
164    use arrow_schema::{DataType, Field};
165    use std::sync::Arc;
166
167    #[test]
168    fn test_string_array_from_u8_slice() {
169        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
170
171        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
172        let string_array = StringArray::from(values);
173
174        assert_eq!(3, string_array.len());
175        assert_eq!(0, string_array.null_count());
176        assert_eq!("hello", string_array.value(0));
177        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
178        assert_eq!("", string_array.value(1));
179        assert_eq!("", unsafe { string_array.value_unchecked(1) });
180        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
181        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
182            string_array.value_unchecked(2)
183        });
184        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
185        assert_eq!(8, string_array.num_chars(2));
186        for i in 0..3 {
187            assert!(string_array.is_valid(i));
188            assert!(!string_array.is_null(i));
189        }
190    }
191
192    #[test]
193    #[should_panic(expected = "StringArray expects DataType::Utf8")]
194    fn test_string_array_from_int() {
195        let array = LargeStringArray::from(vec!["a", "b"]);
196        drop(StringArray::from(array.into_data()));
197    }
198
199    #[test]
200    fn test_large_string_array_from_u8_slice() {
201        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
202
203        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
204        let string_array = LargeStringArray::from(values);
205
206        assert_eq!(3, string_array.len());
207        assert_eq!(0, string_array.null_count());
208        assert_eq!("hello", string_array.value(0));
209        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
210        assert_eq!("", string_array.value(1));
211        assert_eq!("", unsafe { string_array.value_unchecked(1) });
212        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
213        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
214            string_array.value_unchecked(2)
215        });
216        assert_eq!(5, string_array.value_offsets()[2]);
217        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
218        assert_eq!(8, string_array.num_chars(2));
219        for i in 0..3 {
220            assert!(string_array.is_valid(i));
221            assert!(!string_array.is_null(i));
222        }
223    }
224
225    #[test]
226    fn test_nested_string_array() {
227        let string_builder = StringBuilder::with_capacity(3, 10);
228        let mut list_of_string_builder = ListBuilder::new(string_builder);
229
230        list_of_string_builder.values().append_value("foo");
231        list_of_string_builder.values().append_value("bar");
232        list_of_string_builder.append(true);
233
234        list_of_string_builder.values().append_value("foobar");
235        list_of_string_builder.append(true);
236        let list_of_strings = list_of_string_builder.finish();
237
238        assert_eq!(list_of_strings.len(), 2);
239
240        let first_slot = list_of_strings.value(0);
241        let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
242        assert_eq!(first_list.len(), 2);
243        assert_eq!(first_list.value(0), "foo");
244        assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
245        assert_eq!(first_list.value(1), "bar");
246        assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
247
248        let second_slot = list_of_strings.value(1);
249        let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
250        assert_eq!(second_list.len(), 1);
251        assert_eq!(second_list.value(0), "foobar");
252        assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
253    }
254
255    #[test]
256    #[should_panic(
257        expected = "Trying to access an element at index 4 from a StringArray of length 3"
258    )]
259    fn test_string_array_get_value_index_out_of_bound() {
260        let values = b"helloparquet";
261        let offsets: [i32; 4] = [0, 5, 5, 12];
262        let array_data = ArrayData::builder(DataType::Utf8)
263            .len(3)
264            .add_buffer(Buffer::from_slice_ref(offsets))
265            .add_buffer(Buffer::from_slice_ref(values))
266            .build()
267            .unwrap();
268        let string_array = StringArray::from(array_data);
269        string_array.value(4);
270    }
271
272    #[test]
273    fn test_string_array_fmt_debug() {
274        let arr: StringArray = vec!["hello", "arrow"].into();
275        assert_eq!(
276            "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
277            format!("{arr:?}")
278        );
279    }
280
281    #[test]
282    fn test_large_string_array_fmt_debug() {
283        let arr: LargeStringArray = vec!["hello", "arrow"].into();
284        assert_eq!(
285            "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
286            format!("{arr:?}")
287        );
288    }
289
290    #[test]
291    fn test_string_array_from_iter() {
292        let data = [Some("hello"), None, Some("arrow")];
293        let data_vec = data.to_vec();
294        // from Vec<Option<&str>>
295        let array1 = StringArray::from(data_vec.clone());
296        // from Iterator<Option<&str>>
297        let array2: StringArray = data_vec.clone().into_iter().collect();
298        // from Iterator<Option<String>>
299        let array3: StringArray = data_vec
300            .into_iter()
301            .map(|x| x.map(|s| s.to_string()))
302            .collect();
303        // from Iterator<&Option<&str>>
304        let array4: StringArray = data.iter().collect::<StringArray>();
305
306        assert_eq!(array1, array2);
307        assert_eq!(array2, array3);
308        assert_eq!(array3, array4);
309    }
310
311    #[test]
312    fn test_string_array_from_iter_values() {
313        let data = ["hello", "hello2"];
314        let array1 = StringArray::from_iter_values(data.iter());
315
316        assert_eq!(array1.value(0), "hello");
317        assert_eq!(array1.value(1), "hello2");
318
319        // Also works with String types.
320        let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
321        let array2 = StringArray::from_iter_values(data2.iter());
322
323        assert_eq!(array2.value(0), "goodbye");
324        assert_eq!(array2.value(1), "goodbye2");
325    }
326
327    #[test]
328    fn test_string_array_from_unbound_iter() {
329        // iterator that doesn't declare (upper) size bound
330        let string_iter = (0..)
331            .scan(0usize, |pos, i| {
332                if *pos < 10 {
333                    *pos += 1;
334                    Some(Some(format!("value {i}")))
335                } else {
336                    // actually returns up to 10 values
337                    None
338                }
339            })
340            // limited using take()
341            .take(100);
342
343        let (_, upper_size_bound) = string_iter.size_hint();
344        // the upper bound, defined by take above, is 100
345        assert_eq!(upper_size_bound, Some(100));
346        let string_array: StringArray = string_iter.collect();
347        // but the actual number of items in the array should be 10
348        assert_eq!(string_array.len(), 10);
349    }
350
351    #[test]
352    fn test_string_array_all_null() {
353        let data: Vec<Option<&str>> = vec![None];
354        let array = StringArray::from(data);
355        array
356            .into_data()
357            .validate_full()
358            .expect("All null array has valid array data");
359    }
360
361    #[test]
362    fn test_large_string_array_all_null() {
363        let data: Vec<Option<&str>> = vec![None];
364        let array = LargeStringArray::from(data);
365        array
366            .into_data()
367            .validate_full()
368            .expect("All null array has valid array data");
369    }
370
371    fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
372        let values = b"HelloArrowAndParquet";
373        // "ArrowAndParquet"
374        let child_data = ArrayData::builder(DataType::UInt8)
375            .len(15)
376            .offset(5)
377            .add_buffer(Buffer::from(values))
378            .build()
379            .unwrap();
380
381        let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
382        let null_buffer = Buffer::from_slice_ref([0b101]);
383        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
384            Field::new_list_field(DataType::UInt8, false),
385        ));
386
387        // [None, Some("Parquet")]
388        let array_data = ArrayData::builder(data_type)
389            .len(2)
390            .offset(1)
391            .add_buffer(Buffer::from_slice_ref(offsets))
392            .null_bit_buffer(Some(null_buffer))
393            .add_child_data(child_data)
394            .build()
395            .unwrap();
396        let list_array = GenericListArray::<O>::from(array_data);
397        let string_array = GenericStringArray::<O>::from(list_array);
398
399        assert_eq!(2, string_array.len());
400        assert_eq!(1, string_array.null_count());
401        assert!(string_array.is_null(0));
402        assert!(string_array.is_valid(1));
403        assert_eq!("Parquet", string_array.value(1));
404    }
405
406    #[test]
407    fn test_string_array_from_list_array() {
408        _test_generic_string_array_from_list_array::<i32>();
409    }
410
411    #[test]
412    fn test_large_string_array_from_list_array() {
413        _test_generic_string_array_from_list_array::<i64>();
414    }
415
416    fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
417        let values = b"HelloArrow";
418        let child_data = ArrayData::builder(DataType::UInt8)
419            .len(10)
420            .add_buffer(Buffer::from(values))
421            .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
422            .build()
423            .unwrap();
424
425        let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
426
427        // It is possible to create a null struct containing a non-nullable child
428        // see https://github.com/apache/arrow-rs/pull/3244 for details
429        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
430            Field::new_list_field(DataType::UInt8, true),
431        ));
432
433        // [None, Some(b"Parquet")]
434        let array_data = ArrayData::builder(data_type)
435            .len(2)
436            .add_buffer(Buffer::from_slice_ref(offsets))
437            .add_child_data(child_data)
438            .build()
439            .unwrap();
440        let list_array = GenericListArray::<O>::from(array_data);
441        drop(GenericStringArray::<O>::from(list_array));
442    }
443
444    #[test]
445    #[should_panic(expected = "The child array cannot contain null values.")]
446    fn test_string_array_from_list_array_with_child_nulls_failed() {
447        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
448    }
449
450    #[test]
451    #[should_panic(expected = "The child array cannot contain null values.")]
452    fn test_large_string_array_from_list_array_with_child_nulls_failed() {
453        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
454    }
455
456    fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
457        let values = b"HelloArrow";
458        let child_data = ArrayData::builder(DataType::UInt16)
459            .len(5)
460            .add_buffer(Buffer::from(values))
461            .build()
462            .unwrap();
463
464        let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
465        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
466            Field::new_list_field(DataType::UInt16, false),
467        ));
468
469        let array_data = ArrayData::builder(data_type)
470            .len(2)
471            .add_buffer(Buffer::from_slice_ref(offsets))
472            .add_child_data(child_data)
473            .build()
474            .unwrap();
475        let list_array = GenericListArray::<O>::from(array_data);
476        drop(GenericStringArray::<O>::from(list_array));
477    }
478
479    #[test]
480    #[should_panic(
481        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
482    )]
483    fn test_string_array_from_list_array_wrong_type() {
484        _test_generic_string_array_from_list_array_wrong_type::<i32>();
485    }
486
487    #[test]
488    #[should_panic(
489        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
490    )]
491    fn test_large_string_array_from_list_array_wrong_type() {
492        _test_generic_string_array_from_list_array_wrong_type::<i64>();
493    }
494
495    #[test]
496    #[should_panic(
497        expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
498    )]
499    fn test_list_array_utf8_validation() {
500        let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
501        builder.values().append_value(0xFF);
502        builder.append(true);
503        let list = builder.finish();
504        let _ = StringArray::from(list);
505    }
506
507    #[test]
508    fn test_empty_offsets() {
509        let string = StringArray::from(
510            ArrayData::builder(DataType::Utf8)
511                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
512                .build()
513                .unwrap(),
514        );
515        assert_eq!(string.len(), 0);
516        assert_eq!(string.value_offsets(), &[0]);
517
518        let string = LargeStringArray::from(
519            ArrayData::builder(DataType::LargeUtf8)
520                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
521                .build()
522                .unwrap(),
523        );
524        assert_eq!(string.len(), 0);
525        assert_eq!(string.value_offsets(), &[0]);
526    }
527
528    #[test]
529    fn test_into_builder() {
530        let array: StringArray = vec!["hello", "arrow"].into();
531
532        // Append values
533        let mut builder = array.into_builder().unwrap();
534
535        builder.append_value("rust");
536
537        let expected: StringArray = vec!["hello", "arrow", "rust"].into();
538        let array = builder.finish();
539        assert_eq!(expected, array);
540    }
541
542    #[test]
543    fn test_into_builder_err() {
544        let array: StringArray = vec!["hello", "arrow"].into();
545
546        // Clone it, so we cannot get a mutable builder back
547        let shared_array = array.clone();
548
549        let err_return = array.into_builder().unwrap_err();
550        assert_eq!(&err_return, &shared_array);
551    }
552
553    #[test]
554    fn test_non_null_string_array_equal() {
555        let a = StringArray::from(vec![Some("ab"), Some("c")]);
556        let b = StringArray::from(vec![Some("a"), Some("bc")]);
557
558        assert_ne!(a, b);
559    }
560}