Struct ArrowWriter

Source

pub struct ArrowWriter<W: Write> {
    writer: SerializedFileWriter<W>,
    in_progress: Option<ArrowRowGroupWriter>,
    arrow_schema: SchemaRef,
    row_group_writer_factory: ArrowRowGroupWriterFactory,
    max_row_group_size: usize,
}

Expand description

Encodes [RecordBatch] to parquet

Writes Arrow RecordBatches to a Parquet writer. Multiple [RecordBatch] will be encoded to the same row group, up to max_row_group_size rows. Any remaining rows will be flushed on close, leading the final row group in the output file to potentially contain fewer than max_row_group_size rows

§Example: Writing `RecordBatch`es

let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap();

let mut buffer = Vec::new();
let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), None).unwrap();
writer.write(&to_write).unwrap();
writer.close().unwrap();

let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 1024).unwrap();
let read = reader.next().unwrap().unwrap();

assert_eq!(to_write, read);

§Memory Usage and Limiting

The nature of Parquet requires buffering of an entire row group before it can be flushed to the underlying writer. Data is mostly buffered in its encoded form, reducing memory usage. However, some data such as dictionary keys, large strings or very nested data may still result in non-trivial memory usage.

§Type Support

The writer supports writing all Arrow DataTypes that have a direct mapping to Parquet types including StructArray and ListArray.

The following are not supported:

IntervalMonthDayNanoArray: Parquet does not support nanosecond intervals.

§Type Compatibility

The writer can write Arrow [RecordBatch]s that are logically equivalent. This means that for a given column, the writer can accept multiple Arrow DataTypes that contain the same value type.

For example, the following DataTypes are all logically equivalent and can be written to the same column:

String, LargeString, StringView
Binary, LargeBinary, BinaryView

The writer can will also accept both native and dictionary encoded arrays if the dictionaries contain compatible values.

let record_batch1 = RecordBatch::try_new(
   Arc::new(Schema::new(vec![Field::new("col", DataType::LargeUtf8, false)])),
   vec![Arc::new(LargeStringArray::from_iter_values(vec!["a", "b"]))]
 )
.unwrap();

let mut buffer = Vec::new();
let mut writer = ArrowWriter::try_new(&mut buffer, record_batch1.schema(), None).unwrap();
writer.write(&record_batch1).unwrap();

let record_batch2 = RecordBatch::try_new(
    Arc::new(Schema::new(vec![Field::new(
        "col",
        DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)),
         false,
    )])),
    vec![Arc::new(DictionaryArray::new(
         UInt8Array::from_iter_values(vec![0, 1]),
         Arc::new(StringArray::from_iter_values(vec!["b", "c"])),
     ))],
 )
 .unwrap();
 writer.write(&record_batch2).unwrap();
 writer.close();

Fields§

§writer: SerializedFileWriter<W>

Underlying Parquet writer

§in_progress: Option<ArrowRowGroupWriter>

The in-progress row group if any

§arrow_schema: SchemaRef

A copy of the Arrow schema.

The schema is used to verify that each record batch written has the correct schema

§row_group_writer_factory: ArrowRowGroupWriterFactory

Creates new ArrowRowGroupWriter instances as required

§max_row_group_size: usize

The length of arrays to write to each row group

Struct ArrowWriter Copy item path

§Example: Writing RecordBatches

§Memory Usage and Limiting

§Type Support

§Type Compatibility

Fields§

Implementations§

impl<W: Write + Send> ArrowWriter<W>

pub fn try_new( writer: W, arrow_schema: SchemaRef, props: Option<WriterProperties>, ) -> Result<Self>

pub fn try_new_with_options( writer: W, arrow_schema: SchemaRef, options: ArrowWriterOptions, ) -> Result<Self>

pub fn flushed_row_groups(&self) -> &[RowGroupMetaData]

pub fn memory_size(&self) -> usize

pub fn in_progress_size(&self) -> usize

pub fn in_progress_rows(&self) -> usize

pub fn bytes_written(&self) -> usize

pub fn write(&mut self, batch: &RecordBatch) -> Result<()>

pub fn write_all(&mut self, buf: &[u8]) -> Result<()>

pub fn sync(&mut self) -> Result<()>

pub fn flush(&mut self) -> Result<()>

pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue)

pub fn inner(&self) -> &W

pub fn inner_mut(&mut self) -> &mut W

pub fn into_inner(self) -> Result<W>

pub fn finish(&mut self) -> Result<ParquetMetaData>

pub fn close(self) -> Result<ParquetMetaData>

pub fn get_column_writers(&mut self) -> Result<Vec<ArrowColumnWriter>>

pub fn append_row_group(&mut self, chunks: Vec<ArrowColumnChunk>) -> Result<()>

pub fn into_serialized_writer( self, ) -> Result<(SerializedFileWriter<W>, ArrowRowGroupWriterFactory)>

Trait Implementations§

impl<W: Write + Send> Debug for ArrowWriter<W>

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl<W: Write + Send> RecordBatchWriter for ArrowWriter<W>

fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError>

fn close(self) -> Result<(), ArrowError>

Auto Trait Implementations§

impl<W> Freeze for ArrowWriter<W>where W: Freeze,

impl<W> !RefUnwindSafe for ArrowWriter<W>

impl<W> Send for ArrowWriter<W>where W: Send,

impl<W> !Sync for ArrowWriter<W>

impl<W> Unpin for ArrowWriter<W>where W: Unpin,

impl<W> !UnwindSafe for ArrowWriter<W>

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> Ungil for Twhere T: Send,

Struct ArrowWriter

§Example: Writing `RecordBatch`es

impl<W> Freeze for ArrowWriter<W>
where W: Freeze,

impl<W> Send for ArrowWriter<W>
where W: Send,

impl<W> Unpin for ArrowWriter<W>
where W: Unpin,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

impl<T> Ungil for T
where T: Send,