pub struct ArrowColumnWriter {
writer: ArrowColumnWriterImpl,
chunk: Arc<Mutex<ArrowColumnChunkData>>,
}
Expand description
Encodes ArrowLeafColumn
to ArrowColumnChunk
Note: This is a low-level interface for applications that require
fine-grained control of encoding (e.g. encoding using multiple threads),
see ArrowWriter
for a higher-level interface
§Example: Encoding two Arrow Array’s in Parallel
// The arrow schema
let schema = Arc::new(Schema::new(vec![
Field::new("i32", DataType::Int32, false),
Field::new("f32", DataType::Float32, false),
]));
// Compute the parquet schema
let props = Arc::new(WriterProperties::default());
let parquet_schema = ArrowSchemaConverter::new()
.with_coerce_types(props.coerce_types())
.convert(&schema)
.unwrap();
// Create writers for each of the leaf columns
let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap();
// Spawn a worker thread for each column
//
// Note: This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better.
// The `map` produces an iterator of type `tuple of (thread handle, send channel)`.
let mut workers: Vec<_> = col_writers
.into_iter()
.map(|mut col_writer| {
let (send, recv) = std::sync::mpsc::channel::<ArrowLeafColumn>();
let handle = std::thread::spawn(move || {
// receive Arrays to encode via the channel
for col in recv {
col_writer.write(&col)?;
}
// once the input is complete, close the writer
// to return the newly created ArrowColumnChunk
col_writer.close()
});
(handle, send)
})
.collect();
// Create parquet writer
let root_schema = parquet_schema.root_schema_ptr();
// write to memory in the example, but this could be a File
let mut out = Vec::with_capacity(1024);
let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone())
.unwrap();
// Start row group
let mut row_group_writer: SerializedRowGroupWriter<'_, _> = writer
.next_row_group()
.unwrap();
// Create some example input columns to encode
let to_write = vec![
Arc::new(Int32Array::from_iter_values([1, 2, 3])) as _,
Arc::new(Float32Array::from_iter_values([1., 45., -1.])) as _,
];
// Send the input columns to the workers
let mut worker_iter = workers.iter_mut();
for (arr, field) in to_write.iter().zip(&schema.fields) {
for leaves in compute_leaves(field, arr).unwrap() {
worker_iter.next().unwrap().1.send(leaves).unwrap();
}
}
// Wait for the workers to complete encoding, and append
// the resulting column chunks to the row group (and the file)
for (handle, send) in workers {
drop(send); // Drop send side to signal termination
// wait for the worker to send the completed chunk
let chunk: ArrowColumnChunk = handle.join().unwrap().unwrap();
chunk.append_to_row_group(&mut row_group_writer).unwrap();
}
// Close the row group which writes to the underlying file
row_group_writer.close().unwrap();
let metadata = writer.close().unwrap();
assert_eq!(metadata.num_rows, 3);
Fields§
§writer: ArrowColumnWriterImpl
§chunk: Arc<Mutex<ArrowColumnChunkData>>
Implementations§
Source§impl ArrowColumnWriter
impl ArrowColumnWriter
Sourcepub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
Write an ArrowLeafColumn
Sourcepub fn close(self) -> Result<ArrowColumnChunk>
pub fn close(self) -> Result<ArrowColumnChunk>
Close this column returning the written ArrowColumnChunk
Sourcepub fn memory_size(&self) -> usize
pub fn memory_size(&self) -> usize
Returns the estimated total memory usage by the writer.
This Self::get_estimated_total_bytes
this is an estimate
of the current memory usage and not it’s anticipated encoded size.
This includes:
- Data buffered in encoded form
- Data buffered in un-encoded form (e.g.
usize
dictionary keys)
This value should be greater than or equal to Self::get_estimated_total_bytes
Sourcepub fn get_estimated_total_bytes(&self) -> usize
pub fn get_estimated_total_bytes(&self) -> usize
Returns the estimated total encoded bytes for this column writer.
This includes:
- Data buffered in encoded form
- An estimate of how large the data buffered in un-encoded form would be once encoded
This value should be less than or equal to Self::memory_size
Trait Implementations§
Auto Trait Implementations§
impl !Freeze for ArrowColumnWriter
impl !RefUnwindSafe for ArrowColumnWriter
impl Send for ArrowColumnWriter
impl !Sync for ArrowColumnWriter
impl Unpin for ArrowColumnWriter
impl !UnwindSafe for ArrowColumnWriter
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more