pub struct ArrowWriterOptions {
properties: WriterProperties,
skip_arrow_metadata: bool,
schema_root: Option<String>,
schema_descr: Option<SchemaDescriptor>,
page_store_factory: Option<Arc<dyn PageStoreFactory>>,
}Expand description
Arrow-specific configuration settings for writing parquet files.
See ArrowWriter for how to configure the writer.
Fields§
§properties: WriterProperties§skip_arrow_metadata: bool§schema_root: Option<String>§schema_descr: Option<SchemaDescriptor>§page_store_factory: Option<Arc<dyn PageStoreFactory>>Implementations§
Source§impl ArrowWriterOptions
impl ArrowWriterOptions
Sourcepub fn new() -> Self
pub fn new() -> Self
Creates a new ArrowWriterOptions with the default settings.
Sourcepub fn with_properties(self, properties: WriterProperties) -> Self
pub fn with_properties(self, properties: WriterProperties) -> Self
Sets the WriterProperties for writing parquet files.
Sourcepub fn with_page_store_factory(
self,
page_store_factory: Arc<dyn PageStoreFactory>,
) -> Self
pub fn with_page_store_factory( self, page_store_factory: Arc<dyn PageStoreFactory>, ) -> Self
Sets the PageStoreFactory used to buffer completed pages while a row
group is being written.
By default (an InMemoryPageStore per column chunk) completed pages
are buffered on the heap until the row group is flushed, so peak memory
grows with the row group size. Supplying a factory that spills to a temp
file or object storage instead bounds peak write memory, decoupling it
from the row group size while keeping large, read-optimal row groups.
§Example: a custom PageStore
A store only has to map an opaque, store-allocated PageKey to a blob
and hand the blob back once. The keys need not be dense or sequential —
here a HashMap-backed store mints sparse handles, proving the writer
relies only on the opaque-handle contract. A real spilling backend would
write the bytes to a temp file in put and read them back in take.
#[derive(Default)]
struct MapPageStore {
blobs: HashMap<u64, Bytes>,
next: u64,
}
impl PageStore for MapPageStore {
fn put(&mut self, value: Bytes) -> Result<PageKey> {
// Mint a sparse handle (every other integer) to show the writer
// never assumes anything about the key's value.
let key = PageKey::new(self.next);
self.next += 2;
self.blobs.insert(key.get(), value);
Ok(key)
}
fn take(&mut self, key: PageKey) -> Result<Bytes> {
self.blobs
.remove(&key.get())
.ok_or_else(|| ParquetError::General(format!("invalid key {}", key.get())))
}
}
#[derive(Debug)]
struct MapPageStoreFactory;
impl PageStoreFactory for MapPageStoreFactory {
fn create(&self, args: &PageStoreArgs<'_>) -> Result<Box<dyn PageStore>> {
// `args` exposes the column index and descriptor (physical/logical
// type, path), so a real backend could spill only large columns.
let _ = (args.column_index(), args.column_descriptor());
Ok(Box::new(MapPageStore::default()))
}
}
let col = Arc::new(Int64Array::from_iter_values(0..1000)) as ArrayRef;
let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap();
let options =
ArrowWriterOptions::new().with_page_store_factory(Arc::new(MapPageStoreFactory));
let mut buffer = Vec::new();
let mut writer =
ArrowWriter::try_new_with_options(&mut buffer, to_write.schema(), options).unwrap();
writer.write(&to_write).unwrap();
writer.close().unwrap();
// The file is byte-identical to one written with the default store.
let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 1024).unwrap();
assert_eq!(to_write, reader.next().unwrap().unwrap());Sourcepub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self
pub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self
Skip encoding the embedded arrow metadata (defaults to false)
Parquet files generated by the ArrowWriter contain embedded arrow schema
by default.
Set skip_arrow_metadata to true, to skip encoding the embedded metadata.
Sourcepub fn with_schema_root(self, schema_root: String) -> Self
pub fn with_schema_root(self, schema_root: String) -> Self
Set the name of the root parquet schema element (defaults to "arrow_schema")
Sourcepub fn with_parquet_schema(self, schema_descr: SchemaDescriptor) -> Self
pub fn with_parquet_schema(self, schema_descr: SchemaDescriptor) -> Self
Explicitly specify the Parquet schema to be used
If omitted (the default), the ArrowSchemaConverter is used to compute the
Parquet SchemaDescriptor. This may be used When the SchemaDescriptor is
already known or must be calculated using custom logic.
Trait Implementations§
Source§impl Clone for ArrowWriterOptions
impl Clone for ArrowWriterOptions
Source§fn clone(&self) -> ArrowWriterOptions
fn clone(&self) -> ArrowWriterOptions
1.0.0 (const: unstable) · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read more