pub struct ArrowWriterOptions {
properties: WriterProperties,
skip_arrow_metadata: bool,
schema_root: Option<String>,
schema_descr: Option<SchemaDescriptor>,
page_store_factory: Option<Arc<dyn PageStoreFactory>>,
}Expand description
Arrow-specific configuration settings for writing parquet files.
See ArrowWriter for how to configure the writer.
Fields§
§properties: WriterProperties§skip_arrow_metadata: bool§schema_root: Option<String>§schema_descr: Option<SchemaDescriptor>§page_store_factory: Option<Arc<dyn PageStoreFactory>>Implementations§
Source§impl ArrowWriterOptions
impl ArrowWriterOptions
Sourcepub fn new() -> Self
pub fn new() -> Self
Creates a new ArrowWriterOptions with the default settings.
Sourcepub fn with_properties(self, properties: WriterProperties) -> Self
pub fn with_properties(self, properties: WriterProperties) -> Self
Sets the WriterProperties for writing parquet files.
Sourcepub fn with_page_store_factory(
self,
page_store_factory: Arc<dyn PageStoreFactory>,
) -> Self
pub fn with_page_store_factory( self, page_store_factory: Arc<dyn PageStoreFactory>, ) -> Self
Sets the PageStoreFactory used to buffer completed pages while a row
group is being written.
The default implementation (InMemoryPageStore) buffers all completed
pages on the heap until the row group is flushed, so peak write memory
grows with the row group size. Using this API, pages can be spilled to a
file or object storage instead, reducing peak write memory substantially
at the expense of an extra write to and read from secondary storage.
§Example: spilling pages to a temp file
A simple spilling backend uses one temp file per column chunk; put
appends the page and take reads it back.
struct TempFilePageStore {
file: File,
/// Total size of the file
end: u64,
/// Location of pages: (offset, len)
locs: Vec<(u64, usize)>,
}
impl PageStore for TempFilePageStore {
fn put(&mut self, value: Bytes) -> Result<PageKey> {
// Append to the end of the file
self.file.seek(SeekFrom::Start(self.end))?;
self.file.write_all(&value)?;
let key = PageKey::new(self.locs.len() as u64);
self.locs.push((self.end, value.len()));
self.end += value.len() as u64;
Ok(key)
}
fn take(&mut self, key: PageKey) -> Result<Bytes> {
let (offset, len) = self.locs[key.get() as usize];
let mut buf = vec![0u8; len];
self.file.seek(SeekFrom::Start(offset))?;
self.file.read_exact(&mut buf)?;
Ok(Bytes::from(buf))
}
}
/// Factory for creating [`TempFilePageStore`]
#[derive(Debug)]
struct TempFilePageStoreFactory;
impl PageStoreFactory for TempFilePageStoreFactory {
fn create(&self, args: &PageStoreArgs<'_>) -> Result<Box<dyn PageStore>> {
// `args` exposes the column index and descriptor (physical/logical
// type, path), so a real backend might choose to spill only large columns.
let _ = (args.column_index(), args.column_descriptor());
Ok(Box::new(TempFilePageStore {
file: tempfile::tempfile()?, // temp file is cleaned on drop
end: 0,
locs: Vec::new(),
}))
}
}
// write 1000 integers
let col = Arc::new(Int64Array::from_iter_values(0..1000)) as ArrayRef;
let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap();
let options =
ArrowWriterOptions::new().with_page_store_factory(Arc::new(TempFilePageStoreFactory));
let mut buffer = Vec::new();
let mut writer =
ArrowWriter::try_new_with_options(&mut buffer, to_write.schema(), options).unwrap();
writer.write(&to_write).unwrap();
writer.close().unwrap();
// buffer now holds valid Parquet data, which can be read as normal:
let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 1024).unwrap();
assert_eq!(to_write, reader.next().unwrap().unwrap());Sourcepub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self
pub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self
Skip encoding the embedded arrow metadata (defaults to false)
Parquet files generated by the ArrowWriter contain embedded arrow schema
by default.
Set skip_arrow_metadata to true, to skip encoding the embedded metadata.
Sourcepub fn with_schema_root(self, schema_root: String) -> Self
pub fn with_schema_root(self, schema_root: String) -> Self
Set the name of the root parquet schema element (defaults to "arrow_schema")
Sourcepub fn with_parquet_schema(self, schema_descr: SchemaDescriptor) -> Self
pub fn with_parquet_schema(self, schema_descr: SchemaDescriptor) -> Self
Explicitly specify the Parquet schema to be used
If omitted (the default), the ArrowSchemaConverter is used to compute the
Parquet SchemaDescriptor. This may be used When the SchemaDescriptor is
already known or must be calculated using custom logic.
Trait Implementations§
Source§impl Clone for ArrowWriterOptions
impl Clone for ArrowWriterOptions
Source§fn clone(&self) -> ArrowWriterOptions
fn clone(&self) -> ArrowWriterOptions
1.0.0 (const: unstable) · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read more