parquet/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//!
19//! This crate contains the official Native Rust implementation of
20//! [Apache Parquet](https://parquet.apache.org/), part of
21//! the [Apache Arrow](https://arrow.apache.org/) project.
22//! The crate provides a number of APIs to read and write Parquet files,
23//! covering a range of use cases.
24//!
25//! Please see the [parquet crates.io](https://crates.io/crates/parquet)
26//! page for feature flags and tips to improve performance.
27//!
28//! # Format Overview
29//!
30//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are
31//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], but
32//! focuses on storage efficiency whereas Arrow prioritizes compute efficiency.
33//!
34//! Parquet files are partitioned for scalability. Each file contains metadata,
35//! along with zero or more "row groups", each row group containing one or
36//! more columns. The APIs in this crate reflect this structure.
37//!
38//! Data in Parquet files is strongly typed and differentiates between logical
39//! and physical types (see [`schema`]). In addition, Parquet files may contain
40//! other metadata, such as statistics, which can be used to optimize reading
41//! (see [`file::metadata`]).
42//! For more details about the Parquet format itself, see the [Parquet spec]
43//!
44//! [Parquet spec]: https://github.com/apache/parquet-format/blob/master/README.md#file-format
45//!
46//! # APIs
47//!
48//! This crate exposes a number of APIs for different use-cases.
49//!
50//! ## Metadata and Schema
51//!
52//! The [`schema`] module provides APIs to work with Parquet schemas. The
53//! [`file::metadata`] module provides APIs to work with Parquet metadata.
54//!
55//! ## Read/Write Arrow
56//!
57//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`.
58//! This makes for a simple and performant interface to parquet data, whilst allowing workloads
59//! to leverage the wide range of data transforms provided by the [arrow] crate, and by the
60//! ecosystem of libraries and services using [Arrow] as an interop format.
61//!
62//! ## Read/Write Arrow Async
63//!
64//! When the `async` feature is enabled, [`arrow::async_reader`] and [`arrow::async_writer`]
65//! provide the ability to read and write [`arrow`] data asynchronously. Additionally, with the
66//! `object_store` feature is enabled, [`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
67//! provides efficient integration with object storage services such as S3 via the [object_store]
68//! crate, automatically optimizing IO based on any predicates or projections provided.
69//!
70//! ## Read/Write Parquet
71//!
72//! Workloads needing finer-grained control, or avoid a dependence on arrow,
73//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet
74//! data model, and therefore require knowledge of the underlying parquet format,
75//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads
76//! should prefer the arrow interfaces.
77//!
78//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html
79//! [Arrow]: https://arrow.apache.org/
80//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values
81//! [Dremel]: https://research.google/pubs/pub36632/
82//! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
83//! [object_store]: https://docs.rs/object_store/latest/object_store/
84
85#![doc(
86 html_logo_url = "https://raw.githubusercontent.com/apache/parquet-format/25f05e73d8cd7f5c83532ce51cb4f4de8ba5f2a2/logo/parquet-logos_1.svg",
87 html_favicon_url = "https://raw.githubusercontent.com/apache/parquet-format/25f05e73d8cd7f5c83532ce51cb4f4de8ba5f2a2/logo/parquet-logos_1.svg"
88)]
89#![cfg_attr(docsrs, feature(doc_auto_cfg))]
90#![warn(missing_docs)]
91/// Defines a an item with an experimental public API
92///
93/// The module will not be documented, and will only be public if the
94/// experimental feature flag is enabled
95///
96/// Experimental components have no stability guarantees
97#[cfg(feature = "experimental")]
98macro_rules! experimental {
99 ($(#[$meta:meta])* $vis:vis mod $module:ident) => {
100 #[doc(hidden)]
101 $(#[$meta])*
102 pub mod $module;
103 }
104}
105
106#[cfg(not(feature = "experimental"))]
107macro_rules! experimental {
108 ($(#[$meta:meta])* $vis:vis mod $module:ident) => {
109 $(#[$meta])*
110 $vis mod $module;
111 }
112}
113
114#[macro_use]
115pub mod errors;
116pub mod basic;
117
118/// Automatically generated code from the Parquet thrift definition.
119///
120/// This module code generated from [parquet.thrift]. See [crate::file] for
121/// more information on reading Parquet encoded data.
122///
123/// [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
124// see parquet/CONTRIBUTING.md for instructions on regenerating
125// Don't try clippy and format auto generated code
126#[allow(clippy::all, missing_docs)]
127#[rustfmt::skip]
128pub mod format;
129
130#[macro_use]
131pub mod data_type;
132
133// Exported for external use, such as benchmarks
134#[cfg(feature = "experimental")]
135#[doc(hidden)]
136pub use self::encodings::{decoding, encoding};
137
138experimental!(#[macro_use] mod util);
139
140pub use util::utf8;
141
142#[cfg(feature = "arrow")]
143pub mod arrow;
144pub mod column;
145experimental!(mod compression);
146experimental!(mod encodings);
147pub mod bloom_filter;
148pub mod file;
149pub mod record;
150pub mod schema;
151
152pub mod thrift;