Source code for pyarrow.orc

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


from numbers import Integral
import warnings

from pyarrow.lib import Table
import pyarrow._orc as _orc
from pyarrow.fs import _resolve_filesystem_and_path


[docs]class ORCFile: """ Reader interface for a single ORC file Parameters ---------- source : str or pyarrow.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. """
[docs] def __init__(self, source): self.reader = _orc.ORCReader() self.reader.open(source)
@property def metadata(self): """The file metadata, as an arrow KeyValueMetadata""" return self.reader.metadata() @property def schema(self): """The file schema, as an arrow schema""" return self.reader.schema() @property def nrows(self): """The number of rows in the file""" return self.reader.nrows() @property def nstripes(self): """The number of stripes in the file""" return self.reader.nstripes() @property def file_version(self): """Format version of the ORC file, must be 0.11 or 0.12""" return self.reader.file_version() @property def software_version(self): """Software instance and version that wrote this file""" return self.reader.software_version() @property def compression(self): """Compression codec of the file""" return self.reader.compression() @property def compression_size(self): """Number of bytes to buffer for the compression codec in the file""" return self.reader.compression_size() @property def writer(self): """Name of the writer that wrote this file. If the writer is unknown then its Writer ID (a number) is returned""" return self.reader.writer() @property def writer_version(self): """Version of the writer""" return self.reader.writer_version() @property def row_index_stride(self): """Number of rows per an entry in the row index or 0 if there is no row index""" return self.reader.row_index_stride() @property def nstripe_statistics(self): """Number of stripe statistics""" return self.reader.nstripe_statistics() @property def content_length(self): """Length of the data stripes in the file in bytes""" return self.reader.content_length() @property def stripe_statistics_length(self): """The number of compressed bytes in the file stripe statistics""" return self.reader.stripe_statistics_length() @property def file_footer_length(self): """The number of compressed bytes in the file footer""" return self.reader.file_footer_length() @property def file_postscript_length(self): """The number of bytes in the file postscript""" return self.reader.file_postscript_length() @property def file_length(self): """The number of bytes in the file""" return self.reader.file_length() def _select_names(self, columns=None): if columns is None: return None schema = self.schema names = [] for col in columns: if isinstance(col, Integral): col = int(col) if 0 <= col < len(schema): col = schema[col].name names.append(col) else: raise ValueError("Column indices must be in 0 <= ind < %d," " got %d" % (len(schema), col)) else: return columns return names
[docs] def read_stripe(self, n, columns=None): """Read a single stripe from the file. Parameters ---------- n : int The stripe index columns : list If not None, only these columns will be read from the stripe. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' Returns ------- pyarrow.RecordBatch Content of the stripe as a RecordBatch. """ columns = self._select_names(columns) return self.reader.read_stripe(n, columns=columns)
[docs] def read(self, columns=None): """Read the whole file. Parameters ---------- columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' Returns ------- pyarrow.Table Content of the file as a Table. """ columns = self._select_names(columns) return self.reader.read(columns=columns)
_orc_writer_args_docs = """file_version : {"0.11", "0.12"}, default "0.12" Determine which ORC file version to use. `Hive 0.11 / ORC v0 <https://orc.apache.org/specification/ORCv0/>`_ is the older version while `Hive 0.12 / ORC v1 <https://orc.apache.org/specification/ORCv1/>`_ is the newer one. batch_size : int, default 1024 Number of rows the ORC writer writes at a time. stripe_size : int, default 64 * 1024 * 1024 Size of each ORC stripe in bytes. compression : string, default 'uncompressed' The compression codec. Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} Note that LZ0 is currently not supported. compression_block_size : int, default 64 * 1024 Size of each compression block in bytes. compression_strategy : string, default 'speed' The compression strategy i.e. speed vs size reduction. Valid values: {'SPEED', 'COMPRESSION'} row_index_stride : int, default 10000 The row index stride i.e. the number of rows per an entry in the row index. padding_tolerance : double, default 0.0 The padding tolerance. dictionary_key_size_threshold : double, default 0.0 The dictionary key size threshold. 0 to disable dictionary encoding. 1 to always enable dictionary encoding. bloom_filter_columns : None, set-like or list-like, default None Columns that use the bloom filter. bloom_filter_fpp : double, default 0.05 Upper limit of the false-positive rate of the bloom filter. """
[docs]class ORCWriter: __doc__ = """ Writer interface for a single ORC file Parameters ---------- where : str or pyarrow.io.NativeFile Writable target. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream or pyarrow.io.FixedSizeBufferWriter. {} """.format(_orc_writer_args_docs) is_open = False
[docs] def __init__(self, where, *, file_version='0.12', batch_size=1024, stripe_size=64 * 1024 * 1024, compression='uncompressed', compression_block_size=65536, compression_strategy='speed', row_index_stride=10000, padding_tolerance=0.0, dictionary_key_size_threshold=0.0, bloom_filter_columns=None, bloom_filter_fpp=0.05, ): self.writer = _orc.ORCWriter() self.writer.open( where, file_version=file_version, batch_size=batch_size, stripe_size=stripe_size, compression=compression, compression_block_size=compression_block_size, compression_strategy=compression_strategy, row_index_stride=row_index_stride, padding_tolerance=padding_tolerance, dictionary_key_size_threshold=dictionary_key_size_threshold, bloom_filter_columns=bloom_filter_columns, bloom_filter_fpp=bloom_filter_fpp ) self.is_open = True
def __del__(self): self.close() def __enter__(self): return self def __exit__(self, *args, **kwargs): self.close()
[docs] def write(self, table): """ Write the table into an ORC file. The schema of the table must be equal to the schema used when opening the ORC file. Parameters ---------- table : pyarrow.Table The table to be written into the ORC file """ assert self.is_open self.writer.write(table)
[docs] def close(self): """ Close the ORC file """ if self.is_open: self.writer.close() self.is_open = False
[docs]def read_table(source, columns=None, filesystem=None): filesystem, path = _resolve_filesystem_and_path(source, filesystem) if filesystem is not None: source = filesystem.open_input_file(path) if columns is not None and len(columns) == 0: result = ORCFile(source).read().select(columns) else: result = ORCFile(source).read(columns=columns) return result
read_table.__doc__ = """ Read a Table from an ORC file. Parameters ---------- source : str, pyarrow.NativeFile, or file-like object If a string passed, can be a single file name. For file-like objects, only read a single file. Use pyarrow.BufferReader to read a file contained in a bytes or buffer-like object. columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. If empty, no columns will be read. Note that the table will still have the correct num_rows set despite having no columns. filesystem : FileSystem, default None If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. """
[docs]def write_table(table, where, *, file_version='0.12', batch_size=1024, stripe_size=64 * 1024 * 1024, compression='uncompressed', compression_block_size=65536, compression_strategy='speed', row_index_stride=10000, padding_tolerance=0.0, dictionary_key_size_threshold=0.0, bloom_filter_columns=None, bloom_filter_fpp=0.05): if isinstance(where, Table): warnings.warn( "The order of the arguments has changed. Pass as " "'write_table(table, where)' instead. The old order will raise " "an error in the future.", FutureWarning, stacklevel=2 ) table, where = where, table with ORCWriter( where, file_version=file_version, batch_size=batch_size, stripe_size=stripe_size, compression=compression, compression_block_size=compression_block_size, compression_strategy=compression_strategy, row_index_stride=row_index_stride, padding_tolerance=padding_tolerance, dictionary_key_size_threshold=dictionary_key_size_threshold, bloom_filter_columns=bloom_filter_columns, bloom_filter_fpp=bloom_filter_fpp ) as writer: writer.write(table)
write_table.__doc__ = """ Write a table into an ORC file. Parameters ---------- table : pyarrow.lib.Table The table to be written into the ORC file where : str or pyarrow.io.NativeFile Writable target. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream or pyarrow.io.FixedSizeBufferWriter. {} """.format(_orc_writer_args_docs)