# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.fromnumbersimportIntegralimportwarningsfrompyarrow.libimportTableimportpyarrow._orcas_orcfrompyarrow.fsimport_resolve_filesystem_and_path
[docs]classORCFile:""" Reader interface for a single ORC file Parameters ---------- source : str or pyarrow.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. """
@propertydefmetadata(self):"""The file metadata, as an arrow KeyValueMetadata"""returnself.reader.metadata()@propertydefschema(self):"""The file schema, as an arrow schema"""returnself.reader.schema()@propertydefnrows(self):"""The number of rows in the file"""returnself.reader.nrows()@propertydefnstripes(self):"""The number of stripes in the file"""returnself.reader.nstripes()@propertydeffile_version(self):"""Format version of the ORC file, must be 0.11 or 0.12"""returnself.reader.file_version()@propertydefsoftware_version(self):"""Software instance and version that wrote this file"""returnself.reader.software_version()@propertydefcompression(self):"""Compression codec of the file"""returnself.reader.compression()@propertydefcompression_size(self):"""Number of bytes to buffer for the compression codec in the file"""returnself.reader.compression_size()@propertydefwriter(self):"""Name of the writer that wrote this file. If the writer is unknown then its Writer ID (a number) is returned"""returnself.reader.writer()@propertydefwriter_version(self):"""Version of the writer"""returnself.reader.writer_version()@propertydefrow_index_stride(self):"""Number of rows per an entry in the row index or 0 if there is no row index"""returnself.reader.row_index_stride()@propertydefnstripe_statistics(self):"""Number of stripe statistics"""returnself.reader.nstripe_statistics()@propertydefcontent_length(self):"""Length of the data stripes in the file in bytes"""returnself.reader.content_length()@propertydefstripe_statistics_length(self):"""The number of compressed bytes in the file stripe statistics"""returnself.reader.stripe_statistics_length()@propertydeffile_footer_length(self):"""The number of compressed bytes in the file footer"""returnself.reader.file_footer_length()@propertydeffile_postscript_length(self):"""The number of bytes in the file postscript"""returnself.reader.file_postscript_length()@propertydeffile_length(self):"""The number of bytes in the file"""returnself.reader.file_length()def_select_names(self,columns=None):ifcolumnsisNone:returnNoneschema=self.schemanames=[]forcolincolumns:ifisinstance(col,Integral):col=int(col)if0<=col<len(schema):col=schema[col].namenames.append(col)else:raiseValueError("Column indices must be in 0 <= ind < %d,"" got %d"%(len(schema),col))else:returncolumnsreturnnames
[docs]defread_stripe(self,n,columns=None):"""Read a single stripe from the file. Parameters ---------- n : int The stripe index columns : list If not None, only these columns will be read from the stripe. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' Returns ------- pyarrow.RecordBatch Content of the stripe as a RecordBatch. """columns=self._select_names(columns)returnself.reader.read_stripe(n,columns=columns)
[docs]defread(self,columns=None):"""Read the whole file. Parameters ---------- columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. Output always follows the ordering of the file and not the `columns` list. Returns ------- pyarrow.Table Content of the file as a Table. """columns=self._select_names(columns)returnself.reader.read(columns=columns)
_orc_writer_args_docs="""file_version : {"0.11", "0.12"}, default "0.12" Determine which ORC file version to use. `Hive 0.11 / ORC v0 <https://orc.apache.org/specification/ORCv0/>`_ is the older version while `Hive 0.12 / ORC v1 <https://orc.apache.org/specification/ORCv1/>`_ is the newer one.batch_size : int, default 1024 Number of rows the ORC writer writes at a time.stripe_size : int, default 64 * 1024 * 1024 Size of each ORC stripe in bytes.compression : string, default 'uncompressed' The compression codec. Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} Note that LZ0 is currently not supported.compression_block_size : int, default 64 * 1024 Size of each compression block in bytes.compression_strategy : string, default 'speed' The compression strategy i.e. speed vs size reduction. Valid values: {'SPEED', 'COMPRESSION'}row_index_stride : int, default 10000 The row index stride i.e. the number of rows per an entry in the row index.padding_tolerance : double, default 0.0 The padding tolerance.dictionary_key_size_threshold : double, default 0.0 The dictionary key size threshold. 0 to disable dictionary encoding. 1 to always enable dictionary encoding.bloom_filter_columns : None, set-like or list-like, default None Columns that use the bloom filter.bloom_filter_fpp : double, default 0.05 Upper limit of the false-positive rate of the bloom filter."""
[docs]classORCWriter:__doc__="""Writer interface for a single ORC fileParameters----------where : str or pyarrow.io.NativeFile Writable target. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream or pyarrow.io.FixedSizeBufferWriter.{}""".format(_orc_writer_args_docs)is_open=False
[docs]defwrite(self,table):""" Write the table into an ORC file. The schema of the table must be equal to the schema used when opening the ORC file. Parameters ---------- table : pyarrow.Table The table to be written into the ORC file """assertself.is_openself.writer.write(table)
[docs]defclose(self):""" Close the ORC file """ifself.is_open:self.writer.close()self.is_open=False
read_table.__doc__="""Read a Table from an ORC file.Parameters----------source : str, pyarrow.NativeFile, or file-like object If a string passed, can be a single file name. For file-like objects, only read a single file. Use pyarrow.BufferReader to read a file contained in a bytes or buffer-like object.columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e'. Output always follows the ordering of the file and not the `columns` list. If empty, no columns will be read. Note that the table will still have the correct num_rows set despite having no columns.filesystem : FileSystem, default None If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem."""
[docs]defwrite_table(table,where,*,file_version='0.12',batch_size=1024,stripe_size=64*1024*1024,compression='uncompressed',compression_block_size=65536,compression_strategy='speed',row_index_stride=10000,padding_tolerance=0.0,dictionary_key_size_threshold=0.0,bloom_filter_columns=None,bloom_filter_fpp=0.05):ifisinstance(where,Table):warnings.warn("The order of the arguments has changed. Pass as ""'write_table(table, where)' instead. The old order will raise ""an error in the future.",FutureWarning,stacklevel=2)table,where=where,tablewithORCWriter(where,file_version=file_version,batch_size=batch_size,stripe_size=stripe_size,compression=compression,compression_block_size=compression_block_size,compression_strategy=compression_strategy,row_index_stride=row_index_stride,padding_tolerance=padding_tolerance,dictionary_key_size_threshold=dictionary_key_size_threshold,bloom_filter_columns=bloom_filter_columns,bloom_filter_fpp=bloom_filter_fpp)aswriter:writer.write(table)
write_table.__doc__="""Write a table into an ORC file.Parameters----------table : pyarrow.lib.Table The table to be written into the ORC filewhere : str or pyarrow.io.NativeFile Writable target. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream or pyarrow.io.FixedSizeBufferWriter.{}""".format(_orc_writer_args_docs)