Input / output

Interfaces

class FileInterface

Subclassed by arrow::io::InputStream, arrow::io::OutputStream

Public Functions

virtual Status Close() = 0

Close the stream cleanly.

For writable streams, this will attempt to flush any pending data before releasing the underlying resource.

After Close() is called, closed() returns true and the stream is not available for further operations.

virtual Status Abort()

Close the stream abruptly.

This method does not guarantee that any pending data is flushed. It merely releases any underlying resource used by the stream for its operation.

After Abort() is called, closed() returns true and the stream is not available for further operations.

virtual Status Tell(int64_t *position) const = 0

Return the position in this stream.

virtual bool closed() const = 0

Return whether the stream is closed.

class Readable

Subclassed by arrow::io::InputStream

class Seekable

Subclassed by arrow::io::RandomAccessFile, arrow::io::WritableFile

class Writable

Subclassed by arrow::io::OutputStream

Public Functions

virtual Status Write(const void *data, int64_t nbytes) = 0

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

virtual Status Write(const std::shared_ptr<Buffer> &data)

Write the given data to the stream.

Since the Buffer owns its memory, this method can avoid a copy if buffering is required. See Write(const void*, int64_t) for details.

virtual Status Flush()

Flush buffered bytes, if any.

class InputStream : public virtual arrow::io::FileInterface, public virtual arrow::io::Readable

Subclassed by arrow::io::internal::InputStreamConcurrencyWrapper< Derived >, arrow::io::RandomAccessFile, arrow::io::StdinStream, arrow::io::internal::InputStreamConcurrencyWrapper< BufferedInputStream >, arrow::io::internal::InputStreamConcurrencyWrapper< CompressedInputStream >, arrow::io::SlowInputStreamBase< InputStream >

Public Functions

Status Advance(int64_t nbytes)

Advance or skip stream indicated number of bytes.

Return

Status

Parameters
  • [in] nbytes: the number to move forward

virtual Status Peek(int64_t nbytes, util::string_view *out)

Return zero-copy string_view to upcoming bytes.

Do not modify the stream position. The view becomes invalid after any operation on the stream. May trigger buffering if the requested size is larger than the number of buffered bytes.

May return NotImplemented on streams that don’t support it.

Return

Status

Parameters
  • [in] nbytes: the maximum number of bytes to see

  • [out] out: the returned arrow::util::string_view

virtual bool supports_zero_copy() const

Return true if InputStream is capable of zero copy Buffer reads.

class RandomAccessFile : public arrow::io::InputStream, public arrow::io::Seekable

Subclassed by arrow::io::HdfsReadableFile, arrow::io::internal::RandomAccessFileConcurrencyWrapper< Derived >, arrow::io::ReadWriteFileInterface, arrow::py::PyReadableFile, parquet::ParquetInputWrapper, arrow::io::internal::RandomAccessFileConcurrencyWrapper< BufferReader >, arrow::io::internal::RandomAccessFileConcurrencyWrapper< ReadableFile >, arrow::io::SlowInputStreamBase< RandomAccessFile >

Public Functions

~RandomAccessFile()

Necessary because we hold a std::unique_ptr.

virtual Status ReadAt(int64_t position, int64_t nbytes, int64_t *bytes_read, void *out)

Read nbytes at position, provide default implementations using Read(…), but can be overridden.

The default implementation is thread-safe. It is unspecified whether this method updates the file position or not.

Return

Status

Parameters
  • [in] position: Where to read bytes from

  • [in] nbytes: The number of bytes to read

  • [out] bytes_read: The number of bytes read

  • [out] out: The buffer to read bytes into

virtual Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr<Buffer> *out)

Read nbytes at position, provide default implementations using Read(…), but can be overridden.

The default implementation is thread-safe. It is unspecified whether this method updates the file position or not.

Parameters
  • [in] position: Where to read bytes from

  • [in] nbytes: The number of bytes to read

  • [out] out: The buffer to read bytes into. The number of bytes read can be retrieved by calling Buffer::size().

Public Static Functions

static std::shared_ptr<InputStream> GetStream(std::shared_ptr<RandomAccessFile> file, int64_t file_offset, int64_t nbytes)

Create an isolated InputStream that reads a segment of a RandomAccessFile.

Multiple such stream can be created and used independently without interference

Parameters
  • [in] file: a file instance

  • [in] file_offset: the starting position in the file

  • [in] nbytes: the extent of bytes to read. The file should have sufficient bytes available

class OutputStream : public virtual arrow::io::FileInterface, public arrow::io::Writable

Subclassed by arrow::io::BufferedOutputStream, arrow::io::BufferOutputStream, arrow::io::CompressedOutputStream, arrow::io::FileOutputStream, arrow::io::HdfsOutputStream, arrow::io::MockOutputStream, arrow::io::StderrStream, arrow::io::StdoutStream, arrow::io::WritableFile, arrow::py::PyOutputStream, parquet::ParquetOutputWrapper

class ReadWriteFileInterface : public arrow::io::RandomAccessFile, public arrow::io::WritableFile

Subclassed by arrow::io::MemoryMappedFile

Concrete implementations

In-memory streams

class BufferReader : public arrow::io::internal::RandomAccessFileConcurrencyWrapper<BufferReader>

Random access zero-copy reads on an arrow::Buffer.

Subclassed by arrow::cuda::CudaBufferReader

Public Functions

BufferReader(const util::string_view &data)

Instantiate from std::string or arrow::util::string_view.

Does not own data

bool closed() const

Return whether the stream is closed.

bool supports_zero_copy() const

Return true if InputStream is capable of zero copy Buffer reads.

class MockOutputStream : public arrow::io::OutputStream

A helper class to tracks the size of allocations.

Writes to this stream do not copy or retain any data, they just bump a size counter that can be later used to know exactly which data size needs to be allocated for actual writing.

Public Functions

Status Close()

Close the stream cleanly.

For writable streams, this will attempt to flush any pending data before releasing the underlying resource.

After Close() is called, closed() returns true and the stream is not available for further operations.

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status Write(const void *data, int64_t nbytes)

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

class BufferOutputStream : public arrow::io::OutputStream

An output stream that writes to a resizable buffer.

Public Functions

Status Close()

Close the stream, preserving the buffer (retrieve it with Finish()).

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status Write(const void *data, int64_t nbytes)

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

Status Finish(std::shared_ptr<Buffer> *result)

Close the stream and return the buffer.

Status Reset(int64_t initial_capacity = 1024, MemoryPool *pool = default_memory_pool())

Initialize state of OutputStream with newly allocated memory and set position to 0.

Return

Status

Parameters
  • [in] initial_capacity: the starting allocated capacity

  • [inout] pool: the memory pool to use for allocations

Public Static Functions

static Status Create(int64_t initial_capacity, MemoryPool *pool, std::shared_ptr<BufferOutputStream> *out)

Create in-memory output stream with indicated capacity using a memory pool.

Parameters
  • [in] initial_capacity: the initial allocated internal capacity of the OutputStream

  • [inout] pool: a MemoryPool to use for allocations

  • [out] out: the created stream

class FixedSizeBufferWriter : public arrow::io::WritableFile

An output stream that writes into a fixed-size mutable buffer.

Public Functions

FixedSizeBufferWriter(const std::shared_ptr<Buffer> &buffer)

Input buffer must be mutable, will abort if not.

Status Close()

Close the stream cleanly.

For writable streams, this will attempt to flush any pending data before releasing the underlying resource.

After Close() is called, closed() returns true and the stream is not available for further operations.

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status Write(const void *data, int64_t nbytes)

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

Local files

class ReadableFile : public arrow::io::internal::RandomAccessFileConcurrencyWrapper<ReadableFile>

An operating system file open in read-only mode.

Reads through this implementation are unbuffered. If many small reads need to be issued, it is recommended to use a buffering layer for good performance.

Public Functions

bool closed() const

Return whether the stream is closed.

Public Static Functions

static Status Open(const std::string &path, std::shared_ptr<ReadableFile> *file)

Open a local file for reading.

Parameters
  • [in] path: with UTF8 encoding

  • [out] file: ReadableFile instance Open file, allocate memory (if needed) from default memory pool

static Status Open(const std::string &path, MemoryPool *pool, std::shared_ptr<ReadableFile> *file)

Open a local file for reading.

Parameters
  • [in] path: with UTF8 encoding

  • [in] pool: a MemoryPool for memory allocations

  • [out] file: ReadableFile instance Open file with one’s own memory pool for memory allocations

static Status Open(int fd, std::shared_ptr<ReadableFile> *file)

Open a local file for reading.

The file descriptor becomes owned by the

ReadableFile, and will be closed on Close() or destruction.
Parameters
  • [in] fd: file descriptor

  • [out] file: ReadableFile instance Open file with one’s own memory pool for memory allocations

static Status Open(int fd, MemoryPool *pool, std::shared_ptr<ReadableFile> *file)

Open a local file for reading.

The file descriptor becomes owned by the

ReadableFile, and will be closed on Close() or destruction.
Parameters
  • [in] fd: file descriptor

  • [in] pool: a MemoryPool for memory allocations

  • [out] file: ReadableFile instance Open file with one’s own memory pool for memory allocations

class FileOutputStream : public arrow::io::OutputStream

An operating system file open in write-only mode.

Public Functions

Status Close()

Close the stream cleanly.

For writable streams, this will attempt to flush any pending data before releasing the underlying resource.

After Close() is called, closed() returns true and the stream is not available for further operations.

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status Write(const void *data, int64_t nbytes)

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

Public Static Functions

static Status Open(const std::string &path, std::shared_ptr<OutputStream> *out)

Open a local file for writing, truncating any existing file.

When opening a new file, any existing file with the indicated path is truncated to 0 bytes, deleting any existing data

Parameters
  • [in] path: with UTF8 encoding

  • [out] out: a base interface OutputStream instance

static Status Open(const std::string &path, bool append, std::shared_ptr<OutputStream> *out)

Open a local file for writing.

Parameters
  • [in] path: with UTF8 encoding

  • [in] append: append to existing file, otherwise truncate to 0 bytes

  • [out] out: a base interface OutputStream instance

static Status Open(int fd, std::shared_ptr<OutputStream> *out)

Open a file descriptor for writing.

The underlying file isn’t truncated.

The file descriptor becomes owned by the

OutputStream, and will be closed on Close() or destruction.
Parameters
  • [in] fd: file descriptor

  • [out] out: a base interface OutputStream instance

static Status Open(const std::string &path, std::shared_ptr<FileOutputStream> *file)

Open a local file for writing, truncating any existing file.

When opening a new file, any existing file with the indicated path is truncated to 0 bytes, deleting any existing data

Parameters

static Status Open(const std::string &path, bool append, std::shared_ptr<FileOutputStream> *file)

Open a local file for writing.

Parameters
  • [in] path: with UTF8 encoding

  • [in] append: append to existing file, otherwise truncate to 0 bytes

  • [out] file: a FileOutputStream instance

static Status Open(int fd, std::shared_ptr<FileOutputStream> *out)

Open a file descriptor for writing.

The underlying file isn’t truncated.

The file descriptor becomes owned by the

OutputStream, and will be closed on Close() or destruction.
Parameters

class MemoryMappedFile : public arrow::io::ReadWriteFileInterface

A file interface that uses memory-mapped files for memory interactions.

This implementation supports zero-copy reads. The same class is used for both reading and writing.

If opening a file in a writable mode, it is not truncated first as with FileOutputStream.

Public Functions

Status Close()

Close the stream cleanly.

For writable streams, this will attempt to flush any pending data before releasing the underlying resource.

After Close() is called, closed() returns true and the stream is not available for further operations.

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr<Buffer> *out)

Read nbytes at position, provide default implementations using Read(…), but can be overridden.

The default implementation is thread-safe. It is unspecified whether this method updates the file position or not.

Parameters
  • [in] position: Where to read bytes from

  • [in] nbytes: The number of bytes to read

  • [out] out: The buffer to read bytes into. The number of bytes read can be retrieved by calling Buffer::size().

Status ReadAt(int64_t position, int64_t nbytes, int64_t *bytes_read, void *out)

Read nbytes at position, provide default implementations using Read(…), but can be overridden.

The default implementation is thread-safe. It is unspecified whether this method updates the file position or not.

Return

Status

Parameters
  • [in] position: Where to read bytes from

  • [in] nbytes: The number of bytes to read

  • [out] bytes_read: The number of bytes read

  • [out] out: The buffer to read bytes into

bool supports_zero_copy() const

Return true if InputStream is capable of zero copy Buffer reads.

Status Write(const void *data, int64_t nbytes)

Write data at the current position in the file. Thread-safe.

Status Resize(int64_t new_size)

Set the size of the map to new_size.

Status WriteAt(int64_t position, const void *data, int64_t nbytes)

Write data at a particular position in the file. Thread-safe.

Public Static Functions

static Status Create(const std::string &path, int64_t size, std::shared_ptr<MemoryMappedFile> *out)

Create new file with indicated size, return in read/write mode.

Buffering input / output wrappers

class BufferedInputStream : public arrow::io::internal::InputStreamConcurrencyWrapper<BufferedInputStream>

An InputStream that performs buffered reads from an unbuffered InputStream, which can mitigate the overhead of many small reads in some cases.

Public Functions

Status SetBufferSize(int64_t new_buffer_size)

Resize internal read buffer; calls to Read(…) will read at least.

Return

Status

Parameters
  • [in] new_buffer_size: the new read buffer size

int64_t bytes_buffered() const

Return the number of remaining bytes in the read buffer.

int64_t buffer_size() const

Return the current size of the internal buffer.

std::shared_ptr<InputStream> Detach()

Release the raw InputStream.

Any data buffered will be discarded. Further operations on this object are invalid

Return

raw the underlying InputStream

std::shared_ptr<InputStream> raw() const

Return the unbuffered InputStream.

bool closed() const

Return whether the stream is closed.

Public Static Functions

static Status Create(int64_t buffer_size, MemoryPool *pool, std::shared_ptr<InputStream> raw, std::shared_ptr<BufferedInputStream> *out, int64_t raw_read_bound = -1)

Create a BufferedInputStream from a raw InputStream.

Parameters
  • [in] buffer_size: the size of the temporary read buffer

  • [in] pool: a MemoryPool to use for allocations

  • [in] raw: a raw InputStream

  • [out] out: the created BufferedInputStream

  • [in] raw_read_bound: a bound on the maximum number of bytes to read from the raw input stream. The default -1 indicates that it is unbounded

class BufferedOutputStream : public arrow::io::OutputStream

Public Functions

Status SetBufferSize(int64_t new_buffer_size)

Resize internal buffer.

Return

Status

Parameters
  • [in] new_buffer_size: the new buffer size

int64_t buffer_size() const

Return the current size of the internal buffer.

int64_t bytes_buffered() const

Return the number of remaining bytes that have not been flushed to the raw OutputStream.

Status Detach(std::shared_ptr<OutputStream> *raw)

Flush any buffered writes and release the raw OutputStream.

Further operations on this object are invalid

Return

Status

Parameters

Status Close()

Close the buffered output stream.

This implicitly closes the underlying raw output stream.

Status Abort()

Close the stream abruptly.

This method does not guarantee that any pending data is flushed. It merely releases any underlying resource used by the stream for its operation.

After Abort() is called, closed() returns true and the stream is not available for further operations.

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status Write(const void *data, int64_t nbytes)

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

Status Write(const std::shared_ptr<Buffer> &data)

Write the given data to the stream.

Since the Buffer owns its memory, this method can avoid a copy if buffering is required. See Write(const void*, int64_t) for details.

Status Flush()

Flush buffered bytes, if any.

std::shared_ptr<OutputStream> raw() const

Return the underlying raw output stream.

Public Static Functions

static Status Create(int64_t buffer_size, MemoryPool *pool, std::shared_ptr<OutputStream> raw, std::shared_ptr<BufferedOutputStream> *out)

Create a buffered output stream wrapping the given output stream.

Return

Status

Parameters

Compressed input / output wrappers

class CompressedInputStream : public arrow::io::internal::InputStreamConcurrencyWrapper<CompressedInputStream>

Public Functions

bool closed() const

Return whether the stream is closed.

std::shared_ptr<InputStream> raw() const

Return the underlying raw input stream.

Public Static Functions

static Status Make(util::Codec *codec, const std::shared_ptr<InputStream> &raw, std::shared_ptr<CompressedInputStream> *out)

Create a compressed input stream wrapping the given input stream.

class CompressedOutputStream : public arrow::io::OutputStream

Public Functions

Status Close()

Close the compressed output stream.

This implicitly closes the underlying raw output stream.

Status Abort()

Close the stream abruptly.

This method does not guarantee that any pending data is flushed. It merely releases any underlying resource used by the stream for its operation.

After Abort() is called, closed() returns true and the stream is not available for further operations.

bool closed() const

Return whether the stream is closed.

Status Tell(int64_t *position) const

Return the position in this stream.

Status Write(const void *data, int64_t nbytes)

Write the given data to the stream.

This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.

Status Flush()

Flush buffered bytes, if any.

std::shared_ptr<OutputStream> raw() const

Return the underlying raw output stream.

Public Static Functions

static Status Make(util::Codec *codec, const std::shared_ptr<OutputStream> &raw, std::shared_ptr<CompressedOutputStream> *out)

Create a compressed output stream wrapping the given output stream.