Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
writer.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Implement Arrow streaming binary format
19 
20 #ifndef ARROW_IPC_WRITER_H
21 #define ARROW_IPC_WRITER_H
22 
23 #include <cstdint>
24 #include <functional>
25 #include <memory>
26 #include <vector>
27 
28 #include "arrow/ipc/message.h"
29 #include "arrow/util/visibility.h"
30 
31 namespace arrow {
32 
33 class Array;
34 class Buffer;
35 class Field;
36 class MemoryPool;
37 class RecordBatch;
38 class Schema;
39 class Status;
40 class Table;
41 class Tensor;
42 
43 namespace io {
44 
45 class OutputStream;
46 
47 } // namespace io
48 
49 namespace ipc {
50 
53 class ARROW_EXPORT RecordBatchWriter {
54  public:
55  virtual ~RecordBatchWriter();
56 
61  virtual Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) = 0;
62 
66  Status WriteTable(const Table& table);
67 
71  virtual Status Close() = 0;
72 
77  virtual void set_memory_pool(MemoryPool* pool) = 0;
78 };
79 
83 class ARROW_EXPORT RecordBatchStreamWriter : public RecordBatchWriter {
84  public:
85  virtual ~RecordBatchStreamWriter();
86 
94  static Status Open(io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
95  std::shared_ptr<RecordBatchWriter>* out);
96 
102  Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override;
103 
106  Status Close() override;
107 
108  void set_memory_pool(MemoryPool* pool) override;
109 
110  protected:
112  class ARROW_NO_EXPORT RecordBatchStreamWriterImpl;
113  std::unique_ptr<RecordBatchStreamWriterImpl> impl_;
114 };
115 
121 class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter {
122  public:
123  virtual ~RecordBatchFileWriter();
124 
131  static Status Open(io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
132  std::shared_ptr<RecordBatchWriter>* out);
133 
139  Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override;
140 
143  Status Close() override;
144 
145  private:
147  class ARROW_NO_EXPORT RecordBatchFileWriterImpl;
148  std::unique_ptr<RecordBatchFileWriterImpl> impl_;
149 };
150 
177 ARROW_EXPORT
178 Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset,
179  io::OutputStream* dst, int32_t* metadata_length,
180  int64_t* body_length, MemoryPool* pool,
181  int max_recursion_depth = kMaxNestingDepth,
182  bool allow_64bit = false);
183 
190 ARROW_EXPORT
192  std::shared_ptr<Buffer>* out);
193 
203 ARROW_EXPORT
205  io::OutputStream* out);
206 
214 ARROW_EXPORT
216  std::shared_ptr<Buffer>* out);
217 
222 ARROW_EXPORT
223 Status WriteRecordBatchStream(const std::vector<std::shared_ptr<RecordBatch>>& batches,
224  io::OutputStream* dst);
225 
231 ARROW_EXPORT
232 Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
233 
239 ARROW_EXPORT
240 Status GetTensorSize(const Tensor& tensor, int64_t* size);
241 
251 ARROW_EXPORT
252 Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
253  int64_t* body_length);
254 
255 } // namespace ipc
256 } // namespace arrow
257 
258 #endif // ARROW_IPC_WRITER_H
Definition: interfaces.h:111
Abstract interface for writing a stream of record batches.
Definition: writer.h:53
Status WriteRecordBatch(const RecordBatch &batch, int64_t buffer_start_offset, io::OutputStream *dst, int32_t *metadata_length, int64_t *body_length, MemoryPool *pool, int max_recursion_depth=kMaxNestingDepth, bool allow_64bit=false)
Low-level API for writing a record batch (without schema) to an OutputStream.
#define ARROW_NO_EXPORT
Definition: visibility.h:40
Collection of equal-length arrays matching a particular Schema.
Definition: table.h:118
Definition: tensor.h:53
Definition: status.h:106
Status SerializeRecordBatch(const RecordBatch &batch, MemoryPool *pool, io::OutputStream *out)
Write record batch to OutputStream.
Status WriteRecordBatchStream(const std::vector< std::shared_ptr< RecordBatch >> &batches, io::OutputStream *dst)
Write multiple record batches to OutputStream, including schema.
std::unique_ptr< RecordBatchStreamWriterImpl > impl_
Definition: writer.h:113
Synchronous batch stream writer that writes the Arrow streaming format.
Definition: writer.h:83
Status GetRecordBatchSize(const RecordBatch &batch, int64_t *size)
Compute the number of bytes needed to write a record batch including metadata.
Creates the Arrow record batch file format.
Definition: writer.h:121
Sequence of arrow::Field objects describing the columns of a record batch or table data structure...
Definition: type.h:741
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
Status GetTensorSize(const Tensor &tensor, int64_t *size)
Compute the number of bytes needed to write a tensor including metadata.
Status WriteTensor(const Tensor &tensor, io::OutputStream *dst, int32_t *metadata_length, int64_t *body_length)
EXPERIMENTAL: Write arrow::Tensor as a contiguous message.
constexpr int kMaxNestingDepth
Definition: message.h:50
Logical table as sequence of chunked arrays.
Definition: table.h:214
std::shared_ptr< Schema > schema(const std::vector< std::shared_ptr< Field >> &fields, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Schema instance.
Base class for memory allocation.
Definition: memory_pool.h:34
Status SerializeSchema(const Schema &schema, MemoryPool *pool, std::shared_ptr< Buffer > *out)
Serialize schema using stream writer as a sequence of one or more IPC messages.