Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
writer.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Implement Arrow streaming binary format
19 
20 #ifndef ARROW_IPC_WRITER_H
21 #define ARROW_IPC_WRITER_H
22 
23 #include <cstdint>
24 #include <functional>
25 #include <memory>
26 #include <vector>
27 
28 #include "arrow/ipc/message.h"
29 #include "arrow/util/visibility.h"
30 
31 namespace arrow {
32 
33 class Array;
34 class Buffer;
35 class Field;
36 class MemoryPool;
37 class RecordBatch;
38 class Schema;
39 class Status;
40 class Table;
41 class Tensor;
42 
43 namespace io {
44 
45 class OutputStream;
46 
47 } // namespace io
48 
49 namespace ipc {
50 
53 class ARROW_EXPORT RecordBatchWriter {
54  public:
55  virtual ~RecordBatchWriter();
56 
61  virtual Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) = 0;
62 
66  Status WriteTable(const Table& table);
67 
72  Status WriteTable(const Table& table, int64_t max_chunksize);
73 
77  virtual Status Close() = 0;
78 
83  virtual void set_memory_pool(MemoryPool* pool) = 0;
84 };
85 
89 class ARROW_EXPORT RecordBatchStreamWriter : public RecordBatchWriter {
90  public:
91  ~RecordBatchStreamWriter() override;
92 
100  static Status Open(io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
101  std::shared_ptr<RecordBatchWriter>* out);
102 
108  Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override;
109 
112  Status Close() override;
113 
114  void set_memory_pool(MemoryPool* pool) override;
115 
116  protected:
118  class ARROW_NO_EXPORT RecordBatchStreamWriterImpl;
119  std::unique_ptr<RecordBatchStreamWriterImpl> impl_;
120 };
121 
127 class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter {
128  public:
129  ~RecordBatchFileWriter() override;
130 
137  static Status Open(io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
138  std::shared_ptr<RecordBatchWriter>* out);
139 
145  Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override;
146 
149  Status Close() override;
150 
151  private:
153  class ARROW_NO_EXPORT RecordBatchFileWriterImpl;
154  std::unique_ptr<RecordBatchFileWriterImpl> file_impl_;
155 };
156 
183 ARROW_EXPORT
184 Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset,
185  io::OutputStream* dst, int32_t* metadata_length,
186  int64_t* body_length, MemoryPool* pool,
187  int max_recursion_depth = kMaxNestingDepth,
188  bool allow_64bit = false);
189 
196 ARROW_EXPORT
198  std::shared_ptr<Buffer>* out);
199 
209 ARROW_EXPORT
211  io::OutputStream* out);
212 
220 ARROW_EXPORT
222  std::shared_ptr<Buffer>* out);
223 
228 ARROW_EXPORT
229 Status WriteRecordBatchStream(const std::vector<std::shared_ptr<RecordBatch>>& batches,
230  io::OutputStream* dst);
231 
237 ARROW_EXPORT
238 Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
239 
245 ARROW_EXPORT
246 Status GetTensorSize(const Tensor& tensor, int64_t* size);
247 
255 ARROW_EXPORT
256 Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool,
257  std::unique_ptr<Message>* out);
258 
268 ARROW_EXPORT
269 Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
270  int64_t* body_length);
271 
272 } // namespace ipc
273 } // namespace arrow
274 
275 #endif // ARROW_IPC_WRITER_H
Definition: interfaces.h:111
Status SerializeRecordBatch(const RecordBatch &batch, MemoryPool *pool, std::shared_ptr< Buffer > *out)
Serialize record batch as encapsulated IPC message in a new buffer.
Abstract interface for writing a stream of record batches.
Definition: writer.h:53
Status WriteRecordBatch(const RecordBatch &batch, int64_t buffer_start_offset, io::OutputStream *dst, int32_t *metadata_length, int64_t *body_length, MemoryPool *pool, int max_recursion_depth=kMaxNestingDepth, bool allow_64bit=false)
Low-level API for writing a record batch (without schema) to an OutputStream.
#define ARROW_NO_EXPORT
Definition: visibility.h:42
Collection of equal-length arrays matching a particular Schema.
Definition: record_batch.h:41
Status GetTensorMessage(const Tensor &tensor, MemoryPool *pool, std::unique_ptr< Message > *out)
EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory allocation.
Definition: tensor.h:53
Definition: status.h:93
Status WriteRecordBatchStream(const std::vector< std::shared_ptr< RecordBatch >> &batches, io::OutputStream *dst)
Write multiple record batches to OutputStream, including schema.
std::unique_ptr< RecordBatchStreamWriterImpl > impl_
Definition: writer.h:119
Synchronous batch stream writer that writes the Arrow streaming format.
Definition: writer.h:89
Status GetRecordBatchSize(const RecordBatch &batch, int64_t *size)
Compute the number of bytes needed to write a record batch including metadata.
Creates the Arrow record batch file format.
Definition: writer.h:127
Sequence of arrow::Field objects describing the columns of a record batch or table data structure...
Definition: type.h:742
Top-level namespace for Apache Arrow C++ API.
Definition: adapter.h:32
Status GetTensorSize(const Tensor &tensor, int64_t *size)
Compute the number of bytes needed to write a tensor including metadata.
Status WriteTensor(const Tensor &tensor, io::OutputStream *dst, int32_t *metadata_length, int64_t *body_length)
EXPERIMENTAL: Write arrow::Tensor as a contiguous message.
constexpr int kMaxNestingDepth
Definition: message.h:62
Logical table as sequence of chunked arrays.
Definition: table.h:166
std::shared_ptr< Schema > schema(const std::vector< std::shared_ptr< Field >> &fields, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Schema instance.
Base class for memory allocation.
Definition: memory_pool.h:34
Status SerializeSchema(const Schema &schema, MemoryPool *pool, std::shared_ptr< Buffer > *out)
Serialize schema using stream writer as a sequence of one or more IPC messages.