Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
table.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_TABLE_H
19 #define ARROW_TABLE_H
20 
21 #include <cstdint>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 #include "arrow/array.h"
27 #include "arrow/type.h"
28 #include "arrow/util/macros.h"
29 #include "arrow/util/visibility.h"
30 
31 namespace arrow {
32 
33 class KeyValueMetadata;
34 class Status;
35 
36 using ArrayVector = std::vector<std::shared_ptr<Array>>;
37 
41 class ARROW_EXPORT ChunkedArray {
42  public:
43  explicit ChunkedArray(const ArrayVector& chunks);
44 
46  int64_t length() const { return length_; }
47 
48  int64_t null_count() const { return null_count_; }
49 
50  int num_chunks() const { return static_cast<int>(chunks_.size()); }
51 
53  std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
54 
55  const ArrayVector& chunks() const { return chunks_; }
56 
57  std::shared_ptr<DataType> type() const;
58 
59  bool Equals(const ChunkedArray& other) const;
60  bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
61 
62  protected:
64  int64_t length_;
65  int64_t null_count_;
66 };
67 
70 class ARROW_EXPORT Column {
71  public:
72  Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
73  Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data);
74 
75  Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data);
76 
77  // Construct from name and array
78  Column(const std::string& name, const std::shared_ptr<Array>& data);
79 
80  int64_t length() const { return data_->length(); }
81 
82  int64_t null_count() const { return data_->null_count(); }
83 
84  std::shared_ptr<Field> field() const { return field_; }
85 
88  const std::string& name() const { return field_->name(); }
89 
92  std::shared_ptr<DataType> type() const { return field_->type(); }
93 
96  std::shared_ptr<ChunkedArray> data() const { return data_; }
97 
98  bool Equals(const Column& other) const;
99  bool Equals(const std::shared_ptr<Column>& other) const;
100 
103  Status ValidateData();
104 
105  protected:
106  std::shared_ptr<Field> field_;
107  std::shared_ptr<ChunkedArray> data_;
108 
109  private:
111 };
112 
118 class ARROW_EXPORT RecordBatch {
119  public:
124  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
125  const std::vector<std::shared_ptr<Array>>& columns);
126 
128  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
129  std::vector<std::shared_ptr<Array>>&& columns);
130 
141  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
142  std::vector<std::shared_ptr<ArrayData>>&& columns);
143 
146  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
147  const std::vector<std::shared_ptr<ArrayData>>& columns);
148 
151  bool Equals(const RecordBatch& other) const;
152 
154  bool ApproxEquals(const RecordBatch& other) const;
155 
156  // \return the table's schema
158  std::shared_ptr<Schema> schema() const { return schema_; }
159 
163  std::shared_ptr<Array> column(int i) const;
164 
165  std::shared_ptr<ArrayData> column_data(int i) const { return columns_[i]; }
166 
168  const std::string& column_name(int i) const;
169 
171  int num_columns() const { return static_cast<int>(columns_.size()); }
172 
174  int64_t num_rows() const { return num_rows_; }
175 
181  std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
182  const std::shared_ptr<const KeyValueMetadata>& metadata) const;
183 
187  std::shared_ptr<RecordBatch> Slice(int64_t offset) const;
188 
193  std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const;
194 
197  Status Validate() const;
198 
199  private:
201 
202  RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows);
203 
204  std::shared_ptr<Schema> schema_;
205  int64_t num_rows_;
206  std::vector<std::shared_ptr<ArrayData>> columns_;
207 
208  // Caching boxed array data
209  mutable std::vector<std::shared_ptr<Array>> boxed_columns_;
210 };
211 
214 class ARROW_EXPORT Table {
215  public:
221  Table(const std::shared_ptr<Schema>& schema,
222  const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows = -1);
223 
228  Table(const std::shared_ptr<Schema>& schema,
229  const std::vector<std::shared_ptr<Array>>& arrays, int64_t num_rows = -1);
230 
231  // Construct table from RecordBatch, but only if all of the batch schemas are
232  // equal. Returns Status::Invalid if there is some problem
233  static Status FromRecordBatches(
234  const std::vector<std::shared_ptr<RecordBatch>>& batches,
235  std::shared_ptr<Table>* table);
236 
238  std::shared_ptr<Schema> schema() const { return schema_; }
239 
242  std::shared_ptr<Column> column(int i) const { return columns_[i]; }
243 
245  Status RemoveColumn(int i, std::shared_ptr<Table>* out) const;
246 
248  Status AddColumn(int i, const std::shared_ptr<Column>& column,
249  std::shared_ptr<Table>* out) const;
250 
256  std::shared_ptr<Table> ReplaceSchemaMetadata(
257  const std::shared_ptr<const KeyValueMetadata>& metadata) const;
258 
260  int num_columns() const { return static_cast<int>(columns_.size()); }
261 
263  int64_t num_rows() const { return num_rows_; }
264 
266  bool Equals(const Table& other) const;
267 
269  Status ValidateColumns() const;
270 
272  bool IsChunked() const;
273 
274  private:
276 
277  std::shared_ptr<Schema> schema_;
278  std::vector<std::shared_ptr<Column>> columns_;
279 
280  int64_t num_rows_;
281 };
282 
284 class ARROW_EXPORT RecordBatchReader {
285  public:
286  virtual ~RecordBatchReader();
287 
289  virtual std::shared_ptr<Schema> schema() const = 0;
290 
296  virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
297 };
298 
300 class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
301  public:
302  ~TableBatchReader();
303 
305  explicit TableBatchReader(const Table& table);
306 
307  std::shared_ptr<Schema> schema() const override;
308 
309  Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
310 
311  private:
312  class TableBatchReaderImpl;
313  std::unique_ptr<TableBatchReaderImpl> impl_;
314 };
315 
318 ARROW_EXPORT
319 Status ConcatenateTables(const std::vector<std::shared_ptr<Table>>& tables,
320  std::shared_ptr<Table>* table);
321 
324 ARROW_EXPORT
325 Status MakeTable(const std::shared_ptr<Schema>& schema,
326  const std::vector<std::shared_ptr<Array>>& arrays,
327  std::shared_ptr<Table>* table);
328 
329 } // namespace arrow
330 
331 #endif // ARROW_TABLE_H
int64_t length() const
Definition: table.h:46
std::shared_ptr< ChunkedArray > data_
Definition: table.h:107
int64_t length_
Definition: table.h:64
std::shared_ptr< Field > field() const
Definition: table.h:84
int num_chunks() const
Definition: table.h:50
ArrayVector chunks_
Definition: table.h:63
std::shared_ptr< DataType > type() const
The column type.
Definition: table.h:92
std::shared_ptr< ChunkedArray > data() const
The column data as a chunked array.
Definition: table.h:96
Abstract interface for reading stream of record batches.
Definition: table.h:284
Status ConcatenateTables(const std::vector< std::shared_ptr< Table >> &tables, std::shared_ptr< Table > *table)
Construct table from multiple input tables.
Status MakeTable(const std::shared_ptr< Schema > &schema, const std::vector< std::shared_ptr< Array >> &arrays, std::shared_ptr< Table > *table)
Construct table from multiple input tables.
int64_t num_rows() const
Definition: table.h:174
Collection of equal-length arrays matching a particular Schema.
Definition: table.h:118
An immutable column data structure consisting of a field (type metadata) and a logical chunked data a...
Definition: table.h:70
std::vector< std::shared_ptr< Array > > ArrayVector
Definition: table.h:36
Definition: status.h:106
std::shared_ptr< Field > field_
Definition: table.h:106
int64_t null_count() const
Definition: table.h:82
std::shared_ptr< Column > column(int i) const
Definition: table.h:242
const ArrayVector & chunks() const
Definition: table.h:55
const std::string & name() const
The column name.
Definition: table.h:88
std::shared_ptr< Field > field(const std::string &name, const std::shared_ptr< DataType > &type, bool nullable=true, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Field instance.
Compute a sequence of record batches from a (possibly chunked) Table.
Definition: table.h:300
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
std::shared_ptr< Array > chunk(int i) const
Definition: table.h:53
int64_t num_rows() const
Definition: table.h:263
std::shared_ptr< Schema > schema() const
Definition: table.h:158
Logical table as sequence of chunked arrays.
Definition: table.h:214
std::shared_ptr< ArrayData > column_data(int i) const
Definition: table.h:165
int64_t null_count_
Definition: table.h:65
int num_columns() const
Definition: table.h:260
std::shared_ptr< Schema > schema() const
Definition: table.h:238
int64_t length() const
Definition: table.h:80
std::shared_ptr< Schema > schema(const std::vector< std::shared_ptr< Field >> &fields, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Schema instance.
int64_t null_count() const
Definition: table.h:48
int num_columns() const
Definition: table.h:171
A data structure managing a list of primitive Arrow arrays logically as one large array...
Definition: table.h:41
#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: macros.h:23