Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
table.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_TABLE_H
19 #define ARROW_TABLE_H
20 
21 #include <cstdint>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 #include "arrow/array.h"
27 #include "arrow/record_batch.h"
28 #include "arrow/type.h"
29 #include "arrow/util/macros.h"
30 #include "arrow/util/visibility.h"
31 
32 namespace arrow {
33 
34 class KeyValueMetadata;
35 class Status;
36 
40 class ARROW_EXPORT ChunkedArray {
41  public:
42  explicit ChunkedArray(const ArrayVector& chunks);
43  ChunkedArray(const ArrayVector& chunks, const std::shared_ptr<DataType>& type);
44 
46  int64_t length() const { return length_; }
47 
49  int64_t null_count() const { return null_count_; }
50 
51  int num_chunks() const { return static_cast<int>(chunks_.size()); }
52 
54  std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
55 
56  const ArrayVector& chunks() const { return chunks_; }
57 
67  std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const;
68 
70  std::shared_ptr<ChunkedArray> Slice(int64_t offset) const;
71 
77  Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<ChunkedArray>>* out) const;
78 
79  std::shared_ptr<DataType> type() const { return type_; }
80 
81  bool Equals(const ChunkedArray& other) const;
82  bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
83 
84  protected:
86  int64_t length_;
87  int64_t null_count_;
88  std::shared_ptr<DataType> type_;
89 
90  private:
92 };
93 
97 class ARROW_EXPORT Column {
98  public:
99  Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
100  Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data);
101 
102  Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data);
103 
104  // Construct from name and array
105  Column(const std::string& name, const std::shared_ptr<Array>& data);
106 
107  int64_t length() const { return data_->length(); }
108 
109  int64_t null_count() const { return data_->null_count(); }
110 
111  std::shared_ptr<Field> field() const { return field_; }
112 
115  const std::string& name() const { return field_->name(); }
116 
119  std::shared_ptr<DataType> type() const { return field_->type(); }
120 
123  std::shared_ptr<ChunkedArray> data() const { return data_; }
124 
134  std::shared_ptr<Column> Slice(int64_t offset, int64_t length) const {
135  return std::make_shared<Column>(field_, data_->Slice(offset, length));
136  }
137 
139  std::shared_ptr<Column> Slice(int64_t offset) const {
140  return std::make_shared<Column>(field_, data_->Slice(offset));
141  }
142 
147  Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<Column>>* out) const;
148 
149  bool Equals(const Column& other) const;
150  bool Equals(const std::shared_ptr<Column>& other) const;
151 
154  Status ValidateData();
155 
156  protected:
157  std::shared_ptr<Field> field_;
158  std::shared_ptr<ChunkedArray> data_;
159 
160  private:
162 };
163 
166 class ARROW_EXPORT Table {
167  public:
168  virtual ~Table() = default;
169 
175  static std::shared_ptr<Table> Make(const std::shared_ptr<Schema>& schema,
176  const std::vector<std::shared_ptr<Column>>& columns,
177  int64_t num_rows = -1);
178 
183  static std::shared_ptr<Table> Make(const std::shared_ptr<Schema>& schema,
184  const std::vector<std::shared_ptr<Array>>& arrays,
185  int64_t num_rows = -1);
186 
193  static Status FromRecordBatches(
194  const std::vector<std::shared_ptr<RecordBatch>>& batches,
195  std::shared_ptr<Table>* table);
196 
204  static Status FromRecordBatches(
205  const std::shared_ptr<Schema>& schema,
206  const std::vector<std::shared_ptr<RecordBatch>>& batches,
207  std::shared_ptr<Table>* table);
208 
210  std::shared_ptr<Schema> schema() const { return schema_; }
211 
214  virtual std::shared_ptr<Column> column(int i) const = 0;
215 
217  virtual Status RemoveColumn(int i, std::shared_ptr<Table>* out) const = 0;
218 
220  virtual Status AddColumn(int i, const std::shared_ptr<Column>& column,
221  std::shared_ptr<Table>* out) const = 0;
222 
228  virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
229  const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
230 
236  virtual Status Flatten(MemoryPool* pool, std::shared_ptr<Table>* out) const = 0;
237 
239  virtual Status Validate() const = 0;
240 
242  int num_columns() const { return schema_->num_fields(); }
243 
245  int64_t num_rows() const { return num_rows_; }
246 
248  bool Equals(const Table& other) const;
249 
250  protected:
251  Table();
252 
253  std::shared_ptr<Schema> schema_;
254  int64_t num_rows_;
255 
256  private:
258 };
259 
261 class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
262  public:
263  ~TableBatchReader() override;
264 
266  explicit TableBatchReader(const Table& table);
267 
268  std::shared_ptr<Schema> schema() const override;
269 
270  Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
271 
272  void set_chunksize(int64_t chunksize);
273 
274  private:
275  class TableBatchReaderImpl;
276  std::unique_ptr<TableBatchReaderImpl> impl_;
277 };
278 
281 ARROW_EXPORT
282 Status ConcatenateTables(const std::vector<std::shared_ptr<Table>>& tables,
283  std::shared_ptr<Table>* table);
284 
285 } // namespace arrow
286 
287 #endif // ARROW_TABLE_H
int64_t length() const
Definition: table.h:46
std::shared_ptr< ChunkedArray > data_
Definition: table.h:158
int64_t length_
Definition: table.h:86
std::shared_ptr< Field > field() const
Definition: table.h:111
int num_chunks() const
Definition: table.h:51
std::shared_ptr< Column > Slice(int64_t offset, int64_t length) const
Construct a zero-copy slice of the column with the indicated offset and length.
Definition: table.h:134
ArrayVector chunks_
Definition: table.h:85
std::shared_ptr< DataType > type() const
The column type.
Definition: table.h:119
std::shared_ptr< ChunkedArray > data() const
The column data as a chunked array.
Definition: table.h:123
Abstract interface for reading stream of record batches.
Definition: record_batch.h:166
Status ConcatenateTables(const std::vector< std::shared_ptr< Table >> &tables, std::shared_ptr< Table > *table)
Construct table from multiple input tables.
An immutable column data structure consisting of a field (type metadata) and a chunked data array...
Definition: table.h:97
std::vector< std::shared_ptr< Array > > ArrayVector
Definition: array.h:302
int64_t num_rows_
Definition: table.h:254
Definition: status.h:93
std::shared_ptr< Field > field_
Definition: table.h:157
int64_t null_count() const
Definition: table.h:109
const ArrayVector & chunks() const
Definition: table.h:56
std::shared_ptr< DataType > type() const
Definition: table.h:79
const std::string & name() const
The column name.
Definition: table.h:115
std::shared_ptr< Field > field(const std::string &name, const std::shared_ptr< DataType > &type, bool nullable=true, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Field instance.
Compute a sequence of record batches from a (possibly chunked) Table.
Definition: table.h:261
Top-level namespace for Apache Arrow C++ API.
Definition: adapter.h:32
std::shared_ptr< Array > chunk(int i) const
Definition: table.h:54
int64_t num_rows() const
Definition: table.h:245
Logical table as sequence of chunked arrays.
Definition: table.h:166
int64_t null_count_
Definition: table.h:87
std::shared_ptr< Column > Slice(int64_t offset) const
Slice from offset until end of the column.
Definition: table.h:139
int num_columns() const
Definition: table.h:242
std::shared_ptr< Schema > schema() const
Definition: table.h:210
std::shared_ptr< Schema > schema_
Definition: table.h:253
std::shared_ptr< DataType > type_
Definition: table.h:88
int64_t length() const
Definition: table.h:107
std::shared_ptr< Schema > schema(const std::vector< std::shared_ptr< Field >> &fields, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Schema instance.
int64_t null_count() const
Definition: table.h:49
Base class for memory allocation.
Definition: memory_pool.h:34
A data structure managing a list of primitive Arrow arrays logically as one large array...
Definition: table.h:40
#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: macros.h:23