Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
feather-internal.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Public API for the "Feather" file format, originally created at
19 // http://github.com/wesm/feather
20 
21 #ifndef ARROW_IPC_FEATHER_INTERNAL_H
22 #define ARROW_IPC_FEATHER_INTERNAL_H
23 
24 #include <cstdint>
25 #include <iostream>
26 #include <memory>
27 #include <string>
28 #include <vector>
29 
30 #include "flatbuffers/flatbuffers.h"
31 
32 #include "arrow/buffer.h"
33 #include "arrow/ipc/feather.h"
34 #include "arrow/ipc/feather_generated.h"
35 #include "arrow/type.h"
36 
37 namespace arrow {
38 namespace ipc {
39 namespace feather {
40 
41 typedef std::vector<flatbuffers::Offset<fbs::Column>> ColumnVector;
42 typedef flatbuffers::FlatBufferBuilder FBB;
43 typedef flatbuffers::Offset<flatbuffers::String> FBString;
44 
45 struct ARROW_EXPORT ColumnType {
46  enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME };
47 };
48 
49 struct ARROW_EXPORT ArrayMetadata {
51 
52  ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count,
53  int64_t total_bytes)
54  : type(type),
55  offset(offset),
56  length(length),
57  null_count(null_count),
58  total_bytes(total_bytes) {}
59 
60  bool Equals(const ArrayMetadata& other) const {
61  return this->type == other.type && this->offset == other.offset &&
62  this->length == other.length && this->null_count == other.null_count &&
63  this->total_bytes == other.total_bytes;
64  }
65 
66  fbs::Type type;
67  int64_t offset;
68  int64_t length;
69  int64_t null_count;
70  int64_t total_bytes;
71 };
72 
73 struct ARROW_EXPORT CategoryMetadata {
75  bool ordered;
76 };
77 
78 struct ARROW_EXPORT TimestampMetadata {
80 
81  // A timezone name known to the Olson timezone database. For display purposes
82  // because the actual data is all UTC
83  std::string timezone;
84 };
85 
86 struct ARROW_EXPORT TimeMetadata {
88 };
89 
90 static constexpr const char* kFeatherMagicBytes = "FEA1";
91 static constexpr const int kFeatherDefaultAlignment = 8;
92 
93 class ColumnBuilder;
94 
95 class ARROW_EXPORT TableBuilder {
96  public:
97  explicit TableBuilder(int64_t num_rows);
98  ~TableBuilder() = default;
99 
100  FBB& fbb();
101  Status Finish();
102  std::shared_ptr<Buffer> GetBuffer() const;
103 
104  std::unique_ptr<ColumnBuilder> AddColumn(const std::string& name);
105  void SetDescription(const std::string& description);
106  void SetNumRows(int64_t num_rows);
107  void add_column(const flatbuffers::Offset<fbs::Column>& col);
108 
109  private:
110  flatbuffers::FlatBufferBuilder fbb_;
111  ColumnVector columns_;
112 
113  friend class ColumnBuilder;
114 
115  bool finished_;
116  std::string description_;
117  int64_t num_rows_;
118 };
119 
120 class ARROW_EXPORT TableMetadata {
121  public:
123  ~TableMetadata() = default;
124 
125  Status Open(const std::shared_ptr<Buffer>& buffer) {
126  metadata_buffer_ = buffer;
127  table_ = fbs::GetCTable(buffer->data());
128 
129  if (table_->version() < kFeatherVersion) {
130  std::cout << "This Feather file is old"
131  << " and will not be readable beyond the 0.3.0 release" << std::endl;
132  }
133  return Status::OK();
134  }
135 
136  bool HasDescription() const { return table_->description() != 0; }
137 
138  std::string GetDescription() const {
139  if (!HasDescription()) {
140  return std::string("");
141  }
142  return table_->description()->str();
143  }
144 
145  int version() const { return table_->version(); }
146  int64_t num_rows() const { return table_->num_rows(); }
147  int64_t num_columns() const { return table_->columns()->size(); }
148 
149  const fbs::Column* column(int i) { return table_->columns()->Get(i); }
150 
151  private:
152  std::shared_ptr<Buffer> metadata_buffer_;
153  const fbs::CTable* table_;
154 };
155 
156 static inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray(
157  FBB& fbb, const ArrayMetadata& array) {
158  return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding_PLAIN, array.offset,
159  array.length, array.null_count, array.total_bytes);
160 }
161 
162 static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) {
163  return static_cast<fbs::TimeUnit>(static_cast<int>(unit));
164 }
165 
166 static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) {
167  return static_cast<TimeUnit::type>(static_cast<int>(unit));
168 }
169 
170 // Convert Feather enums to Flatbuffer enums
171 
172 const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = {
173  fbs::TypeMetadata_NONE, // PRIMITIVE
174  fbs::TypeMetadata_CategoryMetadata, // CATEGORY
175  fbs::TypeMetadata_TimestampMetadata, // TIMESTAMP
176  fbs::TypeMetadata_DateMetadata, // DATE
177  fbs::TypeMetadata_TimeMetadata // TIME
178 };
179 
180 static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) {
181  return COLUMN_TYPE_ENUM_MAPPING[column_type];
182 }
183 
184 static inline void FromFlatbuffer(const fbs::PrimitiveArray* values, ArrayMetadata* out) {
185  out->type = values->type();
186  out->offset = values->offset();
187  out->length = values->length();
188  out->null_count = values->null_count();
189  out->total_bytes = values->total_bytes();
190 }
191 
192 class ARROW_EXPORT ColumnBuilder {
193  public:
194  ColumnBuilder(TableBuilder* parent, const std::string& name);
195  ~ColumnBuilder() = default;
196 
197  flatbuffers::Offset<void> CreateColumnMetadata();
198 
199  Status Finish();
200  void SetValues(const ArrayMetadata& values);
201  void SetUserMetadata(const std::string& data);
202  void SetCategory(const ArrayMetadata& levels, bool ordered = false);
203  void SetTimestamp(TimeUnit::type unit);
204  void SetTimestamp(TimeUnit::type unit, const std::string& timezone);
205  void SetDate();
206  void SetTime(TimeUnit::type unit);
207  FBB& fbb();
208 
209  private:
210  TableBuilder* parent_;
211 
212  std::string name_;
213  ArrayMetadata values_;
214  std::string user_metadata_;
215 
216  // Column metadata
217 
218  // Is this a primitive type, or one of the types having metadata? Default is
219  // primitive
220  ColumnType::type type_;
221 
222  // Type-specific metadata union
223  CategoryMetadata meta_category_;
224  TimeMetadata meta_time_;
225 
226  TimestampMetadata meta_timestamp_;
227 
228  FBB* fbb_;
229 };
230 
231 } // namespace feather
232 } // namespace ipc
233 } // namespace arrow
234 
235 #endif // ARROW_IPC_FEATHER_INTERNAL_H
flatbuffers::Offset< flatbuffers::String > FBString
Definition: feather-internal.h:43
bool ordered
Definition: feather-internal.h:75
int64_t num_columns() const
Definition: feather-internal.h:147
bool Equals(const ArrayMetadata &other) const
Definition: feather-internal.h:60
int64_t length
Definition: feather-internal.h:68
bool HasDescription() const
Definition: feather-internal.h:136
Status Open(const std::shared_ptr< Buffer > &buffer)
Definition: feather-internal.h:125
std::string timezone
Definition: feather-internal.h:83
Definition: feather-internal.h:95
Definition: feather-internal.h:49
Definition: status.h:106
type
Definition: type.h:597
flatbuffers::FlatBufferBuilder FBB
Definition: feather-internal.h:42
Definition: feather-internal.h:120
TimeUnit::type unit
Definition: feather-internal.h:79
Definition: feather-internal.h:192
Definition: feather-internal.h:78
static Status OK()
Definition: status.h:119
std::vector< flatbuffers::Offset< fbs::Column > > ColumnVector
Definition: feather-internal.h:41
Definition: feather-internal.h:73
ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, int64_t total_bytes)
Definition: feather-internal.h:52
std::string GetDescription() const
Definition: feather-internal.h:138
int64_t null_count
Definition: feather-internal.h:69
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
ArrayMetadata levels
Definition: feather-internal.h:74
TableMetadata()
Definition: feather-internal.h:122
fbs::Type type
Definition: feather-internal.h:66
const fbs::Column * column(int i)
Definition: feather-internal.h:149
ArrayMetadata()
Definition: feather-internal.h:50
int64_t num_rows() const
Definition: feather-internal.h:146
Definition: feather-internal.h:45
int64_t offset
Definition: feather-internal.h:67
const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[]
Definition: feather-internal.h:172
type
Definition: feather-internal.h:46
int version() const
Definition: feather-internal.h:145
TimeUnit::type unit
Definition: feather-internal.h:87
Definition: feather-internal.h:86
int64_t total_bytes
Definition: feather-internal.h:70