Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
metadata-internal.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Internal metadata serialization matters
19 
20 #ifndef ARROW_IPC_METADATA_INTERNAL_H
21 #define ARROW_IPC_METADATA_INTERNAL_H
22 
23 #include <cstdint>
24 #include <cstring>
25 #include <memory>
26 #include <string>
27 #include <vector>
28 
29 #include <flatbuffers/flatbuffers.h>
30 
31 #include "arrow/buffer.h"
32 #include "arrow/ipc/Schema_generated.h"
33 #include "arrow/ipc/dictionary.h" // IYWU pragma: keep
34 #include "arrow/ipc/message.h"
35 #include "arrow/memory_pool.h"
36 #include "arrow/status.h"
37 
38 namespace arrow {
39 
40 class DataType;
41 class Schema;
42 class Tensor;
43 
44 namespace flatbuf = org::apache::arrow::flatbuf;
45 
46 namespace io {
47 
48 class OutputStream;
49 
50 } // namespace io
51 
52 namespace ipc {
53 
54 class DictionaryMemo;
55 
56 namespace internal {
57 
58 static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion =
59  flatbuf::MetadataVersion_V4;
60 
61 static constexpr flatbuf::MetadataVersion kMinMetadataVersion =
62  flatbuf::MetadataVersion_V4;
63 
64 MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version);
65 
66 static constexpr const char* kArrowMagicBytes = "ARROW1";
67 
68 struct FieldMetadata {
69  int64_t length;
70  int64_t null_count;
71  int64_t offset;
72 };
73 
74 struct BufferMetadata {
76  int64_t offset;
77 
79  int64_t length;
80 };
81 
82 struct FileBlock {
83  int64_t offset;
84  int32_t metadata_length;
85  int64_t body_length;
86 };
87 
88 // Read interface classes. We do not fully deserialize the flatbuffers so that
89 // individual fields metadata can be retrieved from very large schema without
90 //
91 
92 // Retrieve a list of all the dictionary ids and types required by the schema for
93 // reconstruction. The presumption is that these will be loaded either from
94 // the stream or file (or they may already be somewhere else in memory)
95 Status GetDictionaryTypes(const void* opaque_schema, DictionaryTypeMap* id_to_field);
96 
97 // Construct a complete Schema from the message. May be expensive for very
98 // large schemas if you are only interested in a few fields
99 Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_memo,
100  std::shared_ptr<Schema>* out);
101 
102 Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
103  std::vector<int64_t>* shape, std::vector<int64_t>* strides,
104  std::vector<std::string>* dim_names);
105 
119 Status WriteMessage(const Buffer& message, int32_t alignment, io::OutputStream* file,
120  int32_t* message_length);
121 
122 // Serialize arrow::Schema as a Flatbuffer
123 //
124 // \param[in] schema a Schema instance
125 // \param[in,out] dictionary_memo class for tracking dictionaries and assigning
126 // dictionary ids
127 // \param[out] out the serialized arrow::Buffer
128 // \return Status outcome
129 Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo,
130  std::shared_ptr<Buffer>* out);
131 
132 Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length,
133  const std::vector<FieldMetadata>& nodes,
134  const std::vector<BufferMetadata>& buffers,
135  std::shared_ptr<Buffer>* out);
136 
137 Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset,
138  std::shared_ptr<Buffer>* out);
139 
140 Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dictionaries,
141  const std::vector<FileBlock>& record_batches,
142  DictionaryMemo* dictionary_memo, io::OutputStream* out);
143 
144 Status WriteDictionaryMessage(const int64_t id, const int64_t length,
145  const int64_t body_length,
146  const std::vector<FieldMetadata>& nodes,
147  const std::vector<BufferMetadata>& buffers,
148  std::shared_ptr<Buffer>* out);
149 
150 static inline Status WriteFlatbufferBuilder(flatbuffers::FlatBufferBuilder& fbb,
151  std::shared_ptr<Buffer>* out) {
152  int32_t size = fbb.GetSize();
153 
154  std::shared_ptr<Buffer> result;
156 
157  uint8_t* dst = result->mutable_data();
158  memcpy(dst, fbb.GetBufferPointer(), size);
159  *out = result;
160  return Status::OK();
161 }
162 
163 } // namespace internal
164 } // namespace ipc
165 } // namespace arrow
166 
167 #endif // ARROW_IPC_METADATA_H
std::unordered_map< int64_t, std::shared_ptr< Field > > DictionaryTypeMap
Definition: dictionary.h:39
#define RETURN_NOT_OK(s)
Definition: status.h:65
MemoryPool * default_memory_pool()
Status AllocateBuffer(MemoryPool *pool, const int64_t size, std::shared_ptr< Buffer > *out)
Allocate a fixed size mutable buffer from a memory pool, zero its padding.
static Status OK()
Definition: status.h:124
Top-level namespace for Apache Arrow C++ API.
Definition: adapter.h:32
MetadataVersion
Definition: message.h:46
std::shared_ptr< Schema > schema(const std::vector< std::shared_ptr< Field >> &fields, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Schema instance.
::arrow::Buffer Buffer
Definition: memory.h:54