Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
metadata-internal.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Internal metadata serialization matters
19 
20 #ifndef ARROW_IPC_METADATA_INTERNAL_H
21 #define ARROW_IPC_METADATA_INTERNAL_H
22 
23 #include <cstdint>
24 #include <memory>
25 #include <string>
26 #include <vector>
27 
28 #include "arrow/ipc/Schema_generated.h"
29 #include "arrow/ipc/dictionary.h"
30 
31 namespace arrow {
32 
33 class Buffer;
34 class DataType;
35 class Schema;
36 class Status;
37 class Tensor;
38 
39 namespace flatbuf = org::apache::arrow::flatbuf;
40 
41 namespace io {
42 
43 class OutputStream;
44 
45 } // namespace io
46 
47 namespace ipc {
48 namespace internal {
49 
50 static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion =
51  flatbuf::MetadataVersion_V3;
52 
53 static constexpr flatbuf::MetadataVersion kMinMetadataVersion =
54  flatbuf::MetadataVersion_V3;
55 
56 static constexpr const char* kArrowMagicBytes = "ARROW1";
57 
58 struct FieldMetadata {
59  int64_t length;
60  int64_t null_count;
61  int64_t offset;
62 };
63 
64 struct BufferMetadata {
66  int32_t page;
67 
69  int64_t offset;
70 
72  int64_t length;
73 };
74 
75 struct FileBlock {
76  int64_t offset;
77  int32_t metadata_length;
78  int64_t body_length;
79 };
80 
81 // Read interface classes. We do not fully deserialize the flatbuffers so that
82 // individual fields metadata can be retrieved from very large schema without
83 //
84 
85 // Retrieve a list of all the dictionary ids and types required by the schema for
86 // reconstruction. The presumption is that these will be loaded either from
87 // the stream or file (or they may already be somewhere else in memory)
88 Status GetDictionaryTypes(const void* opaque_schema, DictionaryTypeMap* id_to_field);
89 
90 // Construct a complete Schema from the message. May be expensive for very
91 // large schemas if you are only interested in a few fields
92 Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_memo,
93  std::shared_ptr<Schema>* out);
94 
95 Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type,
96  std::vector<int64_t>* shape, std::vector<int64_t>* strides,
97  std::vector<std::string>* dim_names);
98 
103 Status WriteMessage(const Buffer& message, io::OutputStream* file,
104  int32_t* message_length);
105 
106 // Serialize arrow::Schema as a Flatbuffer
107 //
108 // \param[in] schema a Schema instance
109 // \param[inout] dictionary_memo class for tracking dictionaries and assigning
110 // dictionary ids
111 // \param[out] out the serialized arrow::Buffer
112 // \return Status outcome
113 Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo,
114  std::shared_ptr<Buffer>* out);
115 
116 Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length,
117  const std::vector<FieldMetadata>& nodes,
118  const std::vector<BufferMetadata>& buffers,
119  std::shared_ptr<Buffer>* out);
120 
121 Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset,
122  std::shared_ptr<Buffer>* out);
123 
124 Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dictionaries,
125  const std::vector<FileBlock>& record_batches,
126  DictionaryMemo* dictionary_memo, io::OutputStream* out);
127 
128 Status WriteDictionaryMessage(const int64_t id, const int64_t length,
129  const int64_t body_length,
130  const std::vector<FieldMetadata>& nodes,
131  const std::vector<BufferMetadata>& buffers,
132  std::shared_ptr<Buffer>* out);
133 
134 } // namespace internal
135 } // namespace ipc
136 } // namespace arrow
137 
138 #endif // ARROW_IPC_METADATA_H
std::unordered_map< int64_t, std::shared_ptr< Field > > DictionaryTypeMap
Definition: dictionary.h:39
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
MetadataVersion
Definition: message.h:45
std::shared_ptr< Schema > schema(const std::vector< std::shared_ptr< Field >> &fields, const std::shared_ptr< const KeyValueMetadata > &metadata=NULLPTR)
Create a Schema instance.