Row to columnar conversion¶
Fixed Schemas¶
The following example converts an array of structs to a arrow::Table
instance, and then converts it back to the original array of structs.
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <arrow/api.h>
#include <arrow/result.h>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <vector>
using arrow::DoubleBuilder;
using arrow::Int64Builder;
using arrow::ListBuilder;
// While we want to use columnar data structures to build efficient operations, we
// often receive data in a row-wise fashion from other systems. In the following,
// we want give a brief introduction into the classes provided by Apache Arrow by
// showing how to transform row-wise data into a columnar table.
//
// The table contains an id for a product, the number of components in the product
// and the cost of each component.
//
// The data in this example is stored in the following struct:
struct data_row {
int64_t id;
int64_t components;
std::vector<double> component_cost;
};
// Transforming a vector of structs into a columnar Table.
//
// The final representation should be an `arrow::Table` which in turn
// is made up of an `arrow::Schema` and a list of
// `arrow::ChunkedArray` instances. As the first step, we will iterate
// over the data and build up the arrays incrementally. For this
// task, we provide `arrow::ArrayBuilder` classes that help in the
// construction of the final `arrow::Array` instances.
//
// For each type, Arrow has a specially typed builder class. For the primitive
// values `id` and `components` we can use the `arrow::Int64Builder`. For the
// `component_cost` vector, we need to have two builders, a top-level
// `arrow::ListBuilder` that builds the array of offsets and a nested
// `arrow::DoubleBuilder` that constructs the underlying values array that
// is referenced by the offsets in the former array.
arrow::Result<std::shared_ptr<arrow::Table>> VectorToColumnarTable(
const std::vector<struct data_row>& rows) {
// The builders are more efficient using
// arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of
// the underlying memory regions in-place. At the moment, arrow::jemalloc is only
// supported on Unix systems, not Windows.
arrow::MemoryPool* pool = arrow::default_memory_pool();
Int64Builder id_builder(pool);
Int64Builder components_builder(pool);
ListBuilder component_cost_builder(pool, std::make_shared<DoubleBuilder>(pool));
// The following builder is owned by component_cost_builder.
DoubleBuilder* component_item_cost_builder =
(static_cast<DoubleBuilder*>(component_cost_builder.value_builder()));
// Now we can loop over our existing data and insert it into the builders. The
// `Append` calls here may fail (e.g. we cannot allocate enough additional memory).
// Thus we need to check their return values. For more information on these values,
// check the documentation about `arrow::Status`.
for (const data_row& row : rows) {
ARROW_RETURN_NOT_OK(id_builder.Append(row.id));
ARROW_RETURN_NOT_OK(components_builder.Append(row.components));
// Indicate the start of a new list row. This will memorise the current
// offset in the values builder.
ARROW_RETURN_NOT_OK(component_cost_builder.Append());
// Store the actual values. The same memory layout is
// used for the component cost data, in this case a vector of
// type double, as for the memory that Arrow uses to hold this
// data and will be created.
ARROW_RETURN_NOT_OK(component_item_cost_builder->AppendValues(
row.component_cost.data(), row.component_cost.size()));
}
// At the end, we finalise the arrays, declare the (type) schema and combine them
// into a single `arrow::Table`:
std::shared_ptr<arrow::Array> id_array;
ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array));
std::shared_ptr<arrow::Array> components_array;
ARROW_RETURN_NOT_OK(components_builder.Finish(&components_array));
// No need to invoke component_item_cost_builder.Finish because it is implied by
// the parent builder's Finish invocation.
std::shared_ptr<arrow::Array> component_cost_array;
ARROW_RETURN_NOT_OK(component_cost_builder.Finish(&component_cost_array));
std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
arrow::field("id", arrow::int64()), arrow::field("components", arrow::int64()),
arrow::field("component_cost", arrow::list(arrow::float64()))};
auto schema = std::make_shared<arrow::Schema>(schema_vector);
// The final `table` variable is the one we can then pass on to other functions
// that can consume Apache Arrow memory structures. This object has ownership of
// all referenced data, thus we don't have to care about undefined references once
// we leave the scope of the function building the table and its underlying arrays.
std::shared_ptr<arrow::Table> table =
arrow::Table::Make(schema, {id_array, components_array, component_cost_array});
return table;
}
arrow::Result<std::vector<data_row>> ColumnarTableToVector(
const std::shared_ptr<arrow::Table>& table) {
// To convert an Arrow table back into the same row-wise representation as in the
// above section, we first will check that the table conforms to our expected
// schema and then will build up the vector of rows incrementally.
//
// For the check if the table is as expected, we can utilise solely its schema.
std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
arrow::field("id", arrow::int64()), arrow::field("components", arrow::int64()),
arrow::field("component_cost", arrow::list(arrow::float64()))};
auto expected_schema = std::make_shared<arrow::Schema>(schema_vector);
if (!expected_schema->Equals(*table->schema())) {
// The table doesn't have the expected schema thus we cannot directly
// convert it to our target representation.
return arrow::Status::Invalid("Schemas are not matching!");
}
// As we have ensured that the table has the expected structure, we can unpack the
// underlying arrays. For the primitive columns `id` and `components` we can use the
// high level functions to get the values whereas for the nested column
// `component_costs` we need to access the C-pointer to the data to copy its
// contents into the resulting `std::vector<double>`. Here we need to be careful to
// also add the offset to the pointer. This offset is needed to enable zero-copy
// slicing operations. While this could be adjusted automatically for double
// arrays, this cannot be done for the accompanying bitmap as often the slicing
// border would be inside a byte.
auto ids = std::static_pointer_cast<arrow::Int64Array>(table->column(0)->chunk(0));
auto components =
std::static_pointer_cast<arrow::Int64Array>(table->column(1)->chunk(0));
auto component_cost =
std::static_pointer_cast<arrow::ListArray>(table->column(2)->chunk(0));
auto component_cost_values =
std::static_pointer_cast<arrow::DoubleArray>(component_cost->values());
// To enable zero-copy slices, the native values pointer might need to account
// for this slicing offset. This is not needed for the higher level functions
// like Value(…) that already account for this offset internally.
const double* ccv_ptr = component_cost_values->raw_values();
std::vector<data_row> rows;
for (int64_t i = 0; i < table->num_rows(); i++) {
// Another simplification in this example is that we assume that there are
// no null entries, e.g. each row is fill with valid values.
int64_t id = ids->Value(i);
int64_t component = components->Value(i);
const double* first = ccv_ptr + component_cost->value_offset(i);
const double* last = ccv_ptr + component_cost->value_offset(i + 1);
std::vector<double> components_vec(first, last);
rows.push_back({id, component, components_vec});
}
return rows;
}
arrow::Status RunRowConversion() {
std::vector<data_row> original_rows = {
{1, 1, {10.0}}, {2, 3, {11.0, 12.0, 13.0}}, {3, 2, {15.0, 25.0}}};
std::shared_ptr<arrow::Table> table;
std::vector<data_row> converted_rows;
ARROW_ASSIGN_OR_RAISE(table, VectorToColumnarTable(original_rows));
ARROW_ASSIGN_OR_RAISE(converted_rows, ColumnarTableToVector(table));
assert(original_rows.size() == converted_rows.size());
// Print out contents of table, should get
// ID Components Component prices
// 1 1 10
// 2 3 11 12 13
// 3 2 15 25
std::cout << std::left << std::setw(3) << "ID " << std::left << std::setw(11)
<< "Components " << std::left << std::setw(15) << "Component prices "
<< std::endl;
for (const auto& row : converted_rows) {
std::cout << std::left << std::setw(3) << row.id << std::left << std::setw(11)
<< row.components;
for (const auto& cost : row.component_cost) {
std::cout << std::left << std::setw(4) << cost;
}
std::cout << std::endl;
}
return arrow::Status::OK();
}
int main(int argc, char** argv) {
auto status = RunRowConversion();
if (!status.ok()) {
std::cerr << status.ToString() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
Dynamic Schemas¶
In many cases, we need to convert to and from row data that does not have a schema known at compile time. To help implement these conversions, this library provides several utilities:
arrow::RecordBatchBuilder
: creates and manages array builders for a full record batch.arrow::VisitTypeInline()
: dispatch to functions specialized for the given array type.Type Traits (such as
arrow::enable_if_primitive_ctype
): narrow template functions to specific Arrow types, useful in conjunction with the Visitor Pattern.arrow::TableBatchReader
: read a table in a batch at a time, with each batch being a zero-copy slice.
The following example shows how to implement conversion between rapidjson::Document
and Arrow objects. You can read the full code example at
https://github.com/apache/arrow/blob/main/cpp/examples/arrow/rapidjson_row_converter.cc
Writing conversions to Arrow¶
To convert rows to Arrow record batches, we’ll setup Array builders for all the columns and then for each field iterate through row values and append to the builders. We assume that we already know the target schema, which may have been provided by another system or was inferred in another function. Inferring the schema during conversion is a challenging proposition; many systems will check the first N rows to infer a schema if there is none already available.
At the top level, we define a function ConvertToRecordBatch
:
495arrow::Result<std::shared_ptr<arrow::RecordBatch>> ConvertToRecordBatch(
496 const std::vector<rapidjson::Document>& rows, std::shared_ptr<arrow::Schema> schema) {
497 // RecordBatchBuilder will create array builders for us for each field in our
498 // schema. By passing the number of output rows (`rows.size()`) we can
499 // pre-allocate the correct size of arrays, except of course in the case of
500 // string, byte, and list arrays, which have dynamic lengths.
501 std::unique_ptr<arrow::RecordBatchBuilder> batch_builder;
502 ARROW_ASSIGN_OR_RAISE(
503 batch_builder,
504 arrow::RecordBatchBuilder::Make(schema, arrow::default_memory_pool(), rows.size()));
505
506 // Inner converter will take rows and be responsible for appending values
507 // to provided array builders.
508 JsonValueConverter converter(rows);
509 for (int i = 0; i < batch_builder->num_fields(); ++i) {
510 std::shared_ptr<arrow::Field> field = schema->field(i);
511 arrow::ArrayBuilder* builder = batch_builder->GetField(i);
512 ARROW_RETURN_NOT_OK(converter.Convert(*field.get(), builder));
513 }
514
515 std::shared_ptr<arrow::RecordBatch> batch;
516 ARROW_ASSIGN_OR_RAISE(batch, batch_builder->Flush());
517
518 // Use RecordBatch::ValidateFull() to make sure arrays were correctly constructed.
519 DCHECK_OK(batch->ValidateFull());
520 return batch;
521} // ConvertToRecordBatch
First we use arrow::RecordBatchBuilder
, which conveniently creates builders
for each field in the schema. Then we iterate over the fields of the schema, get
the builder, and call Convert()
on our JsonValueConverter
(to be discussed
next). At the end, we call batch->ValidateFull()
, which checks the integrity
of our arrays to make sure the conversion was performed correctly, which is useful
for debugging new conversion implementations.
One level down, the JsonValueConverter
is responsible for appending row values
for the provided field to a provided array builder. In order to specialize logic
for each data type, it implements Visit
methods and calls arrow::VisitTypeInline()
.
(See more about type visitors in Visitor Pattern.)
At the end of that class is the private method FieldValues()
, which returns
an iterator of the column values for the current field across the rows. In
row-based structures that are flat (such as a vector of values) this may be
trivial to implement. But if the schema is nested, as in the case of JSON documents,
a special iterator is needed to navigate the levels of nesting. See the
full example
for the implementation details of DocValuesIterator
.
323class JsonValueConverter {
324 public:
325 explicit JsonValueConverter(const std::vector<rapidjson::Document>& rows)
326 : rows_(rows), array_levels_(0) {}
327
328 JsonValueConverter(const std::vector<rapidjson::Document>& rows,
329 const std::vector<std::string>& root_path, int64_t array_levels)
330 : rows_(rows), root_path_(root_path), array_levels_(array_levels) {}
331
332 /// \brief For field passed in, append corresponding values to builder
333 arrow::Status Convert(const arrow::Field& field, arrow::ArrayBuilder* builder) {
334 return Convert(field, field.name(), builder);
335 }
336
337 /// \brief For field passed in, append corresponding values to builder
338 arrow::Status Convert(const arrow::Field& field, const std::string& field_name,
339 arrow::ArrayBuilder* builder) {
340 field_name_ = field_name;
341 builder_ = builder;
342 ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field.type().get(), this));
343 return arrow::Status::OK();
344 }
345
346 // Default implementation
347 arrow::Status Visit(const arrow::DataType& type) {
348 return arrow::Status::NotImplemented(
349 "Can not convert json value to Arrow array of type ", type.ToString());
350 }
351
352 arrow::Status Visit(const arrow::Int64Type& type) {
353 arrow::Int64Builder* builder = static_cast<arrow::Int64Builder*>(builder_);
354 for (const auto& maybe_value : FieldValues()) {
355 ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
356 if (value->IsNull()) {
357 ARROW_RETURN_NOT_OK(builder->AppendNull());
358 } else {
359 if (value->IsUint()) {
360 ARROW_RETURN_NOT_OK(builder->Append(value->GetUint()));
361 } else if (value->IsInt()) {
362 ARROW_RETURN_NOT_OK(builder->Append(value->GetInt()));
363 } else if (value->IsUint64()) {
364 ARROW_RETURN_NOT_OK(builder->Append(value->GetUint64()));
365 } else if (value->IsInt64()) {
366 ARROW_RETURN_NOT_OK(builder->Append(value->GetInt64()));
367 } else {
368 return arrow::Status::Invalid("Value is not an integer");
369 }
370 }
371 }
372 return arrow::Status::OK();
373 }
374
375 arrow::Status Visit(const arrow::DoubleType& type) {
376 arrow::DoubleBuilder* builder = static_cast<arrow::DoubleBuilder*>(builder_);
377 for (const auto& maybe_value : FieldValues()) {
378 ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
379 if (value->IsNull()) {
380 ARROW_RETURN_NOT_OK(builder->AppendNull());
381 } else {
382 ARROW_RETURN_NOT_OK(builder->Append(value->GetDouble()));
383 }
384 }
385 return arrow::Status::OK();
386 }
387
388 arrow::Status Visit(const arrow::StringType& type) {
389 arrow::StringBuilder* builder = static_cast<arrow::StringBuilder*>(builder_);
390 for (const auto& maybe_value : FieldValues()) {
391 ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
392 if (value->IsNull()) {
393 ARROW_RETURN_NOT_OK(builder->AppendNull());
394 } else {
395 ARROW_RETURN_NOT_OK(builder->Append(value->GetString()));
396 }
397 }
398 return arrow::Status::OK();
399 }
400
401 arrow::Status Visit(const arrow::BooleanType& type) {
402 arrow::BooleanBuilder* builder = static_cast<arrow::BooleanBuilder*>(builder_);
403 for (const auto& maybe_value : FieldValues()) {
404 ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
405 if (value->IsNull()) {
406 ARROW_RETURN_NOT_OK(builder->AppendNull());
407 } else {
408 ARROW_RETURN_NOT_OK(builder->Append(value->GetBool()));
409 }
410 }
411 return arrow::Status::OK();
412 }
413
414 arrow::Status Visit(const arrow::StructType& type) {
415 arrow::StructBuilder* builder = static_cast<arrow::StructBuilder*>(builder_);
416
417 std::vector<std::string> child_path(root_path_);
418 if (field_name_.size() > 0) {
419 child_path.push_back(field_name_);
420 }
421 auto child_converter = JsonValueConverter(rows_, child_path, array_levels_);
422
423 for (int i = 0; i < type.num_fields(); ++i) {
424 std::shared_ptr<arrow::Field> child_field = type.field(i);
425 std::shared_ptr<arrow::ArrayBuilder> child_builder = builder->child_builder(i);
426
427 ARROW_RETURN_NOT_OK(
428 child_converter.Convert(*child_field.get(), child_builder.get()));
429 }
430
431 // Make null bitmap
432 for (const auto& maybe_value : FieldValues()) {
433 ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
434 ARROW_RETURN_NOT_OK(builder->Append(!value->IsNull()));
435 }
436
437 return arrow::Status::OK();
438 }
439
440 arrow::Status Visit(const arrow::ListType& type) {
441 arrow::ListBuilder* builder = static_cast<arrow::ListBuilder*>(builder_);
442
443 // Values and offsets needs to be interleaved in ListBuilder, so first collect the
444 // values
445 std::unique_ptr<arrow::ArrayBuilder> tmp_value_builder;
446 ARROW_ASSIGN_OR_RAISE(tmp_value_builder,
447 arrow::MakeBuilder(builder->value_builder()->type()));
448 std::vector<std::string> child_path(root_path_);
449 child_path.push_back(field_name_);
450 auto child_converter = JsonValueConverter(rows_, child_path, array_levels_ + 1);
451 ARROW_RETURN_NOT_OK(
452 child_converter.Convert(*type.value_field().get(), "", tmp_value_builder.get()));
453
454 std::shared_ptr<arrow::Array> values_array;
455 ARROW_RETURN_NOT_OK(tmp_value_builder->Finish(&values_array));
456 std::shared_ptr<arrow::ArrayData> values_data = values_array->data();
457
458 arrow::ArrayBuilder* value_builder = builder->value_builder();
459 int64_t offset = 0;
460 for (const auto& maybe_value : FieldValues()) {
461 ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
462 ARROW_RETURN_NOT_OK(builder->Append(!value->IsNull()));
463 if (!value->IsNull() && value->Size() > 0) {
464 ARROW_RETURN_NOT_OK(
465 value_builder->AppendArraySlice(*values_data.get(), offset, value->Size()));
466 offset += value->Size();
467 }
468 }
469
470 return arrow::Status::OK();
471 }
472
473 private:
474 std::string field_name_;
475 arrow::ArrayBuilder* builder_;
476 const std::vector<rapidjson::Document>& rows_;
477 std::vector<std::string> root_path_;
478 int64_t array_levels_;
479
480 /// Return a flattened iterator over values at nested location
481 arrow::Iterator<const rapidjson::Value*> FieldValues() {
482 std::vector<std::string> path(root_path_);
483 if (field_name_.size() > 0) {
484 path.push_back(field_name_);
485 }
486 auto iter = DocValuesIterator(rows_, std::move(path), array_levels_);
487 auto fn = [iter]() mutable -> arrow::Result<const rapidjson::Value*> {
488 return iter.Next();
489 };
490
491 return arrow::MakeFunctionIterator(fn);
492 }
493}; // JsonValueConverter
Writing conversions from Arrow¶
To convert into rows from Arrow record batches, we’ll process the table in smaller batches, visiting each field of the batch and filling the output rows column-by-column.
At the top-level, we define ArrowToDocumentConverter
that provides the API
for converting Arrow batches and tables to rows. In many cases, it’s more optimal
to perform conversions to rows in smaller batches, rather than doing the entire
table at once. So we define one ConvertToVector
method to convert a single
batch, then in the other conversion method we use arrow::TableBatchReader
to iterate over slices of a table. This returns Arrow’s iterator type
(arrow::Iterator
) so rows could then be processed either one-at-a-time
or be collected into a container.
179class ArrowToDocumentConverter {
180 public:
181 /// Convert a single batch of Arrow data into Documents
182 arrow::Result<std::vector<rapidjson::Document>> ConvertToVector(
183 std::shared_ptr<arrow::RecordBatch> batch) {
184 RowBatchBuilder builder{batch->num_rows()};
185
186 for (int i = 0; i < batch->num_columns(); ++i) {
187 builder.SetField(batch->schema()->field(i).get());
188 ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*batch->column(i).get(), &builder));
189 }
190
191 return std::move(builder).Rows();
192 }
193
194 /// Convert an Arrow table into an iterator of Documents
195 arrow::Iterator<rapidjson::Document> ConvertToIterator(
196 std::shared_ptr<arrow::Table> table, size_t batch_size) {
197 // Use TableBatchReader to divide table into smaller batches. The batches
198 // created are zero-copy slices with *at most* `batch_size` rows.
199 auto batch_reader = std::make_shared<arrow::TableBatchReader>(*table);
200 batch_reader->set_chunksize(batch_size);
201
202 auto read_batch = [this](const std::shared_ptr<arrow::RecordBatch>& batch)
203 -> arrow::Result<arrow::Iterator<rapidjson::Document>> {
204 ARROW_ASSIGN_OR_RAISE(auto rows, ConvertToVector(batch));
205 return arrow::MakeVectorIterator(std::move(rows));
206 };
207
208 auto nested_iter = arrow::MakeMaybeMapIterator(
209 read_batch, arrow::MakeIteratorFromReader(std::move(batch_reader)));
210
211 return arrow::MakeFlattenIterator(std::move(nested_iter));
212 }
213}; // ArrowToDocumentConverter
One level down, the output rows are filled in by RowBatchBuilder
.
The RowBatchBuilder
implements Visit()
methods, but to save on code we
write a template method for array types that have primitive C equivalents
(booleans, integers, and floats) using arrow::enable_if_primitive_ctype
.
See Type Traits for other type predicates.
57class RowBatchBuilder {
58 public:
59 explicit RowBatchBuilder(int64_t num_rows) : field_(nullptr) {
60 // Reserve all of the space required up-front to avoid unnecessary resizing
61 rows_.reserve(num_rows);
62
63 for (int64_t i = 0; i < num_rows; ++i) {
64 rows_.push_back(rapidjson::Document());
65 rows_[i].SetObject();
66 }
67 }
68
69 /// \brief Set which field to convert.
70 void SetField(const arrow::Field* field) { field_ = field; }
71
72 /// \brief Retrieve converted rows from builder.
73 std::vector<rapidjson::Document> Rows() && { return std::move(rows_); }
74
75 // Default implementation
76 arrow::Status Visit(const arrow::Array& array) {
77 return arrow::Status::NotImplemented(
78 "Can not convert to json document for array of type ", array.type()->ToString());
79 }
80
81 // Handles booleans, integers, floats
82 template <typename ArrayType, typename DataClass = typename ArrayType::TypeClass>
83 arrow::enable_if_primitive_ctype<DataClass, arrow::Status> Visit(
84 const ArrayType& array) {
85 assert(static_cast<int64_t>(rows_.size()) == array.length());
86 for (int64_t i = 0; i < array.length(); ++i) {
87 if (!array.IsNull(i)) {
88 rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
89 rows_[i].AddMember(str_key, array.Value(i), rows_[i].GetAllocator());
90 }
91 }
92 return arrow::Status::OK();
93 }
94
95 arrow::Status Visit(const arrow::StringArray& array) {
96 assert(static_cast<int64_t>(rows_.size()) == array.length());
97 for (int64_t i = 0; i < array.length(); ++i) {
98 if (!array.IsNull(i)) {
99 rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
100 std::string_view value_view = array.Value(i);
101 rapidjson::Value value;
102 value.SetString(value_view.data(),
103 static_cast<rapidjson::SizeType>(value_view.size()),
104 rows_[i].GetAllocator());
105 rows_[i].AddMember(str_key, value, rows_[i].GetAllocator());
106 }
107 }
108 return arrow::Status::OK();
109 }
110
111 arrow::Status Visit(const arrow::StructArray& array) {
112 const arrow::StructType* type = array.struct_type();
113
114 assert(static_cast<int64_t>(rows_.size()) == array.length());
115
116 RowBatchBuilder child_builder(rows_.size());
117 for (int i = 0; i < type->num_fields(); ++i) {
118 const arrow::Field* child_field = type->field(i).get();
119 child_builder.SetField(child_field);
120 ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array.field(i).get(), &child_builder));
121 }
122 std::vector<rapidjson::Document> rows = std::move(child_builder).Rows();
123
124 for (int64_t i = 0; i < array.length(); ++i) {
125 if (!array.IsNull(i)) {
126 rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
127 // Must copy value to new allocator
128 rapidjson::Value row_val;
129 row_val.CopyFrom(rows[i], rows_[i].GetAllocator());
130 rows_[i].AddMember(str_key, row_val, rows_[i].GetAllocator());
131 }
132 }
133 return arrow::Status::OK();
134 }
135
136 arrow::Status Visit(const arrow::ListArray& array) {
137 assert(static_cast<int64_t>(rows_.size()) == array.length());
138 // First create rows from values
139 std::shared_ptr<arrow::Array> values = array.values();
140 RowBatchBuilder child_builder(values->length());
141 const arrow::Field* value_field = array.list_type()->value_field().get();
142 std::string value_field_name = value_field->name();
143 child_builder.SetField(value_field);
144 ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*values.get(), &child_builder));
145
146 std::vector<rapidjson::Document> rows = std::move(child_builder).Rows();
147
148 int64_t values_i = 0;
149 for (int64_t i = 0; i < array.length(); ++i) {
150 if (array.IsNull(i)) continue;
151
152 rapidjson::Document::AllocatorType& allocator = rows_[i].GetAllocator();
153 auto array_len = array.value_length(i);
154
155 rapidjson::Value value;
156 value.SetArray();
157 value.Reserve(array_len, allocator);
158
159 for (int64_t j = 0; j < array_len; ++j) {
160 rapidjson::Value row_val;
161 // Must copy value to new allocator
162 row_val.CopyFrom(rows[values_i][value_field_name], allocator);
163 value.PushBack(row_val, allocator);
164 ++values_i;
165 }
166
167 rapidjson::Value str_key(field_->name(), allocator);
168 rows_[i].AddMember(str_key, value, allocator);
169 }
170
171 return arrow::Status::OK();
172 }
173
174 private:
175 const arrow::Field* field_;
176 std::vector<rapidjson::Document> rows_;
177}; // RowBatchBuilder