Row to columnar conversion#

Fixed Schemas#

The following example converts an array of structs to a arrow::Table instance, and then converts it back to the original array of structs.

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <arrow/api.h>
#include <arrow/result.h>

#include <cstdint>
#include <iomanip>
#include <iostream>
#include <vector>

using arrow::DoubleBuilder;
using arrow::Int64Builder;
using arrow::ListBuilder;

// While we want to use columnar data structures to build efficient operations, we
// often receive data in a row-wise fashion from other systems. In the following,
// we want give a brief introduction into the classes provided by Apache Arrow by
// showing how to transform row-wise data into a columnar table.
//
// The table contains an id for a product, the number of components in the product
// and the cost of each component.
//
// The data in this example is stored in the following struct:
struct data_row {
  int64_t id;
  int64_t components;
  std::vector<double> component_cost;
};

// Transforming a vector of structs into a columnar Table.
//
// The final representation should be an `arrow::Table` which in turn
// is made up of an `arrow::Schema` and a list of
// `arrow::ChunkedArray` instances. As the first step, we will iterate
// over the data and build up the arrays incrementally.  For this
// task, we provide `arrow::ArrayBuilder` classes that help in the
// construction of the final `arrow::Array` instances.
//
// For each type, Arrow has a specially typed builder class. For the primitive
// values `id` and `components` we can use the `arrow::Int64Builder`. For the
// `component_cost` vector, we need to have two builders, a top-level
// `arrow::ListBuilder` that builds the array of offsets and a nested
// `arrow::DoubleBuilder` that constructs the underlying values array that
// is referenced by the offsets in the former array.
arrow::Result<std::shared_ptr<arrow::Table>> VectorToColumnarTable(
    const std::vector<struct data_row>& rows) {
  // The builders are more efficient using
  // arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of
  // the underlying memory regions in-place. At the moment, arrow::jemalloc is only
  // supported on Unix systems, not Windows.
  arrow::MemoryPool* pool = arrow::default_memory_pool();

  Int64Builder id_builder(pool);
  Int64Builder components_builder(pool);
  ListBuilder component_cost_builder(pool, std::make_shared<DoubleBuilder>(pool));
  // The following builder is owned by component_cost_builder.
  DoubleBuilder* component_item_cost_builder =
      (static_cast<DoubleBuilder*>(component_cost_builder.value_builder()));

  // Now we can loop over our existing data and insert it into the builders. The
  // `Append` calls here may fail (e.g. we cannot allocate enough additional memory).
  // Thus we need to check their return values. For more information on these values,
  // check the documentation about `arrow::Status`.
  for (const data_row& row : rows) {
    ARROW_RETURN_NOT_OK(id_builder.Append(row.id));
    ARROW_RETURN_NOT_OK(components_builder.Append(row.components));

    // Indicate the start of a new list row. This will memorise the current
    // offset in the values builder.
    ARROW_RETURN_NOT_OK(component_cost_builder.Append());
    // Store the actual values. The same memory layout is
    // used for the component cost data, in this case a vector of
    // type double, as for the memory that Arrow uses to hold this
    // data and will be created.
    ARROW_RETURN_NOT_OK(component_item_cost_builder->AppendValues(
        row.component_cost.data(), row.component_cost.size()));
  }

  // At the end, we finalise the arrays, declare the (type) schema and combine them
  // into a single `arrow::Table`:
  std::shared_ptr<arrow::Array> id_array;
  ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array));
  std::shared_ptr<arrow::Array> components_array;
  ARROW_RETURN_NOT_OK(components_builder.Finish(&components_array));
  // No need to invoke component_item_cost_builder.Finish because it is implied by
  // the parent builder's Finish invocation.
  std::shared_ptr<arrow::Array> component_cost_array;
  ARROW_RETURN_NOT_OK(component_cost_builder.Finish(&component_cost_array));

  std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
      arrow::field("id", arrow::int64()), arrow::field("components", arrow::int64()),
      arrow::field("component_cost", arrow::list(arrow::float64()))};

  auto schema = std::make_shared<arrow::Schema>(schema_vector);

  // The final `table` variable is the one we can then pass on to other functions
  // that can consume Apache Arrow memory structures. This object has ownership of
  // all referenced data, thus we don't have to care about undefined references once
  // we leave the scope of the function building the table and its underlying arrays.
  std::shared_ptr<arrow::Table> table =
      arrow::Table::Make(schema, {id_array, components_array, component_cost_array});

  return table;
}

arrow::Result<std::vector<data_row>> ColumnarTableToVector(
    const std::shared_ptr<arrow::Table>& table) {
  // To convert an Arrow table back into the same row-wise representation as in the
  // above section, we first will check that the table conforms to our expected
  // schema and then will build up the vector of rows incrementally.
  //
  // For the check if the table is as expected, we can utilise solely its schema.
  std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
      arrow::field("id", arrow::int64()), arrow::field("components", arrow::int64()),
      arrow::field("component_cost", arrow::list(arrow::float64()))};
  auto expected_schema = std::make_shared<arrow::Schema>(schema_vector);

  if (!expected_schema->Equals(*table->schema())) {
    // The table doesn't have the expected schema thus we cannot directly
    // convert it to our target representation.
    return arrow::Status::Invalid("Schemas are not matching!");
  }

  // As we have ensured that the table has the expected structure, we can unpack the
  // underlying arrays. For the primitive columns `id` and `components` we can use the
  // high level functions to get the values whereas for the nested column
  // `component_costs` we need to access the C-pointer to the data to copy its
  // contents into the resulting `std::vector<double>`. Here we need to be careful to
  // also add the offset to the pointer. This offset is needed to enable zero-copy
  // slicing operations. While this could be adjusted automatically for double
  // arrays, this cannot be done for the accompanying bitmap as often the slicing
  // border would be inside a byte.

  auto ids = std::static_pointer_cast<arrow::Int64Array>(table->column(0)->chunk(0));
  auto components =
      std::static_pointer_cast<arrow::Int64Array>(table->column(1)->chunk(0));
  auto component_cost =
      std::static_pointer_cast<arrow::ListArray>(table->column(2)->chunk(0));
  auto component_cost_values =
      std::static_pointer_cast<arrow::DoubleArray>(component_cost->values());
  // To enable zero-copy slices, the native values pointer might need to account
  // for this slicing offset. This is not needed for the higher level functions
  // like Value(…) that already account for this offset internally.
  const double* ccv_ptr = component_cost_values->raw_values();
  std::vector<data_row> rows;
  for (int64_t i = 0; i < table->num_rows(); i++) {
    // Another simplification in this example is that we assume that there are
    // no null entries, e.g. each row is fill with valid values.
    int64_t id = ids->Value(i);
    int64_t component = components->Value(i);
    const double* first = ccv_ptr + component_cost->value_offset(i);
    const double* last = ccv_ptr + component_cost->value_offset(i + 1);
    std::vector<double> components_vec(first, last);
    rows.push_back({id, component, components_vec});
  }

  return rows;
}

arrow::Status RunRowConversion() {
  std::vector<data_row> original_rows = {
      {1, 1, {10.0}}, {2, 3, {11.0, 12.0, 13.0}}, {3, 2, {15.0, 25.0}}};
  std::shared_ptr<arrow::Table> table;
  std::vector<data_row> converted_rows;

  ARROW_ASSIGN_OR_RAISE(table, VectorToColumnarTable(original_rows));

  ARROW_ASSIGN_OR_RAISE(converted_rows, ColumnarTableToVector(table));

  assert(original_rows.size() == converted_rows.size());

  // Print out contents of table, should get
  // ID Components Component prices
  // 1  1          10
  // 2  3          11  12  13
  // 3  2          15  25
  std::cout << std::left << std::setw(3) << "ID " << std::left << std::setw(11)
            << "Components " << std::left << std::setw(15) << "Component prices "
            << std::endl;
  for (const auto& row : converted_rows) {
    std::cout << std::left << std::setw(3) << row.id << std::left << std::setw(11)
              << row.components;
    for (const auto& cost : row.component_cost) {
      std::cout << std::left << std::setw(4) << cost;
    }
    std::cout << std::endl;
  }
  return arrow::Status::OK();
}

int main(int argc, char** argv) {
  auto status = RunRowConversion();
  if (!status.ok()) {
    std::cerr << status.ToString() << std::endl;
    return EXIT_FAILURE;
  }
  return EXIT_SUCCESS;
}

Dynamic Schemas#

In many cases, we need to convert to and from row data that does not have a schema known at compile time. To help implement these conversions, this library provides several utilities:

arrow::RecordBatchBuilder: creates and manages array builders for a full record batch.
arrow::VisitTypeInline(): dispatch to functions specialized for the given array type.
Type Traits (such as arrow::enable_if_primitive_ctype): narrow template functions to specific Arrow types, useful in conjunction with the Visitor Pattern.
arrow::TableBatchReader: read a table in a batch at a time, with each batch being a zero-copy slice.

The following example shows how to implement conversion between rapidjson::Document and Arrow objects. You can read the full code example at apache/arrow

Writing conversions to Arrow#

To convert rows to Arrow record batches, we’ll setup Array builders for all the columns and then for each field iterate through row values and append to the builders. We assume that we already know the target schema, which may have been provided by another system or was inferred in another function. Inferring the schema during conversion is a challenging proposition; many systems will check the first N rows to infer a schema if there is none already available.

At the top level, we define a function ConvertToRecordBatch:

arrow::Result<std::shared_ptr<arrow::RecordBatch>> ConvertToRecordBatch(
    const std::vector<rapidjson::Document>& rows, std::shared_ptr<arrow::Schema> schema) {
  // RecordBatchBuilder will create array builders for us for each field in our
  // schema. By passing the number of output rows (`rows.size()`) we can
  // pre-allocate the correct size of arrays, except of course in the case of
  // string, byte, and list arrays, which have dynamic lengths.
  std::unique_ptr<arrow::RecordBatchBuilder> batch_builder;
  ARROW_ASSIGN_OR_RAISE(
      batch_builder,
      arrow::RecordBatchBuilder::Make(schema, arrow::default_memory_pool(), rows.size()));

  // Inner converter will take rows and be responsible for appending values
  // to provided array builders.
  JsonValueConverter converter(rows);
  for (int i = 0; i < batch_builder->num_fields(); ++i) {
    std::shared_ptr<arrow::Field> field = schema->field(i);
    arrow::ArrayBuilder* builder = batch_builder->GetField(i);
    ARROW_RETURN_NOT_OK(converter.Convert(*field.get(), builder));
  }

  std::shared_ptr<arrow::RecordBatch> batch;
  ARROW_ASSIGN_OR_RAISE(batch, batch_builder->Flush());

  // Use RecordBatch::ValidateFull() to make sure arrays were correctly constructed.
  DCHECK_OK(batch->ValidateFull());
  return batch;
}  // ConvertToRecordBatch

First we use arrow::RecordBatchBuilder, which conveniently creates builders for each field in the schema. Then we iterate over the fields of the schema, get the builder, and call Convert() on our JsonValueConverter (to be discussed next). At the end, we call batch->ValidateFull(), which checks the integrity of our arrays to make sure the conversion was performed correctly, which is useful for debugging new conversion implementations.

One level down, the JsonValueConverter is responsible for appending row values for the provided field to a provided array builder. In order to specialize logic for each data type, it implements Visit methods and calls arrow::VisitTypeInline(). (See more about type visitors in Visitor Pattern.)

At the end of that class is the private method FieldValues(), which returns an iterator of the column values for the current field across the rows. In row-based structures that are flat (such as a vector of values) this may be trivial to implement. But if the schema is nested, as in the case of JSON documents, a special iterator is needed to navigate the levels of nesting. See the full example for the implementation details of DocValuesIterator.

class JsonValueConverter {
 public:
  explicit JsonValueConverter(const std::vector<rapidjson::Document>& rows)
      : rows_(rows), array_levels_(0) {}

  JsonValueConverter(const std::vector<rapidjson::Document>& rows,
                     const std::vector<std::string>& root_path, int64_t array_levels)
      : rows_(rows), root_path_(root_path), array_levels_(array_levels) {}

  /// \brief For field passed in, append corresponding values to builder
  arrow::Status Convert(const arrow::Field& field, arrow::ArrayBuilder* builder) {
    return Convert(field, field.name(), builder);
  }

  /// \brief For field passed in, append corresponding values to builder
  arrow::Status Convert(const arrow::Field& field, const std::string& field_name,
                        arrow::ArrayBuilder* builder) {
    field_name_ = field_name;
    builder_ = builder;
    ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field.type().get(), this));
    return arrow::Status::OK();
  }

  // Default implementation
  arrow::Status Visit(const arrow::DataType& type) {
    return arrow::Status::NotImplemented(
        "Can not convert json value to Arrow array of type ", type.ToString());
  }

  arrow::Status Visit(const arrow::Int64Type& type) {
    arrow::Int64Builder* builder = static_cast<arrow::Int64Builder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        if (value->IsUint()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetUint()));
        } else if (value->IsInt()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetInt()));
        } else if (value->IsUint64()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetUint64()));
        } else if (value->IsInt64()) {
          ARROW_RETURN_NOT_OK(builder->Append(value->GetInt64()));
        } else {
          return arrow::Status::Invalid("Value is not an integer");
        }
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::DoubleType& type) {
    arrow::DoubleBuilder* builder = static_cast<arrow::DoubleBuilder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        ARROW_RETURN_NOT_OK(builder->Append(value->GetDouble()));
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StringType& type) {
    arrow::StringBuilder* builder = static_cast<arrow::StringBuilder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        ARROW_RETURN_NOT_OK(builder->Append(value->GetString()));
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::BooleanType& type) {
    arrow::BooleanBuilder* builder = static_cast<arrow::BooleanBuilder*>(builder_);
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      if (value->IsNull()) {
        ARROW_RETURN_NOT_OK(builder->AppendNull());
      } else {
        ARROW_RETURN_NOT_OK(builder->Append(value->GetBool()));
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StructType& type) {
    arrow::StructBuilder* builder = static_cast<arrow::StructBuilder*>(builder_);

    std::vector<std::string> child_path(root_path_);
    if (field_name_.size() > 0) {
      child_path.push_back(field_name_);
    }
    auto child_converter = JsonValueConverter(rows_, child_path, array_levels_);

    for (int i = 0; i < type.num_fields(); ++i) {
      std::shared_ptr<arrow::Field> child_field = type.field(i);
      std::shared_ptr<arrow::ArrayBuilder> child_builder = builder->child_builder(i);

      ARROW_RETURN_NOT_OK(
          child_converter.Convert(*child_field.get(), child_builder.get()));
    }

    // Make null bitmap
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      ARROW_RETURN_NOT_OK(builder->Append(!value->IsNull()));
    }

    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::ListType& type) {
    arrow::ListBuilder* builder = static_cast<arrow::ListBuilder*>(builder_);

    // Values and offsets needs to be interleaved in ListBuilder, so first collect the
    // values
    std::unique_ptr<arrow::ArrayBuilder> tmp_value_builder;
    ARROW_ASSIGN_OR_RAISE(tmp_value_builder,
                          arrow::MakeBuilder(builder->value_builder()->type()));
    std::vector<std::string> child_path(root_path_);
    child_path.push_back(field_name_);
    auto child_converter = JsonValueConverter(rows_, child_path, array_levels_ + 1);
    ARROW_RETURN_NOT_OK(
        child_converter.Convert(*type.value_field().get(), "", tmp_value_builder.get()));

    std::shared_ptr<arrow::Array> values_array;
    ARROW_RETURN_NOT_OK(tmp_value_builder->Finish(&values_array));
    std::shared_ptr<arrow::ArrayData> values_data = values_array->data();

    arrow::ArrayBuilder* value_builder = builder->value_builder();
    int64_t offset = 0;
    for (const auto& maybe_value : FieldValues()) {
      ARROW_ASSIGN_OR_RAISE(auto value, maybe_value);
      ARROW_RETURN_NOT_OK(builder->Append(!value->IsNull()));
      if (!value->IsNull() && value->Size() > 0) {
        ARROW_RETURN_NOT_OK(
            value_builder->AppendArraySlice(*values_data.get(), offset, value->Size()));
        offset += value->Size();
      }
    }

    return arrow::Status::OK();
  }

 private:
  std::string field_name_;
  arrow::ArrayBuilder* builder_;
  const std::vector<rapidjson::Document>& rows_;
  std::vector<std::string> root_path_;
  int64_t array_levels_;

  /// Return a flattened iterator over values at nested location
  arrow::Iterator<const rapidjson::Value*> FieldValues() {
    std::vector<std::string> path(root_path_);
    if (field_name_.size() > 0) {
      path.push_back(field_name_);
    }
    auto iter = DocValuesIterator(rows_, std::move(path), array_levels_);
    auto fn = [iter]() mutable -> arrow::Result<const rapidjson::Value*> {
      return iter.Next();
    };

    return arrow::MakeFunctionIterator(fn);
  }
};  // JsonValueConverter

Writing conversions from Arrow#

To convert into rows from Arrow record batches, we’ll process the table in smaller batches, visiting each field of the batch and filling the output rows column-by-column.

At the top-level, we define ArrowToDocumentConverter that provides the API for converting Arrow batches and tables to rows. In many cases, it’s more optimal to perform conversions to rows in smaller batches, rather than doing the entire table at once. So we define one ConvertToVector method to convert a single batch, then in the other conversion method we use arrow::TableBatchReader to iterate over slices of a table. This returns Arrow’s iterator type (arrow::Iterator) so rows could then be processed either one-at-a-time or be collected into a container.

class ArrowToDocumentConverter {
 public:
  /// Convert a single batch of Arrow data into Documents
  arrow::Result<std::vector<rapidjson::Document>> ConvertToVector(
      std::shared_ptr<arrow::RecordBatch> batch) {
    RowBatchBuilder builder{batch->num_rows()};

    for (int i = 0; i < batch->num_columns(); ++i) {
      builder.SetField(batch->schema()->field(i).get());
      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*batch->column(i).get(), &builder));
    }

    return std::move(builder).Rows();
  }

  /// Convert an Arrow table into an iterator of Documents
  arrow::Iterator<rapidjson::Document> ConvertToIterator(
      std::shared_ptr<arrow::Table> table, size_t batch_size) {
    // Use TableBatchReader to divide table into smaller batches. The batches
    // created are zero-copy slices with *at most* `batch_size` rows.
    auto batch_reader = std::make_shared<arrow::TableBatchReader>(*table);
    batch_reader->set_chunksize(batch_size);

    auto read_batch = [this](const std::shared_ptr<arrow::RecordBatch>& batch)
        -> arrow::Result<arrow::Iterator<rapidjson::Document>> {
      ARROW_ASSIGN_OR_RAISE(auto rows, ConvertToVector(batch));
      return arrow::MakeVectorIterator(std::move(rows));
    };

    auto nested_iter = arrow::MakeMaybeMapIterator(
        read_batch, arrow::MakeIteratorFromReader(std::move(batch_reader)));

    return arrow::MakeFlattenIterator(std::move(nested_iter));
  }
};  // ArrowToDocumentConverter

One level down, the output rows are filled in by RowBatchBuilder. The RowBatchBuilder implements Visit() methods, but to save on code we write a template method for array types that have primitive C equivalents (booleans, integers, and floats) using arrow::enable_if_primitive_ctype. See Type Traits for other type predicates.

class RowBatchBuilder {
 public:
  explicit RowBatchBuilder(int64_t num_rows) : field_(nullptr) {
    // Reserve all of the space required up-front to avoid unnecessary resizing
    rows_.reserve(num_rows);

    for (int64_t i = 0; i < num_rows; ++i) {
      rows_.push_back(rapidjson::Document());
      rows_[i].SetObject();
    }
  }

  /// \brief Set which field to convert.
  void SetField(const arrow::Field* field) { field_ = field; }

  /// \brief Retrieve converted rows from builder.
  std::vector<rapidjson::Document> Rows() && { return std::move(rows_); }

  // Default implementation
  arrow::Status Visit(const arrow::Array& array) {
    return arrow::Status::NotImplemented(
        "Can not convert to json document for array of type ", array.type()->ToString());
  }

  // Handles booleans, integers, floats
  template <typename ArrayType, typename DataClass = typename ArrayType::TypeClass>
  arrow::enable_if_primitive_ctype<DataClass, arrow::Status> Visit(
      const ArrayType& array) {
    assert(static_cast<int64_t>(rows_.size()) == array.length());
    for (int64_t i = 0; i < array.length(); ++i) {
      if (!array.IsNull(i)) {
        rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
        rows_[i].AddMember(str_key, array.Value(i), rows_[i].GetAllocator());
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StringArray& array) {
    assert(static_cast<int64_t>(rows_.size()) == array.length());
    for (int64_t i = 0; i < array.length(); ++i) {
      if (!array.IsNull(i)) {
        rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
        std::string_view value_view = array.Value(i);
        rapidjson::Value value;
        value.SetString(value_view.data(),
                        static_cast<rapidjson::SizeType>(value_view.size()),
                        rows_[i].GetAllocator());
        rows_[i].AddMember(str_key, value, rows_[i].GetAllocator());
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::StructArray& array) {
    const arrow::StructType* type = array.struct_type();

    assert(static_cast<int64_t>(rows_.size()) == array.length());

    RowBatchBuilder child_builder(rows_.size());
    for (int i = 0; i < type->num_fields(); ++i) {
      const arrow::Field* child_field = type->field(i).get();
      child_builder.SetField(child_field);
      ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array.field(i).get(), &child_builder));
    }
    std::vector<rapidjson::Document> rows = std::move(child_builder).Rows();

    for (int64_t i = 0; i < array.length(); ++i) {
      if (!array.IsNull(i)) {
        rapidjson::Value str_key(field_->name(), rows_[i].GetAllocator());
        // Must copy value to new allocator
        rapidjson::Value row_val;
        row_val.CopyFrom(rows[i], rows_[i].GetAllocator());
        rows_[i].AddMember(str_key, row_val, rows_[i].GetAllocator());
      }
    }
    return arrow::Status::OK();
  }

  arrow::Status Visit(const arrow::ListArray& array) {
    assert(static_cast<int64_t>(rows_.size()) == array.length());
    // First create rows from values
    std::shared_ptr<arrow::Array> values = array.values();
    RowBatchBuilder child_builder(values->length());
    const arrow::Field* value_field = array.list_type()->value_field().get();
    std::string value_field_name = value_field->name();
    child_builder.SetField(value_field);
    ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*values.get(), &child_builder));

    std::vector<rapidjson::Document> rows = std::move(child_builder).Rows();

    int64_t values_i = 0;
    for (int64_t i = 0; i < array.length(); ++i) {
      if (array.IsNull(i)) continue;

      rapidjson::Document::AllocatorType& allocator = rows_[i].GetAllocator();
      auto array_len = array.value_length(i);

      rapidjson::Value value;
      value.SetArray();
      value.Reserve(array_len, allocator);

      for (int64_t j = 0; j < array_len; ++j) {
        rapidjson::Value row_val;
        // Must copy value to new allocator
        row_val.CopyFrom(rows[values_i][value_field_name], allocator);
        value.PushBack(row_val, allocator);
        ++values_i;
      }

      rapidjson::Value str_key(field_->name(), allocator);
      rows_[i].AddMember(str_key, value, allocator);
    }

    return arrow::Status::OK();
  }

 private:
  const arrow::Field* field_;
  std::vector<rapidjson::Document> rows_;
};  // RowBatchBuilder