Row to columnar conversion¶
The following example converts an array of structs to a arrow::Table
instance, and then converts it back to the original array of structs.
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cstdint>
#include <iostream>
#include <vector>
#include <arrow/api.h>
using arrow::DoubleBuilder;
using arrow::Int64Builder;
using arrow::ListBuilder;
// While we want to use columnar data structures to build efficient operations, we
// often receive data in a row-wise fashion from other systems. In the following,
// we want give a brief introduction into the classes provided by Apache Arrow by
// showing how to transform row-wise data into a columnar table.
//
// The data in this example is stored in the following struct:
struct data_row {
int64_t id;
double cost;
std::vector<double> cost_components;
};
// Transforming a vector of structs into a columnar Table.
//
// The final representation should be an `arrow::Table` which in turn
// is made up of an `arrow::Schema` and a list of
// `arrow::ChunkedArray` instances. As the first step, we will iterate
// over the data and build up the arrays incrementally. For this
// task, we provide `arrow::ArrayBuilder` classes that help in the
// construction of the final `arrow::Array` instances.
//
// For each type, Arrow has a specially typed builder class. For the primitive
// values `id` and `cost` we can use the respective `arrow::Int64Builder` and
// `arrow::DoubleBuilder`. For the `cost_components` vector, we need to have two
// builders, a top-level `arrow::ListBuilder` that builds the array of offsets and
// a nested `arrow::DoubleBuilder` that constructs the underlying values array that
// is referenced by the offsets in the former array.
arrow::Status VectorToColumnarTable(const std::vector<struct data_row>& rows,
std::shared_ptr<arrow::Table>* table) {
// The builders are more efficient using
// arrow::jemalloc::MemoryPool::default_pool() as this can increase the size of
// the underlying memory regions in-place. At the moment, arrow::jemalloc is only
// supported on Unix systems, not Windows.
arrow::MemoryPool* pool = arrow::default_memory_pool();
Int64Builder id_builder(pool);
DoubleBuilder cost_builder(pool);
ListBuilder components_builder(pool, std::make_shared<DoubleBuilder>(pool));
// The following builder is owned by components_builder.
DoubleBuilder& cost_components_builder =
*(static_cast<DoubleBuilder*>(components_builder.value_builder()));
// Now we can loop over our existing data and insert it into the builders. The
// `Append` calls here may fail (e.g. we cannot allocate enough additional memory).
// Thus we need to check their return values. For more information on these values,
// check the documentation about `arrow::Status`.
for (const data_row& row : rows) {
ARROW_RETURN_NOT_OK(id_builder.Append(row.id));
ARROW_RETURN_NOT_OK(cost_builder.Append(row.cost));
// Indicate the start of a new list row. This will memorise the current
// offset in the values builder.
ARROW_RETURN_NOT_OK(components_builder.Append());
// Store the actual values. The final nullptr argument tells the underyling
// builder that all added values are valid, i.e. non-null.
ARROW_RETURN_NOT_OK(cost_components_builder.AppendValues(row.cost_components.data(),
row.cost_components.size()));
}
// At the end, we finalise the arrays, declare the (type) schema and combine them
// into a single `arrow::Table`:
std::shared_ptr<arrow::Array> id_array;
ARROW_RETURN_NOT_OK(id_builder.Finish(&id_array));
std::shared_ptr<arrow::Array> cost_array;
ARROW_RETURN_NOT_OK(cost_builder.Finish(&cost_array));
// No need to invoke cost_components_builder.Finish because it is implied by
// the parent builder's Finish invocation.
std::shared_ptr<arrow::Array> cost_components_array;
ARROW_RETURN_NOT_OK(components_builder.Finish(&cost_components_array));
std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()),
arrow::field("cost_components", arrow::list(arrow::float64()))};
auto schema = std::make_shared<arrow::Schema>(schema_vector);
// The final `table` variable is the one we then can pass on to other functions
// that can consume Apache Arrow memory structures. This object has ownership of
// all referenced data, thus we don't have to care about undefined references once
// we leave the scope of the function building the table and its underlying arrays.
*table = arrow::Table::Make(schema, {id_array, cost_array, cost_components_array});
return arrow::Status::OK();
}
arrow::Status ColumnarTableToVector(const std::shared_ptr<arrow::Table>& table,
std::vector<struct data_row>* rows) {
// To convert an Arrow table back into the same row-wise representation as in the
// above section, we first will check that the table conforms to our expected
// schema and then will build up the vector of rows incrementally.
//
// For the check if the table is as expected, we can utilise solely its schema.
std::vector<std::shared_ptr<arrow::Field>> schema_vector = {
arrow::field("id", arrow::int64()), arrow::field("cost", arrow::float64()),
arrow::field("cost_components", arrow::list(arrow::float64()))};
auto expected_schema = std::make_shared<arrow::Schema>(schema_vector);
if (!expected_schema->Equals(*table->schema())) {
// The table doesn't have the expected schema thus we cannot directly
// convert it to our target representation.
return arrow::Status::Invalid("Schemas are not matching!");
}
// As we have ensured that the table has the expected structure, we can unpack the
// underlying arrays. For the primitive columns `id` and `cost` we can use the high
// level functions to get the values whereas for the nested column
// `cost_components` we need to access the C-pointer to the data to copy its
// contents into the resulting `std::vector<double>`. Here we need to be care to
// also add the offset to the pointer. This offset is needed to enable zero-copy
// slicing operations. While this could be adjusted automatically for double
// arrays, this cannot be done for the accompanying bitmap as often the slicing
// border would be inside a byte.
auto ids =
std::static_pointer_cast<arrow::Int64Array>(table->column(0)->chunk(0));
auto costs =
std::static_pointer_cast<arrow::DoubleArray>(table->column(1)->chunk(0));
auto cost_components =
std::static_pointer_cast<arrow::ListArray>(table->column(2)->chunk(0));
auto cost_components_values =
std::static_pointer_cast<arrow::DoubleArray>(cost_components->values());
// To enable zero-copy slices, the native values pointer might need to account
// for this slicing offset. This is not needed for the higher level functions
// like Value(…) that already account for this offset internally.
const double* ccv_ptr = cost_components_values->data()->GetValues<double>(1);
for (int64_t i = 0; i < table->num_rows(); i++) {
// Another simplification in this example is that we assume that there are
// no null entries, e.g. each row is fill with valid values.
int64_t id = ids->Value(i);
double cost = costs->Value(i);
const double* first = ccv_ptr + cost_components->value_offset(i);
const double* last = ccv_ptr + cost_components->value_offset(i + 1);
std::vector<double> components_vec(first, last);
rows->push_back({id, cost, components_vec});
}
return arrow::Status::OK();
}
#define EXIT_ON_FAILURE(expr) \
do { \
arrow::Status status_ = (expr); \
if (!status_.ok()) { \
std::cerr << status_.message() << std::endl; \
return EXIT_FAILURE; \
} \
} while (0);
int main(int argc, char** argv) {
std::vector<data_row> rows = {
{1, 1.0, {1.0}}, {2, 2.0, {1.0, 2.0}}, {3, 3.0, {1.0, 2.0, 3.0}}};
std::shared_ptr<arrow::Table> table;
EXIT_ON_FAILURE(VectorToColumnarTable(rows, &table));
std::vector<data_row> expected_rows;
EXIT_ON_FAILURE(ColumnarTableToVector(table, &expected_rows));
assert(rows.size() == expected_rows.size());
return EXIT_SUCCESS;
}