Creating Arrow Objects¶
Recipes related to the creation of Arrays, Tables, Tensors and all other Arrow entities.
Create Arrays from Standard C++¶
Typed subclasses of arrow::ArrayBuilder
make it easy
to efficiently create Arrow arrays from existing C++ data:
arrow::Int32Builder builder;
ARROW_RETURN_NOT_OK(builder.Append(1));
ARROW_RETURN_NOT_OK(builder.Append(2));
ARROW_RETURN_NOT_OK(builder.Append(3));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, builder.Finish())
rout << arr->ToString() << std::endl;
[
1,
2,
3
]
Note
Builders will allocate data as needed and insertion should have constant amortized time.
Builders can also consume standard C++ containers:
// Raw pointers
arrow::Int64Builder long_builder = arrow::Int64Builder();
std::array<int64_t, 4> values = {1, 2, 3, 4};
ARROW_RETURN_NOT_OK(long_builder.AppendValues(values.data(), values.size()));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Array> arr, long_builder.Finish());
rout << arr->ToString() << std::endl;
// Vectors
arrow::StringBuilder str_builder = arrow::StringBuilder();
std::vector<std::string> strvals = {"x", "y", "z"};
ARROW_RETURN_NOT_OK(str_builder.AppendValues(strvals));
ARROW_ASSIGN_OR_RAISE(arr, str_builder.Finish());
rout << arr->ToString() << std::endl;
// Iterators
arrow::DoubleBuilder dbl_builder = arrow::DoubleBuilder();
std::set<double> dblvals = {1.1, 1.1, 2.3};
ARROW_RETURN_NOT_OK(dbl_builder.AppendValues(dblvals.begin(), dblvals.end()));
ARROW_ASSIGN_OR_RAISE(arr, dbl_builder.Finish());
rout << arr->ToString() << std::endl;
[
1,
2,
3,
4
]
[
"x",
"y",
"z"
]
[
1.1,
2.3
]
Note
Builders will not take ownership of data in containers and will make a copy of the underlying data.
Generate Random Data for a Given Schema¶
To generate random data for a given schema, implementing a type visitor is a good idea. The following example only implements double arrays and list arrays, but could be easily extended to all types.
1class RandomBatchGenerator {
2 public:
3 std::shared_ptr<arrow::Schema> schema;
4
5 RandomBatchGenerator(std::shared_ptr<arrow::Schema> schema) : schema(schema){};
6
7 arrow::Result<std::shared_ptr<arrow::RecordBatch>> Generate(int32_t num_rows) {
8 num_rows_ = num_rows;
9 for (std::shared_ptr<arrow::Field> field : schema->fields()) {
10 ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this));
11 }
12
13 return arrow::RecordBatch::Make(schema, num_rows, arrays_);
14 }
15
16 // Default implementation
17 arrow::Status Visit(const arrow::DataType& type) {
18 return arrow::Status::NotImplemented("Generating data for", type.ToString());
19 }
20
21 arrow::Status Visit(const arrow::DoubleType&) {
22 auto builder = arrow::DoubleBuilder();
23 std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0};
24 for (int32_t i = 0; i < num_rows_; ++i) {
25 ARROW_RETURN_NOT_OK(builder.Append(d(gen_)));
26 }
27 ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish());
28 arrays_.push_back(array);
29 return arrow::Status::OK();
30 }
31
32 arrow::Status Visit(const arrow::ListType& type) {
33 // Generate offsets first, which determines number of values in sub-array
34 std::poisson_distribution<> d{/*mean=*/4};
35 auto builder = arrow::Int32Builder();
36 ARROW_RETURN_NOT_OK(builder.Append(0));
37 int32_t last_val = 0;
38 for (int32_t i = 0; i < num_rows_; ++i) {
39 last_val += d(gen_);
40 ARROW_RETURN_NOT_OK(builder.Append(last_val));
41 }
42 ARROW_ASSIGN_OR_RAISE(auto offsets, builder.Finish());
43
44 // Since children of list has a new length, will use a new generator
45 RandomBatchGenerator value_gen(arrow::schema({arrow::field("x", type.value_type())}));
46 // Last index from the offsets array becomes the length of the sub-array
47 ARROW_ASSIGN_OR_RAISE(auto inner_batch, value_gen.Generate(last_val));
48 std::shared_ptr<arrow::Array> values = inner_batch->column(0);
49
50 ARROW_ASSIGN_OR_RAISE(auto array,
51 arrow::ListArray::FromArrays(*offsets.get(), *values.get()));
52 arrays_.push_back(array);
53
54 return arrow::Status::OK();
55 }
56
57 protected:
58 std::random_device rd_{};
59 std::mt19937 gen_{rd_()};
60 std::vector<std::shared_ptr<arrow::Array>> arrays_;
61 int32_t num_rows_;
62}; // RandomBatchGenerator
Given such a generator, you can create random test data for any supported schema:
std::shared_ptr<arrow::Schema> schema =
arrow::schema({arrow::field("x", arrow::float64()),
arrow::field("y", arrow::list(arrow::float64()))});
RandomBatchGenerator generator(schema);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::RecordBatch> batch, generator.Generate(5));
rout << "Created batch: \n" << batch->ToString();
// Consider using ValidateFull to check correctness
ARROW_RETURN_NOT_OK(batch->ValidateFull());
Created batch:
x: [
4.546911589795752,
6.984533198458078,
7.617112892424505,
7.071039704261608,
5.333380507036075
]
y: [
[
6.162093180569001,
4.264271666435832,
4.453379826203139
],
[
5.550493157228391,
2.2790346108514914,
6.320687795635024,
5.790474643286342
],
[
6.1749549303569,
1.2247191609769907,
10.309335708651332,
2.7148579213976567,
0.7332353370369562,
7.925025202564361,
4.011131470597689
],
[
3.051431659823732,
6.459224633329098,
6.545469562979236,
4.2098221381083905,
4.227733269678735,
5.916080551640544
],
[
5.996692460353367,
3.8667241669428876,
1.3804329308731353,
5.711691758211411,
3.4554154047425714,
3.102919934591531
]
]