Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
builder.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_BUILDER_H
19 #define ARROW_BUILDER_H
20 
21 #include <algorithm> // IWYU pragma: keep
22 #include <array>
23 #include <cstdint>
24 #include <cstring>
25 #include <iterator>
26 #include <limits>
27 #include <memory>
28 #include <string>
29 #include <type_traits>
30 #include <vector>
31 
32 #include "arrow/buffer.h"
33 #include "arrow/memory_pool.h"
34 #include "arrow/status.h"
35 #include "arrow/type.h"
36 #include "arrow/type_traits.h"
37 #include "arrow/util/bit-util.h"
38 #include "arrow/util/hash.h"
39 #include "arrow/util/macros.h"
40 #include "arrow/util/type_traits.h"
41 #include "arrow/util/visibility.h"
42 
43 namespace arrow {
44 
45 class Array;
46 struct ArrayData;
47 class Decimal128;
48 
49 constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
50 constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
51 
52 constexpr int64_t kMinBuilderCapacity = 1 << 5;
53 
55 //
59 class ARROW_EXPORT ArrayBuilder {
60  public:
61  explicit ArrayBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
62  : type_(type),
63  pool_(pool),
64  null_bitmap_(NULLPTR),
65  null_count_(0),
66  null_bitmap_data_(NULLPTR),
67  length_(0),
68  capacity_(0) {}
69 
70  virtual ~ArrayBuilder() = default;
71 
74  ArrayBuilder* child(int i) { return children_[i].get(); }
75 
76  int num_children() const { return static_cast<int>(children_.size()); }
77 
78  int64_t length() const { return length_; }
79  int64_t null_count() const { return null_count_; }
80  int64_t capacity() const { return capacity_; }
81 
83  Status AppendToBitmap(bool is_valid);
84 
87  Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
88 
90  Status SetNotNull(int64_t length);
91 
99  virtual Status Resize(int64_t capacity);
100 
107  Status Reserve(int64_t additional_capacity);
108 
110  virtual void Reset();
111 
115  Status Advance(int64_t elements);
116 
122  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
123 
129  Status Finish(std::shared_ptr<Array>* out);
130 
131  std::shared_ptr<DataType> type() const { return type_; }
132 
133  // Unsafe operations (don't check capacity/don't resize)
134 
135  // Append to null bitmap, update the length
136  void UnsafeAppendToBitmap(bool is_valid) {
137  if (is_valid) {
138  BitUtil::SetBit(null_bitmap_data_, length_);
139  } else {
140  ++null_count_;
141  }
142  ++length_;
143  }
144 
145  template <typename IterType>
146  void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) {
147  int64_t byte_offset = length_ / 8;
148  int64_t bit_offset = length_ % 8;
149  uint8_t bitset = null_bitmap_data_[byte_offset];
150 
151  for (auto iter = begin; iter != end; ++iter) {
152  if (bit_offset == 8) {
153  bit_offset = 0;
154  null_bitmap_data_[byte_offset] = bitset;
155  byte_offset++;
156  // TODO: Except for the last byte, this shouldn't be needed
157  bitset = null_bitmap_data_[byte_offset];
158  }
159 
160  if (*iter) {
161  bitset |= BitUtil::kBitmask[bit_offset];
162  } else {
163  bitset &= BitUtil::kFlippedBitmask[bit_offset];
164  ++null_count_;
165  }
166 
167  bit_offset++;
168  }
169 
170  if (bit_offset != 0) {
171  null_bitmap_data_[byte_offset] = bitset;
172  }
173 
174  length_ += std::distance(begin, end);
175  }
176 
177  protected:
179 
180  std::shared_ptr<DataType> type_;
182 
183  // When null_bitmap are first appended to the builder, the null bitmap is allocated
184  std::shared_ptr<ResizableBuffer> null_bitmap_;
185  int64_t null_count_;
187 
188  // Array length, so far. Also, the index of the next element to be added
189  int64_t length_;
190  int64_t capacity_;
191 
192  // Child value array builders. These are owned by this class
193  std::vector<std::unique_ptr<ArrayBuilder>> children_;
194 
195  // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
196  // assume all of length bits are valid.
197  void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
198 
199  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
200 
201  // Set the next length bits to not null (i.e. valid).
202  void UnsafeSetNotNull(int64_t length);
203 
204  private:
206 };
207 
208 class ARROW_EXPORT NullBuilder : public ArrayBuilder {
209  public:
211  : ArrayBuilder(null(), pool) {}
212 
214  ++null_count_;
215  ++length_;
216  return Status::OK();
217  }
218 
219  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
220 };
221 
222 template <typename Type>
223 class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
224  public:
225  using value_type = typename Type::c_type;
226 
227  explicit PrimitiveBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
228  : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {}
229 
230  using ArrayBuilder::Advance;
231 
235  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
236  ARROW_RETURN_NOT_OK(Reserve(length));
237  memset(raw_data_ + length_, 0,
238  static_cast<size_t>(TypeTraits<Type>::bytes_required(length)));
239  UnsafeAppendToBitmap(valid_bytes, length);
240  return Status::OK();
241  }
242 
244  ARROW_RETURN_NOT_OK(Reserve(1));
245  memset(raw_data_ + length_, 0, sizeof(value_type));
246  UnsafeAppendToBitmap(false);
247  return Status::OK();
248  }
249 
250  value_type GetValue(int64_t index) const {
251  return reinterpret_cast<const value_type*>(data_->data())[index];
252  }
253 
260  Status AppendValues(const value_type* values, int64_t length,
261  const uint8_t* valid_bytes = NULLPTR);
262 
269  Status AppendValues(const value_type* values, int64_t length,
270  const std::vector<bool>& is_valid);
271 
277  Status AppendValues(const std::vector<value_type>& values,
278  const std::vector<bool>& is_valid);
279 
283  Status AppendValues(const std::vector<value_type>& values);
284 
289 
290  template <typename ValuesIter>
291  Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
292  int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
293  ARROW_RETURN_NOT_OK(Reserve(length));
294 
295  std::copy(values_begin, values_end, raw_data_ + length_);
296 
297  // this updates the length_
298  UnsafeSetNotNull(length);
299  return Status::OK();
300  }
301 
308  template <typename ValuesIter, typename ValidIter>
309  typename std::enable_if<!std::is_pointer<ValidIter>::value, Status>::type AppendValues(
310  ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
311  static_assert(!internal::is_null_pointer<ValidIter>::value,
312  "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
313  "version instead");
314  int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
315  ARROW_RETURN_NOT_OK(Reserve(length));
316 
317  std::copy(values_begin, values_end, raw_data_ + length_);
318 
319  // this updates the length_
320  UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
321  return Status::OK();
322  }
323 
330  template <typename ValuesIter, typename ValidIter>
331  typename std::enable_if<std::is_pointer<ValidIter>::value, Status>::type AppendValues(
332  ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
333  int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
334  ARROW_RETURN_NOT_OK(Reserve(length));
335 
336  std::copy(values_begin, values_end, raw_data_ + length_);
337 
338  // this updates the length_
339  if (valid_begin == NULLPTR) {
340  UnsafeSetNotNull(length);
341  } else {
342  UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
343  }
344 
345  return Status::OK();
346  }
347 
348  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
349  void Reset() override;
350 
351  Status Resize(int64_t capacity) override;
352 
353  protected:
354  std::shared_ptr<ResizableBuffer> data_;
356 };
357 
359 template <typename T>
360 class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder<T> {
361  public:
362  using typename PrimitiveBuilder<T>::value_type;
364 
365  template <typename T1 = T>
366  explicit NumericBuilder(
367  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool
369  : PrimitiveBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
370 
374 
376  Status Append(const value_type val) {
378  UnsafeAppend(val);
379  return Status::OK();
380  }
381 
387  void UnsafeAppend(const value_type val) {
388  BitUtil::SetBit(null_bitmap_data_, length_);
389  raw_data_[length_++] = val;
390  }
391 
392  protected:
396 };
397 
398 // Builders
399 
404 
414 
418 
419 namespace internal {
420 
421 class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
422  public:
423  explicit AdaptiveIntBuilderBase(MemoryPool* pool);
424 
426  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
427  ARROW_RETURN_NOT_OK(Reserve(length));
428  memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
429  UnsafeAppendToBitmap(valid_bytes, length);
430  return Status::OK();
431  }
432 
433  Status AppendNull() {
434  ARROW_RETURN_NOT_OK(Reserve(1));
435  memset(data_->mutable_data() + length_ * int_size_, 0, int_size_);
436  UnsafeAppendToBitmap(false);
437  return Status::OK();
438  }
439 
440  void Reset() override;
441  Status Resize(int64_t capacity) override;
442 
443  protected:
444  std::shared_ptr<ResizableBuffer> data_;
445  uint8_t* raw_data_;
446 
447  uint8_t int_size_;
448 };
449 
450 // Check if we would need to expand the underlying storage type
451 inline uint8_t ExpandedIntSize(int64_t val, uint8_t current_int_size) {
452  if (current_int_size == 8 ||
453  (current_int_size < 8 &&
454  (val > static_cast<int64_t>(std::numeric_limits<int32_t>::max()) ||
455  val < static_cast<int64_t>(std::numeric_limits<int32_t>::min())))) {
456  return 8;
457  } else if (current_int_size == 4 ||
458  (current_int_size < 4 &&
459  (val > static_cast<int64_t>(std::numeric_limits<int16_t>::max()) ||
460  val < static_cast<int64_t>(std::numeric_limits<int16_t>::min())))) {
461  return 4;
462  } else if (current_int_size == 2 ||
463  (current_int_size == 1 &&
464  (val > static_cast<int64_t>(std::numeric_limits<int8_t>::max()) ||
465  val < static_cast<int64_t>(std::numeric_limits<int8_t>::min())))) {
466  return 2;
467  } else {
468  return 1;
469  }
470 }
471 
472 // Check if we would need to expand the underlying storage type
473 inline uint8_t ExpandedUIntSize(uint64_t val, uint8_t current_int_size) {
474  if (current_int_size == 8 ||
475  (current_int_size < 8 &&
476  (val > static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))) {
477  return 8;
478  } else if (current_int_size == 4 ||
479  (current_int_size < 4 &&
480  (val > static_cast<uint64_t>(std::numeric_limits<uint16_t>::max())))) {
481  return 4;
482  } else if (current_int_size == 2 ||
483  (current_int_size == 1 &&
484  (val > static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())))) {
485  return 2;
486  } else {
487  return 1;
488  }
489 }
490 
491 } // namespace internal
492 
493 class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
494  public:
496 
497  using ArrayBuilder::Advance;
498  using internal::AdaptiveIntBuilderBase::Reset;
499 
501  Status Append(const uint64_t val) {
502  ARROW_RETURN_NOT_OK(Reserve(1));
503  BitUtil::SetBit(null_bitmap_data_, length_);
504 
505  uint8_t new_int_size = internal::ExpandedUIntSize(val, int_size_);
506  if (new_int_size != int_size_) {
507  ARROW_RETURN_NOT_OK(ExpandIntSize(new_int_size));
508  }
509 
510  switch (int_size_) {
511  case 1:
512  reinterpret_cast<uint8_t*>(raw_data_)[length_++] = static_cast<uint8_t>(val);
513  break;
514  case 2:
515  reinterpret_cast<uint16_t*>(raw_data_)[length_++] = static_cast<uint16_t>(val);
516  break;
517  case 4:
518  reinterpret_cast<uint32_t*>(raw_data_)[length_++] = static_cast<uint32_t>(val);
519  break;
520  case 8:
521  reinterpret_cast<uint64_t*>(raw_data_)[length_++] = val;
522  break;
523  default:
524  return Status::NotImplemented("This code shall never be reached");
525  }
526  return Status::OK();
527  }
528 
535  Status AppendValues(const uint64_t* values, int64_t length,
536  const uint8_t* valid_bytes = NULLPTR);
537 
538  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
539 
540  protected:
541  Status ExpandIntSize(uint8_t new_int_size);
542 
543  template <typename new_type, typename old_type>
544  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
545  ExpandIntSizeInternal();
546 #define __LESS(a, b) (a) < (b)
547  template <typename new_type, typename old_type>
548  typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
549  ExpandIntSizeInternal();
550 #undef __LESS
551 
552  template <typename new_type>
553  Status ExpandIntSizeN();
554 };
555 
556 class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
557  public:
559 
560  using ArrayBuilder::Advance;
561  using internal::AdaptiveIntBuilderBase::Reset;
562 
564  Status Append(const int64_t val) {
565  ARROW_RETURN_NOT_OK(Reserve(1));
566  BitUtil::SetBit(null_bitmap_data_, length_);
567 
568  uint8_t new_int_size = internal::ExpandedIntSize(val, int_size_);
569  if (new_int_size != int_size_) {
570  ARROW_RETURN_NOT_OK(ExpandIntSize(new_int_size));
571  }
572 
573  switch (int_size_) {
574  case 1:
575  reinterpret_cast<int8_t*>(raw_data_)[length_++] = static_cast<int8_t>(val);
576  break;
577  case 2:
578  reinterpret_cast<int16_t*>(raw_data_)[length_++] = static_cast<int16_t>(val);
579  break;
580  case 4:
581  reinterpret_cast<int32_t*>(raw_data_)[length_++] = static_cast<int32_t>(val);
582  break;
583  case 8:
584  reinterpret_cast<int64_t*>(raw_data_)[length_++] = val;
585  break;
586  default:
587  return Status::NotImplemented("This code shall never be reached");
588  }
589  return Status::OK();
590  }
591 
598  Status AppendValues(const int64_t* values, int64_t length,
599  const uint8_t* valid_bytes = NULLPTR);
600 
601  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
602 
603  protected:
604  Status ExpandIntSize(uint8_t new_int_size);
605 
606  template <typename new_type, typename old_type>
607  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
608  ExpandIntSizeInternal();
609 #define __LESS(a, b) (a) < (b)
610  template <typename new_type, typename old_type>
611  typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
612  ExpandIntSizeInternal();
613 #undef __LESS
614 
615  template <typename new_type>
616  Status ExpandIntSizeN();
617 };
618 
619 class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
620  public:
621  using value_type = bool;
623 
624  explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
625 
626  using ArrayBuilder::Advance;
627 
629  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
630  ARROW_RETURN_NOT_OK(Reserve(length));
631  UnsafeAppendToBitmap(valid_bytes, length);
632 
633  return Status::OK();
634  }
635 
637  ARROW_RETURN_NOT_OK(Reserve(1));
638  UnsafeAppendToBitmap(false);
639 
640  return Status::OK();
641  }
642 
644  Status Append(const bool val) {
645  ARROW_RETURN_NOT_OK(Reserve(1));
646  BitUtil::SetBit(null_bitmap_data_, length_);
647  if (val) {
648  BitUtil::SetBit(raw_data_, length_);
649  } else {
650  BitUtil::ClearBit(raw_data_, length_);
651  }
652  ++length_;
653  return Status::OK();
654  }
655 
656  Status Append(const uint8_t val) { return Append(val != 0); }
657 
664  Status AppendValues(const uint8_t* values, int64_t length,
665  const uint8_t* valid_bytes = NULLPTR);
666 
673  Status AppendValues(const uint8_t* values, int64_t length,
674  const std::vector<bool>& is_valid);
675 
681  Status AppendValues(const std::vector<uint8_t>& values,
682  const std::vector<bool>& is_valid);
683 
687  Status AppendValues(const std::vector<uint8_t>& values);
688 
694  Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
695 
699  Status AppendValues(const std::vector<bool>& values);
700 
706  template <typename ValuesIter>
707  Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
708  int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
709  ARROW_RETURN_NOT_OK(Reserve(length));
710  auto iter = values_begin;
711  internal::GenerateBitsUnrolled(raw_data_, length_, length,
712  [&iter]() -> bool { return *(iter++); });
713 
714  // this updates length_
715  UnsafeSetNotNull(length);
716  return Status::OK();
717  }
718 
725  template <typename ValuesIter, typename ValidIter>
726  typename std::enable_if<!std::is_pointer<ValidIter>::value, Status>::type AppendValues(
727  ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
728  static_assert(!internal::is_null_pointer<ValidIter>::value,
729  "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
730  "version instead");
731  int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
732  ARROW_RETURN_NOT_OK(Reserve(length));
733 
734  auto iter = values_begin;
735  internal::GenerateBitsUnrolled(raw_data_, length_, length,
736  [&iter]() -> bool { return *(iter++); });
737 
738  // this updates length_
739  ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
740  return Status::OK();
741  }
742 
749  template <typename ValuesIter, typename ValidIter>
750  typename std::enable_if<std::is_pointer<ValidIter>::value, Status>::type AppendValues(
751  ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
752  int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
753  ARROW_RETURN_NOT_OK(Reserve(length));
754 
755  auto iter = values_begin;
756  internal::GenerateBitsUnrolled(raw_data_, length_, length,
757  [&iter]() -> bool { return *(iter++); });
758 
759  // this updates the length_
760  if (valid_begin == NULLPTR) {
761  UnsafeSetNotNull(length);
762  } else {
763  UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
764  }
765 
766  return Status::OK();
767  }
768 
769  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
770  void Reset() override;
771  Status Resize(int64_t capacity) override;
772 
773  protected:
774  std::shared_ptr<ResizableBuffer> data_;
775  uint8_t* raw_data_;
776 };
777 
778 // ----------------------------------------------------------------------
779 // List builder
780 
794 class ARROW_EXPORT ListBuilder : public ArrayBuilder {
795  public:
798  ListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
799  const std::shared_ptr<DataType>& type = NULLPTR);
800 
801  Status Resize(int64_t capacity) override;
802  void Reset() override;
803  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
804 
809  Status AppendValues(const int32_t* offsets, int64_t length,
810  const uint8_t* valid_bytes = NULLPTR);
811 
816  Status Append(bool is_valid = true);
817 
818  Status AppendNull() { return Append(false); }
819 
820  ArrayBuilder* value_builder() const;
821 
822  protected:
824  std::shared_ptr<ArrayBuilder> value_builder_;
825  std::shared_ptr<Array> values_;
826 
827  Status AppendNextOffset();
828 };
829 
830 // ----------------------------------------------------------------------
831 // Binary and String
832 
835 class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
836  public:
838 
839  BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
840 
841  Status Append(const uint8_t* value, int32_t length);
842 
843  Status Append(const char* value, int32_t length) {
844  return Append(reinterpret_cast<const uint8_t*>(value), length);
845  }
846 
847  Status Append(const std::string& value) {
848  return Append(value.c_str(), static_cast<int32_t>(value.size()));
849  }
850 
851  Status AppendNull();
852 
853  void Reset() override;
854  Status Resize(int64_t capacity) override;
855 
858  Status ReserveData(int64_t elements);
859 
860  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
861 
863  int64_t value_data_length() const { return value_data_builder_.length(); }
865  int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
866 
870  const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
871 
872  protected:
875 
876  Status AppendNextOffset();
877 };
878 
881 class ARROW_EXPORT StringBuilder : public BinaryBuilder {
882  public:
885 
886  using BinaryBuilder::Append;
887  using BinaryBuilder::Reset;
888 
895  Status AppendValues(const std::vector<std::string>& values,
896  const uint8_t* valid_bytes = NULLPTR);
897 
907  Status AppendValues(const char** values, int64_t length,
908  const uint8_t* valid_bytes = NULLPTR);
909 };
910 
911 // ----------------------------------------------------------------------
912 // FixedSizeBinaryBuilder
913 
914 class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
915  public:
916  FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
918 
919  Status Append(const uint8_t* value) {
920  ARROW_RETURN_NOT_OK(Reserve(1));
921  UnsafeAppendToBitmap(true);
922  return byte_builder_.Append(value, byte_width_);
923  }
924  Status Append(const char* value) {
925  return Append(reinterpret_cast<const uint8_t*>(value));
926  }
927 
928  template <size_t NBYTES>
929  Status Append(const std::array<uint8_t, NBYTES>& value) {
930  ARROW_RETURN_NOT_OK(Reserve(1));
931  UnsafeAppendToBitmap(true);
932  return byte_builder_.Append(value);
933  }
934 
935  Status AppendValues(const uint8_t* data, int64_t length,
936  const uint8_t* valid_bytes = NULLPTR);
937  Status Append(const std::string& value);
938  Status AppendNull();
939 
940  void Reset() override;
941  Status Resize(int64_t capacity) override;
942  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
943 
945  int64_t value_data_length() const { return byte_builder_.length(); }
946 
947  int32_t byte_width() const { return byte_width_; }
948 
952  const uint8_t* GetValue(int64_t i) const;
953 
954  protected:
955  int32_t byte_width_;
957 };
958 
959 class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
960  public:
961  explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
963 
967 
968  Status Append(const Decimal128& val);
969 
970  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
971 };
972 
974 
975 // ----------------------------------------------------------------------
976 // Struct
977 
978 // ---------------------------------------------------------------------------------
979 // StructArray builder
983 class ARROW_EXPORT StructBuilder : public ArrayBuilder {
984  public:
985  StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
986  std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders);
987 
988  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
989 
994  Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
995  ARROW_RETURN_NOT_OK(Reserve(length));
996  UnsafeAppendToBitmap(valid_bytes, length);
997  return Status::OK();
998  }
999 
1002  Status Append(bool is_valid = true) {
1003  ARROW_RETURN_NOT_OK(Reserve(1));
1004  UnsafeAppendToBitmap(is_valid);
1005  return Status::OK();
1006  }
1007 
1008  Status AppendNull() { return Append(false); }
1009 
1010  void Reset() override;
1011 
1012  ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); }
1013 
1014  int num_fields() const { return static_cast<int>(field_builders_.size()); }
1015 
1016  protected:
1017  std::vector<std::shared_ptr<ArrayBuilder>> field_builders_;
1018 };
1019 
1020 // ----------------------------------------------------------------------
1021 // Dictionary builder
1022 
1023 namespace internal {
1024 
1025 // TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
1026 struct WrappedBinary {
1027  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {}
1028 
1029  const uint8_t* ptr_;
1030  int32_t length_;
1031 };
1032 
1033 template <typename T>
1034 struct DictionaryScalar {
1035  using type = typename T::c_type;
1036 };
1037 
1038 template <>
1039 struct DictionaryScalar<BinaryType> {
1040  using type = WrappedBinary;
1041 };
1042 
1043 template <>
1044 struct DictionaryScalar<StringType> {
1045  using type = WrappedBinary;
1046 };
1047 
1048 template <>
1049 struct DictionaryScalar<FixedSizeBinaryType> {
1050  using type = const uint8_t*;
1051 };
1052 
1053 } // namespace internal
1054 
1063 template <typename T>
1064 class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
1065  public:
1066  using Scalar = typename internal::DictionaryScalar<T>::type;
1067 
1068  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
1069 
1070  template <typename T1 = T>
1072  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool)
1073  : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
1074 
1076  Status Append(const Scalar& value);
1077 
1079  Status AppendNull();
1080 
1082  Status AppendArray(const Array& array);
1083 
1084  void Reset() override;
1085  Status Resize(int64_t capacity) override;
1086  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
1087 
1089  bool is_building_delta() { return entry_id_offset_ > 0; }
1090 
1091  protected:
1092  // Hash table implementation helpers
1093  Status DoubleTableSize();
1094  Scalar GetDictionaryValue(typename TypeTraits<T>::BuilderType& dictionary_builder,
1095  int64_t index);
1096  int64_t HashValue(const Scalar& value);
1097  // Check whether the dictionary entry in *slot* is equal to the given *value*
1098  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
1099  Status AppendDictionary(const Scalar& value);
1100 
1101  std::shared_ptr<Buffer> hash_table_;
1102  int32_t* hash_slots_;
1103 
1106 
1107  // Offset for the dictionary entries in dict_builder_.
1108  // Increased on every Finish call by the number of current entries
1109  // in the dictionary.
1111 
1112  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
1113  // hash_table_size_, but uses far fewer CPU cycles
1114  int64_t mod_bitmask_;
1115 
1116  // This builder accumulates new dictionary entries since the last Finish call
1117  // (or since the beginning if Finish hasn't been called).
1118  // In other words, it contains the current delta dictionary.
1120  // This builder stores dictionary entries encountered before the last Finish call.
1122 
1124  int32_t byte_width_;
1125 
1128 };
1129 
1130 template <>
1131 class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
1132  public:
1133  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
1134  explicit DictionaryBuilder(MemoryPool* pool);
1135 
1137  Status AppendNull();
1138 
1140  Status AppendArray(const Array& array);
1141 
1142  Status Resize(int64_t capacity) override;
1143  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
1144 
1145  protected:
1147 };
1148 
1149 class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
1150  public:
1153 
1154  Status Append(const uint8_t* value, int32_t length) {
1155  return Append(internal::WrappedBinary(value, length));
1156  }
1157 
1158  Status Append(const char* value, int32_t length) {
1159  return Append(
1160  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
1161  }
1162 
1163  Status Append(const std::string& value) {
1164  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
1165  static_cast<int32_t>(value.size())));
1166  }
1167 };
1168 
1170 class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
1171  public:
1174 
1175  Status Append(const uint8_t* value, int32_t length) {
1176  return Append(internal::WrappedBinary(value, length));
1177  }
1178 
1179  Status Append(const char* value, int32_t length) {
1180  return Append(
1181  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
1182  }
1183 
1184  Status Append(const std::string& value) {
1185  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
1186  static_cast<int32_t>(value.size())));
1187  }
1188 };
1189 
1190 // ----------------------------------------------------------------------
1191 // Helper functions
1192 
1193 ARROW_EXPORT
1194 Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
1195  std::unique_ptr<ArrayBuilder>* out);
1196 
1197 } // namespace arrow
1198 
1199 #endif // ARROW_BUILDER_H_
typename T ::c_type value_type
Definition: builder.h:225
std::vector< std::unique_ptr< ArrayBuilder > > children_
Definition: builder.h:193
Builder class for UTF8 strings.
Definition: builder.h:881
void UnsafeAppendToBitmap(bool is_valid)
Definition: builder.h:136
std::shared_ptr< ResizableBuffer > data_
Definition: builder.h:774
Array builder for created encoded DictionaryArray from dense array.
Definition: builder.h:1064
NullBuilder(MemoryPool *pool ARROW_MEMORY_POOL_DEFAULT)
Definition: builder.h:210
int64_t mod_bitmask_
Definition: builder.h:1114
Builder class for variable-length binary data.
Definition: builder.h:835
#define NULLPTR
Definition: macros.h:69
AdaptiveIntBuilder values_builder_
Definition: builder.h:1123
Dictionary array builder with convenience methods for strings.
Definition: builder.h:1170
void Reset() override
Reset the builder.
#define ARROW_MEMORY_POOL_DEFAULT
Definition: memory_pool.h:117
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:1154
Status Append(const char *value)
Definition: builder.h:924
Status Append(const uint8_t *value, int32_t length)
TypedBufferBuilder< uint8_t > value_data_builder_
Definition: builder.h:874
int num_children() const
Definition: builder.h:76
DictionaryBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool)
Definition: builder.h:1071
std::shared_ptr< ResizableBuffer > data_
Definition: builder.h:354
DictionaryBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
std::enable_if< std::is_pointer< ValidIter >::value, Status >::type AppendValues(ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin)
Append a sequence of elements in one shot, with a specified nullmap.
Definition: builder.h:750
#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: macros.h:23
Status MakeBuilder(MemoryPool *pool, const std::shared_ptr< DataType > &type, std::unique_ptr< ArrayBuilder > *out)
Status AppendNull()
Definition: builder.h:636
Definition: builder.h:493
Definition: builder.h:223
int64_t null_count_
Definition: builder.h:185
value_type * raw_data_
Definition: builder.h:355
Status Append(bool is_valid=true)
Append an element to the Struct.
Definition: builder.h:1002
Status Append(const uint64_t val)
Scalar append.
Definition: builder.h:501
constexpr int64_t kListMaximumElements
Definition: builder.h:50
Status AppendNulls(const uint8_t *valid_bytes, int64_t length)
Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory The memory at the correspo...
Definition: builder.h:235
int64_t length() const
Definition: builder.h:78
int64_t length_
Definition: builder.h:189
int32_t byte_width() const
Definition: builder.h:947
std::shared_ptr< Array > values_
Definition: builder.h:825
Status Append(const std::string &value)
Definition: builder.h:1184
std::enable_if<!std::is_pointer< ValidIter >::value, Status >::type AppendValues(ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin)
Append a sequence of elements in one shot, with a specified nullmap.
Definition: builder.h:309
int32_t byte_width_
Definition: builder.h:955
ArrayBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Definition: builder.h:61
constexpr int64_t kMinBuilderCapacity
Definition: builder.h:52
Base class for all Builders that emit an Array of a scalar numerical type.
Definition: builder.h:360
int64_t null_count() const
Definition: builder.h:79
Base class for all data array builders.
Definition: builder.h:59
Status AppendNull()
Definition: builder.h:1008
Definition: status.h:95
int32_t * hash_slots_
Definition: builder.h:1102
Status Append(const char *value, int32_t length)
Definition: builder.h:1179
Definition: builder.h:619
Definition: type_traits.h:34
PrimitiveBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Definition: builder.h:227
Definition: builder.h:1149
ArrayBuilder()
Definition: builder.h:178
int num_fields() const
Definition: builder.h:1014
Status Append(const std::array< uint8_t, NBYTES > &value)
Definition: builder.h:929
Status AppendValues(int64_t length, const uint8_t *valid_bytes)
Null bitmap is of equal length to every child field, and any zero byte will be considered as a null f...
Definition: builder.h:994
Concrete type class for always-null data.
Definition: type.h:311
static Status OK()
Definition: status.h:124
ArrayBuilder * child(int i)
For nested types.
Definition: builder.h:74
int32_t byte_width_
Definition: builder.h:1124
std::shared_ptr< DataType > null()
Return a NullType instance.
Status Append(const Scalar &value)
Append a scalar value.
void Reset() override
Reset the builder.
int32_t hash_slot_t
Definition: hash.h:29
Status Append(const std::string &value)
Definition: builder.h:847
Status AppendNull()
Definition: builder.h:818
Definition: builder.h:556
int64_t entry_id_offset_
Definition: builder.h:1110
value_type GetValue(int64_t index) const
Definition: builder.h:250
Status Append(const std::string &value)
Definition: builder.h:1163
int64_t capacity() const
Definition: builder.h:80
Status AppendValues(const uint8_t *data, int64_t length, const uint8_t *valid_bytes=NULLPTR)
uint8_t * null_bitmap_data_
Definition: builder.h:186
ArrayBuilder * field_builder(int i) const
Definition: builder.h:1012
Status AppendValues(ValuesIter values_begin, ValuesIter values_end)
Append a sequence of elements in one shot.
Definition: builder.h:707
Definition: builder.h:914
bool value_type
Definition: builder.h:621
TypedBufferBuilder< int32_t > offsets_builder_
Definition: builder.h:873
AdaptiveIntBuilder values_builder_
Definition: builder.h:1146
std::vector< std::shared_ptr< ArrayBuilder > > field_builders_
Definition: builder.h:1017
std::shared_ptr< DataType > type_
Definition: builder.h:180
Status Append(const int64_t val)
Scalar append.
Definition: builder.h:564
BufferBuilder byte_builder_
Definition: builder.h:956
void UnsafeAppendToBitmap(const IterType &begin, const IterType &end)
Definition: builder.h:146
Status AppendNull()
Definition: builder.h:213
Top-level namespace for Apache Arrow C++ API.
Definition: adapter.h:32
Array base type Immutable data array with some logical type and some length.
Definition: array.h:200
TypedBufferBuilder< int32_t > offsets_builder_
Definition: builder.h:823
TypeTraits< T >::BuilderType overflow_dict_builder_
Definition: builder.h:1121
int64_t hash_table_size_
Size of the table. Must be a power of 2.
Definition: builder.h:1105
int64_t value_data_length() const
Definition: builder.h:863
Append, Resize and Reserve methods are acting on StructBuilder.
Definition: builder.h:983
int64_t capacity_
Definition: builder.h:190
Status Append(const uint8_t val)
Definition: builder.h:656
BinaryBuilder(MemoryPool *pool ARROW_MEMORY_POOL_DEFAULT)
NumericBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool ARROW_MEMORY_POOL_DEFAULT)
Definition: builder.h:366
Status Append(const char *value, int32_t length)
Definition: builder.h:1158
Status AppendNulls(const uint8_t *valid_bytes, int64_t length)
Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory.
Definition: builder.h:629
Status Append(const value_type val)
Append a single scalar and increase the size if necessary.
Definition: builder.h:376
Status Append(const char *value, int32_t length)
Definition: builder.h:843
Definition: builder.h:959
Status Append(const uint8_t *value)
Definition: builder.h:919
Status Advance(int64_t elements)
For cases where raw data was memcpy&#39;d into the internal buffers, allows us to advance the length of t...
bool is_building_delta()
is the dictionary builder in the delta building mode
Definition: builder.h:1089
std::shared_ptr< DataType > type() const
Definition: builder.h:131
MemoryPool * pool_
Definition: builder.h:181
std::shared_ptr< Buffer > hash_table_
Definition: builder.h:1101
static Status NotImplemented(const std::string &msg)
Definition: status.h:146
#define ARROW_RETURN_NOT_OK(s)
Definition: status.h:44
uint8_t * raw_data_
Definition: builder.h:775
void UnsafeAppend(const value_type val)
Append a single scalar under the assumption that the underlying Buffer is large enough.
Definition: builder.h:387
std::shared_ptr< ArrayBuilder > value_builder_
Definition: builder.h:824
int64_t hash_table_load_threshold_
Size at which we decide to resize.
Definition: builder.h:1127
int64_t value_data_capacity() const
Definition: builder.h:865
A class for incrementally building a contiguous chunk of in-memory data.
Definition: buffer.h:372
Base class for memory allocation.
Definition: memory_pool.h:34
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:1175
std::enable_if< std::is_pointer< ValidIter >::value, Status >::type AppendValues(ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin)
Append a sequence of elements in one shot, with a specified nullmap.
Definition: builder.h:331
typename internal::DictionaryScalar< BinaryType >::type Scalar
Definition: builder.h:1066
Status Reserve(int64_t additional_capacity)
Ensure that there is enough space allocated to add the indicated number of elements without any furth...
constexpr int64_t kBinaryMemoryLimit
Definition: builder.h:49
int64_t value_data_length() const
Definition: builder.h:945
Definition: builder.h:208
std::enable_if<!std::is_pointer< ValidIter >::value, Status >::type AppendValues(ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin)
Append a sequence of elements in one shot, with a specified nullmap.
Definition: builder.h:726
Represents a signed 128-bit integer in two&#39;s complement.
Definition: decimal.h:42
std::shared_ptr< ResizableBuffer > null_bitmap_
Definition: builder.h:184
Status Append(const bool val)
Scalar append.
Definition: builder.h:644
Status AppendNull()
Definition: builder.h:243
TypeTraits< T >::BuilderType dict_builder_
Definition: builder.h:1119
Builder class for variable-length list array value types.
Definition: builder.h:794
Status AppendValues(ValuesIter values_begin, ValuesIter values_end)
Append a sequence of elements in one shot.
Definition: builder.h:291