Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
builder.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_BUILDER_H
19 #define ARROW_BUILDER_H
20 
21 #include <array>
22 #include <cstdint>
23 #include <functional>
24 #include <limits>
25 #include <memory>
26 #include <string>
27 #include <vector>
28 
29 #include "arrow/buffer.h"
30 #include "arrow/memory_pool.h"
31 #include "arrow/status.h"
32 #include "arrow/table.h"
33 #include "arrow/type.h"
34 #include "arrow/type_traits.h"
35 #include "arrow/util/bit-util.h"
36 #include "arrow/util/macros.h"
37 #include "arrow/util/visibility.h"
38 
39 namespace arrow {
40 
41 class Array;
42 class Decimal128;
43 
44 namespace internal {
45 
46 struct ArrayData;
47 
48 } // namespace internal
49 
50 static constexpr int64_t kMinBuilderCapacity = 1 << 5;
51 
53 //
57 class ARROW_EXPORT ArrayBuilder {
58  public:
59  explicit ArrayBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
60  : type_(type),
61  pool_(pool),
62  null_bitmap_(NULLPTR),
63  null_count_(0),
64  null_bitmap_data_(NULLPTR),
65  length_(0),
66  capacity_(0) {}
67 
68  virtual ~ArrayBuilder() = default;
69 
72  ArrayBuilder* child(int i) { return children_[i].get(); }
73 
74  int num_children() const { return static_cast<int>(children_.size()); }
75 
76  int64_t length() const { return length_; }
77  int64_t null_count() const { return null_count_; }
78  int64_t capacity() const { return capacity_; }
79 
81  Status AppendToBitmap(bool is_valid);
82 
85  Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
86 
88  Status SetNotNull(int64_t length);
89 
93  virtual Status Init(int64_t capacity);
94 
98  virtual Status Resize(int64_t new_bits);
99 
102  Status Reserve(int64_t elements);
103 
107  Status Advance(int64_t elements);
108 
109  std::shared_ptr<PoolBuffer> null_bitmap() const { return null_bitmap_; }
110 
116  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
117 
122  Status Finish(std::shared_ptr<Array>* out);
123 
124  std::shared_ptr<DataType> type() const { return type_; }
125 
126  protected:
128 
129  std::shared_ptr<DataType> type_;
131 
132  // When null_bitmap are first appended to the builder, the null bitmap is allocated
133  std::shared_ptr<PoolBuffer> null_bitmap_;
134  int64_t null_count_;
136 
137  // Array length, so far. Also, the index of the next element to be added
138  int64_t length_;
139  int64_t capacity_;
140 
141  // Child value array builders. These are owned by this class
142  std::vector<std::unique_ptr<ArrayBuilder>> children_;
143 
144  void Reset();
145 
146  // Unsafe operations (don't check capacity/don't resize)
147 
148  // Append to null bitmap.
149  void UnsafeAppendToBitmap(bool is_valid) {
150  if (is_valid) {
151  BitUtil::SetBit(null_bitmap_data_, length_);
152  } else {
153  ++null_count_;
154  }
155  ++length_;
156  }
157 
158  // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
159  // assume all of length bits are valid.
160  void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
161 
162  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
163 
164  // Set the next length bits to not null (i.e. valid).
165  void UnsafeSetNotNull(int64_t length);
166 
167  private:
169 };
170 
171 class ARROW_EXPORT NullBuilder : public ArrayBuilder {
172  public:
174  : ArrayBuilder(null(), pool) {}
175 
177  ++null_count_;
178  ++length_;
179  return Status::OK();
180  }
181 
182  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
183 };
184 
185 template <typename Type>
186 class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
187  public:
188  using value_type = typename Type::c_type;
189 
190  explicit PrimitiveBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
191  : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {}
192 
193  using ArrayBuilder::Advance;
194 
196  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
197  RETURN_NOT_OK(Reserve(length));
198  UnsafeAppendToBitmap(valid_bytes, length);
199  return Status::OK();
200  }
201 
203  RETURN_NOT_OK(Reserve(1));
204  UnsafeAppendToBitmap(false);
205  return Status::OK();
206  }
207 
208  std::shared_ptr<Buffer> data() const { return data_; }
209 
216  Status Append(const value_type* values, int64_t length,
217  const uint8_t* valid_bytes = NULLPTR);
218 
225  Status Append(const value_type* values, int64_t length,
226  const std::vector<bool>& is_valid);
227 
233  Status Append(const std::vector<value_type>& values, const std::vector<bool>& is_valid);
234 
238  Status Append(const std::vector<value_type>& values);
239 
240  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
241  Status Init(int64_t capacity) override;
242 
245  Status Resize(int64_t capacity) override;
246 
247  protected:
248  std::shared_ptr<PoolBuffer> data_;
249  value_type* raw_data_;
250 };
251 
253 template <typename T>
254 class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder<T> {
255  public:
256  using typename PrimitiveBuilder<T>::value_type;
258 
259  template <typename T1 = T>
260  explicit NumericBuilder(
261  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool
263  : PrimitiveBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
264 
269 
271  Status Append(const value_type val) {
273  UnsafeAppend(val);
274  return Status::OK();
275  }
276 
282  void UnsafeAppend(const value_type val) {
283  BitUtil::SetBit(null_bitmap_data_, length_);
284  raw_data_[length_++] = val;
285  }
286 
287  protected:
291 };
292 
293 // Builders
294 
299 
309 
313 
314 namespace internal {
315 
316 class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
317  public:
318  explicit AdaptiveIntBuilderBase(MemoryPool* pool);
319 
321  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
322  RETURN_NOT_OK(Reserve(length));
323  UnsafeAppendToBitmap(valid_bytes, length);
324  return Status::OK();
325  }
326 
327  Status AppendNull() {
328  RETURN_NOT_OK(Reserve(1));
329  UnsafeAppendToBitmap(false);
330  return Status::OK();
331  }
332 
333  std::shared_ptr<Buffer> data() const { return data_; }
334 
335  Status Init(int64_t capacity) override;
336 
339  Status Resize(int64_t capacity) override;
340 
341  protected:
342  std::shared_ptr<PoolBuffer> data_;
343  uint8_t* raw_data_;
344 
345  uint8_t int_size_;
346 };
347 
348 // Check if we would need to expand the underlying storage type
349 inline uint8_t ExpandedIntSize(int64_t val, uint8_t current_int_size) {
350  if (current_int_size == 8 ||
351  (current_int_size < 8 &&
352  (val > static_cast<int64_t>(std::numeric_limits<int32_t>::max()) ||
353  val < static_cast<int64_t>(std::numeric_limits<int32_t>::min())))) {
354  return 8;
355  } else if (current_int_size == 4 ||
356  (current_int_size < 4 &&
357  (val > static_cast<int64_t>(std::numeric_limits<int16_t>::max()) ||
358  val < static_cast<int64_t>(std::numeric_limits<int16_t>::min())))) {
359  return 4;
360  } else if (current_int_size == 2 ||
361  (current_int_size == 1 &&
362  (val > static_cast<int64_t>(std::numeric_limits<int8_t>::max()) ||
363  val < static_cast<int64_t>(std::numeric_limits<int8_t>::min())))) {
364  return 2;
365  } else {
366  return 1;
367  }
368 }
369 
370 // Check if we would need to expand the underlying storage type
371 inline uint8_t ExpandedUIntSize(uint64_t val, uint8_t current_int_size) {
372  if (current_int_size == 8 ||
373  (current_int_size < 8 &&
374  (val > static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))) {
375  return 8;
376  } else if (current_int_size == 4 ||
377  (current_int_size < 4 &&
378  (val > static_cast<uint64_t>(std::numeric_limits<uint16_t>::max())))) {
379  return 4;
380  } else if (current_int_size == 2 ||
381  (current_int_size == 1 &&
382  (val > static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())))) {
383  return 2;
384  } else {
385  return 1;
386  }
387 }
388 
389 } // namespace internal
390 
391 class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
392  public:
394 
395  using ArrayBuilder::Advance;
396 
398  Status Append(const uint64_t val) {
399  RETURN_NOT_OK(Reserve(1));
400  BitUtil::SetBit(null_bitmap_data_, length_);
401 
402  uint8_t new_int_size = internal::ExpandedUIntSize(val, int_size_);
403  if (new_int_size != int_size_) {
404  RETURN_NOT_OK(ExpandIntSize(new_int_size));
405  }
406 
407  switch (int_size_) {
408  case 1:
409  reinterpret_cast<uint8_t*>(raw_data_)[length_++] = static_cast<uint8_t>(val);
410  break;
411  case 2:
412  reinterpret_cast<uint16_t*>(raw_data_)[length_++] = static_cast<uint16_t>(val);
413  break;
414  case 4:
415  reinterpret_cast<uint32_t*>(raw_data_)[length_++] = static_cast<uint32_t>(val);
416  break;
417  case 8:
418  reinterpret_cast<uint64_t*>(raw_data_)[length_++] = val;
419  break;
420  default:
421  return Status::NotImplemented("This code shall never be reached");
422  }
423  return Status::OK();
424  }
425 
432  Status Append(const uint64_t* values, int64_t length,
433  const uint8_t* valid_bytes = NULLPTR);
434 
435  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
436 
437  protected:
438  Status ExpandIntSize(uint8_t new_int_size);
439 
440  template <typename new_type, typename old_type>
441  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
442  ExpandIntSizeInternal();
443 #define __LESS(a, b) (a) < (b)
444  template <typename new_type, typename old_type>
445  typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
446  ExpandIntSizeInternal();
447 #undef __LESS
448 
449  template <typename new_type>
450  Status ExpandIntSizeN();
451 };
452 
453 class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
454  public:
456 
457  using ArrayBuilder::Advance;
458 
460  Status Append(const int64_t val) {
461  RETURN_NOT_OK(Reserve(1));
462  BitUtil::SetBit(null_bitmap_data_, length_);
463 
464  uint8_t new_int_size = internal::ExpandedIntSize(val, int_size_);
465  if (new_int_size != int_size_) {
466  RETURN_NOT_OK(ExpandIntSize(new_int_size));
467  }
468 
469  switch (int_size_) {
470  case 1:
471  reinterpret_cast<int8_t*>(raw_data_)[length_++] = static_cast<int8_t>(val);
472  break;
473  case 2:
474  reinterpret_cast<int16_t*>(raw_data_)[length_++] = static_cast<int16_t>(val);
475  break;
476  case 4:
477  reinterpret_cast<int32_t*>(raw_data_)[length_++] = static_cast<int32_t>(val);
478  break;
479  case 8:
480  reinterpret_cast<int64_t*>(raw_data_)[length_++] = val;
481  break;
482  default:
483  return Status::NotImplemented("This code shall never be reached");
484  }
485  return Status::OK();
486  }
487 
494  Status Append(const int64_t* values, int64_t length,
495  const uint8_t* valid_bytes = NULLPTR);
496 
497  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
498 
499  protected:
500  Status ExpandIntSize(uint8_t new_int_size);
501 
502  template <typename new_type, typename old_type>
503  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
504  ExpandIntSizeInternal();
505 #define __LESS(a, b) (a) < (b)
506  template <typename new_type, typename old_type>
507  typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
508  ExpandIntSizeInternal();
509 #undef __LESS
510 
511  template <typename new_type>
512  Status ExpandIntSizeN();
513 };
514 
515 class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
516  public:
518 
519  explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
520 
521  using ArrayBuilder::Advance;
522 
524  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
525  RETURN_NOT_OK(Reserve(length));
526  UnsafeAppendToBitmap(valid_bytes, length);
527  return Status::OK();
528  }
529 
531  RETURN_NOT_OK(Reserve(1));
532  UnsafeAppendToBitmap(false);
533  return Status::OK();
534  }
535 
536  std::shared_ptr<Buffer> data() const { return data_; }
537 
539  Status Append(const bool val) {
540  RETURN_NOT_OK(Reserve(1));
541  BitUtil::SetBit(null_bitmap_data_, length_);
542  if (val) {
543  BitUtil::SetBit(raw_data_, length_);
544  } else {
545  BitUtil::ClearBit(raw_data_, length_);
546  }
547  ++length_;
548  return Status::OK();
549  }
550 
551  Status Append(const uint8_t val) { return Append(val != 0); }
552 
559  Status Append(const uint8_t* values, int64_t length,
560  const uint8_t* valid_bytes = NULLPTR);
561 
568  Status Append(const uint8_t* values, int64_t length, const std::vector<bool>& is_valid);
569 
575  Status Append(const std::vector<uint8_t>& values, const std::vector<bool>& is_valid);
576 
580  Status Append(const std::vector<uint8_t>& values);
581 
587  Status Append(const std::vector<bool>& values, const std::vector<bool>& is_valid);
588 
592  Status Append(const std::vector<bool>& values);
593 
594  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
595  Status Init(int64_t capacity) override;
596 
599  Status Resize(int64_t capacity) override;
600 
601  protected:
602  std::shared_ptr<PoolBuffer> data_;
603  uint8_t* raw_data_;
604 };
605 
606 // ----------------------------------------------------------------------
607 // List builder
608 
622 class ARROW_EXPORT ListBuilder : public ArrayBuilder {
623  public:
626  ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_builder,
627  const std::shared_ptr<DataType>& type = NULLPTR);
628 
629  Status Init(int64_t elements) override;
630  Status Resize(int64_t capacity) override;
631  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
632 
637  Status Append(const int32_t* offsets, int64_t length,
638  const uint8_t* valid_bytes = NULLPTR);
639 
644  Status Append(bool is_valid = true);
645 
646  Status AppendNull() { return Append(false); }
647 
648  ArrayBuilder* value_builder() const;
649 
650  protected:
652  std::unique_ptr<ArrayBuilder> value_builder_;
653  std::shared_ptr<Array> values_;
654 
655  Status AppendNextOffset();
656 
657  void Reset();
658 };
659 
660 // ----------------------------------------------------------------------
661 // Binary and String
662 
665 class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
666  public:
668 
669  BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
670 
671  Status Append(const uint8_t* value, int32_t length);
672 
673  Status Append(const char* value, int32_t length) {
674  return Append(reinterpret_cast<const uint8_t*>(value), length);
675  }
676 
677  Status Append(const std::string& value) {
678  return Append(value.c_str(), static_cast<int32_t>(value.size()));
679  }
680 
681  Status AppendNull();
682 
683  Status Init(int64_t elements) override;
684  Status Resize(int64_t capacity) override;
685  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
686 
688  int64_t value_data_length() const { return value_data_builder_.length(); }
689 
693  const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
694 
695  protected:
698 
699  static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1;
700 
701  Status AppendNextOffset();
702  void Reset();
703 };
704 
707 class ARROW_EXPORT StringBuilder : public BinaryBuilder {
708  public:
711 
712  using BinaryBuilder::Append;
713 
714  Status Append(const std::vector<std::string>& values, uint8_t* null_bytes);
715 };
716 
717 // ----------------------------------------------------------------------
718 // FixedSizeBinaryBuilder
719 
720 class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
721  public:
722  FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
724 
725  Status Append(const uint8_t* value);
726 
727  template <size_t NBYTES>
728  Status Append(const std::array<uint8_t, NBYTES>& value) {
729  RETURN_NOT_OK(Reserve(1));
730  UnsafeAppendToBitmap(true);
731  return byte_builder_.Append(value);
732  }
733 
734  Status Append(const uint8_t* data, int64_t length,
735  const uint8_t* valid_bytes = NULLPTR);
736  Status Append(const std::string& value);
737  Status AppendNull();
738 
739  Status Init(int64_t elements) override;
740  Status Resize(int64_t capacity) override;
741  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
742 
744  int64_t value_data_length() const { return byte_builder_.length(); }
745 
749  const uint8_t* GetValue(int64_t i) const;
750 
751  protected:
752  int32_t byte_width_;
754 };
755 
756 class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder {
757  public:
758  explicit DecimalBuilder(const std::shared_ptr<DataType>& type,
760 
762 
763  Status Append(const Decimal128& val);
764 
765  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
766 };
767 
768 // ----------------------------------------------------------------------
769 // Struct
770 
771 // ---------------------------------------------------------------------------------
772 // StructArray builder
776 class ARROW_EXPORT StructBuilder : public ArrayBuilder {
777  public:
778  StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
779  std::vector<std::unique_ptr<ArrayBuilder>>&& field_builders);
780 
781  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
782 
787  Status Append(int64_t length, const uint8_t* valid_bytes) {
788  RETURN_NOT_OK(Reserve(length));
789  UnsafeAppendToBitmap(valid_bytes, length);
790  return Status::OK();
791  }
792 
795  Status Append(bool is_valid = true) {
796  RETURN_NOT_OK(Reserve(1));
797  UnsafeAppendToBitmap(is_valid);
798  return Status::OK();
799  }
800 
801  Status AppendNull() { return Append(false); }
802 
803  ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); }
804 
805  int num_fields() const { return static_cast<int>(field_builders_.size()); }
806 
807  protected:
808  std::vector<std::unique_ptr<ArrayBuilder>> field_builders_;
809 };
810 
811 // ----------------------------------------------------------------------
812 // Dictionary builder
813 
814 // Based on Apache Parquet-cpp's DictEncoder
815 
816 // Initially 1024 elements
817 static constexpr int kInitialHashTableSize = 1 << 10;
818 
819 typedef int32_t hash_slot_t;
820 static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
821 
822 // The maximum load factor for the hash table before resizing.
823 static constexpr double kMaxHashTableLoad = 0.7;
824 
825 namespace internal {
826 
827 // TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
828 struct WrappedBinary {
829  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {}
830 
831  const uint8_t* ptr_;
832  int32_t length_;
833 };
834 
835 template <typename T>
836 struct DictionaryScalar {
837  using type = typename T::c_type;
838 };
839 
840 template <>
841 struct DictionaryScalar<BinaryType> {
842  using type = WrappedBinary;
843 };
844 
845 template <>
846 struct DictionaryScalar<StringType> {
847  using type = WrappedBinary;
848 };
849 
850 template <>
851 struct DictionaryScalar<FixedSizeBinaryType> {
852  using type = uint8_t const*;
853 };
854 
855 } // namespace internal
856 
858 template <typename T>
859 class ARROW_EXPORT UniqueBuilder : public ArrayBuilder {
860  public:
861  using Scalar = typename internal::DictionaryScalar<T>::type;
862 
863  UniqueBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
864  template <typename T1 = T>
865  explicit UniqueBuilder(
866  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool)
867  : UniqueBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
868 
870  Status Append(const Scalar& value);
872  Status Append(const Scalar& value, int32_t* index);
874  Status AppendArray(const Array& array);
875 
876  Status Init(int64_t elements) override;
877  Status Resize(int64_t capacity) override;
878  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
879 
880  protected:
881  Status DoubleTableSize();
882  Scalar GetDictionaryValue(int64_t index);
883  int HashValue(const Scalar& value);
884  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
885  Status AppendDictionary(const Scalar& value);
886 
887  std::shared_ptr<PoolBuffer> hash_table_;
888  int32_t* hash_slots_;
889 
892 
893  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
894  // hash_table_size_, but uses far fewer CPU cycles
896 
898  int32_t byte_width_;
899 };
900 
901 class ARROW_EXPORT BinaryUniqueBuilder : public UniqueBuilder<BinaryType> {
902  public:
903  using UniqueBuilder::Append;
905 
906  Status Append(const uint8_t* value, int32_t length) {
907  return Append(internal::WrappedBinary(value, length));
908  }
909 
910  Status Append(const uint8_t* value, int32_t length, int32_t* index) {
911  return Append(internal::WrappedBinary(value, length), index);
912  }
913 
914  Status Append(const char* value, int32_t length) {
915  return Append(
916  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
917  }
918 
919  Status Append(const char* value, int32_t length, int32_t* index) {
920  return Append(
921  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length), index);
922  }
923 
924  Status Append(const std::string& value) {
925  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
926  static_cast<int32_t>(value.size())));
927  }
928 
929  Status Append(const std::string& value, int32_t* index) {
930  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
931  static_cast<int32_t>(value.size())),
932  index);
933  }
934 };
935 
937 class ARROW_EXPORT StringUniqueBuilder : public UniqueBuilder<StringType> {
938  public:
939  using UniqueBuilder::Append;
941 
942  Status Append(const uint8_t* value, int32_t length) {
943  return Append(internal::WrappedBinary(value, length));
944  }
945 
946  Status Append(const uint8_t* value, int32_t length, int32_t* index) {
947  return Append(internal::WrappedBinary(value, length), index);
948  }
949 
950  Status Append(const char* value, int32_t length) {
951  return Append(
952  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
953  }
954 
955  Status Append(const char* value, int32_t length, int32_t* index) {
956  return Append(
957  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length), index);
958  }
959 
960  Status Append(const std::string& value) {
961  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
962  static_cast<int32_t>(value.size())));
963  }
964 
965  Status Append(const std::string& value, int32_t* index) {
966  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
967  static_cast<int32_t>(value.size())),
968  index);
969  }
970 };
971 
974 template <typename T>
975 class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
976  public:
977  using Scalar = typename internal::DictionaryScalar<T>::type;
978 
980 
981  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
982 
983  template <typename T1 = T>
985  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool)
986  : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
987 
989  Status Append(const Scalar& value);
990 
992  Status AppendNull();
993 
995  Status AppendArray(const Array& array);
996 
997  Status Init(int64_t elements) override;
998  Status Resize(int64_t capacity) override;
999  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
1000 
1001  protected:
1004 };
1005 
1006 template <>
1007 class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
1008  public:
1009  ~DictionaryBuilder();
1010 
1011  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
1012  explicit DictionaryBuilder(MemoryPool* pool);
1013 
1015  Status AppendNull();
1016 
1018  Status AppendArray(const Array& array);
1019 
1020  Status Init(int64_t elements) override;
1021  Status Resize(int64_t capacity) override;
1022  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
1023 
1024  protected:
1026 };
1027 
1028 class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
1029  public:
1032 
1033  Status Append(const uint8_t* value, int32_t length) {
1034  return Append(internal::WrappedBinary(value, length));
1035  }
1036 
1037  Status Append(const char* value, int32_t length) {
1038  return Append(
1039  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
1040  }
1041 
1042  Status Append(const std::string& value) {
1043  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
1044  static_cast<int32_t>(value.size())));
1045  }
1046 };
1047 
1049 class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
1050  public:
1053 
1054  Status Append(const uint8_t* value, int32_t length) {
1055  return Append(internal::WrappedBinary(value, length));
1056  }
1057 
1058  Status Append(const char* value, int32_t length) {
1059  return Append(
1060  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
1061  }
1062 
1063  Status Append(const std::string& value) {
1064  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
1065  static_cast<int32_t>(value.size())));
1066  }
1067 };
1068 
1069 // ----------------------------------------------------------------------
1070 // Helper functions
1071 
1072 Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
1073  std::unique_ptr<ArrayBuilder>* out);
1074 
1075 Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool,
1076  const std::shared_ptr<DataType>& type,
1077  std::shared_ptr<ArrayBuilder>* out);
1078 
1084 Status ARROW_EXPORT EncodeArrayToDictionary(const Array& input, MemoryPool* pool,
1085  std::shared_ptr<Array>* out);
1086 
1092 Status ARROW_EXPORT EncodeColumnToDictionary(const Column& input, MemoryPool* pool,
1093  std::shared_ptr<Column>* out);
1094 } // namespace arrow
1095 
1096 #endif // ARROW_BUILDER_H_
std::shared_ptr< PoolBuffer > null_bitmap_
Definition: builder.h:133
typename T ::c_type value_type
Definition: builder.h:188
std::vector< std::unique_ptr< ArrayBuilder > > children_
Definition: builder.h:142
Builder class for UTF8 strings.
Definition: builder.h:707
void UnsafeAppendToBitmap(bool is_valid)
Definition: builder.h:149
Status Append(const uint8_t *value, int32_t length, int32_t *index)
Definition: builder.h:946
Array builder for created encoded DictionaryArray from dense array data.
Definition: builder.h:975
NullBuilder(MemoryPool *pool ARROW_MEMORY_POOL_DEFAULT)
Definition: builder.h:173
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:906
Status Append(const Scalar &value)
Append a scalar value.
Builder class for variable-length binary data.
Definition: builder.h:665
AdaptiveIntBuilder values_builder_
Definition: builder.h:1003
Dictionary array builder with convenience methods for strings.
Definition: builder.h:1049
int32_t * hash_slots_
Definition: builder.h:888
Array builder that only adds elements if they already exist.
Definition: builder.h:859
#define ARROW_MEMORY_POOL_DEFAULT
Definition: memory_pool.h:94
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:1033
Status Append(const uint8_t *value, int32_t length)
Status Append(const char *value, int32_t length)
Definition: builder.h:914
TypedBufferBuilder< uint8_t > value_data_builder_
Definition: builder.h:697
UniqueBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
int num_children() const
Definition: builder.h:74
DictionaryBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool)
Definition: builder.h:984
DictionaryBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Status MakeBuilder(MemoryPool *pool, const std::shared_ptr< DataType > &type, std::unique_ptr< ArrayBuilder > *out)
Status AppendNull()
Definition: builder.h:530
Definition: builder.h:391
Definition: builder.h:186
int64_t null_count_
Definition: builder.h:134
value_type * raw_data_
Definition: builder.h:249
Status Append(bool is_valid=true)
Append an element to the Struct.
Definition: builder.h:795
Status Append(const uint64_t val)
Scalar append.
Definition: builder.h:398
#define NULLPTR
Definition: macros.h:69
Status Append(const uint8_t *value, int32_t length, int32_t *index)
Definition: builder.h:910
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:942
Status AppendNulls(const uint8_t *valid_bytes, int64_t length)
Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory.
Definition: builder.h:196
std::vector< std::unique_ptr< ArrayBuilder > > field_builders_
Definition: builder.h:808
int64_t length() const
Definition: builder.h:76
int64_t length_
Definition: builder.h:138
An immutable column data structure consisting of a field (type metadata) and a logical chunked data a...
Definition: table.h:70
std::shared_ptr< Array > values_
Definition: builder.h:653
Status Append(const std::string &value)
Definition: builder.h:1063
int32_t byte_width_
Definition: builder.h:752
ArrayBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Definition: builder.h:59
Base class for all Builders that emit an Array of a scalar numerical type.
Definition: builder.h:254
int64_t null_count() const
Definition: builder.h:77
Definition: builder.h:756
Base class for all data array builders.
Definition: builder.h:57
Status AppendNull()
Definition: builder.h:801
Definition: status.h:106
Status Append(const char *value, int32_t length)
Definition: builder.h:1058
std::shared_ptr< Buffer > data() const
Definition: builder.h:536
Definition: builder.h:515
Definition: type_traits.h:29
PrimitiveBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Definition: builder.h:190
Status Reserve(int64_t elements)
Ensures there is enough space for adding the number of elements by checking capacity and calling Resi...
std::shared_ptr< PoolBuffer > data_
Definition: builder.h:602
Definition: builder.h:1028
ArrayBuilder()
Definition: builder.h:127
Status Append(const std::string &value)
Definition: builder.h:960
int num_fields() const
Definition: builder.h:805
Status Append(const std::array< uint8_t, NBYTES > &value)
Definition: builder.h:728
Status EncodeColumnToDictionary(const Column &input, MemoryPool *pool, std::shared_ptr< Column > *out)
Convert a Column&#39;s data internally to DictionaryArray.
std::shared_ptr< PoolBuffer > data_
Definition: builder.h:248
Status Append(const char *value, int32_t length, int32_t *index)
Definition: builder.h:955
Definition: type.h:312
#define RETURN_NOT_OK(s)
Definition: status.h:66
static Status OK()
Definition: status.h:119
ArrayBuilder * child(int i)
For nested types.
Definition: builder.h:72
std::shared_ptr< DataType > null()
Status Append(const Scalar &value)
Append a scalar value.
int32_t hash_slot_t
Definition: builder.h:819
Definition: type.h:433
Status Append(const std::string &value)
Definition: builder.h:677
Status AppendNull()
Definition: builder.h:646
Definition: builder.h:453
Status EncodeArrayToDictionary(const Array &input, MemoryPool *pool, std::shared_ptr< Array > *out)
Convert Array to encoded DictionaryArray form.
Definition: type.h:474
Status Append(const std::string &value)
Definition: builder.h:1042
int64_t capacity() const
Definition: builder.h:78
uint8_t * null_bitmap_data_
Definition: builder.h:135
ArrayBuilder * field_builder(int i) const
Definition: builder.h:803
Status Append(const std::string &value)
Definition: builder.h:924
Definition: builder.h:720
TypedBufferBuilder< int32_t > offsets_builder_
Definition: builder.h:696
std::shared_ptr< PoolBuffer > hash_table_
Definition: builder.h:887
Status Append(const std::string &value, int32_t *index)
Definition: builder.h:965
AdaptiveIntBuilder values_builder_
Definition: builder.h:1025
int hash_table_size_
Size of the table. Must be a power of 2.
Definition: builder.h:891
std::shared_ptr< DataType > type_
Definition: builder.h:129
Status Append(const int64_t val)
Scalar append.
Definition: builder.h:460
BufferBuilder byte_builder_
Definition: builder.h:753
Status AppendNull()
Definition: builder.h:176
Top-level namespace for Apache Arrow C++ API.
Definition: allocator.h:29
Array base type Immutable data array with some logical type and some length.
Definition: array.h:180
TypedBufferBuilder< int32_t > offsets_builder_
Definition: builder.h:651
Status Append(const std::string &value, int32_t *index)
Definition: builder.h:929
std::shared_ptr< Buffer > data() const
Definition: builder.h:208
int64_t value_data_length() const
Definition: builder.h:688
Append, Resize and Reserve methods are acting on StructBuilder.
Definition: builder.h:776
int64_t capacity_
Definition: builder.h:139
Status Append(const uint8_t val)
Definition: builder.h:551
BinaryBuilder(MemoryPool *pool ARROW_MEMORY_POOL_DEFAULT)
NumericBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool ARROW_MEMORY_POOL_DEFAULT)
Definition: builder.h:260
Status Append(const char *value, int32_t length)
Definition: builder.h:1037
Status AppendNulls(const uint8_t *valid_bytes, int64_t length)
Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory.
Definition: builder.h:524
Status Append(const value_type val)
Append a single scalar and increase the size if necessary.
Definition: builder.h:271
Status Append(const char *value, int32_t length)
Definition: builder.h:673
Status Append(const uint8_t *value)
Status Advance(int64_t elements)
For cases where raw data was memcpy&#39;d into the internal buffers, allows us to advance the length of t...
int mod_bitmask_
Definition: builder.h:895
std::shared_ptr< DataType > type() const
Definition: builder.h:124
MemoryPool * pool_
Definition: builder.h:130
Status Append(const char *value, int32_t length, int32_t *index)
Definition: builder.h:919
static Status NotImplemented(const std::string &msg)
Definition: status.h:138
UniqueBuilder< T > unique_builder_
Definition: builder.h:1002
uint8_t * raw_data_
Definition: builder.h:603
void UnsafeAppend(const value_type val)
Append a single scalar under the assumption that the underlying Buffer is large enough.
Definition: builder.h:282
Status MakeDictionaryBuilder(MemoryPool *pool, const std::shared_ptr< DataType > &type, std::shared_ptr< ArrayBuilder > *out)
std::unique_ptr< ArrayBuilder > value_builder_
Definition: builder.h:652
Definition: buffer.h:193
Base class for memory allocation.
Definition: memory_pool.h:34
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:1054
typename internal::DictionaryScalar< BinaryType >::type Scalar
Definition: builder.h:977
UniqueBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool)
Definition: builder.h:865
std::shared_ptr< PoolBuffer > null_bitmap() const
Definition: builder.h:109
int32_t byte_width_
Definition: builder.h:898
int64_t value_data_length() const
Definition: builder.h:744
Definition: builder.h:171
#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: macros.h:23
TypeTraits< T >::BuilderType dict_builder_
Definition: builder.h:897
~DictionaryBuilder()
Definition: builder.h:979
Unique array builder with convenience methods for strings.
Definition: builder.h:937
Status Append(int64_t length, const uint8_t *valid_bytes)
Null bitmap is of equal length to every child field, and any zero byte will be considered as a null f...
Definition: builder.h:787
Represents a signed 128-bit integer in two&#39;s complement.
Definition: decimal.h:39
typename internal::DictionaryScalar< BinaryType >::type Scalar
Definition: builder.h:861
Definition: builder.h:901
Status Append(const bool val)
Scalar append.
Definition: builder.h:539
Status AppendNull()
Definition: builder.h:202
Status Append(const char *value, int32_t length)
Definition: builder.h:950
Builder class for variable-length list array value types.
Definition: builder.h:622
Definition: type.h:451