Apache Arrow (C++)
A columnar in-memory analytics layer designed to accelerate big data.
builder.h
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #ifndef ARROW_BUILDER_H
19 #define ARROW_BUILDER_H
20 
21 #include <array>
22 #include <cstdint>
23 #include <functional>
24 #include <limits>
25 #include <memory>
26 #include <string>
27 #include <vector>
28 
29 #include "arrow/buffer.h"
30 #include "arrow/memory_pool.h"
31 #include "arrow/status.h"
32 #include "arrow/type.h"
33 #include "arrow/type_traits.h"
34 #include "arrow/util/bit-util.h"
35 #include "arrow/util/hash.h"
36 #include "arrow/util/macros.h"
37 #include "arrow/util/visibility.h"
38 
39 namespace arrow {
40 
41 class Array;
42 class Decimal128;
43 
44 constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
45 constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
46 
47 namespace internal {
48 
49 struct ArrayData;
50 
51 } // namespace internal
52 
53 constexpr int64_t kMinBuilderCapacity = 1 << 5;
54 
56 //
60 class ARROW_EXPORT ArrayBuilder {
61  public:
62  explicit ArrayBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
63  : type_(type),
64  pool_(pool),
65  null_bitmap_(NULLPTR),
66  null_count_(0),
67  null_bitmap_data_(NULLPTR),
68  length_(0),
69  capacity_(0) {}
70 
71  virtual ~ArrayBuilder() = default;
72 
75  ArrayBuilder* child(int i) { return children_[i].get(); }
76 
77  int num_children() const { return static_cast<int>(children_.size()); }
78 
79  int64_t length() const { return length_; }
80  int64_t null_count() const { return null_count_; }
81  int64_t capacity() const { return capacity_; }
82 
84  Status AppendToBitmap(bool is_valid);
85 
88  Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
89 
91  Status SetNotNull(int64_t length);
92 
96  virtual Status Init(int64_t capacity);
97 
101  virtual Status Resize(int64_t new_bits);
102 
105  Status Reserve(int64_t elements);
106 
110  Status Advance(int64_t elements);
111 
112  std::shared_ptr<PoolBuffer> null_bitmap() const { return null_bitmap_; }
113 
119  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
120 
126  Status Finish(std::shared_ptr<Array>* out);
127 
128  std::shared_ptr<DataType> type() const { return type_; }
129 
130  // Unsafe operations (don't check capacity/don't resize)
131 
132  // Append to null bitmap.
133  void UnsafeAppendToBitmap(bool is_valid) {
134  if (is_valid) {
135  BitUtil::SetBit(null_bitmap_data_, length_);
136  } else {
137  ++null_count_;
138  }
139  ++length_;
140  }
141 
142  protected:
144 
145  std::shared_ptr<DataType> type_;
147 
148  // When null_bitmap are first appended to the builder, the null bitmap is allocated
149  std::shared_ptr<PoolBuffer> null_bitmap_;
150  int64_t null_count_;
152 
153  // Array length, so far. Also, the index of the next element to be added
154  int64_t length_;
155  int64_t capacity_;
156 
157  // Child value array builders. These are owned by this class
158  std::vector<std::unique_ptr<ArrayBuilder>> children_;
159 
160  void Reset();
161 
162  // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
163  // assume all of length bits are valid.
164  void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
165 
166  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
167 
168  // Set the next length bits to not null (i.e. valid).
169  void UnsafeSetNotNull(int64_t length);
170 
171  private:
173 };
174 
175 class ARROW_EXPORT NullBuilder : public ArrayBuilder {
176  public:
178  : ArrayBuilder(null(), pool) {}
179 
181  ++null_count_;
182  ++length_;
183  return Status::OK();
184  }
185 
186  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
187 };
188 
189 template <typename Type>
190 class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
191  public:
192  using value_type = typename Type::c_type;
193 
194  explicit PrimitiveBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
195  : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {}
196 
197  using ArrayBuilder::Advance;
198 
200  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
201  RETURN_NOT_OK(Reserve(length));
202  UnsafeAppendToBitmap(valid_bytes, length);
203  return Status::OK();
204  }
205 
207  RETURN_NOT_OK(Reserve(1));
208  UnsafeAppendToBitmap(false);
209  return Status::OK();
210  }
211 
212  std::shared_ptr<Buffer> data() const { return data_; }
213 
220  Status AppendValues(const value_type* values, int64_t length,
221  const uint8_t* valid_bytes = NULLPTR);
223  ARROW_DEPRECATED("Use AppendValues instead")
224  Status Append(const value_type* values, int64_t length,
225  const uint8_t* valid_bytes = NULLPTR);
226 
233  Status AppendValues(const value_type* values, int64_t length,
234  const std::vector<bool>& is_valid);
236  ARROW_DEPRECATED("Use AppendValues instead")
237  Status Append(const value_type* values, int64_t length,
238  const std::vector<bool>& is_valid);
239 
245  Status AppendValues(const std::vector<value_type>& values,
246  const std::vector<bool>& is_valid);
248  ARROW_DEPRECATED("Use AppendValues instead")
249  Status Append(const std::vector<value_type>& values, const std::vector<bool>& is_valid);
250 
254  Status AppendValues(const std::vector<value_type>& values);
256  ARROW_DEPRECATED("Use AppendValues instead")
257  Status Append(const std::vector<value_type>& values);
258 
259  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
260  Status Init(int64_t capacity) override;
261 
264  Status Resize(int64_t capacity) override;
265 
266  protected:
267  std::shared_ptr<PoolBuffer> data_;
268  value_type* raw_data_;
269 };
270 
272 template <typename T>
273 class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder<T> {
274  public:
275  using typename PrimitiveBuilder<T>::value_type;
277 
278  template <typename T1 = T>
279  explicit NumericBuilder(
280  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool
282  : PrimitiveBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
283 
289 
291  Status Append(const value_type val) {
293  UnsafeAppend(val);
294  return Status::OK();
295  }
296 
302  void UnsafeAppend(const value_type val) {
303  BitUtil::SetBit(null_bitmap_data_, length_);
304  raw_data_[length_++] = val;
305  }
306 
307  protected:
311 };
312 
313 // Builders
314 
319 
329 
333 
334 namespace internal {
335 
336 class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
337  public:
338  explicit AdaptiveIntBuilderBase(MemoryPool* pool);
339 
341  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
342  RETURN_NOT_OK(Reserve(length));
343  UnsafeAppendToBitmap(valid_bytes, length);
344  return Status::OK();
345  }
346 
347  Status AppendNull() {
348  RETURN_NOT_OK(Reserve(1));
349  UnsafeAppendToBitmap(false);
350  return Status::OK();
351  }
352 
353  std::shared_ptr<Buffer> data() const { return data_; }
354 
355  Status Init(int64_t capacity) override;
356 
359  Status Resize(int64_t capacity) override;
360 
361  protected:
362  std::shared_ptr<PoolBuffer> data_;
363  uint8_t* raw_data_;
364 
365  uint8_t int_size_;
366 };
367 
368 // Check if we would need to expand the underlying storage type
369 inline uint8_t ExpandedIntSize(int64_t val, uint8_t current_int_size) {
370  if (current_int_size == 8 ||
371  (current_int_size < 8 &&
372  (val > static_cast<int64_t>(std::numeric_limits<int32_t>::max()) ||
373  val < static_cast<int64_t>(std::numeric_limits<int32_t>::min())))) {
374  return 8;
375  } else if (current_int_size == 4 ||
376  (current_int_size < 4 &&
377  (val > static_cast<int64_t>(std::numeric_limits<int16_t>::max()) ||
378  val < static_cast<int64_t>(std::numeric_limits<int16_t>::min())))) {
379  return 4;
380  } else if (current_int_size == 2 ||
381  (current_int_size == 1 &&
382  (val > static_cast<int64_t>(std::numeric_limits<int8_t>::max()) ||
383  val < static_cast<int64_t>(std::numeric_limits<int8_t>::min())))) {
384  return 2;
385  } else {
386  return 1;
387  }
388 }
389 
390 // Check if we would need to expand the underlying storage type
391 inline uint8_t ExpandedUIntSize(uint64_t val, uint8_t current_int_size) {
392  if (current_int_size == 8 ||
393  (current_int_size < 8 &&
394  (val > static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))) {
395  return 8;
396  } else if (current_int_size == 4 ||
397  (current_int_size < 4 &&
398  (val > static_cast<uint64_t>(std::numeric_limits<uint16_t>::max())))) {
399  return 4;
400  } else if (current_int_size == 2 ||
401  (current_int_size == 1 &&
402  (val > static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())))) {
403  return 2;
404  } else {
405  return 1;
406  }
407 }
408 
409 } // namespace internal
410 
411 class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
412  public:
414 
415  using ArrayBuilder::Advance;
416 
418  Status Append(const uint64_t val) {
419  RETURN_NOT_OK(Reserve(1));
420  BitUtil::SetBit(null_bitmap_data_, length_);
421 
422  uint8_t new_int_size = internal::ExpandedUIntSize(val, int_size_);
423  if (new_int_size != int_size_) {
424  RETURN_NOT_OK(ExpandIntSize(new_int_size));
425  }
426 
427  switch (int_size_) {
428  case 1:
429  reinterpret_cast<uint8_t*>(raw_data_)[length_++] = static_cast<uint8_t>(val);
430  break;
431  case 2:
432  reinterpret_cast<uint16_t*>(raw_data_)[length_++] = static_cast<uint16_t>(val);
433  break;
434  case 4:
435  reinterpret_cast<uint32_t*>(raw_data_)[length_++] = static_cast<uint32_t>(val);
436  break;
437  case 8:
438  reinterpret_cast<uint64_t*>(raw_data_)[length_++] = val;
439  break;
440  default:
441  return Status::NotImplemented("This code shall never be reached");
442  }
443  return Status::OK();
444  }
445 
452  Status AppendValues(const uint64_t* values, int64_t length,
453  const uint8_t* valid_bytes = NULLPTR);
455  ARROW_DEPRECATED("Use AppendValues instead")
456  Status Append(const uint64_t* values, int64_t length,
457  const uint8_t* valid_bytes = NULLPTR);
458 
459  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
460 
461  protected:
462  Status ExpandIntSize(uint8_t new_int_size);
463 
464  template <typename new_type, typename old_type>
465  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
466  ExpandIntSizeInternal();
467 #define __LESS(a, b) (a) < (b)
468  template <typename new_type, typename old_type>
469  typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
470  ExpandIntSizeInternal();
471 #undef __LESS
472 
473  template <typename new_type>
474  Status ExpandIntSizeN();
475 };
476 
477 class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
478  public:
480 
481  using ArrayBuilder::Advance;
482 
484  Status Append(const int64_t val) {
485  RETURN_NOT_OK(Reserve(1));
486  BitUtil::SetBit(null_bitmap_data_, length_);
487 
488  uint8_t new_int_size = internal::ExpandedIntSize(val, int_size_);
489  if (new_int_size != int_size_) {
490  RETURN_NOT_OK(ExpandIntSize(new_int_size));
491  }
492 
493  switch (int_size_) {
494  case 1:
495  reinterpret_cast<int8_t*>(raw_data_)[length_++] = static_cast<int8_t>(val);
496  break;
497  case 2:
498  reinterpret_cast<int16_t*>(raw_data_)[length_++] = static_cast<int16_t>(val);
499  break;
500  case 4:
501  reinterpret_cast<int32_t*>(raw_data_)[length_++] = static_cast<int32_t>(val);
502  break;
503  case 8:
504  reinterpret_cast<int64_t*>(raw_data_)[length_++] = val;
505  break;
506  default:
507  return Status::NotImplemented("This code shall never be reached");
508  }
509  return Status::OK();
510  }
511 
518  Status AppendValues(const int64_t* values, int64_t length,
519  const uint8_t* valid_bytes = NULLPTR);
521  ARROW_DEPRECATED("Use AppendValues instead")
522  Status Append(const int64_t* values, int64_t length,
523  const uint8_t* valid_bytes = NULLPTR);
524 
525  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
526 
527  protected:
528  Status ExpandIntSize(uint8_t new_int_size);
529 
530  template <typename new_type, typename old_type>
531  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
532  ExpandIntSizeInternal();
533 #define __LESS(a, b) (a) < (b)
534  template <typename new_type, typename old_type>
535  typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
536  ExpandIntSizeInternal();
537 #undef __LESS
538 
539  template <typename new_type>
540  Status ExpandIntSizeN();
541 };
542 
543 class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
544  public:
546 
547  explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
548 
549  using ArrayBuilder::Advance;
550 
552  Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
553  RETURN_NOT_OK(Reserve(length));
554  UnsafeAppendToBitmap(valid_bytes, length);
555  return Status::OK();
556  }
557 
559  RETURN_NOT_OK(Reserve(1));
560  UnsafeAppendToBitmap(false);
561  return Status::OK();
562  }
563 
564  std::shared_ptr<Buffer> data() const { return data_; }
565 
567  Status Append(const bool val) {
568  RETURN_NOT_OK(Reserve(1));
569  BitUtil::SetBit(null_bitmap_data_, length_);
570  if (val) {
571  BitUtil::SetBit(raw_data_, length_);
572  } else {
573  BitUtil::ClearBit(raw_data_, length_);
574  }
575  ++length_;
576  return Status::OK();
577  }
578 
579  Status Append(const uint8_t val) { return Append(val != 0); }
580 
587  Status AppendValues(const uint8_t* values, int64_t length,
588  const uint8_t* valid_bytes = NULLPTR);
590  ARROW_DEPRECATED("Use AppendValues instead")
591  Status Append(const uint8_t* values, int64_t length,
592  const uint8_t* valid_bytes = NULLPTR);
593 
600  Status AppendValues(const uint8_t* values, int64_t length,
601  const std::vector<bool>& is_valid);
603  ARROW_DEPRECATED("Use AppendValues instead")
604  Status Append(const uint8_t* values, int64_t length, const std::vector<bool>& is_valid);
605 
611  Status AppendValues(const std::vector<uint8_t>& values,
612  const std::vector<bool>& is_valid);
614  ARROW_DEPRECATED("Use AppendValues instead")
615  Status Append(const std::vector<uint8_t>& values, const std::vector<bool>& is_valid);
616 
620  Status AppendValues(const std::vector<uint8_t>& values);
622  ARROW_DEPRECATED("Use AppendValues instead")
623  Status Append(const std::vector<uint8_t>& values);
624 
630  Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
632  ARROW_DEPRECATED("Use AppendValues instead")
633  Status Append(const std::vector<bool>& values, const std::vector<bool>& is_valid);
634 
638  Status AppendValues(const std::vector<bool>& values);
640  ARROW_DEPRECATED("Use AppendValues instead")
641  Status Append(const std::vector<bool>& values);
642 
643  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
644  Status Init(int64_t capacity) override;
645 
648  Status Resize(int64_t capacity) override;
649 
650  protected:
651  std::shared_ptr<PoolBuffer> data_;
652  uint8_t* raw_data_;
653 };
654 
655 // ----------------------------------------------------------------------
656 // List builder
657 
671 class ARROW_EXPORT ListBuilder : public ArrayBuilder {
672  public:
675  ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_builder,
676  const std::shared_ptr<DataType>& type = NULLPTR);
677 
678  Status Init(int64_t elements) override;
679  Status Resize(int64_t capacity) override;
680  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
681 
686  Status AppendValues(const int32_t* offsets, int64_t length,
687  const uint8_t* valid_bytes = NULLPTR);
689  ARROW_DEPRECATED("Use AppendValues instead")
690  Status Append(const int32_t* offsets, int64_t length,
691  const uint8_t* valid_bytes = NULLPTR);
692 
697  Status Append(bool is_valid = true);
698 
699  Status AppendNull() { return Append(false); }
700 
701  ArrayBuilder* value_builder() const;
702 
703  protected:
705  std::unique_ptr<ArrayBuilder> value_builder_;
706  std::shared_ptr<Array> values_;
707 
708  Status AppendNextOffset();
709 
710  void Reset();
711 };
712 
713 // ----------------------------------------------------------------------
714 // Binary and String
715 
718 class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
719  public:
721 
722  BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
723 
724  Status Append(const uint8_t* value, int32_t length);
725 
726  Status Append(const char* value, int32_t length) {
727  return Append(reinterpret_cast<const uint8_t*>(value), length);
728  }
729 
730  Status Append(const std::string& value) {
731  return Append(value.c_str(), static_cast<int32_t>(value.size()));
732  }
733 
734  Status AppendNull();
735 
736  Status Init(int64_t elements) override;
737  Status Resize(int64_t capacity) override;
740  Status ReserveData(int64_t elements);
741  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
742 
744  int64_t value_data_length() const { return value_data_builder_.length(); }
746  int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
747 
751  const uint8_t* GetValue(int64_t i, int32_t* out_length) const;
752 
753  protected:
756 
757  Status AppendNextOffset();
758  void Reset();
759 };
760 
763 class ARROW_EXPORT StringBuilder : public BinaryBuilder {
764  public:
767 
768  using BinaryBuilder::Append;
769 
776  Status AppendValues(const std::vector<std::string>& values,
777  const uint8_t* valid_bytes = NULLPTR);
779  ARROW_DEPRECATED("Use AppendValues instead")
780  Status Append(const std::vector<std::string>& values,
781  const uint8_t* valid_bytes = NULLPTR);
782 
792  Status AppendValues(const char** values, int64_t length,
793  const uint8_t* valid_bytes = NULLPTR);
795  ARROW_DEPRECATED("Use AppendValues instead")
796  Status Append(const char** values, int64_t length,
797  const uint8_t* valid_bytes = NULLPTR);
798 };
799 
800 // ----------------------------------------------------------------------
801 // FixedSizeBinaryBuilder
802 
803 class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
804  public:
805  FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
807 
808  Status Append(const uint8_t* value) {
809  RETURN_NOT_OK(Reserve(1));
810  UnsafeAppendToBitmap(true);
811  return byte_builder_.Append(value, byte_width_);
812  }
813  Status Append(const char* value) {
814  return Append(reinterpret_cast<const uint8_t*>(value));
815  }
816 
817  template <size_t NBYTES>
818  Status Append(const std::array<uint8_t, NBYTES>& value) {
819  RETURN_NOT_OK(Reserve(1));
820  UnsafeAppendToBitmap(true);
821  return byte_builder_.Append(value);
822  }
823 
824  Status AppendValues(const uint8_t* data, int64_t length,
825  const uint8_t* valid_bytes = NULLPTR);
827  ARROW_DEPRECATED("Use AppendValues instead")
828  Status Append(const uint8_t* data, int64_t length,
829  const uint8_t* valid_bytes = NULLPTR);
830  Status Append(const std::string& value);
831  Status AppendNull();
832 
833  Status Init(int64_t elements) override;
834  Status Resize(int64_t capacity) override;
835  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
836 
838  int64_t value_data_length() const { return byte_builder_.length(); }
839 
843  const uint8_t* GetValue(int64_t i) const;
844 
845  protected:
846  int32_t byte_width_;
848 };
849 
850 class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
851  public:
852  explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
854 
857 
858  Status Append(const Decimal128& val);
859 
860  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
861 };
862 
864 
865 // ----------------------------------------------------------------------
866 // Struct
867 
868 // ---------------------------------------------------------------------------------
869 // StructArray builder
873 class ARROW_EXPORT StructBuilder : public ArrayBuilder {
874  public:
875  StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
876  std::vector<std::unique_ptr<ArrayBuilder>>&& field_builders);
877 
878  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
879 
884  Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
885  RETURN_NOT_OK(Reserve(length));
886  UnsafeAppendToBitmap(valid_bytes, length);
887  return Status::OK();
888  }
890  ARROW_DEPRECATED("Use AppendValues instead")
891  Status Append(int64_t length, const uint8_t* valid_bytes) {
892  return AppendValues(length, valid_bytes);
893  }
894 
897  Status Append(bool is_valid = true) {
898  RETURN_NOT_OK(Reserve(1));
899  UnsafeAppendToBitmap(is_valid);
900  return Status::OK();
901  }
902 
903  Status AppendNull() { return Append(false); }
904 
905  ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); }
906 
907  int num_fields() const { return static_cast<int>(field_builders_.size()); }
908 
909  protected:
910  std::vector<std::unique_ptr<ArrayBuilder>> field_builders_;
911 };
912 
913 // ----------------------------------------------------------------------
914 // Dictionary builder
915 
916 namespace internal {
917 
918 // TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
919 struct WrappedBinary {
920  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {}
921 
922  const uint8_t* ptr_;
923  int32_t length_;
924 };
925 
926 template <typename T>
927 struct DictionaryScalar {
928  using type = typename T::c_type;
929 };
930 
931 template <>
932 struct DictionaryScalar<BinaryType> {
933  using type = WrappedBinary;
934 };
935 
936 template <>
937 struct DictionaryScalar<StringType> {
938  using type = WrappedBinary;
939 };
940 
941 template <>
942 struct DictionaryScalar<FixedSizeBinaryType> {
943  using type = uint8_t const*;
944 };
945 
946 } // namespace internal
947 
956 template <typename T>
957 class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
958  public:
959  using Scalar = typename internal::DictionaryScalar<T>::type;
960 
961  ~DictionaryBuilder() override {}
962 
963  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
964 
965  template <typename T1 = T>
967  typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool)
968  : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
969 
971  Status Append(const Scalar& value);
972 
974  Status AppendNull();
975 
977  Status AppendArray(const Array& array);
978 
979  Status Init(int64_t elements) override;
980  Status Resize(int64_t capacity) override;
981  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
982 
984  bool is_building_delta() { return entry_id_offset_ > 0; }
985 
986  protected:
987  Status DoubleTableSize();
988  Scalar GetDictionaryValue(typename TypeTraits<T>::BuilderType& dictionary_builder,
989  int64_t index);
990  int64_t HashValue(const Scalar& value);
991  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
992  Status AppendDictionary(const Scalar& value);
993 
994  std::shared_ptr<Buffer> hash_table_;
995  int32_t* hash_slots_;
996 
999 
1000  // offset for the entry ids. Used to build delta dictionaries,
1001  // increased on every InternalFinish by the number of current entries
1002  // in the dictionary
1004 
1005  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
1006  // hash_table_size_, but uses far fewer CPU cycles
1007  int64_t mod_bitmask_;
1008 
1011 
1013  int32_t byte_width_;
1014 
1017 };
1018 
1019 template <>
1020 class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
1021  public:
1022  ~DictionaryBuilder() override;
1023 
1024  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
1025  explicit DictionaryBuilder(MemoryPool* pool);
1026 
1028  Status AppendNull();
1029 
1031  Status AppendArray(const Array& array);
1032 
1033  Status Init(int64_t elements) override;
1034  Status Resize(int64_t capacity) override;
1035  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
1036 
1037  protected:
1039 };
1040 
1041 class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
1042  public:
1045 
1046  Status Append(const uint8_t* value, int32_t length) {
1047  return Append(internal::WrappedBinary(value, length));
1048  }
1049 
1050  Status Append(const char* value, int32_t length) {
1051  return Append(
1052  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
1053  }
1054 
1055  Status Append(const std::string& value) {
1056  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
1057  static_cast<int32_t>(value.size())));
1058  }
1059 };
1060 
1062 class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
1063  public:
1066 
1067  Status Append(const uint8_t* value, int32_t length) {
1068  return Append(internal::WrappedBinary(value, length));
1069  }
1070 
1071  Status Append(const char* value, int32_t length) {
1072  return Append(
1073  internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length));
1074  }
1075 
1076  Status Append(const std::string& value) {
1077  return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()),
1078  static_cast<int32_t>(value.size())));
1079  }
1080 };
1081 
1082 // ----------------------------------------------------------------------
1083 // Helper functions
1084 
1085 Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
1086  std::unique_ptr<ArrayBuilder>* out);
1087 
1088 } // namespace arrow
1089 
1090 #endif // ARROW_BUILDER_H_
std::shared_ptr< PoolBuffer > null_bitmap_
Definition: builder.h:149
typename T ::c_type value_type
Definition: builder.h:192
std::vector< std::unique_ptr< ArrayBuilder > > children_
Definition: builder.h:158
Builder class for UTF8 strings.
Definition: builder.h:763
void UnsafeAppendToBitmap(bool is_valid)
Definition: builder.h:133
Array builder for created encoded DictionaryArray from dense array.
Definition: builder.h:957
NullBuilder(MemoryPool *pool ARROW_MEMORY_POOL_DEFAULT)
Definition: builder.h:177
int64_t mod_bitmask_
Definition: builder.h:1007
Builder class for variable-length binary data.
Definition: builder.h:718
AdaptiveIntBuilder values_builder_
Definition: builder.h:1012
Dictionary array builder with convenience methods for strings.
Definition: builder.h:1062
#define ARROW_MEMORY_POOL_DEFAULT
Definition: memory_pool.h:94
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:1046
Status Append(const char *value)
Definition: builder.h:813
Status Append(const uint8_t *value, int32_t length)
TypedBufferBuilder< uint8_t > value_data_builder_
Definition: builder.h:755
int num_children() const
Definition: builder.h:77
DictionaryBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool)
Definition: builder.h:966
DictionaryBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Status MakeBuilder(MemoryPool *pool, const std::shared_ptr< DataType > &type, std::unique_ptr< ArrayBuilder > *out)
Status AppendNull()
Definition: builder.h:558
Definition: builder.h:411
Definition: builder.h:190
int64_t null_count_
Definition: builder.h:150
Status Append(bool is_valid=true)
Append an element to the Struct.
Definition: builder.h:897
Status Append(const uint64_t val)
Scalar append.
Definition: builder.h:418
A Buffer whose lifetime is tied to a particular MemoryPool.
Definition: buffer.h:210
#define NULLPTR
Definition: macros.h:69
constexpr int64_t kListMaximumElements
Definition: builder.h:45
Status AppendNulls(const uint8_t *valid_bytes, int64_t length)
Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory.
Definition: builder.h:200
std::vector< std::unique_ptr< ArrayBuilder > > field_builders_
Definition: builder.h:910
int64_t length() const
Definition: builder.h:79
int64_t length_
Definition: builder.h:154
std::shared_ptr< Array > values_
Definition: builder.h:706
Status Append(const std::string &value)
Definition: builder.h:1076
int32_t byte_width_
Definition: builder.h:846
ArrayBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Definition: builder.h:62
constexpr int64_t kMinBuilderCapacity
Definition: builder.h:53
Base class for all Builders that emit an Array of a scalar numerical type.
Definition: builder.h:273
int64_t null_count() const
Definition: builder.h:80
Base class for all data array builders.
Definition: builder.h:60
Status AppendNull()
Definition: builder.h:903
Definition: status.h:93
int32_t * hash_slots_
Definition: builder.h:995
Status Append(const char *value, int32_t length)
Definition: builder.h:1071
std::shared_ptr< Buffer > data() const
Definition: builder.h:564
Definition: builder.h:543
Definition: type_traits.h:30
PrimitiveBuilder(const std::shared_ptr< DataType > &type, MemoryPool *pool)
Definition: builder.h:194
Status Reserve(int64_t elements)
Ensures there is enough space for adding the number of elements by checking capacity and calling Resi...
Definition: builder.h:1041
ArrayBuilder()
Definition: builder.h:143
int num_fields() const
Definition: builder.h:907
Status Append(const std::array< uint8_t, NBYTES > &value)
Definition: builder.h:818
Status AppendValues(int64_t length, const uint8_t *valid_bytes)
Null bitmap is of equal length to every child field, and any zero byte will be considered as a null f...
Definition: builder.h:884
Definition: type.h:291
#define RETURN_NOT_OK(s)
Definition: status.h:43
static Status OK()
Definition: status.h:106
ArrayBuilder * child(int i)
For nested types.
Definition: builder.h:75
int32_t byte_width_
Definition: builder.h:1013
std::shared_ptr< DataType > null()
Status Append(const Scalar &value)
Append a scalar value.
int32_t hash_slot_t
Definition: hash.h:29
Status Append(const std::string &value)
Definition: builder.h:730
Status AppendNull()
Definition: builder.h:699
Definition: builder.h:477
int64_t entry_id_offset_
Definition: builder.h:1003
Status Append(const std::string &value)
Definition: builder.h:1055
int64_t capacity() const
Definition: builder.h:81
Status AppendValues(const uint8_t *data, int64_t length, const uint8_t *valid_bytes=NULLPTR)
uint8_t * null_bitmap_data_
Definition: builder.h:151
ArrayBuilder * field_builder(int i) const
Definition: builder.h:905
Definition: builder.h:803
TypedBufferBuilder< int32_t > offsets_builder_
Definition: builder.h:754
AdaptiveIntBuilder values_builder_
Definition: builder.h:1038
std::shared_ptr< DataType > type_
Definition: builder.h:145
Status Append(const int64_t val)
Scalar append.
Definition: builder.h:484
BufferBuilder byte_builder_
Definition: builder.h:847
Status AppendNull()
Definition: builder.h:180
Top-level namespace for Apache Arrow C++ API.
Definition: adapter.h:32
Array base type Immutable data array with some logical type and some length.
Definition: array.h:200
TypedBufferBuilder< int32_t > offsets_builder_
Definition: builder.h:704
TypeTraits< T >::BuilderType overflow_dict_builder_
Definition: builder.h:1010
std::shared_ptr< Buffer > data() const
Definition: builder.h:212
int64_t hash_table_size_
Size of the table. Must be a power of 2.
Definition: builder.h:998
int64_t value_data_length() const
Definition: builder.h:744
Append, Resize and Reserve methods are acting on StructBuilder.
Definition: builder.h:873
int64_t capacity_
Definition: builder.h:155
Status Append(const uint8_t val)
Definition: builder.h:579
#define ARROW_DEPRECATED(...)
Definition: macros.h:85
BinaryBuilder(MemoryPool *pool ARROW_MEMORY_POOL_DEFAULT)
NumericBuilder(typename std::enable_if< TypeTraits< T1 >::is_parameter_free, MemoryPool *>::type pool ARROW_MEMORY_POOL_DEFAULT)
Definition: builder.h:279
Status Append(const char *value, int32_t length)
Definition: builder.h:1050
Status AppendNulls(const uint8_t *valid_bytes, int64_t length)
Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory.
Definition: builder.h:552
Status Append(const value_type val)
Append a single scalar and increase the size if necessary.
Definition: builder.h:291
Status Append(const char *value, int32_t length)
Definition: builder.h:726
Definition: builder.h:850
Status Append(const uint8_t *value)
Definition: builder.h:808
Status Advance(int64_t elements)
For cases where raw data was memcpy&#39;d into the internal buffers, allows us to advance the length of t...
bool is_building_delta()
is the dictionary builder in the delta building mode
Definition: builder.h:984
std::shared_ptr< DataType > type() const
Definition: builder.h:128
MemoryPool * pool_
Definition: builder.h:146
std::shared_ptr< Buffer > hash_table_
Definition: builder.h:994
static Status NotImplemented(const std::string &msg)
Definition: status.h:125
Mutable container for generic Arrow array data.
Definition: array.h:88
void UnsafeAppend(const value_type val)
Append a single scalar under the assumption that the underlying Buffer is large enough.
Definition: builder.h:302
int64_t hash_table_load_threshold_
Size at which we decide to resize.
Definition: builder.h:1016
int64_t value_data_capacity() const
Definition: builder.h:746
std::unique_ptr< ArrayBuilder > value_builder_
Definition: builder.h:705
A class for incrementally building a contiguous chunk of in-memory data.
Definition: buffer.h:224
Base class for memory allocation.
Definition: memory_pool.h:34
Status Append(const uint8_t *value, int32_t length)
Definition: builder.h:1067
typename internal::DictionaryScalar< BinaryType >::type Scalar
Definition: builder.h:959
std::shared_ptr< PoolBuffer > null_bitmap() const
Definition: builder.h:112
constexpr int64_t kBinaryMemoryLimit
Definition: builder.h:44
Definition: builder.h:175
#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: macros.h:23
~DictionaryBuilder() override
Definition: builder.h:961
Represents a signed 128-bit integer in two&#39;s complement.
Definition: decimal.h:39
Status Append(const bool val)
Scalar append.
Definition: builder.h:567
Status AppendNull()
Definition: builder.h:206
TypeTraits< T >::BuilderType dict_builder_
Definition: builder.h:1009
Builder class for variable-length list array value types.
Definition: builder.h:671