Data manipulation¶
Recipes related to compare, filtering or transforming data.
Concatenate VectorSchemaRoots¶
In some cases, VectorSchemaRoot needs to be modeled as a container. To accomplish
this, you can use VectorSchemaRootAppender.append
. The following code
creates two roots, then concatenates them together:
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.VectorSchemaRootAppender;
import static java.util.Arrays.asList;
Field column_one = new Field("column-one", FieldType.nullable(new ArrowType.Int(32, true)), null);
Schema schema = new Schema(asList(column_one));
try (
BufferAllocator allocator = new RootAllocator();
VectorSchemaRoot rootOne = VectorSchemaRoot.create(schema, allocator);
VectorSchemaRoot rootTwo = VectorSchemaRoot.create(schema, allocator);
VectorSchemaRoot result = VectorSchemaRoot.create(schema, allocator);
) {
IntVector appenderOne = (IntVector) rootOne.getVector(0);
rootOne.allocateNew();
appenderOne.set(0, 100);
appenderOne.set(1, 20);
rootOne.setRowCount(2);
IntVector appenderTwo = (IntVector) rootTwo.getVector(0);
rootTwo.allocateNew();
appenderTwo.set(0, 34);
appenderTwo.set(1, 75);
rootTwo.setRowCount(2);
result.allocateNew();
VectorSchemaRootAppender.append(result, rootOne, rootTwo);
System.out.print(result.contentToTSVString());
}
column-one
100
20
34
75
Concatenate Value Vectors¶
In some cases, we need to concatenate two value vectors into one. To accomplish this, we can use VectorAppender. This mutates the initial ValueVector.
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.util.VectorAppender;
try (
BufferAllocator allocator = new RootAllocator();
IntVector initialValues = new IntVector("initialValues", allocator);
IntVector toAppend = new IntVector("toAppend", allocator);
) {
initialValues.allocateNew(2);
initialValues.set(0, 1);
initialValues.set(1, 2);
initialValues.setValueCount(2);
System.out.println("Initial IntVector: " + initialValues);
toAppend.allocateNew(4);
toAppend.set(1, 4);
toAppend.set(3, 6);
toAppend.setValueCount(4);
System.out.println("IntVector to Append: " + toAppend);
VectorAppender appenderUtil = new VectorAppender(initialValues);
toAppend.accept(appenderUtil, null);
System.out.println("IntVector Result: " + initialValues);
}
Initial IntVector: [1, 2]
IntVector to Append: [null, 4, null, 6]
IntVector Result: [1, 2, null, 4, null, 6]
Compare Vectors for Field Equality¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.compare.TypeEqualsVisitor;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector right = new IntVector("int", allocator);
) {
right.allocateNew(3);
right.set(0, 10);
right.set(1, 20);
right.set(2, 30);
right.setValueCount(3);
IntVector left1 = new IntVector("int", allocator);
IntVector left2 = new IntVector("int2", allocator);
TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);
System.out.println(visitor.equals(left1));
System.out.println(visitor.equals(left2));
}
true
false
Compare Vectors Equality¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.compare.VectorEqualsVisitor;
try(
BufferAllocator allocator = new RootAllocator();
IntVector vector1 = new IntVector("vector1", allocator);
IntVector vector2 = new IntVector("vector1", allocator);
IntVector vector3 = new IntVector("vector1", allocator)
) {
vector1.allocateNew(1);
vector1.set(0, 10);
vector1.setValueCount(1);
vector2.allocateNew(1);
vector2.set(0, 10);
vector2.setValueCount(1);
vector3.allocateNew(1);
vector3.set(0, 20);
vector3.setValueCount(1);
VectorEqualsVisitor visitor = new VectorEqualsVisitor();
System.out.println(visitor.vectorEquals(vector1, vector2));
System.out.println(visitor.vectorEquals(vector1, vector3));
}
true
false
Compare Values on the Array¶
Comparing two values at the given indices in the vectors:
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
VarCharVector vec = new VarCharVector("valueindexcomparator", allocator);
) {
vec.allocateNew(3);
vec.setValueCount(3);
vec.set(0, "ba".getBytes());
vec.set(1, "abc".getBytes());
vec.set(2, "aa".getBytes());
VectorValueComparator<VarCharVector> valueComparator = DefaultVectorComparators.createDefaultComparator(vec);
valueComparator.attachVector(vec);
System.out.println(valueComparator.compare(0, 1) > 0);
System.out.println(valueComparator.compare(1, 2) < 0);
}
true
false
Consider that if we need our own comparator we could extend VectorValueComparator and override compareNotNull method as needed
Search Values on the Array¶
Linear Search - O(n)¶
Algorithm: org.apache.arrow.algorithm.search.VectorSearcher#linearSearch - O(n)
import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector linearSearchVector = new IntVector("linearSearchVector", allocator);
) {
linearSearchVector.allocateNew(10);
linearSearchVector.setValueCount(10);
for (int i = 0; i < 10; i++) {
linearSearchVector.set(i, i);
}
VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(linearSearchVector);
int result = VectorSearcher.linearSearch(linearSearchVector, comparatorInt, linearSearchVector, 3);
System.out.println(result);
}
3
Binary Search - O(log(n))¶
Algorithm: org.apache.arrow.algorithm.search.VectorSearcher#binarySearch - O(log(n))
import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector binarySearchVector = new IntVector("", allocator);
) {
binarySearchVector.allocateNew(10);
binarySearchVector.setValueCount(10);
for (int i = 0; i < 10; i++) {
binarySearchVector.set(i, i);
}
VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(binarySearchVector);
int result = VectorSearcher.binarySearch(binarySearchVector, comparatorInt, binarySearchVector, 3);
System.out.println(result);
}
3
Sort Values on the Array¶
In-place Sorter - O(nlog(n))¶
Sorting by manipulating the original vector. Algorithm: org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter - O(nlog(n))
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
) {
intVectorNotSorted.allocateNew(3);
intVectorNotSorted.setValueCount(3);
intVectorNotSorted.set(0, 10);
intVectorNotSorted.set(1, 8);
intVectorNotSorted.setNull(2);
FixedWidthInPlaceVectorSorter<IntVector> sorter = new FixedWidthInPlaceVectorSorter<IntVector>();
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
sorter.sortInPlace(intVectorNotSorted, comparator);
System.out.println(intVectorNotSorted);
}
[null, 8, 10]
Out-place Sorter - O(nlog(n))¶
Sorting by copies vector elements to a new vector in sorted order - O(nlog(n)) Algorithm: : org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter. FixedWidthOutOfPlaceVectorSorter & VariableWidthOutOfPlaceVectorSor
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthOutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.OutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
try(
BufferAllocator allocator = new RootAllocator();
IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
IntVector intVectorSorted = (IntVector) intVectorNotSorted.getField()
.getFieldType().createNewSingleVector("new-out-of-place-sorter",
allocator, null);
) {
intVectorNotSorted.allocateNew(3);
intVectorNotSorted.setValueCount(3);
intVectorNotSorted.set(0, 10);
intVectorNotSorted.set(1, 8);
intVectorNotSorted.setNull(2);
OutOfPlaceVectorSorter<IntVector> sorterOutOfPlaceSorter = new FixedWidthOutOfPlaceVectorSorter<>();
VectorValueComparator<IntVector> comparatorOutOfPlaceSorter = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
intVectorSorted.allocateNew(intVectorNotSorted.getValueCount());
intVectorSorted.setValueCount(intVectorNotSorted.getValueCount());
sorterOutOfPlaceSorter.sortOutOfPlace(intVectorNotSorted, intVectorSorted, comparatorOutOfPlaceSorter);
System.out.println(intVectorSorted);
}
[null, 8, 10]