Data manipulation

Recipes related to compare, filtering or transforming data.

Concatenate VectorSchemaRoots

In some cases, VectorSchemaRoot needs to be modeled as a container. To accomplish this, you can use VectorSchemaRootAppender.append. The following code creates two roots, then concatenates them together:

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.VectorSchemaRootAppender;

import static java.util.Arrays.asList;

Field column_one = new Field("column-one", FieldType.nullable(new ArrowType.Int(32, true)), null);
Schema schema = new Schema(asList(column_one));
try (
    BufferAllocator allocator = new RootAllocator();
    VectorSchemaRoot rootOne = VectorSchemaRoot.create(schema, allocator);
    VectorSchemaRoot rootTwo = VectorSchemaRoot.create(schema, allocator);
    VectorSchemaRoot result = VectorSchemaRoot.create(schema, allocator);
) {
    IntVector appenderOne = (IntVector) rootOne.getVector(0);
    rootOne.allocateNew();
    appenderOne.set(0, 100);
    appenderOne.set(1, 20);
    rootOne.setRowCount(2);
    IntVector appenderTwo = (IntVector) rootTwo.getVector(0);
    rootTwo.allocateNew();
    appenderTwo.set(0, 34);
    appenderTwo.set(1, 75);
    rootTwo.setRowCount(2);
    result.allocateNew();
    VectorSchemaRootAppender.append(result, rootOne, rootTwo);
    System.out.print(result.contentToTSVString());
}
column-one
100
20
34
75

Concatenate Value Vectors

In some cases, we need to concatenate two value vectors into one. To accomplish this, we can use VectorAppender. This mutates the initial ValueVector.

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.util.VectorAppender;

try (
    BufferAllocator allocator = new RootAllocator();
    IntVector initialValues = new IntVector("initialValues", allocator);
    IntVector toAppend = new IntVector("toAppend", allocator);
) {
    initialValues.allocateNew(2);
    initialValues.set(0, 1);
    initialValues.set(1, 2);
    initialValues.setValueCount(2);
    System.out.println("Initial IntVector: " + initialValues);
    toAppend.allocateNew(4);
    toAppend.set(1, 4);
    toAppend.set(3, 6);
    toAppend.setValueCount(4);
    System.out.println("IntVector to Append: " + toAppend);
    VectorAppender appenderUtil = new VectorAppender(initialValues);
    toAppend.accept(appenderUtil, null);
    System.out.println("IntVector Result: " + initialValues);
}
Initial IntVector: [1, 2]
IntVector to Append: [null, 4, null, 6]
IntVector Result: [1, 2, null, 4, null, 6]

Compare Vectors for Field Equality

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.compare.TypeEqualsVisitor;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector right = new IntVector("int", allocator);
) {
    right.allocateNew(3);
    right.set(0, 10);
    right.set(1, 20);
    right.set(2, 30);
    right.setValueCount(3);
    IntVector left1 = new IntVector("int", allocator);
    IntVector left2 = new IntVector("int2", allocator);
    TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);

    System.out.println(visitor.equals(left1));
    System.out.println(visitor.equals(left2));
}
true
false

Compare Vectors Equality

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.compare.VectorEqualsVisitor;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector vector1 = new IntVector("vector1", allocator);
    IntVector vector2 = new IntVector("vector1", allocator);
    IntVector vector3 = new IntVector("vector1", allocator)
) {
    vector1.allocateNew(1);
    vector1.set(0, 10);
    vector1.setValueCount(1);

    vector2.allocateNew(1);
    vector2.set(0, 10);
    vector2.setValueCount(1);

    vector3.allocateNew(1);
    vector3.set(0, 20);
    vector3.setValueCount(1);
    VectorEqualsVisitor visitor = new VectorEqualsVisitor();

    System.out.println(visitor.vectorEquals(vector1, vector2));
    System.out.println(visitor.vectorEquals(vector1, vector3));
}
true
false

Compare Values on the Array

Comparing two values at the given indices in the vectors:

import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    VarCharVector vec = new VarCharVector("valueindexcomparator", allocator);
) {
    vec.allocateNew(3);
    vec.setValueCount(3);
    vec.set(0, "ba".getBytes());
    vec.set(1, "abc".getBytes());
    vec.set(2, "aa".getBytes());
    VectorValueComparator<VarCharVector> valueComparator = DefaultVectorComparators.createDefaultComparator(vec);
    valueComparator.attachVector(vec);

    System.out.println(valueComparator.compare(0, 1) > 0);
    System.out.println(valueComparator.compare(1, 2) < 0);
}
true
false

Consider that if we need our own comparator we could extend VectorValueComparator and override compareNotNull method as needed

Search Values on the Array

Linear Search - O(n)

Algorithm: org.apache.arrow.algorithm.search.VectorSearcher#linearSearch - O(n)

import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector linearSearchVector = new IntVector("linearSearchVector", allocator);
) {
    linearSearchVector.allocateNew(10);
    linearSearchVector.setValueCount(10);
    for (int i = 0; i < 10; i++) {
        linearSearchVector.set(i, i);
    }
    VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(linearSearchVector);
    int result = VectorSearcher.linearSearch(linearSearchVector, comparatorInt, linearSearchVector, 3);

    System.out.println(result);
}
3

Binary Search - O(log(n))

Algorithm: org.apache.arrow.algorithm.search.VectorSearcher#binarySearch - O(log(n))

import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector binarySearchVector = new IntVector("", allocator);
) {
    binarySearchVector.allocateNew(10);
    binarySearchVector.setValueCount(10);
    for (int i = 0; i < 10; i++) {
        binarySearchVector.set(i, i);
    }
    VectorValueComparator<IntVector> comparatorInt = DefaultVectorComparators.createDefaultComparator(binarySearchVector);
    int result = VectorSearcher.binarySearch(binarySearchVector, comparatorInt, binarySearchVector, 3);

    System.out.println(result);
}
3

Sort Values on the Array

In-place Sorter - O(nlog(n))

Sorting by manipulating the original vector. Algorithm: org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter - O(nlog(n))

import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
) {
    intVectorNotSorted.allocateNew(3);
    intVectorNotSorted.setValueCount(3);
    intVectorNotSorted.set(0, 10);
    intVectorNotSorted.set(1, 8);
    intVectorNotSorted.setNull(2);
    FixedWidthInPlaceVectorSorter<IntVector> sorter = new FixedWidthInPlaceVectorSorter<IntVector>();
    VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
    sorter.sortInPlace(intVectorNotSorted, comparator);

    System.out.println(intVectorNotSorted);
}
[null, 8, 10]

Out-place Sorter - O(nlog(n))

Sorting by copies vector elements to a new vector in sorted order - O(nlog(n)) Algorithm: : org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter. FixedWidthOutOfPlaceVectorSorter & VariableWidthOutOfPlaceVectorSor

import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthOutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.OutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;

try(
    BufferAllocator allocator = new RootAllocator();
    IntVector intVectorNotSorted = new IntVector("intvectornotsorted", allocator);
    IntVector intVectorSorted = (IntVector) intVectorNotSorted.getField()
            .getFieldType().createNewSingleVector("new-out-of-place-sorter",
                    allocator, null);

) {
    intVectorNotSorted.allocateNew(3);
    intVectorNotSorted.setValueCount(3);
    intVectorNotSorted.set(0, 10);
    intVectorNotSorted.set(1, 8);
    intVectorNotSorted.setNull(2);
    OutOfPlaceVectorSorter<IntVector> sorterOutOfPlaceSorter = new FixedWidthOutOfPlaceVectorSorter<>();
    VectorValueComparator<IntVector> comparatorOutOfPlaceSorter = DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
    intVectorSorted.allocateNew(intVectorNotSorted.getValueCount());
    intVectorSorted.setValueCount(intVectorNotSorted.getValueCount());
    sorterOutOfPlaceSorter.sortOutOfPlace(intVectorNotSorted, intVectorSorted, comparatorOutOfPlaceSorter);

    System.out.println(intVectorSorted);
}
[null, 8, 10]