Creating Arrow Objects¶
A vector is the basic unit in the Arrow Java library. Data types describe the types of values; ValueVectors are sequences of typed values. Vectors represent a one-dimensional sequence of values of the same type. They are mutable containers.
Vectors implement the interface ValueVector. The Arrow libraries provide implementations of vectors for various data types.
Creating Vectors (arrays)¶
Array of Int¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
try(
BufferAllocator allocator = new RootAllocator();
IntVector intVector = new IntVector("intVector", allocator)
) {
intVector.allocateNew(3);
intVector.set(0, 1);
intVector.set(1, 2);
intVector.set(2, 3);
intVector.setValueCount(3);
System.out.print(intVector);
}
[1, 2, 3]
Array of Varchar¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;
try(
BufferAllocator allocator = new RootAllocator();
VarCharVector varCharVector = new VarCharVector("varCharVector", allocator);
) {
varCharVector.allocateNew(3);
varCharVector.set(0, "one".getBytes());
varCharVector.set(1, "two".getBytes());
varCharVector.set(2, "three".getBytes());
varCharVector.setValueCount(3);
System.out.print(varCharVector);
}
[one, two, three]
Dictionary-Encoded Array of Varchar¶
In some scenarios dictionary-encoding a column is useful to save memory.
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.dictionary.Dictionary;
import org.apache.arrow.vector.dictionary.DictionaryEncoder;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
import java.nio.charset.StandardCharsets;
try (BufferAllocator root = new RootAllocator();
VarCharVector countries = new VarCharVector("country-dict", root);
VarCharVector appUserCountriesUnencoded = new VarCharVector("app-use-country-dict", root)
) {
countries.allocateNew(10);
countries.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
countries.set(1, "Cuba".getBytes(StandardCharsets.UTF_8));
countries.set(2, "Grecia".getBytes(StandardCharsets.UTF_8));
countries.set(3, "Guinea".getBytes(StandardCharsets.UTF_8));
countries.set(4, "Islandia".getBytes(StandardCharsets.UTF_8));
countries.set(5, "Malta".getBytes(StandardCharsets.UTF_8));
countries.set(6, "Tailandia".getBytes(StandardCharsets.UTF_8));
countries.set(7, "Uganda".getBytes(StandardCharsets.UTF_8));
countries.set(8, "Yemen".getBytes(StandardCharsets.UTF_8));
countries.set(9, "Zambia".getBytes(StandardCharsets.UTF_8));
countries.setValueCount(10);
Dictionary countriesDictionary = new Dictionary(countries,
new DictionaryEncoding(/*id=*/1L, /*ordered=*/false, /*indexType=*/new ArrowType.Int(8, true)));
System.out.println("Dictionary: " + countriesDictionary);
appUserCountriesUnencoded.allocateNew(5);
appUserCountriesUnencoded.set(0, "Andorra".getBytes(StandardCharsets.UTF_8));
appUserCountriesUnencoded.set(1, "Guinea".getBytes(StandardCharsets.UTF_8));
appUserCountriesUnencoded.set(2, "Islandia".getBytes(StandardCharsets.UTF_8));
appUserCountriesUnencoded.set(3, "Malta".getBytes(StandardCharsets.UTF_8));
appUserCountriesUnencoded.set(4, "Uganda".getBytes(StandardCharsets.UTF_8));
appUserCountriesUnencoded.setValueCount(5);
System.out.println("Unencoded data: " + appUserCountriesUnencoded);
try (FieldVector appUserCountriesDictionaryEncoded = (FieldVector) DictionaryEncoder
.encode(appUserCountriesUnencoded, countriesDictionary)) {
System.out.println("Dictionary-encoded data: " + appUserCountriesDictionaryEncoded);
}
}
Dictionary: Dictionary DictionaryEncoding[id=1,ordered=false,indexType=Int(8, true)] [Andorra, Cuba, Grecia, Guinea, Islandia, Malta, Tailandia, Uganda, Yemen, Zambia]
Unencoded data: [Andorra, Guinea, Islandia, Malta, Uganda]
Dictionary-encoded data: [0, 3, 4, 5, 7]
Array of List¶
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.complex.ListVector;
try(
BufferAllocator allocator = new RootAllocator();
ListVector listVector = ListVector.empty("listVector", allocator);
UnionListWriter listWriter = listVector.getWriter()
) {
int[] data = new int[] { 1, 2, 3, 10, 20, 30, 100, 200, 300, 1000, 2000, 3000 };
int tmp_index = 0;
for(int i = 0; i < 4; i++) {
listWriter.setPosition(i);
listWriter.startList();
for(int j = 0; j < 3; j++) {
listWriter.writeInt(data[tmp_index]);
tmp_index = tmp_index + 1;
}
listWriter.setValueCount(3);
listWriter.endList();
}
listVector.setValueCount(4);
System.out.print(listVector);
} catch (Exception e) {
e.printStackTrace();
}
[[1,2,3], [10,20,30], [100,200,300], [1000,2000,3000]]
Slicing¶
Slicing provides a way of copying a range of rows between two vectors of the same type.
Slicing IntVector¶
In this example, we copy a portion of the input IntVector to a new IntVector.
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.util.TransferPair;
try (BufferAllocator allocator = new RootAllocator();
IntVector vector = new IntVector("intVector", allocator)) {
for (int i = 0; i < 10; i++) {
vector.setSafe(i, i);
}
vector.setValueCount(10);
TransferPair tp = vector.getTransferPair(allocator);
tp.splitAndTransfer(0, 5);
try (IntVector sliced = (IntVector) tp.getTo()) {
System.out.println(sliced);
}
tp = vector.getTransferPair(allocator);
// copy 6 elements from index 2
tp.splitAndTransfer(2, 6);
try (IntVector sliced = (IntVector) tp.getTo()) {
System.out.print(sliced);
}
}
[0, 1, 2, 3, 4]
[2, 3, 4, 5, 6, 7]