Arrow JDBC Adapter

The Arrow Java JDBC module converts JDBC ResultSets into Arrow VectorSchemaRoots.

ResultSet to VectorSchemaRoot Conversion

The main class to help us to convert ResultSet to VectorSchemaRoot is JdbcToArrow

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, allocator)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5
101    true    1000000000300
102    true    100000000030
103    true    10000000003

Configuring Array subtypes

JdbcToArrow accepts configuration through JdbcToArrowConfig. For example, the type of the elements of array columns can be specified by setArraySubTypeByColumnNameMap.

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder;
import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashMap;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
            JdbcToArrowUtils.getUtcCalendar())
            .setArraySubTypeByColumnNameMap(
                    new HashMap<>() {{
                        put("LIST_FIELD19",
                                new JdbcFieldInfo(Types.INTEGER));
                    }}
            )
            .build();
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, config)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
101    true    1000000000300    some char text      [1,2,3]
102    true    100000000030    some char text      [1,2]
103    true    10000000003    some char text      [1]

Configuring batch size

By default, the adapter will read up to 1024 rows in a batch. This can be customized via setTargetBatchSize.

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder;
import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashMap;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
            JdbcToArrowUtils.getUtcCalendar())
            .setTargetBatchSize(2)
            .setArraySubTypeByColumnNameMap(
                    new HashMap<>() {{
                        put("LIST_FIELD19",
                                new JdbcFieldInfo(Types.INTEGER));
                    }}
            )
            .build();
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, config)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
101    true    1000000000300    some char text      [1,2,3]
102    true    100000000030    some char text      [1,2]
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
103    true    10000000003    some char text      [1]

Configuring numeric (decimal) precision and scale

By default, the scale of any decimal values must exactly match the defined scale of the Arrow type of the column, or else an UnsupportedOperationException will be thrown with a message like BigDecimal scale must equal that in the Arrow vector.

This can happen because Arrow infers the type from the ResultSet metadata, which is not accurate for all database drivers. The JDBC adapter lets you avoid this by either overriding the decimal scale, or by providing a RoundingMode via setBigDecimalRoundingMode to convert values to the expected scale.

In this example, we have a BigInt column. By default, the inferred scale is 0. We override the scale to 7 and then set a RoundingMode to convert values to the given scale.

import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
import org.apache.arrow.adapter.jdbc.JdbcToArrow;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig;
import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder;
import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.ibatis.jdbc.ScriptRunner;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.math.RoundingMode;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashMap;

try (BufferAllocator allocator = new RootAllocator();
     Connection connection = DriverManager.getConnection(
             "jdbc:h2:mem:h2-jdbc-adapter")) {
    ScriptRunner runnerDDLDML = new ScriptRunner(connection);
    runnerDDLDML.setLogWriter(null);
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-ddl.sql")));
    runnerDDLDML.runScript(new BufferedReader(
            new FileReader("./thirdpartydeps/jdbc/h2-dml.sql")));
    JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
            JdbcToArrowUtils.getUtcCalendar())
            .setTargetBatchSize(2)
            .setArraySubTypeByColumnNameMap(
                    new HashMap<>() {{
                        put("LIST_FIELD19",
                                new JdbcFieldInfo(Types.INTEGER));
                    }}
            )
            .setExplicitTypesByColumnName(
                    new HashMap<>() {{
                        put("BIGINT_FIELD5",
                                new JdbcFieldInfo(Types.DECIMAL, 20, 7));
                    }}
            )
            .setBigDecimalRoundingMode(RoundingMode.UNNECESSARY)
            .build();
    try (ResultSet resultSet = connection.createStatement().executeQuery(
            "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1");
         ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator(
                 resultSet, config)) {
        while (iterator.hasNext()) {
            try (VectorSchemaRoot root = iterator.next()) {
                System.out.print(root.contentToTSVString());
            }
        }
    }
} catch (SQLException | IOException e) {
    e.printStackTrace();
}
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
101    true    1000000000300.0000000    some char text      [1,2,3]
102    true    100000000030.0000000    some char text      [1,2]
INT_FIELD1    BOOL_FIELD2    BIGINT_FIELD5    CHAR_FIELD16    LIST_FIELD19
103    true    10000000003.0000000    some char text      [1]