PostgreSQL Recipes¶

Authenticate with a username and password¶

Recipe source: postgresql_authenticate.py

To connect to a PostgreSQL database, the username and password must be provided in the URI. For example,

postgresql://username:password@hostname:port/dbname

See the PostgreSQL documentation for full details.

import os

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

with conn.cursor() as cur:
    cur.execute("SELECT 1")
    print(cur.fetchone())
    # Output: (1,)

conn.close()

stdout¶

(1,)

Create/append to a table from an Arrow dataset¶

Recipe source: postgresql_create_dataset_table.py

ADBC makes it easy to load PyArrow datasets into your datastore.

import os
import tempfile
from pathlib import Path

import pyarrow
import pyarrow.csv
import pyarrow.dataset
import pyarrow.feather
import pyarrow.parquet

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

For the purposes of testing, we’ll first make sure the tables we’re about to use don’t exist.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS csvtable")
    cur.execute("DROP TABLE IF EXISTS ipctable")
    cur.execute("DROP TABLE IF EXISTS pqtable")
    cur.execute("DROP TABLE IF EXISTS csvdataset")
    cur.execute("DROP TABLE IF EXISTS ipcdataset")
    cur.execute("DROP TABLE IF EXISTS pqdataset")

conn.commit()

Generating sample data¶

tempdir = tempfile.TemporaryDirectory(
    prefix="adbc-docs-",
    ignore_cleanup_errors=True,
)
root = Path(tempdir.name)
table = pyarrow.table(
    [
        [1, 1, 2],
        ["foo", "bar", "baz"],
    ],
    names=["ints", "strs"],
)

First we’ll write single files.

csv_file = root / "example.csv"
pyarrow.csv.write_csv(table, csv_file)

ipc_file = root / "example.arrow"
pyarrow.feather.write_feather(table, ipc_file)

parquet_file = root / "example.parquet"
pyarrow.parquet.write_table(table, parquet_file)

We’ll also generate some partitioned datasets.

csv_dataset = root / "csv_dataset"
pyarrow.dataset.write_dataset(
    table,
    csv_dataset,
    format="csv",
    partitioning=["ints"],
)

ipc_dataset = root / "ipc_dataset"
pyarrow.dataset.write_dataset(
    table,
    ipc_dataset,
    format="feather",
    partitioning=["ints"],
)

parquet_dataset = root / "parquet_dataset"
pyarrow.dataset.write_dataset(
    table,
    parquet_dataset,
    format="parquet",
    partitioning=["ints"],
)

Loading CSV Files into PostgreSQL¶

We can directly pass a pyarrow.RecordBatchReader (from open_csv) to adbc_ingest. We can also pass a pyarrow.dataset.Dataset, or a pyarrow.dataset.Scanner.

with conn.cursor() as cur:
    reader = pyarrow.csv.open_csv(csv_file)
    cur.adbc_ingest("csvtable", reader, mode="create")

    reader = pyarrow.dataset.dataset(
        csv_dataset,
        format="csv",
        partitioning=["ints"],
    )
    cur.adbc_ingest("csvdataset", reader, mode="create")

conn.commit()

with conn.cursor() as cur:
    cur.execute("SELECT ints, strs FROM csvtable ORDER BY ints, strs ASC")
    assert cur.fetchall() == [(1, "bar"), (1, "foo"), (2, "baz")]

    cur.execute("SELECT ints, strs FROM csvdataset ORDER BY ints, strs ASC")
    assert cur.fetchall() == [(1, "bar"), (1, "foo"), (2, "baz")]

Loading Arrow IPC (Feather) Files into PostgreSQL¶

with conn.cursor() as cur:
    reader = pyarrow.ipc.RecordBatchFileReader(ipc_file)

Because of quirks in the PyArrow API, we have to read the file into memory.

    cur.adbc_ingest("ipctable", reader.read_all(), mode="create")

The Dataset API will stream the data into memory and then into PostgreSQL, though.

    reader = pyarrow.dataset.dataset(
        ipc_dataset,
        format="feather",
        partitioning=["ints"],
    )
    cur.adbc_ingest("ipcdataset", reader, mode="create")

conn.commit()

with conn.cursor() as cur:
    cur.execute("SELECT ints, strs FROM ipctable ORDER BY ints, strs ASC")
    assert cur.fetchall() == [(1, "bar"), (1, "foo"), (2, "baz")]

    cur.execute("SELECT ints, strs FROM ipcdataset ORDER BY ints, strs ASC")
    assert cur.fetchall() == [(1, "bar"), (1, "foo"), (2, "baz")]

Loading Parquet Files into PostgreSQL¶

with conn.cursor() as cur:
    reader = pyarrow.parquet.ParquetFile(parquet_file)
    cur.adbc_ingest("pqtable", reader.iter_batches(), mode="create")

    reader = pyarrow.dataset.dataset(
        parquet_dataset,
        format="parquet",
        partitioning=["ints"],
    )
    cur.adbc_ingest("pqdataset", reader, mode="create")

conn.commit()

with conn.cursor() as cur:
    cur.execute("SELECT ints, strs FROM pqtable ORDER BY ints, strs ASC")
    assert cur.fetchall() == [(1, "bar"), (1, "foo"), (2, "baz")]

    cur.execute("SELECT ints, strs FROM pqdataset ORDER BY ints, strs ASC")
    assert cur.fetchall() == [(1, "bar"), (1, "foo"), (2, "baz")]

Cleanup¶

conn.close()
tempdir.cleanup()

Create/append to a table from an Arrow table¶

Recipe source: postgresql_create_append_table.py

ADBC allows creating and appending to database tables using Arrow tables.

import os

import pyarrow

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

For the purposes of testing, we’ll first make sure the tables we’re about to use don’t exist.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS example")
    cur.execute("DROP TABLE IF EXISTS example2")

Now we can create the table.

with conn.cursor() as cur:
    data = pyarrow.table(
        [
            [1, 2, None, 4],
        ],
        schema=pyarrow.schema(
            [
                ("ints", "int32"),
            ]
        ),
    )
    cur.adbc_ingest("example", data, mode="create")

conn.commit()

After ingestion, we can fetch the result.

with conn.cursor() as cur:
    cur.execute("SELECT * FROM example")
    assert cur.fetchone() == (1,)
    assert cur.fetchone() == (2,)

    cur.execute("SELECT COUNT(*) FROM example")
    assert cur.fetchone() == (4,)

If we try to ingest again, it’ll fail, because the table already exists.

with conn.cursor() as cur:
    try:
        cur.adbc_ingest("example", data, mode="create")
    except conn.ProgrammingError:
        pass
    else:
        raise RuntimeError("Should have failed!")

conn.rollback()

Instead, we can append to the table.

with conn.cursor() as cur:
    cur.adbc_ingest("example", data, mode="append")

    cur.execute("SELECT COUNT(*) FROM example")
    assert cur.fetchone() == (8,)

We can also choose to create the table if it doesn’t exist, and otherwise append.

with conn.cursor() as cur:
    cur.adbc_ingest("example2", data, mode="create_append")

    cur.execute("SELECT COUNT(*) FROM example2")
    assert cur.fetchone() == (4,)

    cur.adbc_ingest("example2", data, mode="create_append")

    cur.execute("SELECT COUNT(*) FROM example2")
    assert cur.fetchone() == (8,)

Finally, we can replace the table.

with conn.cursor() as cur:
    cur.adbc_ingest("example", data.slice(0, 2), mode="replace")

    cur.execute("SELECT COUNT(*) FROM example")
    assert cur.fetchone() == (2,)

conn.close()

Create/append to a temporary table¶

Recipe source: postgresql_create_temp_table.py

ADBC allows creating and appending to temporary tables as well.

import os

import pyarrow

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

For the purposes of testing, we’ll first make sure the tables we’re about to use don’t exist.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS example")

To create a temporary table, just specify the option “temporary”.

data = pyarrow.table(
    [
        [1, 2, None, 4],
    ],
    schema=pyarrow.schema(
        [
            ("ints", "int32"),
        ]
    ),
)

with conn.cursor() as cur:
    cur.adbc_ingest("example", data, mode="create", temporary=True)

conn.commit()

After ingestion, we can fetch the result.

with conn.cursor() as cur:
    cur.execute("SELECT * FROM example")
    assert cur.fetchone() == (1,)
    assert cur.fetchone() == (2,)

    cur.execute("SELECT COUNT(*) FROM example")
    assert cur.fetchone() == (4,)

Temporary tables are separate from regular tables, even if they have the same name.

with conn.cursor() as cur:
    cur.adbc_ingest("example", data.slice(0, 2), mode="create", temporary=False)

conn.commit()

with conn.cursor() as cur:

Because we have two tables with the same name, we have to explicitly reference the normal temporary table here.

    cur.execute("SELECT COUNT(*) FROM public.example")
    assert cur.fetchone() == (2,)

    cur.execute("SELECT COUNT(*) FROM example")
    assert cur.fetchone() == (4,)

conn.close()

After closing the connection, the temporary table is implicitly dropped. If we reconnect, the table won’t exist; we’ll see only the ‘normal’ table.

with adbc_driver_postgresql.dbapi.connect(uri) as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT COUNT(*) FROM example")
        assert cur.fetchone() == (2,)

All the regular ingestion options apply to temporary tables, too. See Create/append to a table from an Arrow dataset for more examples.

Execute a statement with bind parameters¶

Recipe source: postgresql_execute_bind.py

ADBC allows using Python and Arrow values as bind parameters. Right now, the PostgreSQL driver only supports bind parameters for queries that don’t generate result sets.

import os

import pyarrow

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

We’ll create an example table to test.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS example")
    cur.execute("CREATE TABLE example (ints INT, bigints BIGINT)")

conn.commit()

We can bind Python values:

with conn.cursor() as cur:
    cur.executemany("INSERT INTO example VALUES ($1, $2)", [(1, 2), (3, 4)])

    cur.execute("SELECT SUM(ints) FROM example")
    assert cur.fetchone() == (4,)

Note

If you’re used to the format-string style %s syntax that libraries like psycopg use for bind parameters, note that this is not supported—only the PostgreSQL-native $1 syntax.

We can also bind Arrow values:

with conn.cursor() as cur:
    data = pyarrow.record_batch(
        [
            [5, 6],
            [7, 8],
        ],
        names=["$1", "$2"],
    )
    cur.executemany("INSERT INTO example VALUES ($1, $2)", data)

    cur.execute("SELECT SUM(ints) FROM example")
    assert cur.fetchone() == (15,)

conn.close()

Execute a statement without COPY¶

Recipe source: postgresql_execute_nocopy.py

The ADBC driver tries to execute queries with COPY by default since it is faster for large result sets. PostgreSQL does not support COPY for all kinds of queries, however. For example, SHOW queries will not work. In this case, you can explicitly disable the COPY optimization.

import os

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

The option can be set when creating the cursor:

with conn.cursor(
    adbc_stmt_kwargs={
        adbc_driver_postgresql.StatementOptions.USE_COPY.value: False,
    }
) as cur:
    cur.execute("SHOW ALL")
    print(cur.fetch_arrow_table().schema)

Or it can be set afterwards:

with conn.cursor() as cur:
    cur.adbc_statement.set_options(
        **{
            adbc_driver_postgresql.StatementOptions.USE_COPY.value: False,
        }
    )
    cur.execute("SHOW ALL")
    print(cur.fetch_arrow_table().schema)

Without the option, the query fails as the driver attempts to execute the query with COPY:

with conn.cursor() as cur:
    try:
        cur.execute("SHOW ALL")
    except conn.Error:
        pass
    else:
        raise RuntimeError("Expected error")

conn.close()

stdout¶

name: string
setting: string
description: string
name: string
setting: string
description: string

Get the Arrow schema of a table¶

Recipe source: postgresql_get_table_schema.py

ADBC lets you get the schema of a table as an Arrow schema.

import os

import pyarrow

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

We’ll create some example tables to test.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS example")
    cur.execute("CREATE TABLE example (ints INT, bigints BIGINT)")

    cur.execute("CREATE SCHEMA IF NOT EXISTS other_schema")
    cur.execute("DROP TABLE IF EXISTS other_schema.example")
    cur.execute("CREATE TABLE other_schema.example (strings TEXT, values INT)")

conn.commit()

By default the “active” catalog/schema are assumed.

assert conn.adbc_get_table_schema("example") == pyarrow.schema(
    [
        ("ints", "int32"),
        ("bigints", "int64"),
    ]
)

We can explicitly specify the PostgreSQL schema to get the Arrow schema of a table in a different namespace.

Note

In PostgreSQL, you can only query the database (catalog) that you are connected to. So we cannot specify the catalog here (or rather, there is no point in doing so).

Note that the NUMERIC column is read as a string, because PostgreSQL decimals do not map onto Arrow decimals.

assert conn.adbc_get_table_schema(
    "example",
    db_schema_filter="other_schema",
) == pyarrow.schema(
    [
        ("strings", "string"),
        ("values", "int32"),
    ]
)

conn.close()

Get the Arrow schema of a query¶

Recipe source: postgresql_get_query_schema.py

ADBC lets you get the schema of a result set, without executing the query.

import os

import pyarrow

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

We’ll create an example table to test.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS example")
    cur.execute("CREATE TABLE example (ints INT, bigints BIGINT)")

conn.commit()

expected = pyarrow.schema(
    [
        ("ints", "int32"),
        ("bigints", "int64"),
    ]
)

with conn.cursor() as cur:
    assert cur.adbc_execute_schema("SELECT * FROM example") == expected

PostgreSQL doesn’t know the type here, so it just returns a guess.

    assert cur.adbc_execute_schema("SELECT $1 AS res") == pyarrow.schema(
        [
            ("res", "string"),
        ]
    )

conn.close()

List catalogs, schemas, and tables¶

Recipe source: postgresql_list_catalogs.py

ADBC allows listing tables, catalogs, and schemas in the database.

import os

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

We’ll create an example table to look for.

with conn.cursor() as cur:
    cur.execute("DROP TABLE IF EXISTS example")
    cur.execute("CREATE TABLE example (ints INT, bigints BIGINT)")

conn.commit()

The data is given as a PyArrow RecordBatchReader.

objects = conn.adbc_get_objects(depth="all").read_all()

We’ll convert it to plain Python data for convenience.

objects = objects.to_pylist()
catalog = objects[0]
assert catalog["catalog_name"] == "postgres"

db_schema = catalog["catalog_db_schemas"][0]
assert db_schema["db_schema_name"] == "public"

tables = db_schema["db_schema_tables"]
example = [table for table in tables if table["table_name"] == "example"]
assert len(example) == 1
example = example[0]

assert example["table_columns"][0]["column_name"] == "ints"
assert example["table_columns"][1]["column_name"] == "bigints"

conn.close()

Connection pooling with SQLAlchemy¶

Recipe source: postgresql_pool.py

ADBC does not implement connection pooling, as this is not generally a feature of DBAPI drivers. Instead, use a third party connection pool like the one built into SQLAlchemy.

import os

import sqlalchemy.pool

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]

source = adbc_driver_postgresql.dbapi.connect(uri)

adbc_driver_manager.dbapi.Connection.adbc_clone() opens a new connection from an existing connection, sharing internal resources where possible. For example, the PostgreSQL driver will share the internal OID cache, saving some overhead on connection.

pool = sqlalchemy.pool.QueuePool(source.adbc_clone, max_overflow=1, pool_size=2)

We can now get connections out of the pool; SQLAlchemy overrides close() to return the connection to the pool.

Note

SQLAlchemy’s wrapper does not support the context manager protocol, unlike the underlying ADBC connection.

conn = pool.connect()

assert pool.checkedin() == 0
assert pool.checkedout() == 1

with conn.cursor() as cur:
    cur.execute("SELECT 1")
    assert cur.fetchone() == (1,)

conn.close()

assert pool.checkedin() == 1
assert pool.checkedout() == 0

source.close()

Using Pandas and ADBC¶

Recipe source: postgresql_pandas.py

ADBC is integrated into pandas, a popular dataframe library. Pandas can use ADBC to exchange data with PostgreSQL and other databases. Compared to using SQLAlchemy or other options, using ADBC with pandas can have better performance, such as by avoiding excess conversions to and from Python objects.

import os

import pandas as pd

import adbc_driver_postgresql.dbapi

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
conn = adbc_driver_postgresql.dbapi.connect(uri)

We’ll use pd.DataFrame.to_sql to create a sample table.

data = pd.DataFrame(
    {
        "ints": [1, 2, None, 4],
        "strs": ["a", "b", "c", "d"],
    }
)
data.to_sql("example", conn, if_exists="replace")
conn.commit()

After creating the table, we can pass an ADBC connection and a SQL query to pd.read_sql to get the result set as a pandas DataFrame.

df = pd.read_sql("SELECT * FROM example WHERE ints > 1", conn)

assert len(df) == 2

conn.close()

Compared to the ADBC interface, pandas offers a more convenient and higher level API, especially for those already using pandas.

Using Polars and ADBC¶

Recipe source: postgresql_polars.py

ADBC can be used with Polars, a dataframe library written in Rust. As per its documentation:

If the backend supports returning Arrow data directly then this facility will be used to efficiently instantiate the DataFrame; otherwise, the DataFrame is initialised from row-wise data.

Obviously, ADBC returns Arrow data directly, making ADBC and Polars a natural fit for each other.

import os

import polars as pl

uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]

We’ll use Polars to create a sample table with polars.DataFrame.write_database(). We don’t need to open an ADBC connection ourselves with Polars.

data = pl.DataFrame(
    {
        "ints": [1, 2, None, 4],
        "strs": ["a", "b", "c", "d"],
    }
)
data.write_database("example", uri, engine="adbc", if_table_exists="replace")

After creating the table, we can use polars.read_database_uri() to fetch the result. Again, we can just pass the URI and tell Polars to manage ADBC for us.

df = pl.read_database_uri("SELECT * FROM example WHERE ints > 1", uri, engine="adbc")

assert len(df) == 2