rand[om]

rand[om]

med ∩ ml

Query a list of dictionaries with DuckDB

Table of contents

How to query a list of Python dictionaries using DuckDB, without previously converting to another format.

We can pass the list of dictionaries to the query directly and use unnest to convert to a list of rows. We can also expand the struct to multiple columns.

Note: all the dictionaries must have the same keys.

# /// script
# requires-python = ">=3.12"
# dependencies = ["duckdb"]
# ///

import duckdb


DATA = [
    {"foo": 1, "bar": "some string", "baz": 1.23},
    {"foo": 2, "bar": "some other string", "baz": 2.34},
    {"foo": 3, "bar": "yet another string", "baz": 3.45},
]

# Select as struct
rel = duckdb.query("SELECT unnest($data) as x", params={"data": DATA})

# Expand struct to rows
rel = duckdb.query("SELECT x.* FROM (SELECT unnest($data) as x)", params={"data": DATA})

result = duckdb.query("SELECT * FROM rel WHERE foo > 1")

print(result.fetchall())
# [(2, 'some other string', 2.34), (3, 'yet another string', 3.45)]

Here’s a longer script, including timings, and different approaches. I’m using .fetchdf().to_markdown() to pretty-print the results:

# /// script
# requires-python = ">=3.12"
# dependencies = ["pandas", "numpy", "duckdb", "tabulate"]
# ///


import time

import duckdb
import pandas as pd

DATA = [
    {"foo": 1, "bar": "some string", "baz": 1.23},
    {"foo": 2, "bar": "some other string", "baz": 2.34},
    {"foo": 3, "bar": "yet another string", "baz": 3.45},
]


print("=" * 20, "Fetching as struct\n")
start = time.perf_counter()
rel = duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
end = time.perf_counter()

print(rel.fetchdf().to_markdown(index=False))
print(f"\n\n\tTime taken: {end - start} seconds\n\n")


print("=" * 20, "Fetching as expanded columns\n")
start = time.perf_counter()
rel = duckdb.query(
    "SELECT x.* FROM (SELECT unnest($data) as x)", params={"data": DATA}
)
end = time.perf_counter()
print(rel.fetchdf().to_markdown(index=False))
print(f"\n\n\tTime taken: {end - start} seconds\n\n")


print("=" * 20, "Pre-converting to DF\n")
df = pd.DataFrame(DATA)

start = time.perf_counter()
rel = duckdb.query("SELECT * FROM df")
end = time.perf_counter()
print(rel.fetchdf().to_markdown(index=False))
print(f"\n\n\tTime taken: {end - start} seconds\n\n")


print("=" * 20, "Adding data with different types will fail\n")
DATA.append({"foo": 4, "new_key": 4.56})


x = (
    duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
    .fetchdf()
    .to_markdown(index=False)
)
print(x)

Output:

==================== Fetching as struct

| x                                                    |
|:-----------------------------------------------------|
| {'foo': 1, 'bar': 'some string', 'baz': 1.23}        |
| {'foo': 2, 'bar': 'some other string', 'baz': 2.34}  |
| {'foo': 3, 'bar': 'yet another string', 'baz': 3.45} |


        Time taken: 0.0007178339874371886 seconds


==================== Fetching as expanded columns

|   foo | bar                |   baz |
|------:|:-------------------|------:|
|     1 | some string        |  1.23 |
|     2 | some other string  |  2.34 |
|     3 | yet another string |  3.45 |


        Time taken: 0.0004723750171251595 seconds


==================== Pre-converting to DF

|   foo | bar                |   baz |
|------:|:-------------------|------:|
|     1 | some string        |  1.23 |
|     2 | some other string  |  2.34 |
|     3 | yet another string |  3.45 |


        Time taken: 0.00047075003385543823 seconds


==================== Adding data with different types will fail

Traceback (most recent call last):
  File "/Users/r/projects/testing/duck_pydata.py", line 53, in <module>
    duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
duckdb.duckdb.TypeMismatchException: Mismatch Type Error: Type STRUCT(foo INTEGER, new_key DOUBLE) does not match with STRUCT(foo INTEGER, bar VARCHAR, baz DOUBLE). Cannot cast STRUCTs of
 different size