Query a list of dictionaries with DuckDB
Table of contents
How to query a list of Python dictionaries using DuckDB, without previously converting to another format.
We can pass the list of dictionaries to the query directly and use unnest to convert to a list of rows. We can also expand the struct to multiple columns.
Note: all the dictionaries must have the same keys.
# /// script
# requires-python = ">=3.12"
# dependencies = ["duckdb"]
# ///
import duckdb
DATA = [
{"foo": 1, "bar": "some string", "baz": 1.23},
{"foo": 2, "bar": "some other string", "baz": 2.34},
{"foo": 3, "bar": "yet another string", "baz": 3.45},
]
# Select as struct
rel = duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
# Expand struct to rows
rel = duckdb.query("SELECT x.* FROM (SELECT unnest($data) as x)", params={"data": DATA})
result = duckdb.query("SELECT * FROM rel WHERE foo > 1")
print(result.fetchall())
# [(2, 'some other string', 2.34), (3, 'yet another string', 3.45)]
Here’s a longer script, including timings, and different approaches. I’m using .fetchdf().to_markdown()
to pretty-print the results:
# /// script
# requires-python = ">=3.12"
# dependencies = ["pandas", "numpy", "duckdb", "tabulate"]
# ///
import time
import duckdb
import pandas as pd
DATA = [
{"foo": 1, "bar": "some string", "baz": 1.23},
{"foo": 2, "bar": "some other string", "baz": 2.34},
{"foo": 3, "bar": "yet another string", "baz": 3.45},
]
print("=" * 20, "Fetching as struct\n")
start = time.perf_counter()
rel = duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
end = time.perf_counter()
print(rel.fetchdf().to_markdown(index=False))
print(f"\n\n\tTime taken: {end - start} seconds\n\n")
print("=" * 20, "Fetching as expanded columns\n")
start = time.perf_counter()
rel = duckdb.query(
"SELECT x.* FROM (SELECT unnest($data) as x)", params={"data": DATA}
)
end = time.perf_counter()
print(rel.fetchdf().to_markdown(index=False))
print(f"\n\n\tTime taken: {end - start} seconds\n\n")
print("=" * 20, "Pre-converting to DF\n")
df = pd.DataFrame(DATA)
start = time.perf_counter()
rel = duckdb.query("SELECT * FROM df")
end = time.perf_counter()
print(rel.fetchdf().to_markdown(index=False))
print(f"\n\n\tTime taken: {end - start} seconds\n\n")
print("=" * 20, "Adding data with different types will fail\n")
DATA.append({"foo": 4, "new_key": 4.56})
x = (
duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
.fetchdf()
.to_markdown(index=False)
)
print(x)
Output:
==================== Fetching as struct
| x |
|:-----------------------------------------------------|
| {'foo': 1, 'bar': 'some string', 'baz': 1.23} |
| {'foo': 2, 'bar': 'some other string', 'baz': 2.34} |
| {'foo': 3, 'bar': 'yet another string', 'baz': 3.45} |
Time taken: 0.0007178339874371886 seconds
==================== Fetching as expanded columns
| foo | bar | baz |
|------:|:-------------------|------:|
| 1 | some string | 1.23 |
| 2 | some other string | 2.34 |
| 3 | yet another string | 3.45 |
Time taken: 0.0004723750171251595 seconds
==================== Pre-converting to DF
| foo | bar | baz |
|------:|:-------------------|------:|
| 1 | some string | 1.23 |
| 2 | some other string | 2.34 |
| 3 | yet another string | 3.45 |
Time taken: 0.00047075003385543823 seconds
==================== Adding data with different types will fail
Traceback (most recent call last):
File "/Users/r/projects/testing/duck_pydata.py", line 53, in <module>
duckdb.query("SELECT unnest($data) as x", params={"data": DATA})
duckdb.duckdb.TypeMismatchException: Mismatch Type Error: Type STRUCT(foo INTEGER, new_key DOUBLE) does not match with STRUCT(foo INTEGER, bar VARCHAR, baz DOUBLE). Cannot cast STRUCTs of
different size