SQL, Python & More for DuckDB - Page 2

‌

Easy GraphQL QueryingSQL

This uses the http_client and json extensions to create a (relatively) flexible and easy to use GraphQL reader. This only supports POST, but it'd be pretty easy to change to GET. There are parameters to control what and how the results are extracted into a DuckDB table.

Execute this SQL

INSTALL http_client FROM community; 
LOAD json; 
LOAD http_client; 
-- Run a GraphQL query and get back fields with various diagnostics and intermediate values 
CREATE OR REPLACE MACRO 
  query_graphql(endpoint, query,  -- GraphGQL endpoint and query
                params:=NULL,  -- GraphQL parameters
                headers:=MAP{},  -- HTTP headers (e.g., auth)
                result_path:='$',  -- json_path to use as results
                result_structure:=NULL,  -- json_structure for results or auto detect
                strict:=true  -- Error on failure to transform results
  ) AS TABLE
  SELECT
    response: http_post(endpoint, headers, IF(params IS NULL, {'query': query}, {'query': query, 'params': params})),
    body: (response->>'body')::JSON,
    json: body->result_path,
    structure: IF(result_structure IS NULL, json_structure(json), result_structure),
     result: IF(strict, from_json_strict(json, structure), from_json(json, structure));  

-- Simple helper to extract GraphQL results to a "normal" table 
CREATE OR REPLACE MACRO 
  read_graphql(endpoint, query,  -- Same as above ...
               params:=NULL, 
               headers:=MAP{}, 
               result_path:='$', 
               result_structure:=NULL, 
               strict:=true,  -- ... Same as above
               unnest_levels:=0  -- Apply unnest this many times to create table
  ) AS TABLE
  WITH nested AS (
    SELECT [result] AS result
    FROM query_graphql(endpoint, query,
                       params:=params,
                       headers:=headers,
                       result_path:=result_path,
                       result_structure:=result_structure, 
                       strict:=strict)
  ) 
 SELECT unnest(result, max_depth:=unnest_levels+1) AS result 
 FROM nested;


------------------- Example usage ---------------------
FROM read_graphql('https://rickandmortyapi.com/graphql', 
  $$
  query Query($name: String) {
    characters(page: 2, filter: {name: $name}) {
      info { count }
      results { id name gender }
    }
  }
  $$,
  params:={'name': 'Morty'},    
  result_path:='$.data.characters.results',
  result_structure:=[{id:'int',name:'string',gender:'string'}],    
  unnest_levels:=2
);

-- Example Results ------------------
-- ┌─────────┬────────────────────┬─────────┐ 
-- │   id    │        name        │ gender  │ 
-- │  int32  │      varchar       │ varchar │ 
-- ├─────────┼────────────────────┼─────────┤ 
-- │ 21      │ Aqua Morty         │ Male    │ 
-- ... 
-- ├─────────┴────────────────────┴─────────┤ 
-- │ 20 rows                      3 columns │ 
-- └────────────────────────────────────────┘
Copy code

Teague Sterling

Edited 07/09/25

‌

DuckDB to geojsonSQL

Snippet helps put correctly formatted geojson file using spatial and json extensions from any table/query.

Execute this SQL

COPY (
    SELECT
        *, -- all these field will become properties in resulting geojson (doesnt work well with map-like columns)
 	ST_GeomFromText(wkt) as geometry -- this is the main geometry entry    
    FROM temp_table      
) TO 'test.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON', LAYER_NAME 'test');
Copy code

Shabbir Marzban

Created 03/20/25

‌

Export/import/share DuckDB UI Notebooks

The DuckDB UI stores notebook content in an internal database called _duckdb_ui. You can query and export notebook content, as well as insert new definitions into the database. Warning: Modifying the internal database may lead to corruption and data loss. Be cautious and use it at your own risk!

Export a notebook definition to JSONSQL

copy (
  select
    "json"
  from _duckdb_ui.notebook_versions
  where 1=1
    and title = 'MySingleNotebook'
    and expires is null
) to 'exported-notebook.json'
Copy code

Import notebook definitionSQL

set variable notebook_content = (select json from 'exported-notebook.json');
set variable notebook_id = uuid();
set variable current_timestamp = now();

begin transaction;
  insert into _duckdb_ui.notebooks (id, name, created)
  select
    getvariable('notebook_id'),
    'notebook_' || getvariable('notebook_id'),
    getvariable('current_timestamp')
  ;

  insert into _duckdb_ui.notebook_versions (notebook_id, version, title, json, created, expires)
  select
    getvariable('notebook_id'),
    1,
    'imported-notebook-' || getvariable('current_timestamp'),
    getvariable('notebook_content'),
    getvariable('current_timestamp'),
    null
  ;
commit;
Copy code

Created 03/15/25

‌

Detect Schema Changes Across Datasets (Python)Python

Compare the schema of two datasets and identify any differences.

Execute this Python

import duckdb

def compare_schemas(file1, file2):
    """
    Compare schemas of two datasets and find differences.
    Args:
        file1 (str): Path to the first dataset (CSV/Parquet).
        file2 (str): Path to the second dataset (CSV/Parquet).
    Returns:
        list: Schema differences.
    """
    con = duckdb.connect()
    schema1 = con.execute(f"DESCRIBE SELECT * FROM read_csv_auto('{file1}')").fetchall()
    schema2 = con.execute(f"DESCRIBE SELECT * FROM read_csv_auto('{file2}')").fetchall()
    return {"file1_schema": schema1, "file2_schema": schema2}

# Example Usage
differences = compare_schemas("data1.csv", "data2.csv")
print(differences)
Copy code

Created 12/12/24

‌

Remove Duplicate Records from a CSV File (Bash)Bash

This function helps clean up a dataset by identifying and removing duplicate records. It’s especially useful for ensuring data integrity before analysis.

Execute this Bash

#!/bin/bash
function remove_duplicates() {
    input_file="$1"  # Input CSV file with duplicates
    output_file="$2" # Deduplicated output CSV file
    # Use DuckDB to remove duplicate rows and write the cleaned data to a new CSV file.
    duckdb -c "COPY (SELECT DISTINCT * FROM read_csv_auto('$input_file')) TO '$output_file' (FORMAT CSV, HEADER TRUE);"
}

#Usage remove_duplicates "input_data.csv" "cleaned_data.csv"
Copy code

Created 12/12/24

‌

Query JSON files Using SQL in PythonPython

DuckDB supports querying JSON files directly, enabling seamless analysis of semi-structured data. This script lets you apply SQL queries to JSON files within a Python environment, ideal for preprocessing or exploring JSON datasets.

Execute this Python

import duckdb

def query_json(file_path, query):
    """
    Query JSON data directly using DuckDB.
    Args:
        file_path (str): Path to the JSON file.
        query (str): SQL query to execute on the JSON data.
    Returns:
        pandas.DataFrame: Query results as a Pandas DataFrame.
    """
    con = duckdb.connect()
    # Execute the query on the JSON file and fetch the results as a Pandas DataFrame.
    df = con.execute(f"SELECT * FROM read_json_auto('{file_path}') WHERE {query}").df()
    return df

# Example Usage
result = query_json("./json/query_20min.json", "scheduled = true")
print(result)
Copy code

Created 12/12/24

‌

read_dsv() -> Parse properly separated CSV files

I tend to prefer using the ASCII unit (\x1f) and group separator (\x1e) as resp. column and line delimiters in CSVs (which technically no longer makes them a CSV). The read_csv function doesn't seem to want to play nice with these, so here's my attempt at a workaround.

Marco definitionSQL

-- For more info on DSVs (I'm not the author): https://matthodges.com/posts/2024-08-12-csv-bad-dsv-good/
CREATE OR REPLACE MACRO read_dsv(path_spec)
 AS TABLE
(
with _lines as (
    select 
        filename
        ,regexp_split_to_array(content, '\x1e') as content
    from read_text(path_spec )
)
, _cleanup as (
    select 
        filename
        ,regexp_split_to_array(content[1],'\x1f') as header
        ,generate_subscripts(content[2:],1) as linenum
        ,unnest((content[2:]).list_filter(x -> trim(x) != '').list_transform(x -> x.regexp_split_to_array('\x1f'))) as line
    from _lines
)
select
    filename
    ,linenum
    ,unnest(map_entries(map(header, line)), recursive := true) as kv
from _cleanup
);
Copy code

UsageSQL

-- You can use the same path specification as you would with read_text or read_csv, this includes globbing.
-- Trying to include the pivot statement in the macro isn't possible, as you then have to explicitly define the column values (which defeats the purpose of this implementation)
pivot read_dsv("C:\Temp\csv\*.csv")
on key
using first(value)
group by filename, linenum
order by filename, linenum
Copy code

DuckØ

Created 11/30/24