SQL, Python & More for DuckDB - Page 11

‌

Sort by WeekdaysSQL

Execute this SQL

-- duckdb has only weekday-name not number
-- create enum in the right order
create type Weekday as enum ('Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday');

-- cast the string to enum for sorting
select count(*), cast(dayname(CreationDate) as Weekday) as day
from posts where posttypeid = 1 and tags like '%>sql<%'
group by all
order by day asc;
-- doing the cast in order by causes a bug

-- ┌──────────────┬───────────┐
-- │ count_star() │    day    │
-- │    int64     │  weekday  │
-- ├──────────────┼───────────┤
-- │       103937 │ Monday    │
-- │       115575 │ Tuesday   │
-- │       119825 │ Wednesday │
-- │       119514 │ Thursday  │
-- │       103445 │ Friday    │
-- │        47139 │ Saturday  │
-- │        47390 │ Sunday    │
-- └──────────────┴───────────┘
Copy code

Michael Hunger

Created 07/15/23

‌

Rounding integer numbers with trailing zerosSQL

Execute this SQL

select round(2061899,-6); -- 2000000.0
Copy code

Michael Hunger

Created 07/15/23

‌

Query parquet files storage on S3 compatible storage providerSQL

Execute this SQL

-- Using  Scaleway as an example, where first 75GB is free
-- https://www.scaleway.com/en/object-storage/
-- This also works for Google Storage, Digital Ocean Spaces, 
-- Wasabi and so on

-- set these like you do for AWS
SET s3_access_key_id='MY_KEY'
SET s3_secret_access_key='MY_SECRET'

-- set the endpoint and region to override the default of 
-- using AWS endpoints
SET s3_endpoint='s3.nl-ams.scw.cloud'
SET s3_region="nl-ams"

SELECT
  -- format a large number with helpful commas 
  -- for readability
  format('{:,}', count(*)) as "total rows in file" 
FROM "s3://<BUCKET_NAME>/<FILE_NAME>.zst.parquet"
Copy code

Chris Adams

Created 07/14/23

‌

Compute the length of a GPX track in metersSQL

Execute this SQL

LOAD spatial;

-- Pick for example any Garmin Connect or Strava export as GPX file
-- https://en.wikipedia.org/wiki/GPS_Exchange_Format

SELECT round(
         -- Compute the length of the geomentry in the units of the
         -- reference system being used
         ST_Length(
           -- Transform from standard GPS WGS-84 into a reference system 
           -- that uses meters. EPSG:25832 stands for ETRS89 / UTM zone 32N,
           -- Europe between 6°E and 12°E
           ST_transform(
             -- Fix the order of coordinates 
             -- (GXP read as long/lat, we need lat/long)
             ST_FlipCoordinates(
               -- Turn the WKT into a DuckDB Geometry
               ST_GeomFromWKB(wkb_geometry)
             ),
            'EPSG:4326',
            'EPSG:25832'
           )
       ) /1000, 2) AS 'Distance (km)'
FROM st_read('activity_11501782487.gpx', layer='tracks');
Copy code

Michael Simons

Created 07/08/23

‌

Compute the altitude gain and loss of a GPX trackSQL

Execute this SQL

-- This requires the spatial extension
LOAD spatial;

-- Pick for example any Garmin Connect or Strava export as GPX file
-- https://en.wikipedia.org/wiki/GPS_Exchange_Format

WITH elevation_gains AS (
  SELECT ele - lag(ele) OVER (ORDER BY track_seg_point_id ASC) AS value
  FROM st_read('activity_11501782487.gpx', layer='track_points')
)
SELECT 
  sum(e.value) FILTER (WHERE e.value >= 0) AS ascent,
  sum(e.value) FILTER (WHERE e.value < 0) AS descent
FROM elevation_gains e;
Copy code

Michael Simons

Created 07/08/23

‌

No more error on on end of line commasSQL

Editor's note: this query demonstrates default behavior for DuckDB, but boy does it make it easier to comment your SQL lines out without fail.

Execute this SQL

# 🤓 Courtesy of Michael Simons (aka. @rotnroll666)
# 🐦 https://twitter.com/rotnroll666/status/1671066790368010241

SELECT foo,
    bar,
    # hello,
    world,
    # dummy,
FROM bazbar;
Copy code

SALES

Edited 02/23/24

‌

Query CloudTrail logs in S3SQL

Execute this SQL

-- Background: CloudTrail logs contain a significant amount of data about different actions that occur in your AWS account but can be challenging to deal with for a couple reasons:
--   1. Each file is a single JSON object with a list of of records in a "Records" key
--   2. The schema varies wildly depending on the type of AWS service and actions that are being used.

-- CloudTrail logs typically go to: s3://<bucket>/AWSLogs/<accountid>/CloudTrail/<region>/<year>/<month>/<day>/<accountid-region-some-unique-id>.json.gz"

-- To use this query, you need to load the httpfs extension and either set the following s3_ variables or ensure your standard AWS secret key environment variables are set

-- install https;
-- load https;
-- SET s3_region='us-west-2';
-- SET s3_access_key_id='';
-- SET s3_secret_access_key='';

-- The following query returns the first 10 logs from today.
WITH parsed_logs AS (
    select UNNEST(Records, recursive := True)
    from read_json_auto(
        "s3://<bucket>/AWSLogs/<accountid>/CloudTrail/<region>/" || strftime(now(), "%Y/%m/%d") || "/*.json.gz" ,
        union_by_name=True
    )
)
SELECT * FROM parsed_logs LIMIT 10;
Copy code

Damon

Created 06/08/23

‌

Python Scalar UDF using ArrowPython

Execute this Python

# Creates a lambda that adds X+Y using pyarrow.compute
# This lambda will be called once for every 2048 rows
import duckdb
from duckdb
import pyarrow as pa
import pyarrow.compute as pc
from duckdb.typing import *
con=duckdb.connect()
con.create_function('plus_lambda', lambda x,y: pc.add(x, y), [BIGINT, BIGINT], BIGINT, type='arrow')
res = con.execute('select plus_lambda(i, 5) from range(11) tbl(i)').df()
print(res)
con.remove_function('plus_lambda')
Copy code

Created 05/23/23