Query parquet files storage on S3 compatible storage providerSQL

Execute this SQL

-- Using  Scaleway as an example, where first 75GB is free
-- https://www.scaleway.com/en/object-storage/
-- This also works for Google Storage, Digital Ocean Spaces, 
-- Wasabi and so on

-- set these like you do for AWS
SET s3_access_key_id='MY_KEY'
SET s3_secret_access_key='MY_SECRET'

-- set the endpoint and region to override the default of 
-- using AWS endpoints
SET s3_endpoint='s3.nl-ams.scw.cloud'
SET s3_region="nl-ams"

SELECT
  -- format a large number with helpful commas 
  -- for readability
  format('{:,}', count(*)) as "total rows in file" 
FROM "s3://<BUCKET_NAME>/<FILE_NAME>.zst.parquet"

Copy code

Chris Adams

Copy code

Expand

Share link


Compute the length of a GPX track in metersSQL

Execute this SQL

LOAD spatial;

-- Pick for example any Garmin Connect or Strava export as GPX file
-- https://en.wikipedia.org/wiki/GPS_Exchange_Format

SELECT round(
         -- Compute the length of the geomentry in the units of the
         -- reference system being used
         ST_Length(
           -- Transform from standard GPS WGS-84 into a reference system 
           -- that uses meters. EPSG:25832 stands for ETRS89 / UTM zone 32N,
           -- Europe between 6°E and 12°E
           ST_transform(
             -- Fix the order of coordinates 
             -- (GXP read as long/lat, we need lat/long)
             ST_FlipCoordinates(
               -- Turn the WKT into a DuckDB Geometry
               ST_GeomFromWKB(wkb_geometry)
             ),
            'EPSG:4326',
            'EPSG:25832'
           )
       ) /1000, 2) AS 'Distance (km)'
FROM st_read('activity_11501782487.gpx', layer='tracks');

Copy code

Michael Simons

Copy code

Expand

Share link


Compute the altitude gain and loss of a GPX trackSQL

Execute this SQL

-- This requires the spatial extension
LOAD spatial;

-- Pick for example any Garmin Connect or Strava export as GPX file
-- https://en.wikipedia.org/wiki/GPS_Exchange_Format

WITH elevation_gains AS (
  SELECT ele - lag(ele) OVER (ORDER BY track_seg_point_id ASC) AS value
  FROM st_read('activity_11501782487.gpx', layer='track_points')
)
SELECT 
  sum(e.value) FILTER (WHERE e.value >= 0) AS ascent,
  sum(e.value) FILTER (WHERE e.value < 0) AS descent
FROM elevation_gains e;

Copy code

Michael Simons

Copy code

Expand

Share link


No more error on on end of line commasSQL

Editor's note: this query demonstrates default behavior for DuckDB, but boy does it make it easier to comment your SQL lines out without fail.

Execute this SQL

# 🤓 Courtesy of Michael Simons (aka. @rotnroll666)
# 🐦 https://twitter.com/rotnroll666/status/1671066790368010241

SELECT foo,
    bar,
    # hello,
    world,
    # dummy,
FROM bazbar;

Copy code

SALES

Expand

Share link


Query CloudTrail logs in S3SQL

Execute this SQL

-- Background: CloudTrail logs contain a significant amount of data about different actions that occur in your AWS account but can be challenging to deal with for a couple reasons:
--   1. Each file is a single JSON object with a list of of records in a "Records" key
--   2. The schema varies wildly depending on the type of AWS service and actions that are being used.

-- CloudTrail logs typically go to: s3://<bucket>/AWSLogs/<accountid>/CloudTrail/<region>/<year>/<month>/<day>/<accountid-region-some-unique-id>.json.gz"

-- To use this query, you need to load the httpfs extension and either set the following s3_ variables or ensure your standard AWS secret key environment variables are set

-- install https;
-- load https;
-- SET s3_region='us-west-2';
-- SET s3_access_key_id='';
-- SET s3_secret_access_key='';

-- The following query returns the first 10 logs from today.
WITH parsed_logs AS (
    select UNNEST(Records, recursive := True)
    from read_json_auto(
        "s3://<bucket>/AWSLogs/<accountid>/CloudTrail/<region>/" || strftime(now(), "%Y/%m/%d") || "/*.json.gz" ,
        union_by_name=True
    )
)
SELECT * FROM parsed_logs LIMIT 10;

Copy code

Damon

Copy code

Expand

Share link


Python Scalar UDF using ArrowPython

Execute this Python

# Creates a lambda that adds X+Y using pyarrow.compute
# This lambda will be called once for every 2048 rows
import duckdb
from duckdb
import pyarrow as pa
import pyarrow.compute as pc
from duckdb.typing import *
con=duckdb.connect()
con.create_function('plus_lambda', lambda x,y: pc.add(x, y), [BIGINT, BIGINT], BIGINT, type='arrow')
res = con.execute('select plus_lambda(i, 5) from range(11) tbl(i)').df()
print(res)
con.remove_function('plus_lambda')

Copy code

paul-1

Copy code

Expand

Share link


Read an MS Excel File with the spatial extensionSQL

Editor's note: It might seem a bit odd, but the DuckDB spatial extension includes a function for reading Microsoft Excel XLSX files into DuckDB. This is because a lot of geospatial files are shared this way, but you can take advantage of this capability even if you have no spatial data!

Execute this SQL

install spatial;
load spatial;
from st_read('file.xlsx',layer='sheet_name');

Copy code

alexboks-1

Expand

Share link


Query S3 Access LogsSQL

Editor's note: Want to read log files with DuckDB? You can use the read_csv function and custom date/time + regex parsing to do it. To make the data more useful, you can specifically CAST some of the values as numerical types. This snippet also shows CASE WHEN ELSE statements in action.

Execute this SQL


/* 
Background: If you have S3 Access Logging enabled on one of your S3 buckets, you'll have some useful information about requests to your bucket. Unfortunately, it's in a semistructured format that can be difficult to parse. This SQL query will can help in this manner, both pulling out individual fields and coersing them to native data types.

Usage: you'll want to search for the strings <bucket> and <prefix>, and insert the S3 bucket where your access logs are being delivered. Use (or delete) <prefix> to filter to a subset of your logs.

Also, these commented out configuration settings you can either run  yourself in the REPL and source this file using `.read parse_s3_access_logs.sql`, or you can uncomment them and supply values for yourself.
*/

-- install https;
-- load https;
-- SET s3_region='us-west-2';
-- SET s3_access_key_id='';
-- SET s3_secret_access_key='';

WITH parsed_logs AS (
    SELECT
        regexp_extract(col1, '^([0-9a-zA-Z]+)\s+([a-z0-9.\-]+)\s+\[([0-9/A-Za-z: +]+)\] ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ("[^"]*"|-) ([^ ]+) ([^ ]+) (\d+|-) (\d+|-) (\d+|-) (\d+|-) ("[^"]*"|-) ("[^"]*"|-) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+)(.*)$',
        ['bucket_owner', 'bucket', 'timestamp', 'remote_ip', 'request', 'request_id', 'operation', 's3_key', 'request_uri', 'http_status', 's3_errorcode', 'bytes_sent','object_size', 'total_time', 'turn_around_time', 'referer', 'user_agent', 'version_id', 'host_id', 'sigver', 'cyphersuite', 'auth_type', 'host_header', 'tls_version', 'access_point_arn', 'acl_required', 'extra']
    ) AS log_struct
    FROM
        -- Trick the CSV reader into reading as a single column
        read_csv(
            's3://<bucket>/<prefix>/*',
            columns={'col1': 'VARCHAR'},
            -- Use a *hopefully* nonsensical deliminator, so no ',' chars screw us up
            delim='\0'
        )
)
SELECT
        -- Grab everything from the struct that we want as strings, exclude stuff we'll coersce to diff types
        log_struct.* exclude (timestamp, bytes_sent, object_size, total_time, turn_around_time),
        strptime(log_struct.timestamp, '%d/%b/%Y:%H:%M:%S %z') AS timestamp,
        CASE
                WHEN log_struct.bytes_sent = '-' THEN NULL
                ELSE CAST(log_struct.bytes_sent AS INTEGER)
        END AS bytes_sent,
        CASE
                WHEN log_struct.object_size = '-' THEN NULL
                ELSE CAST(log_struct.object_size AS INTEGER)
        END AS object_size,
        CASE
                WHEN log_struct.total_time = '-' THEN NULL
                ELSE CAST(log_struct.total_time AS INTEGER)
        END AS total_time,
        CASE
                WHEN log_struct.turn_around_time = '-' THEN NULL
                ELSE CAST(log_struct.turn_around_time AS INTEGER)
        END AS turn_around_time
FROM parsed_logs;

Copy code

Mark Roddy

Expand

Share link

Sponsored by
Inspired by