SQL with PipesSQL
Pipes in SQL via psql extension created by Yannick Welsch
Execute this SQL
install psql from community; load psql; from 'https://sampledata.sidequery.ai/earthquakes.parquet' |> limit 10000 |> where status = 'Reviewed' |> select data_type, avg(depth), avg(magnitudo) group by all;
Copy code
Nico Ritschel
Expand
Share link
Execute this SQL
-- This selects only specific fields from the CSV -- It also renames one of the fields in the target schema COPY (SELECT col1, col2, col3 AS new_col_name FROM read_csv('~/data/source.csv',AUTO_DETECT=TRUE)) TO '~/data/target.parquet' (FORMAT 'PARQUET', CODEC 'ZSTD');
Copy code
Robin Moffatt
Copy code
Expand
Share link
Execute this SQL
LOAD spatial; -- Pick for example any Garmin Connect or Strava export as GPX file -- https://en.wikipedia.org/wiki/GPS_Exchange_Format SELECT round( -- Compute the length of the geomentry in the units of the -- reference system being used ST_Length( -- Transform from standard GPS WGS-84 into a reference system -- that uses meters. EPSG:25832 stands for ETRS89 / UTM zone 32N, -- Europe between 6°E and 12°E ST_transform( -- Fix the order of coordinates -- (GXP read as long/lat, we need lat/long) ST_FlipCoordinates( -- Turn the WKT into a DuckDB Geometry ST_GeomFromWKB(wkb_geometry) ), 'EPSG:4326', 'EPSG:25832' ) ) /1000, 2) AS 'Distance (km)' FROM st_read('activity_11501782487.gpx', layer='tracks');
Copy code
Michael Simons
Copy code
Expand
Share link
Execute this SQL
-- listing files FROM glob('dataset/*'); -- reading from files FROM 'dataset/*.parquet' LIMIT 100; -- reading parquet files metadata (min_value, max_value, null_count for each field in each file) FROM parquet_metadata('dataset/*.parquet'); -- convert files or export tables to parquet COPY (FROM tbl) TO 'file.parquet' (FORMAT 'parquet'); COPY 'data.csv' TO 'data.paruqet' (FORMAT 'parquet');
Copy code
Octavian Zarzu
Copy code
Expand
Share link
Execute this SQL
-- Here we make use of list_transform and -- the fact that DuckDB knows lambda expressions SELECT list_transform(split('01:08:22', ':'), x -> CAST (x AS INTEGER));
Copy code
Michael Simons
Copy code
Expand
Share link
Execute this SQL
with bits as ( select -- add 8 bits to the end to account for the delimiter bit_length(columns(*)) + 8 from <TABLE> ), -- aggregate all columns bits_agg as ( select sum(columns(*)) from bits ), -- unpivot a wide single row of aggs to single column bits_col as ( unpivot bits_agg on columns(*) ) -- add them all up & convert to mb select sum(value) / (8 * 1024 ** 2) as mb from bits_col
Copy code
Hamilton Ulmer
Copy code
Expand
Share link
Execute this SQL
-- Using Scaleway as an example, where first 75GB is free -- https://www.scaleway.com/en/object-storage/ -- This also works for Google Storage, Digital Ocean Spaces, -- Wasabi and so on -- set these like you do for AWS SET s3_access_key_id='MY_KEY' SET s3_secret_access_key='MY_SECRET' -- set the endpoint and region to override the default of -- using AWS endpoints SET s3_endpoint='s3.nl-ams.scw.cloud' SET s3_region="nl-ams" SELECT -- format a large number with helpful commas -- for readability format('{:,}', count(*)) as "total rows in file" FROM "s3://<BUCKET_NAME>/<FILE_NAME>.zst.parquet"
Copy code
Chris Adams
Copy code
Expand
Share link
Convert EPOC integer field to TimestampSQL
Editor's note: date/time conversion can be painful in SQL and in many programming languages. Simon shows you how to handle it in DuckDB here, with timezone values. There are other timestamp without timezone functions in the DuckDB docs.
Execute this SQL
select ('EPOCH'::TIMESTAMP + INTERVAL (epoc_field::INT) seconds)::TIMESTAMPTZ as epoc_tz from ( -- epoc column - number of seconds since 1 Jan 1970 select 1682481422 as epoc_field ) ;
Copy code
Simon Aubury
Expand
Share link