Back to Code Snippets

Éric Mauvière

@eric.mauviere


Discretize numeric column following specified thresholds

discretize() converts a numeric column into discrete ordered ids, taking into account a list of thresholds.

Execute this SQL

CREATE OR REPLACE MACRO discretize(v, l) AS (
	WITH t1 AS (
		SELECT unnest(list_distinct(l)) as j
	), t2 AS (
		SELECT COUNT(*) + 1 c FROM t1 
	  WHERE try_cast(j AS float) <= v
	) FROM t2 
	SELECT IF(v IS NULL, NULL, c) 
) ;

--Usage
FROM 'https://raw.githubusercontent.com/thewiremonkey/factbook.csv/master/data/c2127.csv'
SELECT name, value, discretize(value, [2,3,4,5]) AS class ;

Copy code

Éric Mauvière

Expand

Share link


KMeans on one dimensional data with recursive CTE

Compute kmeans thresholds from a table with 2 columns : id (unique) and numeric. Outputs a list. Easy to extend to 2 dimensions data (x,y).

Execute this SQL

CREATE FUNCTION kmeans(tname, idcol_name, numcol_name, bins:=5, maxiter:=100) AS (
WITH RECURSIVE clusters(iter, cid, x) AS ( 
	WITH t1 AS (FROM query_table(tname) SELECT idcol_name AS id, numcol_name AS x)
	(SELECT 0, id, x FROM t1 LIMIT bins-1) 
	UNION ALL 
	SELECT iter + 1, cid, avg(px) FROM ( 
		SELECT iter, cid, p.x as px, 
		rank() OVER (PARTITION BY p.id ORDER BY (p.x-c.x)^2, c.x^2) r
		FROM t1 p, clusters c
	) x 
	WHERE x.r = 1 and iter < maxiter 
	GROUP BY ALL
)
SELECT list(x) FROM 
(FROM clusters WHERE iter = maxiter ORDER BY x)
) ;

Copy code

Éric Mauvière

Expand

Share link