Back to Code Snippets

Éric Mauvière

@eric.mauviere


Kernel Density Estimation - Epanechnikov KernelSQL

KDE estimates the probability distribution of a random variable. The bandwidth parameter controls the width of the kernel, influencing how smooth or detailed the estimated density curve is. A smaller bandwidth results in a more detailed estimation, while a larger bandwidth produces a smoother curve.

Execute this SQL

CREATE OR REPLACE MACRO KDE_EPANECH(data, varnum, bandwidth, bin_count := 30) AS TABLE 
WITH hist AS (
	FROM histogram_values(data, varnum, bin_count := bin_count)
)
SELECT hist.bin, k.kernel_value
FROM hist, LATERAL (
    SELECT 100 * AVG(
    IF(abs(hist.bin - varnum) / bandwidth  < 1,
    0.75 * (1 - POW(abs(hist.bin - varnum) / bandwidth, 2)) / bandwidth,
    0)) AS kernel_value
	FROM query_table(data)
) k
ORDER BY hist.bin ;

-- Following David Scott's rule, here is an estimate for bandwidth:
CREATE OR REPLACE MACRO KDE_BANDWIDTH(data, varnum) AS (
	FROM query_table(data)
	SELECT 1.06 * stddev(varnum) * pow(count(*), -1/5)
);

-- Usage
SET VARIABLE bandwidth = (SELECT KDE_BANDWIDTH(mydata, myvarnum)) ;

FROM KDE_EPANECH(mydata, myvarnum, getvariable('bandwidth')) ;

-- Inspiration and illustration: https://observablehq.com/@d3/kernel-density-estimation

Copy code

Éric Mauvière

Expand

Share link


Discretize numeric column following specified thresholdsSQL

discretize() converts a numeric column into discrete ordered ids, taking into account a list of thresholds.

Execute this SQL

CREATE OR REPLACE MACRO discretize(v, l) AS (
	WITH t1 AS (
		SELECT unnest(list_distinct(l)) as j
	), t2 AS (
		SELECT COUNT(*) + 1 c FROM t1 
	  WHERE try_cast(j AS float) <= v
	) FROM t2 
	SELECT IF(v IS NULL, NULL, c) 
) ;

--Usage
FROM 'https://raw.githubusercontent.com/thewiremonkey/factbook.csv/master/data/c2127.csv'
SELECT name, value, discretize(value, [2,3,4,5]) AS class ;

Copy code

Éric Mauvière

Expand

Share link


KMeans on one dimensional data with recursive CTESQL

Compute kmeans thresholds from a table with 2 columns : id (unique) and numeric. Outputs a list. Easy to extend to 2 dimensions data (x,y).

Execute this SQL

CREATE FUNCTION kmeans(tname, idcol_name, numcol_name, bins:=5, maxiter:=100) AS (
WITH RECURSIVE clusters(iter, cid, x) AS ( 
	WITH t1 AS (FROM query_table(tname) SELECT idcol_name AS id, numcol_name AS x)
	(SELECT 0, id, x FROM t1 LIMIT bins-1) 
	UNION ALL 
	SELECT iter + 1, cid, avg(px) FROM ( 
		SELECT iter, cid, p.x as px, 
		rank() OVER (PARTITION BY p.id ORDER BY (p.x-c.x)^2, c.x^2) r
		FROM t1 p, clusters c
	) x 
	WHERE x.r = 1 and iter < maxiter 
	GROUP BY ALL
)
SELECT list(x) FROM 
(FROM clusters WHERE iter = maxiter ORDER BY x)
) ;

Copy code

Éric Mauvière

Expand

Share link