diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql new file mode 100644 index 0000000..63ec647 --- /dev/null +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -0,0 +1,46 @@ +-- +-- CDB_DistinctMeasure +-- calculates the fraction of rows in the 10 most common distinct categories +-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows +-- +-- + +CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT null ) RETURNS numeric as $$ +DECLARE + element_count INT4; + maxval numeric; + passes numeric; +BEGIN + SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x; + + -- count number of occurrences per bin + -- calculate the normalized cumulative sum + -- return the max value: which corresponds nth entry + -- for n <= 10 depending on # of distinct values + EXECUTE 'WITH a As ( + SELECT + count(*) cnt + FROM + (SELECT * FROM unnest($2) e ) x + WHERE e is not null + GROUP BY e + ORDER BY cnt DESC + ), + b As ( + SELECT + sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum + FROM a + LIMIT 10 + ) + SELECT max(cumsum) maxval FROM b' + INTO maxval + USING element_count, in_array; + IF threshold is null THEN + passes = maxval; + ELSE + passes = CASE WHEN (maxval >= threshold) THEN 1 ELSE 0 END; + END IF; + + RETURN passes; +END; +$$ language plpgsql IMMUTABLE; diff --git a/scripts-enabled/CDB_DistinctMeasure.sql b/scripts-enabled/CDB_DistinctMeasure.sql new file mode 120000 index 0000000..e78b435 --- /dev/null +++ b/scripts-enabled/CDB_DistinctMeasure.sql @@ -0,0 +1 @@ +../scripts-available/CDB_DistinctMeasure.sql \ No newline at end of file diff --git a/test/CDB_DistinctMeasureTest.sql b/test/CDB_DistinctMeasureTest.sql new file mode 100644 index 0000000..1eb3a74 --- /dev/null +++ b/test/CDB_DistinctMeasureTest.sql @@ -0,0 +1,20 @@ +-- a - j add up to 89%, k-m add up to 11% +WITH a As ( + SELECT ( + repeat('a',12) || + repeat('b',11) || + repeat('c',11) || + repeat('d',10) || + repeat('e',10) || + repeat('f',9) || + repeat('g',8) || + repeat('h',7) || + repeat('i',6) || + repeat('j',5) || + repeat('k',4) || + repeat('l',4) || + repeat('m',3) + )::text AS x + ) + +SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a diff --git a/test/CDB_DistinctMeasureTest_expect b/test/CDB_DistinctMeasureTest_expect new file mode 100644 index 0000000..573541a --- /dev/null +++ b/test/CDB_DistinctMeasureTest_expect @@ -0,0 +1 @@ +0