diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql new file mode 100644 index 0000000..803ab69 --- /dev/null +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -0,0 +1,46 @@ +-- +-- CDB_DistinctMeasure +-- calculates the fraction of rows in the 10 most common distinct categories +-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows +-- +-- + +CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT 0.90 ) RETURNS boolean as $$ +DECLARE + element_count INT4; + maxval numeric; + passes boolean; +BEGIN + SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x; + + -- count number of occurrences per bin + -- calculate the normalized cumulative sum + -- return the max value: which corresponds nth entry + -- for n <= 10 depending on # of distinct values + EXECUTE 'WITH a As ( + SELECT + count(*) cnt, + e + FROM + (SELECT * FROM unnest($2) e ) x + GROUP BY e + ORDER BY cnt DESC + ), + b As ( + SELECT + sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum + FROM a + LIMIT 10 + ) + SELECT max(cumsum) maxval FROM b' + INTO maxval + USING element_count, in_array; + + passes = CASE WHEN maxval >= threshold + THEN TRUE + ELSE FALSE + END; + RAISE NOTICE 'maxval: %', maxval; + RETURN passes; +END; +$$ language plpgsql IMMUTABLE;