From 79bd3193666893ceb449579220015ac8e14938e7 Mon Sep 17 00:00:00 2001 From: Stuart Lynn Date: Mon, 7 Mar 2016 11:49:57 -0500 Subject: [PATCH] bayesian_blocks function --- pg/crankshaft--0.0.1.sql | 47 +++++++++++ pg/sql/0.0.1/06_basyian_bins.sql | 20 +++++ python/crankshaft/crankshaft/__init__.py | 1 + .../crankshaft/bayesian_blocks/__init__.py | 1 + .../bayesian_blocks/bayesian_blocks.py | 84 +++++++++++++++++++ python/crankshaft/setup.py | 2 +- 6 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 pg/sql/0.0.1/06_basyian_bins.sql create mode 100644 python/crankshaft/crankshaft/bayesian_blocks/__init__.py create mode 100644 python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py diff --git a/pg/crankshaft--0.0.1.sql b/pg/crankshaft--0.0.1.sql index 436beea..c29f3f4 100644 --- a/pg/crankshaft--0.0.1.sql +++ b/pg/crankshaft--0.0.1.sql @@ -137,6 +137,53 @@ BEGIN END; $$ LANGUAGE plpgsql VOLATILE; +CREATE OR REPLACE FUNCTION + cdb_create_segment ( + segment_name TEXT, + table_name TEXT, + column_name TEXT, + geoid_column TEXT DEFAULT 'geoid', + census_table TEXT DEFAULT 'block_groups' + ) +RETURNS NUMERIC +AS $$ + from crankshaft.segmentation import create_segemnt + # TODO: use named parameters or a dictionary + return create_segment('table') +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION + cdb_predict_segment ( + segment_name TEXT, + geoid_column TEXT DEFAULT 'geoid', + census_table TEXT DEFAULT 'block_groups' + ) +RETURNS TABLE(geoid TEXT, prediction NUMERIC) +AS $$ + from crankshaft.segmentation import create_segemnt + # TODO: use named parameters or a dictionary + return create_segment('table') +$$ LANGUAGE plpythonu; +CREATE OR REPLACE FUNCTION + cdb_adaptive_histogram ( + table_name TEXT, + column_name TEXT + ) +RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric) + +AS $$ + from crankshaft.bayesian_blocks import adaptive_histogram + return adaptive_histogram(table_name,column_name) +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION + cdb_simple_test ( + ) +RETURNS NUMERIC + +AS $$ + return 5 +$$ LANGUAGE plpythonu; -- Make sure by default there are no permissions for publicuser -- NOTE: this happens at extension creation time, as part of an implicit transaction. -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; diff --git a/pg/sql/0.0.1/06_basyian_bins.sql b/pg/sql/0.0.1/06_basyian_bins.sql new file mode 100644 index 0000000..9ffbfdc --- /dev/null +++ b/pg/sql/0.0.1/06_basyian_bins.sql @@ -0,0 +1,20 @@ +CREATE OR REPLACE FUNCTION + cdb_adaptive_histogram ( + table_name TEXT, + column_name TEXT + ) +RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric) + +AS $$ + from crankshaft.bayesian_blocks import adaptive_histogram + return adaptive_histogram(table_name,column_name) +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION + cdb_simple_test ( + ) +RETURNS NUMERIC + +AS $$ + return 5 +$$ LANGUAGE plpythonu; diff --git a/python/crankshaft/crankshaft/__init__.py b/python/crankshaft/crankshaft/__init__.py index d07e330..e492236 100644 --- a/python/crankshaft/crankshaft/__init__.py +++ b/python/crankshaft/crankshaft/__init__.py @@ -1,2 +1,3 @@ import random_seeds import clustering +import bayesian_blocks diff --git a/python/crankshaft/crankshaft/bayesian_blocks/__init__.py b/python/crankshaft/crankshaft/bayesian_blocks/__init__.py new file mode 100644 index 0000000..ed29f0e --- /dev/null +++ b/python/crankshaft/crankshaft/bayesian_blocks/__init__.py @@ -0,0 +1 @@ +from bayesian_blocks import * diff --git a/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py new file mode 100644 index 0000000..a11da0b --- /dev/null +++ b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py @@ -0,0 +1,84 @@ +import plpy +import numpy as np + + +def adaptive_histogram(table_name,column_name): + data = plpy.execute("select {column_name} from {table_name}".format(**locals())) + + data = [float(d['count']) for d in data] + plpy.notice(data) + vals, bins = np.histogram( data, bins=_bayesian_blocks(data)) + return zip(vals,bins, bins[1:]) + + +def _bayesian_blocks(t): + """Bayesian Blocks Implementation + + By Jake Vanderplas. License: BSD + Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S + + Parameters + ---------- + t : ndarray, length N + data to be histogrammed + + Returns + ------- + bins : ndarray + array containing the (N+1) bin edges + + Notes + ----- + This is an incomplete implementation: it may fail for some + datasets. Alternate fitness functions and prior forms can + be found in the paper listed above. + """ + # copy and sort the array + t = np.sort(t) + N = t.size + + # create length-(N + 1) array of cell edges + edges = np.concatenate([t[:1], + 0.5 * (t[1:] + t[:-1]), + t[-1:]]) + block_length = t[-1] - edges + + # arrays needed for the iteration + nn_vec = np.ones(N) + best = np.zeros(N, dtype=float) + last = np.zeros(N, dtype=int) + + #----------------------------------------------------------------- + # Start with first data cell; add one cell at each iteration + #----------------------------------------------------------------- + for K in range(N): + # Compute the width and count of the final bin for all possible + # locations of the K^th changepoint + width = block_length[:K + 1] - block_length[K + 1] + count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1] + + # evaluate fitness function for these possibilities + fit_vec = count_vec * (np.log(count_vec) - np.log(width)) + fit_vec -= 4 # 4 comes from the prior on the number of changepoints + fit_vec[1:] += best[:K] + + # find the max of the fitness: this is the K^th changepoint + i_max = np.argmax(fit_vec) + last[K] = i_max + best[K] = fit_vec[i_max] + + #----------------------------------------------------------------- + # Recover changepoints by iteratively peeling off the last block + #----------------------------------------------------------------- + change_points = np.zeros(N, dtype=int) + i_cp = N + ind = N + while True: + i_cp -= 1 + change_points[i_cp] = ind + if ind == 0: + break + ind = last[ind - 1] + change_points = change_points[i_cp:] + + return edges[change_points] diff --git a/python/crankshaft/setup.py b/python/crankshaft/setup.py index c0f8c50..bcd6a16 100644 --- a/python/crankshaft/setup.py +++ b/python/crankshaft/setup.py @@ -40,7 +40,7 @@ setup( # The choice of component versions is dictated by what's # provisioned in the production servers. - install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'], + install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'], requires=['pysal', 'numpy'],