Compare commits
2 Commits
develop
...
bayesian_b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e24d542b3 | ||
|
|
79bd319366 |
@@ -137,6 +137,53 @@ BEGIN
|
|||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql VOLATILE;
|
LANGUAGE plpgsql VOLATILE;
|
||||||
|
CREATE OR REPLACE FUNCTION
|
||||||
|
cdb_create_segment (
|
||||||
|
segment_name TEXT,
|
||||||
|
table_name TEXT,
|
||||||
|
column_name TEXT,
|
||||||
|
geoid_column TEXT DEFAULT 'geoid',
|
||||||
|
census_table TEXT DEFAULT 'block_groups'
|
||||||
|
)
|
||||||
|
RETURNS NUMERIC
|
||||||
|
AS $$
|
||||||
|
from crankshaft.segmentation import create_segemnt
|
||||||
|
# TODO: use named parameters or a dictionary
|
||||||
|
return create_segment('table')
|
||||||
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION
|
||||||
|
cdb_predict_segment (
|
||||||
|
segment_name TEXT,
|
||||||
|
geoid_column TEXT DEFAULT 'geoid',
|
||||||
|
census_table TEXT DEFAULT 'block_groups'
|
||||||
|
)
|
||||||
|
RETURNS TABLE(geoid TEXT, prediction NUMERIC)
|
||||||
|
AS $$
|
||||||
|
from crankshaft.segmentation import create_segemnt
|
||||||
|
# TODO: use named parameters or a dictionary
|
||||||
|
return create_segment('table')
|
||||||
|
$$ LANGUAGE plpythonu;
|
||||||
|
CREATE OR REPLACE FUNCTION
|
||||||
|
cdb_adaptive_histogram (
|
||||||
|
table_name TEXT,
|
||||||
|
column_name TEXT
|
||||||
|
)
|
||||||
|
RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
|
||||||
|
|
||||||
|
AS $$
|
||||||
|
from crankshaft.bayesian_blocks import adaptive_histogram
|
||||||
|
return adaptive_histogram(table_name,column_name)
|
||||||
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION
|
||||||
|
cdb_simple_test (
|
||||||
|
)
|
||||||
|
RETURNS NUMERIC
|
||||||
|
|
||||||
|
AS $$
|
||||||
|
return 5
|
||||||
|
$$ LANGUAGE plpythonu;
|
||||||
-- Make sure by default there are no permissions for publicuser
|
-- Make sure by default there are no permissions for publicuser
|
||||||
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
|
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
|
||||||
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
|
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
|
||||||
|
|||||||
11
pg/sql/0.0.1/06_bayesian_blocks.sql
Normal file
11
pg/sql/0.0.1/06_bayesian_blocks.sql
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
CREATE OR REPLACE FUNCTION
|
||||||
|
cdb_adaptive_histogram (
|
||||||
|
table_name TEXT,
|
||||||
|
column_name TEXT
|
||||||
|
)
|
||||||
|
RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
|
||||||
|
|
||||||
|
AS $$
|
||||||
|
from crankshaft.bayesian_blocks import adaptive_histogram
|
||||||
|
return adaptive_histogram(table_name,column_name)
|
||||||
|
$$ LANGUAGE plpythonu;
|
||||||
@@ -1,2 +1,3 @@
|
|||||||
import random_seeds
|
import random_seeds
|
||||||
import clustering
|
import clustering
|
||||||
|
import bayesian_blocks
|
||||||
|
|||||||
1
python/crankshaft/crankshaft/bayesian_blocks/__init__.py
Normal file
1
python/crankshaft/crankshaft/bayesian_blocks/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from bayesian_blocks import *
|
||||||
@@ -0,0 +1,84 @@
|
|||||||
|
import plpy
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def adaptive_histogram(table_name,column_name):
|
||||||
|
data = plpy.execute("select {column_name} from {table_name}".format(**locals()))
|
||||||
|
|
||||||
|
data = [float(d['count']) for d in data]
|
||||||
|
plpy.notice(data)
|
||||||
|
vals, bins = np.histogram( data, bins=_bayesian_blocks(data))
|
||||||
|
return zip(vals,bins, bins[1:])
|
||||||
|
|
||||||
|
|
||||||
|
def _bayesian_blocks(t):
|
||||||
|
"""Bayesian Blocks Implementation
|
||||||
|
|
||||||
|
By Jake Vanderplas. License: BSD
|
||||||
|
Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
t : ndarray, length N
|
||||||
|
data to be histogrammed
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bins : ndarray
|
||||||
|
array containing the (N+1) bin edges
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
This is an incomplete implementation: it may fail for some
|
||||||
|
datasets. Alternate fitness functions and prior forms can
|
||||||
|
be found in the paper listed above.
|
||||||
|
"""
|
||||||
|
# copy and sort the array
|
||||||
|
t = np.sort(t)
|
||||||
|
N = t.size
|
||||||
|
|
||||||
|
# create length-(N + 1) array of cell edges
|
||||||
|
edges = np.concatenate([t[:1],
|
||||||
|
0.5 * (t[1:] + t[:-1]),
|
||||||
|
t[-1:]])
|
||||||
|
block_length = t[-1] - edges
|
||||||
|
|
||||||
|
# arrays needed for the iteration
|
||||||
|
nn_vec = np.ones(N)
|
||||||
|
best = np.zeros(N, dtype=float)
|
||||||
|
last = np.zeros(N, dtype=int)
|
||||||
|
|
||||||
|
#-----------------------------------------------------------------
|
||||||
|
# Start with first data cell; add one cell at each iteration
|
||||||
|
#-----------------------------------------------------------------
|
||||||
|
for K in range(N):
|
||||||
|
# Compute the width and count of the final bin for all possible
|
||||||
|
# locations of the K^th changepoint
|
||||||
|
width = block_length[:K + 1] - block_length[K + 1]
|
||||||
|
count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]
|
||||||
|
|
||||||
|
# evaluate fitness function for these possibilities
|
||||||
|
fit_vec = count_vec * (np.log(count_vec) - np.log(width))
|
||||||
|
fit_vec -= 4 # 4 comes from the prior on the number of changepoints
|
||||||
|
fit_vec[1:] += best[:K]
|
||||||
|
|
||||||
|
# find the max of the fitness: this is the K^th changepoint
|
||||||
|
i_max = np.argmax(fit_vec)
|
||||||
|
last[K] = i_max
|
||||||
|
best[K] = fit_vec[i_max]
|
||||||
|
|
||||||
|
#-----------------------------------------------------------------
|
||||||
|
# Recover changepoints by iteratively peeling off the last block
|
||||||
|
#-----------------------------------------------------------------
|
||||||
|
change_points = np.zeros(N, dtype=int)
|
||||||
|
i_cp = N
|
||||||
|
ind = N
|
||||||
|
while True:
|
||||||
|
i_cp -= 1
|
||||||
|
change_points[i_cp] = ind
|
||||||
|
if ind == 0:
|
||||||
|
break
|
||||||
|
ind = last[ind - 1]
|
||||||
|
change_points = change_points[i_cp:]
|
||||||
|
|
||||||
|
return edges[change_points]
|
||||||
@@ -40,7 +40,7 @@ setup(
|
|||||||
|
|
||||||
# The choice of component versions is dictated by what's
|
# The choice of component versions is dictated by what's
|
||||||
# provisioned in the production servers.
|
# provisioned in the production servers.
|
||||||
install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
|
install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'],
|
||||||
|
|
||||||
requires=['pysal', 'numpy'],
|
requires=['pysal', 'numpy'],
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user