renaming to fix typo

bayesian_blocks function
2016-03-07 12:55:25 -05:00 · 2016-03-07 11:49:57 -05:00
6 changed files with 145 additions and 1 deletions
--- a/pg/crankshaft--0.0.1.sql
+++ b/pg/crankshaft--0.0.1.sql
@@ -137,6 +137,53 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql VOLATILE;
 CREATE OR REPLACE FUNCTION
  cdb_create_segment (
      segment_name TEXT,
      table_name TEXT,
  	  column_name TEXT,
      geoid_column TEXT DEFAULT 'geoid',
      census_table TEXT DEFAULT 'block_groups'
  )
 RETURNS NUMERIC
 AS $$
  from crankshaft.segmentation import create_segemnt
  # TODO: use named parameters or a dictionary
  return create_segment('table')
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION
  cdb_predict_segment (
      segment_name TEXT,
      geoid_column TEXT DEFAULT 'geoid',
      census_table TEXT DEFAULT 'block_groups'
  )
 RETURNS TABLE(geoid TEXT, prediction NUMERIC)
 AS $$
  from crankshaft.segmentation import create_segemnt
  # TODO: use named parameters or a dictionary
  return create_segment('table')
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION
  cdb_adaptive_histogram (
      table_name  TEXT,
      column_name TEXT
  )
 RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
 AS $$
  from crankshaft.bayesian_blocks import adaptive_histogram
  return adaptive_histogram(table_name,column_name)
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION
  cdb_simple_test (
  )
 RETURNS NUMERIC
 AS $$
  return 5
 $$ LANGUAGE plpythonu;
 -- Make sure by default there are no permissions for publicuser
 -- NOTE: this happens at extension creation time, as part of an implicit transaction.
 -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
--- a/pg/sql/0.0.1/06_bayesian_blocks.sql
+++ b/pg/sql/0.0.1/06_bayesian_blocks.sql
@@ -0,0 +1,11 @@
 CREATE OR REPLACE FUNCTION
  cdb_adaptive_histogram (
      table_name  TEXT,
      column_name TEXT
  )
 RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
 AS $$
  from crankshaft.bayesian_blocks import adaptive_histogram
  return adaptive_histogram(table_name,column_name)
 $$ LANGUAGE plpythonu;
--- a/python/crankshaft/crankshaft/init.py
+++ b/python/crankshaft/crankshaft/init.py
@@ -1,2 +1,3 @@
 import random_seeds
 import clustering
 import bayesian_blocks
--- a/python/crankshaft/crankshaft/bayesian_blocks/init.py
+++ b/python/crankshaft/crankshaft/bayesian_blocks/init.py
@@ -0,0 +1 @@
 from bayesian_blocks import *
--- a/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
+++ b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
@@ -0,0 +1,84 @@
 import plpy
 import numpy as np
 def adaptive_histogram(table_name,column_name):
    data = plpy.execute("select {column_name} from {table_name}".format(**locals()))
    data = [float(d['count']) for d in data]
    plpy.notice(data)
    vals, bins = np.histogram( data, bins=_bayesian_blocks(data))
    return zip(vals,bins, bins[1:])
 def _bayesian_blocks(t):
    """Bayesian Blocks Implementation
    By Jake Vanderplas.  License: BSD
    Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S
    Parameters
    ----------
    t : ndarray, length N
        data to be histogrammed
    Returns
    -------
    bins : ndarray
        array containing the (N+1) bin edges
    Notes
    -----
    This is an incomplete implementation: it may fail for some
    datasets.  Alternate fitness functions and prior forms can
    be found in the paper listed above.
    """
    # copy and sort the array
    t = np.sort(t)
    N = t.size
    # create length-(N + 1) array of cell edges
    edges = np.concatenate([t[:1],
                            0.5 * (t[1:] + t[:-1]),
                            t[-1:]])
    block_length = t[-1] - edges
    # arrays needed for the iteration
    nn_vec = np.ones(N)
    best = np.zeros(N, dtype=float)
    last = np.zeros(N, dtype=int)
    #-----------------------------------------------------------------
    # Start with first data cell; add one cell at each iteration
    #-----------------------------------------------------------------
    for K in range(N):
        # Compute the width and count of the final bin for all possible
        # locations of the K^th changepoint
        width = block_length[:K + 1] - block_length[K + 1]
        count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]
        # evaluate fitness function for these possibilities
        fit_vec = count_vec * (np.log(count_vec) - np.log(width))
        fit_vec -= 4  # 4 comes from the prior on the number of changepoints
        fit_vec[1:] += best[:K]
        # find the max of the fitness: this is the K^th changepoint
        i_max = np.argmax(fit_vec)
        last[K] = i_max
        best[K] = fit_vec[i_max]
    #-----------------------------------------------------------------
    # Recover changepoints by iteratively peeling off the last block
    #-----------------------------------------------------------------
    change_points =  np.zeros(N, dtype=int)
    i_cp = N
    ind = N
    while True:
        i_cp -= 1
        change_points[i_cp] = ind
        if ind == 0:
            break
        ind = last[ind - 1]
    change_points = change_points[i_cp:]
    return edges[change_points]
--- a/python/crankshaft/setup.py
+++ b/python/crankshaft/setup.py
@@ -40,7 +40,7 @@ setup(
    # The choice of component versions is dictated by what's
    # provisioned in the production servers.
-    install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
+    install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'],
    requires=['pysal', 'numpy'],
Author	SHA1	Message	Date
Stuart Lynn	0e24d542b3	renaming to fix typo	2016-03-07 12:55:25 -05:00
Stuart Lynn	79bd319366	bayesian_blocks function	2016-03-07 11:49:57 -05:00