From 79bd3193666893ceb449579220015ac8e14938e7 Mon Sep 17 00:00:00 2001
From: Stuart Lynn <stuart.lynn@gmail.com>
Date: Mon, 7 Mar 2016 11:49:57 -0500
Subject: [PATCH] bayesian_blocks function

---
 pg/crankshaft--0.0.1.sql                      | 47 +++++++++++
 pg/sql/0.0.1/06_basyian_bins.sql              | 20 +++++
 python/crankshaft/crankshaft/__init__.py      |  1 +
 .../crankshaft/bayesian_blocks/__init__.py    |  1 +
 .../bayesian_blocks/bayesian_blocks.py        | 84 +++++++++++++++++++
 python/crankshaft/setup.py                    |  2 +-
 6 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 pg/sql/0.0.1/06_basyian_bins.sql
 create mode 100644 python/crankshaft/crankshaft/bayesian_blocks/__init__.py
 create mode 100644 python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py

diff --git a/pg/crankshaft--0.0.1.sql b/pg/crankshaft--0.0.1.sql
index 436beea..c29f3f4 100644
--- a/pg/crankshaft--0.0.1.sql
+++ b/pg/crankshaft--0.0.1.sql
@@ -137,6 +137,53 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql VOLATILE;
+CREATE OR REPLACE FUNCTION
+  cdb_create_segment (
+      segment_name TEXT,
+      table_name TEXT,
+  	  column_name TEXT,
+      geoid_column TEXT DEFAULT 'geoid',
+      census_table TEXT DEFAULT 'block_groups'
+  )
+RETURNS NUMERIC
+AS $$
+  from crankshaft.segmentation import create_segemnt
+  # TODO: use named parameters or a dictionary
+  return create_segment('table')
+$$ LANGUAGE plpythonu;
+
+CREATE OR REPLACE FUNCTION
+  cdb_predict_segment (
+      segment_name TEXT,
+      geoid_column TEXT DEFAULT 'geoid',
+      census_table TEXT DEFAULT 'block_groups'
+  )
+RETURNS TABLE(geoid TEXT, prediction NUMERIC)
+AS $$
+  from crankshaft.segmentation import create_segemnt
+  # TODO: use named parameters or a dictionary
+  return create_segment('table')
+$$ LANGUAGE plpythonu;
+CREATE OR REPLACE FUNCTION
+  cdb_adaptive_histogram (
+      table_name  TEXT,
+      column_name TEXT
+  )
+RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
+
+AS $$
+  from crankshaft.bayesian_blocks import adaptive_histogram
+  return adaptive_histogram(table_name,column_name)
+$$ LANGUAGE plpythonu;
+
+CREATE OR REPLACE FUNCTION
+  cdb_simple_test (
+  )
+RETURNS NUMERIC
+
+AS $$
+  return 5
+$$ LANGUAGE plpythonu;
 -- Make sure by default there are no permissions for publicuser
 -- NOTE: this happens at extension creation time, as part of an implicit transaction.
 -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
diff --git a/pg/sql/0.0.1/06_basyian_bins.sql b/pg/sql/0.0.1/06_basyian_bins.sql
new file mode 100644
index 0000000..9ffbfdc
--- /dev/null
+++ b/pg/sql/0.0.1/06_basyian_bins.sql
@@ -0,0 +1,20 @@
+CREATE OR REPLACE FUNCTION
+  cdb_adaptive_histogram (
+      table_name  TEXT,
+      column_name TEXT
+  )
+RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
+
+AS $$
+  from crankshaft.bayesian_blocks import adaptive_histogram
+  return adaptive_histogram(table_name,column_name)
+$$ LANGUAGE plpythonu;
+
+CREATE OR REPLACE FUNCTION
+  cdb_simple_test (
+  )
+RETURNS NUMERIC
+
+AS $$
+  return 5
+$$ LANGUAGE plpythonu;
diff --git a/python/crankshaft/crankshaft/__init__.py b/python/crankshaft/crankshaft/__init__.py
index d07e330..e492236 100644
--- a/python/crankshaft/crankshaft/__init__.py
+++ b/python/crankshaft/crankshaft/__init__.py
@@ -1,2 +1,3 @@
 import random_seeds
 import clustering
+import bayesian_blocks
diff --git a/python/crankshaft/crankshaft/bayesian_blocks/__init__.py b/python/crankshaft/crankshaft/bayesian_blocks/__init__.py
new file mode 100644
index 0000000..ed29f0e
--- /dev/null
+++ b/python/crankshaft/crankshaft/bayesian_blocks/__init__.py
@@ -0,0 +1 @@
+from bayesian_blocks import *
diff --git a/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
new file mode 100644
index 0000000..a11da0b
--- /dev/null
+++ b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
@@ -0,0 +1,84 @@
+import plpy
+import numpy as np
+
+
+def adaptive_histogram(table_name,column_name):
+    data = plpy.execute("select {column_name} from {table_name}".format(**locals()))
+
+    data = [float(d['count']) for d in data]
+    plpy.notice(data)
+    vals, bins = np.histogram( data, bins=_bayesian_blocks(data))
+    return zip(vals,bins, bins[1:])
+
+
+def _bayesian_blocks(t):
+    """Bayesian Blocks Implementation
+
+    By Jake Vanderplas.  License: BSD
+    Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S
+
+    Parameters
+    ----------
+    t : ndarray, length N
+        data to be histogrammed
+
+    Returns
+    -------
+    bins : ndarray
+        array containing the (N+1) bin edges
+
+    Notes
+    -----
+    This is an incomplete implementation: it may fail for some
+    datasets.  Alternate fitness functions and prior forms can
+    be found in the paper listed above.
+    """
+    # copy and sort the array
+    t = np.sort(t)
+    N = t.size
+
+    # create length-(N + 1) array of cell edges
+    edges = np.concatenate([t[:1],
+                            0.5 * (t[1:] + t[:-1]),
+                            t[-1:]])
+    block_length = t[-1] - edges
+
+    # arrays needed for the iteration
+    nn_vec = np.ones(N)
+    best = np.zeros(N, dtype=float)
+    last = np.zeros(N, dtype=int)
+
+    #-----------------------------------------------------------------
+    # Start with first data cell; add one cell at each iteration
+    #-----------------------------------------------------------------
+    for K in range(N):
+        # Compute the width and count of the final bin for all possible
+        # locations of the K^th changepoint
+        width = block_length[:K + 1] - block_length[K + 1]
+        count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]
+
+        # evaluate fitness function for these possibilities
+        fit_vec = count_vec * (np.log(count_vec) - np.log(width))
+        fit_vec -= 4  # 4 comes from the prior on the number of changepoints
+        fit_vec[1:] += best[:K]
+
+        # find the max of the fitness: this is the K^th changepoint
+        i_max = np.argmax(fit_vec)
+        last[K] = i_max
+        best[K] = fit_vec[i_max]
+
+    #-----------------------------------------------------------------
+    # Recover changepoints by iteratively peeling off the last block
+    #-----------------------------------------------------------------
+    change_points =  np.zeros(N, dtype=int)
+    i_cp = N
+    ind = N
+    while True:
+        i_cp -= 1
+        change_points[i_cp] = ind
+        if ind == 0:
+            break
+        ind = last[ind - 1]
+    change_points = change_points[i_cp:]
+
+    return edges[change_points]
diff --git a/python/crankshaft/setup.py b/python/crankshaft/setup.py
index c0f8c50..bcd6a16 100644
--- a/python/crankshaft/setup.py
+++ b/python/crankshaft/setup.py
@@ -40,7 +40,7 @@ setup(
 
     # The choice of component versions is dictated by what's
     # provisioned in the production servers.
-    install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
+    install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'],
 
     requires=['pysal', 'numpy'],