renaming to fix typo

bayesian_blocks function
Merge pull request #5 from CartoDB/4-pgxs-fix
2016-03-07 12:55:25 -05:00 · 2016-03-07 11:49:57 -05:00 · 2016-02-29 16:35:04 +01:00 · 2016-02-26 19:09:17 +01:00 · 2016-02-26 19:02:18 +01:00
12 changed files with 149 additions and 109 deletions
--- a/pg/Makefile
+++ b/pg/Makefile
@@ -28,3 +28,6 @@ REGRESS_OPTS = --inputdir='$(TEST_DIR)' --outputdir='$(TEST_DIR)'
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
+
+# This seems to be needed at least for PG 9.3.11
+all: $(DATA)
--- a/pg/crankshaft--0.0.1.sql
+++ b/pg/crankshaft--0.0.1.sql
@@ -137,6 +137,53 @@ BEGIN
 END;
 $$
 LANGUAGE plpgsql VOLATILE;
+CREATE OR REPLACE FUNCTION
+  cdb_create_segment (
+      segment_name TEXT,
+      table_name TEXT,
+  	  column_name TEXT,
+      geoid_column TEXT DEFAULT 'geoid',
+      census_table TEXT DEFAULT 'block_groups'
+  )
+RETURNS NUMERIC
+AS $$
+  from crankshaft.segmentation import create_segemnt
+  # TODO: use named parameters or a dictionary
+  return create_segment('table')
+$$ LANGUAGE plpythonu;
+
+CREATE OR REPLACE FUNCTION
+  cdb_predict_segment (
+      segment_name TEXT,
+      geoid_column TEXT DEFAULT 'geoid',
+      census_table TEXT DEFAULT 'block_groups'
+  )
+RETURNS TABLE(geoid TEXT, prediction NUMERIC)
+AS $$
+  from crankshaft.segmentation import create_segemnt
+  # TODO: use named parameters or a dictionary
+  return create_segment('table')
+$$ LANGUAGE plpythonu;
+CREATE OR REPLACE FUNCTION
+  cdb_adaptive_histogram (
+      table_name  TEXT,
+      column_name TEXT
+  )
+RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
+
+AS $$
+  from crankshaft.bayesian_blocks import adaptive_histogram
+  return adaptive_histogram(table_name,column_name)
+$$ LANGUAGE plpythonu;
+
+CREATE OR REPLACE FUNCTION
+  cdb_simple_test (
+  )
+RETURNS NUMERIC
+
+AS $$
+  return 5
+$$ LANGUAGE plpythonu;
 -- Make sure by default there are no permissions for publicuser
 -- NOTE: this happens at extension creation time, as part of an implicit transaction.
 -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
--- a/pg/doc/05_cdb_union_adjacent.md
+++ b/pg/doc/05_cdb_union_adjacent.md
@@ -1,25 +0,0 @@
-### Union Adjacent
-
-This is an aggregate function that will take a set of polygons and return a geometry array
-of regions where the polygons are continuous. Basically it combines polygons
-which are touching in to single polygons.
-
-It takes a single value:
-
-* `geometry` a list of geometries to be clustered and joined
-
-and returns
-
-* `geometry[]` an array of the joined geometries.
-
-An example usage would be something like:
-
-```postgresql
-  with joined_polygons as (
-    select cdb_union_adjacent(the_geom) regions from some_table
-  )
-  select unnest(region) the_geom from joined_polygons
-```
-
-which will produce a table with regions of continuous polygons from the original
-table.
--- a/pg/sql/0.0.1/05_cdb_union_adjacent.sql
+++ b/pg/sql/0.0.1/05_cdb_union_adjacent.sql
@@ -1,43 +0,0 @@
-CREATE OR REPLACE FUNCTION _cdb_final_union_adjacent( joined_geoms geometry[] )
-RETURNS geometry[] AS $$
-BEGIN
-    RETURN joined_geoms;
-END
-$$ LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION _cdb_state_update_union_adjacent(clusters geometry[], new_geom  geometry)
-RETURNS geometry[] AS $$
-DECLARE
-  joins  geometry[] :='{}';
-  unjoined geometry[] :='{}';
-  i integer;
-  combined geometry;
-BEGIN
-  joins := (select array_agg(g)
-            from unnest(clusters) a(g)
-            where ST_TOUCHES(g, new_geom));
-
-  unjoined := (select array_agg(g)
-               from unnest(clusters) a(g)
-               where ST_TOUCHES(g, new_geom) = false);
-
-  IF array_length(joins, 1) > 0 THEN
-    joins := array_append(joins, new_geom);
-    combined := ST_UNION(joins);
-  ELSE
-    combined := new_geom;
-  END IF;
-
-  unjoined := array_append(unjoined, combined);
-  RETURN unjoined;
-END
-$$
-LANGUAGE plpgsql;
-
-CREATE AGGREGATE cdb_union_adjacent(geometry)(
-  SFUNC=_cdb_state_update_union_adjacent,
-  STYPE=geometry[],
-  FINALFUNC=_cdb_final_union_adjacent,
-  INITCOND='{}'
-);
--- a/pg/sql/0.0.1/06_bayesian_blocks.sql
+++ b/pg/sql/0.0.1/06_bayesian_blocks.sql
@@ -0,0 +1,11 @@
+CREATE OR REPLACE FUNCTION
+  cdb_adaptive_histogram (
+      table_name  TEXT,
+      column_name TEXT
+  )
+RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
+
+AS $$
+  from crankshaft.bayesian_blocks import adaptive_histogram
+  return adaptive_histogram(table_name,column_name)
+$$ LANGUAGE plpythonu;
--- a/pg/test/0.0.1/expected/05_cdb_union_adjacent_test.out
+++ b/pg/test/0.0.1/expected/05_cdb_union_adjacent_test.out
@@ -1,22 +0,0 @@
-\i test/fixtures/touching_polygons.sql
-- test table (polygons, some of which touch and some which dont)
-CREATE TABLE touching_polygons(cartodb_id integer, the_geom geometry);
-INSERT INTO  touching_polygons VALUES
-(1, ST_GeomFromText('POLYGON ((0 0, 1 0,1 1, 0 1, 0 0 ))')),
-(2, ST_GeomFromText('POLYGON ((1 0, 2 0, 2 1, 1 1, 1 0))')),
-(1, ST_GeomFromText('POLYGON ((0 1, 1 1,1 2, 0 2, 0 1 ))')),
-(4, ST_GeomFromText('POLYGON ((3 0, 4 0, 4 1, 3 1, 3 0))')),
-(5, ST_GeomFromText('POLYGON ((3 1, 4 1, 4 2, 3 2, 3 1))'));
-WITH joined_polygons AS (
-  SELECT cdb_crankshaft.cdb_union_adjacent(the_geom) the_geom FROM touching_polygons
-),
-unnested_polygons as (
-  select unnest(joined_polygons.the_geom) the_geom from joined_polygons
-)
-select ST_ASTEXT(unnested_polygons.the_geom) from unnested_polygons;
-                   st_astext                    
------------------------------------------------
- POLYGON((1 0,0 0,0 1,0 2,1 2,1 1,2 1,2 0,1 0))
- POLYGON((4 1,4 0,3 0,3 1,3 2,4 2,4 1))
-(2 rows)
-
--- a/pg/test/0.0.1/sql/05_cdb_union_adjacent_test.sql
+++ b/pg/test/0.0.1/sql/05_cdb_union_adjacent_test.sql
@@ -1,9 +0,0 @@
-\i test/fixtures/touching_polygons.sql
-
-WITH joined_polygons AS (
-  SELECT cdb_crankshaft.cdb_union_adjacent(the_geom) the_geom FROM touching_polygons
-),
-unnested_polygons as (
-  select unnest(joined_polygons.the_geom) the_geom from joined_polygons
-)
-select ST_ASTEXT(unnested_polygons.the_geom) from unnested_polygons;
--- a/pg/test/fixtures/touching_polygons.sql
+++ b/pg/test/fixtures/touching_polygons.sql
@@ -1,8 +0,0 @@
-- test table (polygons, some of which touch and some which dont)
-CREATE TABLE touching_polygons(cartodb_id integer, the_geom geometry);
-INSERT INTO  touching_polygons VALUES
-(1, ST_GeomFromText('POLYGON ((0 0, 1 0,1 1, 0 1, 0 0 ))')),
-(2, ST_GeomFromText('POLYGON ((1 0, 2 0, 2 1, 1 1, 1 0))')),
-(1, ST_GeomFromText('POLYGON ((0 1, 1 1,1 2, 0 2, 0 1 ))')),
-(4, ST_GeomFromText('POLYGON ((3 0, 4 0, 4 1, 3 1, 3 0))')),
-(5, ST_GeomFromText('POLYGON ((3 1, 4 1, 4 2, 3 2, 3 1))'));
--- a/python/crankshaft/crankshaft/init.py
+++ b/python/crankshaft/crankshaft/init.py
@@ -1,2 +1,3 @@
 import random_seeds
 import clustering
+import bayesian_blocks
--- a/python/crankshaft/crankshaft/bayesian_blocks/init.py
+++ b/python/crankshaft/crankshaft/bayesian_blocks/init.py
@@ -0,0 +1 @@
+from bayesian_blocks import *
--- a/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
+++ b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
@@ -0,0 +1,84 @@
+import plpy
+import numpy as np
+
+
+def adaptive_histogram(table_name,column_name):
+    data = plpy.execute("select {column_name} from {table_name}".format(**locals()))
+
+    data = [float(d['count']) for d in data]
+    plpy.notice(data)
+    vals, bins = np.histogram( data, bins=_bayesian_blocks(data))
+    return zip(vals,bins, bins[1:])
+
+
+def _bayesian_blocks(t):
+    """Bayesian Blocks Implementation
+
+    By Jake Vanderplas.  License: BSD
+    Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S
+
+    Parameters
+    ----------
+    t : ndarray, length N
+        data to be histogrammed
+
+    Returns
+    -------
+    bins : ndarray
+        array containing the (N+1) bin edges
+
+    Notes
+    -----
+    This is an incomplete implementation: it may fail for some
+    datasets.  Alternate fitness functions and prior forms can
+    be found in the paper listed above.
+    """
+    # copy and sort the array
+    t = np.sort(t)
+    N = t.size
+
+    # create length-(N + 1) array of cell edges
+    edges = np.concatenate([t[:1],
+                            0.5 * (t[1:] + t[:-1]),
+                            t[-1:]])
+    block_length = t[-1] - edges
+
+    # arrays needed for the iteration
+    nn_vec = np.ones(N)
+    best = np.zeros(N, dtype=float)
+    last = np.zeros(N, dtype=int)
+
+    #-----------------------------------------------------------------
+    # Start with first data cell; add one cell at each iteration
+    #-----------------------------------------------------------------
+    for K in range(N):
+        # Compute the width and count of the final bin for all possible
+        # locations of the K^th changepoint
+        width = block_length[:K + 1] - block_length[K + 1]
+        count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]
+
+        # evaluate fitness function for these possibilities
+        fit_vec = count_vec * (np.log(count_vec) - np.log(width))
+        fit_vec -= 4  # 4 comes from the prior on the number of changepoints
+        fit_vec[1:] += best[:K]
+
+        # find the max of the fitness: this is the K^th changepoint
+        i_max = np.argmax(fit_vec)
+        last[K] = i_max
+        best[K] = fit_vec[i_max]
+
+    #-----------------------------------------------------------------
+    # Recover changepoints by iteratively peeling off the last block
+    #-----------------------------------------------------------------
+    change_points =  np.zeros(N, dtype=int)
+    i_cp = N
+    ind = N
+    while True:
+        i_cp -= 1
+        change_points[i_cp] = ind
+        if ind == 0:
+            break
+        ind = last[ind - 1]
+    change_points = change_points[i_cp:]
+
+    return edges[change_points]
--- a/python/crankshaft/setup.py
+++ b/python/crankshaft/setup.py
@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
 setup(
    name='crankshaft',

-    version='0.0.01',
+    version='0.0.1',

    description='CartoDB Spatial Analysis Python Library',

@@ -40,7 +40,7 @@ setup(

    # The choice of component versions is dictated by what's
    # provisioned in the production servers.
-    install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
+    install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'],

    requires=['pysal', 'numpy'],
Author	SHA1	Message	Date
Stuart Lynn	0e24d542b3	renaming to fix typo	2016-03-07 12:55:25 -05:00
Stuart Lynn	79bd319366	bayesian_blocks function	2016-03-07 11:49:57 -05:00
Javier Goizueta	46c66476b5	Merge pull request #5 from CartoDB/4-pgxs-fix Adapt Makefile of the extension for some PGXS versions	2016-02-29 16:35:04 +01:00
Javier Goizueta	e03aac4d8f	Fix typo	2016-02-26 19:09:17 +01:00
Javier Goizueta	d885c16db2	Adapt Makefile of the extension for some PGXS versions Postgresql 9.3.11 doesn't generates $DATA by default. fixes #4	2016-02-26 19:02:18 +01:00