Add comments

Sketch of the cdb_population function
2016-02-22 15:36:02 +01:00 · 2016-02-22 13:27:31 +01:00
14 changed files with 145 additions and 194 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@ name must be created.
 ### Version numbers
 The version of both the SQL extension and the Python package shall
-follow the [Semantic Versioning 2.0](http://semver.org/) guidelines:
+follow the[Semantic Versioning 2.0](http://semver.org/) guidelines:
 * When backwards incompatibility is introduced the major number is incremented
 * When functionally is added (in a backwards-compatible manner) the minor number
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@ CartoDB Spatial Analysis extension for PostgreSQL.
 * *pg* contains the PostgreSQL extension source code
 * *python* Python module
 FIXME: should it be `./extension` and `./lib/python' ?
 ## Requirements
 * pip
--- a/pg/Makefile
+++ b/pg/Makefile
@@ -28,6 +28,3 @@ REGRESS_OPTS = --inputdir='$(TEST_DIR)' --outputdir='$(TEST_DIR)'
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 # This seems to be needed at least for PG 9.3.11
 all: $(DATA)
--- a/pg/crankshaft--0.0.1.sql
+++ b/pg/crankshaft--0.0.1.sql
@@ -1,6 +1,3 @@
 --DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
 -- Complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
 -- Internal function.
 -- Set the seeds of the RNGs (Random Number Generators)
 -- used internally.
@@ -136,60 +133,4 @@ BEGIN
  RETURN ST_Collect(points);
 END;
 $$
-LANGUAGE plpgsql VOLATILE;
+LANGUAGE plpgsql VOLATILE
 CREATE OR REPLACE FUNCTION
  cdb_create_segment (
      segment_name TEXT,
      table_name TEXT,
  	  column_name TEXT,
      geoid_column TEXT DEFAULT 'geoid',
      census_table TEXT DEFAULT 'block_groups'
  )
 RETURNS NUMERIC
 AS $$
  from crankshaft.segmentation import create_segemnt
  # TODO: use named parameters or a dictionary
  return create_segment('table')
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION
  cdb_predict_segment (
      segment_name TEXT,
      geoid_column TEXT DEFAULT 'geoid',
      census_table TEXT DEFAULT 'block_groups'
  )
 RETURNS TABLE(geoid TEXT, prediction NUMERIC)
 AS $$
  from crankshaft.segmentation import create_segemnt
  # TODO: use named parameters or a dictionary
  return create_segment('table')
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION
  cdb_adaptive_histogram (
      table_name  TEXT,
      column_name TEXT
  )
 RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
 AS $$
  from crankshaft.bayesian_blocks import adaptive_histogram
  return adaptive_histogram(table_name,column_name)
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION
  cdb_simple_test (
  )
 RETURNS NUMERIC
 AS $$
  return 5
 $$ LANGUAGE plpythonu;
 -- Make sure by default there are no permissions for publicuser
 -- NOTE: this happens at extension creation time, as part of an implicit transaction.
 -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
 -- Grant permissions on the schema to publicuser (but just the schema)
 GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
 -- Revoke execute permissions on all functions in the schema by default
 -- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
--- a/pg/sql/0.0.1/00_header.sql
+++ b/pg/sql/0.0.1/00_header.sql
@@ -1,3 +0,0 @@
 --DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
 -- Complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
--- a/pg/sql/0.0.1/04_dot_density.sql
+++ b/pg/sql/0.0.1/04_dot_density.sql
@@ -51,4 +51,4 @@ BEGIN
  RETURN ST_Collect(points);
 END;
 $$
-LANGUAGE plpgsql VOLATILE;
+LANGUAGE plpgsql VOLATILE
--- a/pg/sql/0.0.1/06_bayesian_blocks.sql
+++ b/pg/sql/0.0.1/06_bayesian_blocks.sql
@@ -1,11 +0,0 @@
 CREATE OR REPLACE FUNCTION
  cdb_adaptive_histogram (
      table_name  TEXT,
      column_name TEXT
  )
 RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
 AS $$
  from crankshaft.bayesian_blocks import adaptive_histogram
  return adaptive_histogram(table_name,column_name)
 $$ LANGUAGE plpythonu;
--- a/pg/sql/0.0.1/90_permissions.sql
+++ b/pg/sql/0.0.1/90_permissions.sql
@@ -1,9 +0,0 @@
 -- Make sure by default there are no permissions for publicuser
 -- NOTE: this happens at extension creation time, as part of an implicit transaction.
 -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
 -- Grant permissions on the schema to publicuser (but just the schema)
 GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
 -- Revoke execute permissions on all functions in the schema by default
 -- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
--- a/pg/sql/0.0.1/population.sql
+++ b/pg/sql/0.0.1/population.sql
@@ -0,0 +1,138 @@
 -- Function to obtain an estimate of the population living inside
 -- an area (polygon) from the CartoDB Data Observatory
 CREATE OR REPLACE FUNCTION cdb_population(area geometry)
 RETURNS NUMERIC AS $$
 DECLARE
  georef_column TEXT;
  table_id TEXT;
  tag_value TEXT;
  table_name TEXT;
  column_name TEXT;
  population NUMERIC;
 BEGIN
  -- Note: comments contain pseudo-code that should be implemented
  -- Register metadata tables:
  -- This would require super-user privileges
  /*
  SELECT cdb_add_remote_table('observatory', 'bmd_column_table');
  SELECT cdb_add_remote_table('observatory', 'bmd_column_2_column');
  SELECT cdb_add_remote_table('observatory', 'bmd_table');
  SELECT cdb_add_remote_table('observatory', 'bmd_column_table');
  SELECT cdb_add_remote_table('observatory', 'bmd_column_tag');
  SELECT cdb_add_remote_table('observatory', 'bmd_tag');
  */
  tag_value := 'population';
  -- Determine the georef column id to be used: it must have type 'geometry',
  -- the maximum weight.
  -- TODO: in general, multiple columns with maximal weight could be found;
  -- we should use the timespan of the table to disambiguate (choose the
  -- most recent). Also a rank of geometry columns should be introduced to
  -- find select the greatest resolution available.
  /*
  WITH selected_tables AS (
    -- Find tables that have population columns and cover the input area
    SELECT tab.id AS id
    FROM observatory.bmd_column col,
         observatory.bmd_column_table coltab,
         observatory.bmd_table tab,
         observatory.bmd_tag tag,
         observatory.bmd_column_tag coltag
    WHERE coltab.column_id = col.id
      AND coltab.table_id = tab.id
      AND coltag.tag_id = tag.id
      AND coltag.column_id = col.id
      AND tag.name ILIKE tag_value
      AND tab.id = table_id
      AND tab.bounds && area;
  )
  SELECT
    FROM bmd_column col
    JOIN bmd_table tab ON col.table_id = tab.id
    WHERE type = 'geometry'
      AND tab.id IN (selected_tables)
    ORDER BY weight DESC LIMIT 1;
  */
  georef_column := '"us.census.tiger".block_group_2013';
  -- Now we will query the metadata to find which actual tables correspond
  -- to this datasource and resolution/timespan
  -- and choose the 'parent' or more general of them.
  /*
  SELECT from_table_geoid.id data_table_id
  FROM observatory.bmd_column_table from_column_table_geoid,
       observatory.bmd_column_table to_column_table_geoid,
       observatory.bmd_column_2_column rel,
       observatory.bmd_column_table to_column_table_geom,
       observatory.bmd_table from_table_geoid,
       observatory.bmd_table to_table_geoid,
       observatory.bmd_table to_table_geom
  WHERE from_column_table_geoid.column_id = to_column_table_geoid.column_id
    AND to_column_table_geoid.column_id = rel.from_id
    AND rel.reltype = 'geom_ref'
    AND rel.to_id = to_column_table_geom.column_id
    AND to_column_table_geom.column_id = georef_column
    AND from_table_geoid.id = from_column_table_geoid.table_id
    AND to_table_geoid.id = to_column_table_geoid.table_id
    AND to_table_geom.id = to_column_table_geom.table_id
    AND from_table_geoid.bounds && area
  ORDER by from_table_geoid.timespan desc
  INTO table_id;
  */
  table_id := '"us.census.acs".extract_2013_5yr_block_group';
  -- Next will fetch the columns of that table that are tagged as population:
  -- and get the more general one (not having a parent or denominator)
  /*
  WITH column_ids AS (
    SELECT col.id AS id
    FROM observatory.bmd_column col,
         observatory.bmd_column_table coltab,
         observatory.bmd_table tab,
         observatory.bmd_tag tag,
         observatory.bmd_column_tag coltag
    WHERE coltab.column_id = col.id
      AND coltab.table_id = tab.id
      AND coltag.tag_id = tag.id
      AND coltag.column_id = col.id
      AND tag.name ILIKE tag_value
      AND tab.id = table_id;
  ),
  excluded_column_ids AS (
    SELECT from_id AS id
    FROM observatory.bmd_column_2_column
    WHERE from_id in (column_ids)
      AND reltype in ('parent', 'denominator')
      AND to_id in (column_ids)
  ),
  SELECT bmd_table.tablename, bmd_column_table.colname
  FROM observatory.bmd_column_table,
       observatory.bmd_table
  WHERE bmd_column_table.table_id = bmd_table.id
    AND bmd_column_table.column_id IN (column_ids)
    AND NOT bmd_column_table.column_id IN (exclude_column_ids)
  INTO (table_name, column_name);
  */
  table_name := 'us_census_acs2013_5yr_block_group';
  column_name := 'total_pop';
  -- Register the foreign table
  -- This would require super-user privileges
  -- SELECT cdb_add_remote_table('observatory', table_name);
  -- Perform the query
  SELECT cdb_crankshaft.cdb_overlap_sum(
    area,
    table_name,
    column_name,
    schema_name := 'observatory')
  INTO population;
  RETURN population;
 END;
 $$
 LANGUAGE plpgsql VOLATILE
--- a/pg/test/0.0.1/sql/90_permissions.sql
+++ b/pg/test/0.0.1/sql/90_permissions.sql
@@ -1,18 +0,0 @@
 SELECT cdb_crankshaft._cdb_random_seeds(1234);
 -- Use regular user role
 SET ROLE test_regular_user;
 -- Add to the search path the schema
 SET search_path TO public,cartodb,cdb_crankshaft;
 -- Exercise public functions
 SELECT ppoints.code, m.quads
  FROM ppoints
  JOIN cdb_moran_local('ppoints', 'value') m
    ON ppoints.cartodb_id = m.ids
  ORDER BY ppoints.code;
 SELECT round(cdb_overlap_sum(
  '0106000020E61000000100000001030000000100000004000000FFFFFFFFFF3604C09A0B9ECEC42E444000000000C060FBBF30C7FD70E01D44400000000040AD02C06481F1C8CD034440FFFFFFFFFF3604C09A0B9ECEC42E4440'::geometry,
  'values', 'value'
 ), 2);
--- a/python/crankshaft/crankshaft/init.py
+++ b/python/crankshaft/crankshaft/init.py
@@ -1,3 +1,2 @@
 import random_seeds
 import clustering
 import bayesian_blocks
--- a/python/crankshaft/crankshaft/bayesian_blocks/init.py
+++ b/python/crankshaft/crankshaft/bayesian_blocks/init.py
@@ -1 +0,0 @@
 from bayesian_blocks import *
--- a/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
+++ b/python/crankshaft/crankshaft/bayesian_blocks/bayesian_blocks.py
@@ -1,84 +0,0 @@
 import plpy
 import numpy as np
 def adaptive_histogram(table_name,column_name):
    data = plpy.execute("select {column_name} from {table_name}".format(**locals()))
    data = [float(d['count']) for d in data]
    plpy.notice(data)
    vals, bins = np.histogram( data, bins=_bayesian_blocks(data))
    return zip(vals,bins, bins[1:])
 def _bayesian_blocks(t):
    """Bayesian Blocks Implementation
    By Jake Vanderplas.  License: BSD
    Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S
    Parameters
    ----------
    t : ndarray, length N
        data to be histogrammed
    Returns
    -------
    bins : ndarray
        array containing the (N+1) bin edges
    Notes
    -----
    This is an incomplete implementation: it may fail for some
    datasets.  Alternate fitness functions and prior forms can
    be found in the paper listed above.
    """
    # copy and sort the array
    t = np.sort(t)
    N = t.size
    # create length-(N + 1) array of cell edges
    edges = np.concatenate([t[:1],
                            0.5 * (t[1:] + t[:-1]),
                            t[-1:]])
    block_length = t[-1] - edges
    # arrays needed for the iteration
    nn_vec = np.ones(N)
    best = np.zeros(N, dtype=float)
    last = np.zeros(N, dtype=int)
    #-----------------------------------------------------------------
    # Start with first data cell; add one cell at each iteration
    #-----------------------------------------------------------------
    for K in range(N):
        # Compute the width and count of the final bin for all possible
        # locations of the K^th changepoint
        width = block_length[:K + 1] - block_length[K + 1]
        count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]
        # evaluate fitness function for these possibilities
        fit_vec = count_vec * (np.log(count_vec) - np.log(width))
        fit_vec -= 4  # 4 comes from the prior on the number of changepoints
        fit_vec[1:] += best[:K]
        # find the max of the fitness: this is the K^th changepoint
        i_max = np.argmax(fit_vec)
        last[K] = i_max
        best[K] = fit_vec[i_max]
    #-----------------------------------------------------------------
    # Recover changepoints by iteratively peeling off the last block
    #-----------------------------------------------------------------
    change_points =  np.zeros(N, dtype=int)
    i_cp = N
    ind = N
    while True:
        i_cp -= 1
        change_points[i_cp] = ind
        if ind == 0:
            break
        ind = last[ind - 1]
    change_points = change_points[i_cp:]
    return edges[change_points]
--- a/python/crankshaft/setup.py
+++ b/python/crankshaft/setup.py
@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
 setup(
    name='crankshaft',
-    version='0.0.1',
+    version='0.0.01',
    description='CartoDB Spatial Analysis Python Library',
@@ -40,7 +40,7 @@ setup(
    # The choice of component versions is dictated by what's
    # provisioned in the production servers.
-    install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'],
+    install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
    requires=['pysal', 'numpy'],
Author	SHA1	Message	Date
Javier Goizueta	c293781624	Add comments	2016-02-22 15:36:02 +01:00
Javier Goizueta	6fa726bcce	Sketch of the cdb_population function	2016-02-22 13:27:31 +01:00