Compare commits
2 Commits
bayesian_b
...
population
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c293781624 | ||
|
|
6fa726bcce |
@@ -12,7 +12,7 @@ name must be created.
|
|||||||
### Version numbers
|
### Version numbers
|
||||||
|
|
||||||
The version of both the SQL extension and the Python package shall
|
The version of both the SQL extension and the Python package shall
|
||||||
follow the [Semantic Versioning 2.0](http://semver.org/) guidelines:
|
follow the[Semantic Versioning 2.0](http://semver.org/) guidelines:
|
||||||
|
|
||||||
* When backwards incompatibility is introduced the major number is incremented
|
* When backwards incompatibility is introduced the major number is incremented
|
||||||
* When functionally is added (in a backwards-compatible manner) the minor number
|
* When functionally is added (in a backwards-compatible manner) the minor number
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ CartoDB Spatial Analysis extension for PostgreSQL.
|
|||||||
* *pg* contains the PostgreSQL extension source code
|
* *pg* contains the PostgreSQL extension source code
|
||||||
* *python* Python module
|
* *python* Python module
|
||||||
|
|
||||||
|
FIXME: should it be `./extension` and `./lib/python' ?
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
* pip
|
* pip
|
||||||
|
|||||||
@@ -28,6 +28,3 @@ REGRESS_OPTS = --inputdir='$(TEST_DIR)' --outputdir='$(TEST_DIR)'
|
|||||||
PG_CONFIG = pg_config
|
PG_CONFIG = pg_config
|
||||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||||
include $(PGXS)
|
include $(PGXS)
|
||||||
|
|
||||||
# This seems to be needed at least for PG 9.3.11
|
|
||||||
all: $(DATA)
|
|
||||||
|
|||||||
@@ -1,6 +1,3 @@
|
|||||||
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
|
|
||||||
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
|
|
||||||
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
|
|
||||||
-- Internal function.
|
-- Internal function.
|
||||||
-- Set the seeds of the RNGs (Random Number Generators)
|
-- Set the seeds of the RNGs (Random Number Generators)
|
||||||
-- used internally.
|
-- used internally.
|
||||||
@@ -136,60 +133,4 @@ BEGIN
|
|||||||
RETURN ST_Collect(points);
|
RETURN ST_Collect(points);
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql VOLATILE;
|
LANGUAGE plpgsql VOLATILE
|
||||||
CREATE OR REPLACE FUNCTION
|
|
||||||
cdb_create_segment (
|
|
||||||
segment_name TEXT,
|
|
||||||
table_name TEXT,
|
|
||||||
column_name TEXT,
|
|
||||||
geoid_column TEXT DEFAULT 'geoid',
|
|
||||||
census_table TEXT DEFAULT 'block_groups'
|
|
||||||
)
|
|
||||||
RETURNS NUMERIC
|
|
||||||
AS $$
|
|
||||||
from crankshaft.segmentation import create_segemnt
|
|
||||||
# TODO: use named parameters or a dictionary
|
|
||||||
return create_segment('table')
|
|
||||||
$$ LANGUAGE plpythonu;
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION
|
|
||||||
cdb_predict_segment (
|
|
||||||
segment_name TEXT,
|
|
||||||
geoid_column TEXT DEFAULT 'geoid',
|
|
||||||
census_table TEXT DEFAULT 'block_groups'
|
|
||||||
)
|
|
||||||
RETURNS TABLE(geoid TEXT, prediction NUMERIC)
|
|
||||||
AS $$
|
|
||||||
from crankshaft.segmentation import create_segemnt
|
|
||||||
# TODO: use named parameters or a dictionary
|
|
||||||
return create_segment('table')
|
|
||||||
$$ LANGUAGE plpythonu;
|
|
||||||
CREATE OR REPLACE FUNCTION
|
|
||||||
cdb_adaptive_histogram (
|
|
||||||
table_name TEXT,
|
|
||||||
column_name TEXT
|
|
||||||
)
|
|
||||||
RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
|
|
||||||
|
|
||||||
AS $$
|
|
||||||
from crankshaft.bayesian_blocks import adaptive_histogram
|
|
||||||
return adaptive_histogram(table_name,column_name)
|
|
||||||
$$ LANGUAGE plpythonu;
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION
|
|
||||||
cdb_simple_test (
|
|
||||||
)
|
|
||||||
RETURNS NUMERIC
|
|
||||||
|
|
||||||
AS $$
|
|
||||||
return 5
|
|
||||||
$$ LANGUAGE plpythonu;
|
|
||||||
-- Make sure by default there are no permissions for publicuser
|
|
||||||
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
|
|
||||||
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
|
|
||||||
|
|
||||||
-- Grant permissions on the schema to publicuser (but just the schema)
|
|
||||||
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
|
|
||||||
|
|
||||||
-- Revoke execute permissions on all functions in the schema by default
|
|
||||||
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
|
|
||||||
|
|||||||
@@ -1,3 +0,0 @@
|
|||||||
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
|
|
||||||
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
|
|
||||||
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
|
|
||||||
@@ -51,4 +51,4 @@ BEGIN
|
|||||||
RETURN ST_Collect(points);
|
RETURN ST_Collect(points);
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql VOLATILE;
|
LANGUAGE plpgsql VOLATILE
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
CREATE OR REPLACE FUNCTION
|
|
||||||
cdb_adaptive_histogram (
|
|
||||||
table_name TEXT,
|
|
||||||
column_name TEXT
|
|
||||||
)
|
|
||||||
RETURNS TABLE (bin_start numeric,bin_end numeric,value numeric)
|
|
||||||
|
|
||||||
AS $$
|
|
||||||
from crankshaft.bayesian_blocks import adaptive_histogram
|
|
||||||
return adaptive_histogram(table_name,column_name)
|
|
||||||
$$ LANGUAGE plpythonu;
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
-- Make sure by default there are no permissions for publicuser
|
|
||||||
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
|
|
||||||
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
|
|
||||||
|
|
||||||
-- Grant permissions on the schema to publicuser (but just the schema)
|
|
||||||
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
|
|
||||||
|
|
||||||
-- Revoke execute permissions on all functions in the schema by default
|
|
||||||
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
|
|
||||||
138
pg/sql/0.0.1/population.sql
Normal file
138
pg/sql/0.0.1/population.sql
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
-- Function to obtain an estimate of the population living inside
|
||||||
|
-- an area (polygon) from the CartoDB Data Observatory
|
||||||
|
CREATE OR REPLACE FUNCTION cdb_population(area geometry)
|
||||||
|
RETURNS NUMERIC AS $$
|
||||||
|
DECLARE
|
||||||
|
georef_column TEXT;
|
||||||
|
table_id TEXT;
|
||||||
|
tag_value TEXT;
|
||||||
|
table_name TEXT;
|
||||||
|
column_name TEXT;
|
||||||
|
population NUMERIC;
|
||||||
|
BEGIN
|
||||||
|
|
||||||
|
-- Note: comments contain pseudo-code that should be implemented
|
||||||
|
|
||||||
|
-- Register metadata tables:
|
||||||
|
-- This would require super-user privileges
|
||||||
|
/*
|
||||||
|
SELECT cdb_add_remote_table('observatory', 'bmd_column_table');
|
||||||
|
SELECT cdb_add_remote_table('observatory', 'bmd_column_2_column');
|
||||||
|
SELECT cdb_add_remote_table('observatory', 'bmd_table');
|
||||||
|
SELECT cdb_add_remote_table('observatory', 'bmd_column_table');
|
||||||
|
SELECT cdb_add_remote_table('observatory', 'bmd_column_tag');
|
||||||
|
SELECT cdb_add_remote_table('observatory', 'bmd_tag');
|
||||||
|
*/
|
||||||
|
|
||||||
|
tag_value := 'population';
|
||||||
|
|
||||||
|
|
||||||
|
-- Determine the georef column id to be used: it must have type 'geometry',
|
||||||
|
-- the maximum weight.
|
||||||
|
-- TODO: in general, multiple columns with maximal weight could be found;
|
||||||
|
-- we should use the timespan of the table to disambiguate (choose the
|
||||||
|
-- most recent). Also a rank of geometry columns should be introduced to
|
||||||
|
-- find select the greatest resolution available.
|
||||||
|
/*
|
||||||
|
WITH selected_tables AS (
|
||||||
|
-- Find tables that have population columns and cover the input area
|
||||||
|
SELECT tab.id AS id
|
||||||
|
FROM observatory.bmd_column col,
|
||||||
|
observatory.bmd_column_table coltab,
|
||||||
|
observatory.bmd_table tab,
|
||||||
|
observatory.bmd_tag tag,
|
||||||
|
observatory.bmd_column_tag coltag
|
||||||
|
WHERE coltab.column_id = col.id
|
||||||
|
AND coltab.table_id = tab.id
|
||||||
|
AND coltag.tag_id = tag.id
|
||||||
|
AND coltag.column_id = col.id
|
||||||
|
AND tag.name ILIKE tag_value
|
||||||
|
AND tab.id = table_id
|
||||||
|
AND tab.bounds && area;
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
FROM bmd_column col
|
||||||
|
JOIN bmd_table tab ON col.table_id = tab.id
|
||||||
|
WHERE type = 'geometry'
|
||||||
|
AND tab.id IN (selected_tables)
|
||||||
|
ORDER BY weight DESC LIMIT 1;
|
||||||
|
*/
|
||||||
|
georef_column := '"us.census.tiger".block_group_2013';
|
||||||
|
|
||||||
|
-- Now we will query the metadata to find which actual tables correspond
|
||||||
|
-- to this datasource and resolution/timespan
|
||||||
|
-- and choose the 'parent' or more general of them.
|
||||||
|
/*
|
||||||
|
SELECT from_table_geoid.id data_table_id
|
||||||
|
FROM observatory.bmd_column_table from_column_table_geoid,
|
||||||
|
observatory.bmd_column_table to_column_table_geoid,
|
||||||
|
observatory.bmd_column_2_column rel,
|
||||||
|
observatory.bmd_column_table to_column_table_geom,
|
||||||
|
observatory.bmd_table from_table_geoid,
|
||||||
|
observatory.bmd_table to_table_geoid,
|
||||||
|
observatory.bmd_table to_table_geom
|
||||||
|
WHERE from_column_table_geoid.column_id = to_column_table_geoid.column_id
|
||||||
|
AND to_column_table_geoid.column_id = rel.from_id
|
||||||
|
AND rel.reltype = 'geom_ref'
|
||||||
|
AND rel.to_id = to_column_table_geom.column_id
|
||||||
|
AND to_column_table_geom.column_id = georef_column
|
||||||
|
AND from_table_geoid.id = from_column_table_geoid.table_id
|
||||||
|
AND to_table_geoid.id = to_column_table_geoid.table_id
|
||||||
|
AND to_table_geom.id = to_column_table_geom.table_id
|
||||||
|
AND from_table_geoid.bounds && area
|
||||||
|
ORDER by from_table_geoid.timespan desc
|
||||||
|
INTO table_id;
|
||||||
|
*/
|
||||||
|
table_id := '"us.census.acs".extract_2013_5yr_block_group';
|
||||||
|
|
||||||
|
-- Next will fetch the columns of that table that are tagged as population:
|
||||||
|
-- and get the more general one (not having a parent or denominator)
|
||||||
|
/*
|
||||||
|
WITH column_ids AS (
|
||||||
|
SELECT col.id AS id
|
||||||
|
FROM observatory.bmd_column col,
|
||||||
|
observatory.bmd_column_table coltab,
|
||||||
|
observatory.bmd_table tab,
|
||||||
|
observatory.bmd_tag tag,
|
||||||
|
observatory.bmd_column_tag coltag
|
||||||
|
WHERE coltab.column_id = col.id
|
||||||
|
AND coltab.table_id = tab.id
|
||||||
|
AND coltag.tag_id = tag.id
|
||||||
|
AND coltag.column_id = col.id
|
||||||
|
AND tag.name ILIKE tag_value
|
||||||
|
AND tab.id = table_id;
|
||||||
|
),
|
||||||
|
excluded_column_ids AS (
|
||||||
|
SELECT from_id AS id
|
||||||
|
FROM observatory.bmd_column_2_column
|
||||||
|
WHERE from_id in (column_ids)
|
||||||
|
AND reltype in ('parent', 'denominator')
|
||||||
|
AND to_id in (column_ids)
|
||||||
|
),
|
||||||
|
SELECT bmd_table.tablename, bmd_column_table.colname
|
||||||
|
FROM observatory.bmd_column_table,
|
||||||
|
observatory.bmd_table
|
||||||
|
WHERE bmd_column_table.table_id = bmd_table.id
|
||||||
|
AND bmd_column_table.column_id IN (column_ids)
|
||||||
|
AND NOT bmd_column_table.column_id IN (exclude_column_ids)
|
||||||
|
INTO (table_name, column_name);
|
||||||
|
*/
|
||||||
|
table_name := 'us_census_acs2013_5yr_block_group';
|
||||||
|
column_name := 'total_pop';
|
||||||
|
|
||||||
|
-- Register the foreign table
|
||||||
|
-- This would require super-user privileges
|
||||||
|
-- SELECT cdb_add_remote_table('observatory', table_name);
|
||||||
|
|
||||||
|
-- Perform the query
|
||||||
|
SELECT cdb_crankshaft.cdb_overlap_sum(
|
||||||
|
area,
|
||||||
|
table_name,
|
||||||
|
column_name,
|
||||||
|
schema_name := 'observatory')
|
||||||
|
INTO population;
|
||||||
|
|
||||||
|
RETURN population;
|
||||||
|
END;
|
||||||
|
$$
|
||||||
|
LANGUAGE plpgsql VOLATILE
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
SELECT cdb_crankshaft._cdb_random_seeds(1234);
|
|
||||||
|
|
||||||
-- Use regular user role
|
|
||||||
SET ROLE test_regular_user;
|
|
||||||
|
|
||||||
-- Add to the search path the schema
|
|
||||||
SET search_path TO public,cartodb,cdb_crankshaft;
|
|
||||||
|
|
||||||
-- Exercise public functions
|
|
||||||
SELECT ppoints.code, m.quads
|
|
||||||
FROM ppoints
|
|
||||||
JOIN cdb_moran_local('ppoints', 'value') m
|
|
||||||
ON ppoints.cartodb_id = m.ids
|
|
||||||
ORDER BY ppoints.code;
|
|
||||||
SELECT round(cdb_overlap_sum(
|
|
||||||
'0106000020E61000000100000001030000000100000004000000FFFFFFFFFF3604C09A0B9ECEC42E444000000000C060FBBF30C7FD70E01D44400000000040AD02C06481F1C8CD034440FFFFFFFFFF3604C09A0B9ECEC42E4440'::geometry,
|
|
||||||
'values', 'value'
|
|
||||||
), 2);
|
|
||||||
@@ -1,3 +1,2 @@
|
|||||||
import random_seeds
|
import random_seeds
|
||||||
import clustering
|
import clustering
|
||||||
import bayesian_blocks
|
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
from bayesian_blocks import *
|
|
||||||
@@ -1,84 +0,0 @@
|
|||||||
import plpy
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def adaptive_histogram(table_name,column_name):
|
|
||||||
data = plpy.execute("select {column_name} from {table_name}".format(**locals()))
|
|
||||||
|
|
||||||
data = [float(d['count']) for d in data]
|
|
||||||
plpy.notice(data)
|
|
||||||
vals, bins = np.histogram( data, bins=_bayesian_blocks(data))
|
|
||||||
return zip(vals,bins, bins[1:])
|
|
||||||
|
|
||||||
|
|
||||||
def _bayesian_blocks(t):
|
|
||||||
"""Bayesian Blocks Implementation
|
|
||||||
|
|
||||||
By Jake Vanderplas. License: BSD
|
|
||||||
Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : ndarray, length N
|
|
||||||
data to be histogrammed
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
bins : ndarray
|
|
||||||
array containing the (N+1) bin edges
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
This is an incomplete implementation: it may fail for some
|
|
||||||
datasets. Alternate fitness functions and prior forms can
|
|
||||||
be found in the paper listed above.
|
|
||||||
"""
|
|
||||||
# copy and sort the array
|
|
||||||
t = np.sort(t)
|
|
||||||
N = t.size
|
|
||||||
|
|
||||||
# create length-(N + 1) array of cell edges
|
|
||||||
edges = np.concatenate([t[:1],
|
|
||||||
0.5 * (t[1:] + t[:-1]),
|
|
||||||
t[-1:]])
|
|
||||||
block_length = t[-1] - edges
|
|
||||||
|
|
||||||
# arrays needed for the iteration
|
|
||||||
nn_vec = np.ones(N)
|
|
||||||
best = np.zeros(N, dtype=float)
|
|
||||||
last = np.zeros(N, dtype=int)
|
|
||||||
|
|
||||||
#-----------------------------------------------------------------
|
|
||||||
# Start with first data cell; add one cell at each iteration
|
|
||||||
#-----------------------------------------------------------------
|
|
||||||
for K in range(N):
|
|
||||||
# Compute the width and count of the final bin for all possible
|
|
||||||
# locations of the K^th changepoint
|
|
||||||
width = block_length[:K + 1] - block_length[K + 1]
|
|
||||||
count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1]
|
|
||||||
|
|
||||||
# evaluate fitness function for these possibilities
|
|
||||||
fit_vec = count_vec * (np.log(count_vec) - np.log(width))
|
|
||||||
fit_vec -= 4 # 4 comes from the prior on the number of changepoints
|
|
||||||
fit_vec[1:] += best[:K]
|
|
||||||
|
|
||||||
# find the max of the fitness: this is the K^th changepoint
|
|
||||||
i_max = np.argmax(fit_vec)
|
|
||||||
last[K] = i_max
|
|
||||||
best[K] = fit_vec[i_max]
|
|
||||||
|
|
||||||
#-----------------------------------------------------------------
|
|
||||||
# Recover changepoints by iteratively peeling off the last block
|
|
||||||
#-----------------------------------------------------------------
|
|
||||||
change_points = np.zeros(N, dtype=int)
|
|
||||||
i_cp = N
|
|
||||||
ind = N
|
|
||||||
while True:
|
|
||||||
i_cp -= 1
|
|
||||||
change_points[i_cp] = ind
|
|
||||||
if ind == 0:
|
|
||||||
break
|
|
||||||
ind = last[ind - 1]
|
|
||||||
change_points = change_points[i_cp:]
|
|
||||||
|
|
||||||
return edges[change_points]
|
|
||||||
@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
|
|||||||
setup(
|
setup(
|
||||||
name='crankshaft',
|
name='crankshaft',
|
||||||
|
|
||||||
version='0.0.1',
|
version='0.0.01',
|
||||||
|
|
||||||
description='CartoDB Spatial Analysis Python Library',
|
description='CartoDB Spatial Analysis Python Library',
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ setup(
|
|||||||
|
|
||||||
# The choice of component versions is dictated by what's
|
# The choice of component versions is dictated by what's
|
||||||
# provisioned in the production servers.
|
# provisioned in the production servers.
|
||||||
install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0'],
|
install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
|
||||||
|
|
||||||
requires=['pysal', 'numpy'],
|
requires=['pysal', 'numpy'],
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user