Compare commits

..

10 Commits

Author SHA1 Message Date
Javier Goizueta
8e972128eb Modify sql code to user the python virtualenv 2016-03-09 15:00:50 +01:00
Javier Goizueta
cdd2d9e722 Directory reorganization and sketch of new versioning procedure 2016-03-08 19:35:02 +01:00
Javier Goizueta
46c66476b5 Merge pull request #5 from CartoDB/4-pgxs-fix
Adapt Makefile of the extension for some PGXS versions
2016-02-29 16:35:04 +01:00
Javier Goizueta
e03aac4d8f Fix typo 2016-02-26 19:09:17 +01:00
Javier Goizueta
d885c16db2 Adapt Makefile of the extension for some PGXS versions
Postgresql 9.3.11 doesn't generates $DATA by default.
fixes #4
2016-02-26 19:02:18 +01:00
Rafa de la Torre
abfda1c75e Update CONTRIBUTING.md
minor change (just a space)
2016-02-23 17:23:33 +01:00
Rafa de la Torre
8f478ef22c Update README.md
Remove FIXME that should be already fixed.
2016-02-23 17:18:19 +01:00
Javier Goizueta
c7bb50be5a Fix: Make extension publicly available 2016-02-22 17:39:58 +01:00
Javier Goizueta
ef17e2fe4c Add header 2016-02-22 16:14:52 +01:00
Javier Goizueta
f3b8546063 Fix syntax 2016-02-22 16:14:28 +01:00
47 changed files with 196 additions and 336 deletions

View File

@@ -12,7 +12,7 @@ name must be created.
### Version numbers
The version of both the SQL extension and the Python package shall
follow the[Semantic Versioning 2.0](http://semver.org/) guidelines:
follow the [Semantic Versioning 2.0](http://semver.org/) guidelines:
* When backwards incompatibility is introduced the major number is incremented
* When functionally is added (in a backwards-compatible manner) the minor number

View File

@@ -1,5 +1,5 @@
EXT_DIR = pg
PYP_DIR = python
EXT_DIR = src/pg
PYP_DIR = src/py
.PHONY: install
.PHONY: run_tests

View File

@@ -4,11 +4,87 @@ CartoDB Spatial Analysis extension for PostgreSQL.
## Code organization
* *pg* contains the PostgreSQL extension source code
* *python* Python module
FIXME: should it be `./extension` and `./lib/python' ?
* *doc* documentation
* *src* source code
* - *src/pg* contains the PostgreSQL extension source code
* - *src/py* Python module source code
* *release* reselesed versions
## Requirements
* pip
* pip, virtualenv, PostgreSQL
# Working Process
## Development
Work in `src/pg/sql`, `src/py/crankshaft`;
use topic branch.
Update local installation with `sudo make install`
(this will update the 'dev' version of the extension in 'src/pg/')
Run the tests with `PGUSER=postgres make test`
Update extension in working database with
* `ALTER EXTENSION crankshaft VERSION TO 'current';`
`ALTER EXTENSION crankshaft VERSION TO 'dev';`
Note: we keep the current development version install as 'dev' always;
we update through the 'current' alias to allow changing the extension
contents but not the version identifier. This will fail if the
changes involve incompatible function changes such as a different
return type; in that case the offending function (or the whole extension)
should be dropped manually before the update.
If the extension has not previously been installed in a database
we can:
Add tests...
* `CREATE EXTENSION crankshaft WITH VERSION 'dev';`
Test
Commit, push, create PR, wait for CI tests, CR, ...
## Release
To release current development version
(working directory should be clean in dev branch)
(process to be gradually automated)
For backwards compatible changes (no return value, num of arguments, etc. changes...)
new version number increasing either patch level (no new functionality)
or minor level (new functionality) => 'X.Y.Z'.
Update version in src/pg/crankshaft.control
Copy release/crankshaft--current.sql to release/crankshaft--X.Y.Z.sql
Prepare incremental downgrade, upgrade scripts....
Python: ...
Install the new release
`make install-release`
Test the new release
`make test-release`
Push the release
Wait for CI tests
Merge into master
Deploy: install extension and python to production hosts,
update extension in databases (limited to team users, data observatory, ...)
Release manager role: ...
.sql release scripts
commit
tests: staging....
merge, tag, deploy...

3
pg/.gitignore vendored
View File

@@ -1,3 +0,0 @@
regression.diffs
regression.out
results/

View File

@@ -1,30 +0,0 @@
# Makefile to generate the extension out of separate sql source files.
# Once a version is released, it is not meant to be changed. E.g: once version 0.0.1 is out, it SHALL NOT be changed.
EXTENSION = crankshaft
EXTVERSION = $(shell grep default_version $(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
# The new version to be generated from templates
NEW_EXTENSION_ARTIFACT = $(EXTENSION)--$(EXTVERSION).sql
# DATA is a special variable used by postgres build infrastructure
# These are the files to be installed in the server shared dir,
# for installation from scratch, upgrades and downgrades.
# @see http://www.postgresql.org/docs/current/static/extend-pgxs.html
DATA = $(NEW_EXTENSION_ARTIFACT)
SOURCES_DATA_DIR = sql/$(EXTVERSION)
SOURCES_DATA = $(wildcard sql/$(EXTVERSION)/*.sql)
# The extension installation artifacts are stored in the base subdirectory
$(NEW_EXTENSION_ARTIFACT): $(SOURCES_DATA)
rm -f $@
cat $(SOURCES_DATA_DIR)/*.sql >> $@
REGRESS = $(notdir $(basename $(wildcard test/$(EXTVERSION)/sql/*test.sql)))
TEST_DIR = test/$(EXTVERSION)
REGRESS_OPTS = --inputdir='$(TEST_DIR)' --outputdir='$(TEST_DIR)'
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -1,136 +0,0 @@
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I
CREATE OR REPLACE FUNCTION
cdb_moran_local (
t TEXT,
attr TEXT,
significance float DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate
CREATE OR REPLACE FUNCTION
cdb_moran_local_rate(t TEXT,
numerator TEXT,
denominator TEXT,
significance FLOAT DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric)
AS $$
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE

View File

@@ -1,138 +0,0 @@
-- Function to obtain an estimate of the population living inside
-- an area (polygon) from the CartoDB Data Observatory
CREATE OR REPLACE FUNCTION cdb_population(area geometry)
RETURNS NUMERIC AS $$
DECLARE
georef_column TEXT;
table_id TEXT;
tag_value TEXT;
table_name TEXT;
column_name TEXT;
population NUMERIC;
BEGIN
-- Note: comments contain pseudo-code that should be implemented
-- Register metadata tables:
-- This would require super-user privileges
/*
SELECT cdb_add_remote_table('observatory', 'bmd_column_table');
SELECT cdb_add_remote_table('observatory', 'bmd_column_2_column');
SELECT cdb_add_remote_table('observatory', 'bmd_table');
SELECT cdb_add_remote_table('observatory', 'bmd_column_table');
SELECT cdb_add_remote_table('observatory', 'bmd_column_tag');
SELECT cdb_add_remote_table('observatory', 'bmd_tag');
*/
tag_value := 'population';
-- Determine the georef column id to be used: it must have type 'geometry',
-- the maximum weight.
-- TODO: in general, multiple columns with maximal weight could be found;
-- we should use the timespan of the table to disambiguate (choose the
-- most recent). Also a rank of geometry columns should be introduced to
-- find select the greatest resolution available.
/*
WITH selected_tables AS (
-- Find tables that have population columns and cover the input area
SELECT tab.id AS id
FROM observatory.bmd_column col,
observatory.bmd_column_table coltab,
observatory.bmd_table tab,
observatory.bmd_tag tag,
observatory.bmd_column_tag coltag
WHERE coltab.column_id = col.id
AND coltab.table_id = tab.id
AND coltag.tag_id = tag.id
AND coltag.column_id = col.id
AND tag.name ILIKE tag_value
AND tab.id = table_id
AND tab.bounds && area;
)
SELECT
FROM bmd_column col
JOIN bmd_table tab ON col.table_id = tab.id
WHERE type = 'geometry'
AND tab.id IN (selected_tables)
ORDER BY weight DESC LIMIT 1;
*/
georef_column := '"us.census.tiger".block_group_2013';
-- Now we will query the metadata to find which actual tables correspond
-- to this datasource and resolution/timespan
-- and choose the 'parent' or more general of them.
/*
SELECT from_table_geoid.id data_table_id
FROM observatory.bmd_column_table from_column_table_geoid,
observatory.bmd_column_table to_column_table_geoid,
observatory.bmd_column_2_column rel,
observatory.bmd_column_table to_column_table_geom,
observatory.bmd_table from_table_geoid,
observatory.bmd_table to_table_geoid,
observatory.bmd_table to_table_geom
WHERE from_column_table_geoid.column_id = to_column_table_geoid.column_id
AND to_column_table_geoid.column_id = rel.from_id
AND rel.reltype = 'geom_ref'
AND rel.to_id = to_column_table_geom.column_id
AND to_column_table_geom.column_id = georef_column
AND from_table_geoid.id = from_column_table_geoid.table_id
AND to_table_geoid.id = to_column_table_geoid.table_id
AND to_table_geom.id = to_column_table_geom.table_id
AND from_table_geoid.bounds && area
ORDER by from_table_geoid.timespan desc
INTO table_id;
*/
table_id := '"us.census.acs".extract_2013_5yr_block_group';
-- Next will fetch the columns of that table that are tagged as population:
-- and get the more general one (not having a parent or denominator)
/*
WITH column_ids AS (
SELECT col.id AS id
FROM observatory.bmd_column col,
observatory.bmd_column_table coltab,
observatory.bmd_table tab,
observatory.bmd_tag tag,
observatory.bmd_column_tag coltag
WHERE coltab.column_id = col.id
AND coltab.table_id = tab.id
AND coltag.tag_id = tag.id
AND coltag.column_id = col.id
AND tag.name ILIKE tag_value
AND tab.id = table_id;
),
excluded_column_ids AS (
SELECT from_id AS id
FROM observatory.bmd_column_2_column
WHERE from_id in (column_ids)
AND reltype in ('parent', 'denominator')
AND to_id in (column_ids)
),
SELECT bmd_table.tablename, bmd_column_table.colname
FROM observatory.bmd_column_table,
observatory.bmd_table
WHERE bmd_column_table.table_id = bmd_table.id
AND bmd_column_table.column_id IN (column_ids)
AND NOT bmd_column_table.column_id IN (exclude_column_ids)
INTO (table_name, column_name);
*/
table_name := 'us_census_acs2013_5yr_block_group';
column_name := 'total_pop';
-- Register the foreign table
-- This would require super-user privileges
-- SELECT cdb_add_remote_table('observatory', table_name);
-- Perform the query
SELECT cdb_crankshaft.cdb_overlap_sum(
area,
table_name,
column_name,
schema_name := 'observatory')
INTO population;
RETURN population;
END;
$$
LANGUAGE plpgsql VOLATILE

View File

@@ -1,6 +0,0 @@
-- Install dependencies
CREATE EXTENSION plpythonu;
CREATE EXTENSION postgis;
CREATE EXTENSION cartodb;
-- Install the extension
CREATE EXTENSION crankshaft;

View File

@@ -1,11 +0,0 @@
# Install the package (needs root privileges)
install:
pip install ./crankshaft --upgrade
# Test from source code
test:
(cd crankshaft && nosetests test/)
# Test currently installed package
testinstalled:
nosetests crankshaft/test/

6
src/pg/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
regression.diffs
regression.out
results/
crankshaft--dev.sql
crankshaft--dev--current.sql
crankshaft--current--dev.sql

41
src/pg/Makefile Normal file
View File

@@ -0,0 +1,41 @@
# Generation of a new development version 'dev' (with an alias 'current' for
# updating easily by upgrading to 'current', then 'dev')
# sudo make install -- generate the 'dev' version from current source
# and make it available to PostgreSQL
# PGUSER=postgres make installcheck -- test the 'dev' extension
EXTENSION = crankshaft
DATA = $(EXTENSION)--dev.sql \
$(EXTENSION)--current--dev.sql \
$(EXTENSION)--dev--current.sql
SOURCES_DATA_DIR = sql
SOURCES_DATA = $(wildcard $(SOURCES_DATA_DIR)/*.sql)
$(DATA): $(SOURCES_DATA)
cat $(SOURCES_DATA_DIR)/*.sql > $@
TEST_DIR = test
REGRESS = $(notdir $(basename $(wildcard $(TEST_DIR)/sql/*test.sql)))
REGRESS_OPTS = --inputdir='$(TEST_DIR)' --outputdir='$(TEST_DIR)'
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
# This seems to be needed at least for PG 9.3.11
all: $(DATA)
# WIP: goals for releasing the extension...
EXTVERSION = $(shell grep default_version $(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
../release/$(EXTENSION).control: $(EXTENSION).control
cp $< $@
release: ../release/$(EXTENSION).control
cp $(EXTENSION)--dev.sql $(EXTENSION)--$(EXTVERSION).sql
# pending: create upgrade/downgrade scripts,
# commit, push, tag....

3
src/pg/sql/00_header.sql Normal file
View File

@@ -0,0 +1,3 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit

18
src/pg/sql/01_py.sql Normal file
View File

@@ -0,0 +1,18 @@
-- Use the crankshaft python module
CREATE OR REPLACE FUNCTION _cdb_crankshaft_activate_py()
RETURNS VOID
AS $$
# activate virtualenv
# TODO: parameterize with environment variables or something
venv_path = '/home/ubuntu/crankshaft/src/py/dev'
activate_path = venv_path + '/bin/activate_this.py'
exec(open(activate_path).read(),
dict(__file__=activate_path))
# import something from virtualenv
# from crankshaft import random_seeds
# do some stuff
# random_seeds.set_random_seeds(123)
# plpy.notice('here we are')
$$ LANGUAGE plpythonu;

View File

@@ -4,6 +4,7 @@
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;

View File

@@ -11,6 +11,7 @@ CREATE OR REPLACE FUNCTION
w_type TEXT DEFAULT 'knn')
RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
@@ -29,6 +30,7 @@ CREATE OR REPLACE FUNCTION
w_type TEXT DEFAULT 'knn')
RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type)

View File

@@ -51,4 +51,4 @@ BEGIN
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE
LANGUAGE plpgsql VOLATILE;

View File

@@ -0,0 +1,9 @@
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -3,4 +3,4 @@ CREATE EXTENSION plpythonu;
CREATE EXTENSION postgis;
CREATE EXTENSION cartodb;
-- Install the extension
CREATE EXTENSION crankshaft;
CREATE EXTENSION crankshaft VERSION 'dev';

View File

@@ -4,4 +4,4 @@ CREATE EXTENSION postgis;
CREATE EXTENSION cartodb;
-- Install the extension
CREATE EXTENSION crankshaft;
CREATE EXTENSION crankshaft VERSION 'dev';

View File

@@ -0,0 +1,18 @@
SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- Use regular user role
SET ROLE test_regular_user;
-- Add to the search path the schema
SET search_path TO public,cartodb,cdb_crankshaft;
-- Exercise public functions
SELECT ppoints.code, m.quads
FROM ppoints
JOIN cdb_moran_local('ppoints', 'value') m
ON ppoints.cartodb_id = m.ids
ORDER BY ppoints.code;
SELECT round(cdb_overlap_sum(
'0106000020E61000000100000001030000000100000004000000FFFFFFFFFF3604C09A0B9ECEC42E444000000000C060FBBF30C7FD70E01D44400000000040AD02C06481F1C8CD034440FFFFFFFFFF3604C09A0B9ECEC42E4440'::geometry,
'values', 'value'
), 2);

View File

@@ -1 +1,2 @@
*.pyc
dev/

9
src/py/Makefile Normal file
View File

@@ -0,0 +1,9 @@
# Install the package locally for development
install:
virtualenv dev
./dev/bin/pip install ./crankshaft --upgrade
./dev/bin/pip install nose
# Test develpment install
testinstalled:
./dev/bin/nosetests crankshaft/test/

View File

@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.01',
version='0.0.1',
description='CartoDB Spatial Analysis Python Library',