Compare commits

...

240 Commits
docker ... gwr

Author SHA1 Message Date
Rafa de la Torre
c21fcdf69a Drop objects in reverse order in downgrade script
Otherwise it fails downgrading, despite it is supposed to run everything
in the context of a transaction:

```
tests=# alter extension crankshaft update to '0.0.4';
ERROR:  cannot drop function cdb_pyaggs(numeric[],numeric[]) because
other objects depend on it
DETAIL:  extension crankshaft depends on function
cdb_pyaggs(numeric[],numeric[])
HINT:  Use DROP ... CASCADE to drop the dependent objects too.
```
2016-06-29 20:01:41 +02:00
Rafa de la Torre
c8871a5547 Remove the DEFAULT values in DROP FUNCTION's 2016-06-29 19:52:04 +02:00
Rafa de la Torre
3d99d1f9bf Add upgrade and downgrade files for 0.1.0 2016-06-29 19:51:14 +02:00
Rafa de la Torre
78ceb02c22 Add release files generated with make release 2016-06-29 19:15:32 +02:00
Rafa de la Torre
408dc6806e Use 0.1.0 instead of 0.0.5 2016-06-29 19:10:28 +02:00
Rafa de la Torre
702a1fb1ed Update NEWS.md 2016-06-29 19:00:53 +02:00
Rafa de la Torre
76ee4cacbc Revert changes in release/ dir 2016-06-29 18:49:14 +02:00
Rafa de la Torre
4192930260 Merge remote-tracking branch 'origin/develop' 2016-06-29 18:47:23 +02:00
Rafa de la Torre
2fe50e1f71 Merge pull request #70 from CartoDB/segmentation
Segmentation pull request
2016-06-29 18:35:01 +02:00
Rafa de la Torre
d97231f604 Revert changes in release/ dir 2016-06-29 18:34:01 +02:00
Rafa de la Torre
cb330ebe6b Make test more robust 2016-06-29 18:12:21 +02:00
Rafa de la Torre
505ae0fb44 Go back to using numpy==1.6.1 2016-06-29 17:09:36 +02:00
Rafa de la Torre
35b5612ded Merge remote-tracking branch 'origin/develop' into segmentation
Conflicts:
	src/py/crankshaft/crankshaft/__init__.py
2016-06-29 17:09:14 +02:00
Stuart Lynn
cfb58f7898 removing full function calls to be compatiable with numpy 1.6.1 2016-06-29 14:46:03 +00:00
Rafa de la Torre
bef5fe2c80 Merge remote-tracking branch 'origin/develop' 2016-06-29 16:35:34 +02:00
Rafa de la Torre
ab349fbdc0 Merge pull request #78 from CartoDB/77-fix-markov-tests
Fix Markov trend_test that expects a bool #77
2016-06-29 16:34:21 +02:00
Andy Eschbacher
9e4d378a08 updates error bounds 2016-06-29 10:27:19 -04:00
Rafa de la Torre
3a2e5ec7f9 Fix Markov trend_test that expects a bool #77 2016-06-29 16:06:38 +02:00
Rafa de la Torre
1e42c23919 Merge pull request #76 from CartoDB/develop
Release prep: merge develop into master
2016-06-29 15:50:18 +02:00
Rafa de la Torre
55d2860a84 Merge tag '0.0.4'
0.0.4 (2016-06-20)
------------------
* Remove cartodb extension dependency from tests
* Declare all correct dependencies with correct versions in setup.py

Conflicts:
	CONTRIBUTING.md
	src/pg/Makefile
2016-06-29 15:22:02 +02:00
Andy Eschbacher
b7860d5a24 Merge pull request #23 from CartoDB/add-spatial-markov
[wip] Add spatial markov
2016-06-28 17:41:23 -04:00
Andy Eschbacher
476ec04386 fill in rest of docs descriptions 2016-06-28 14:44:17 -04:00
Andy Eschbacher
75d97915d6 update tests from production machine 2016-06-28 14:23:36 -04:00
Stuart Lynn
068c80f369 fixing errors in documentatio 2016-06-28 18:03:52 +00:00
Stuart Lynn
5f98735ce9 adding documentation for pyagg helper 2016-06-28 16:04:23 +00:00
Stuart Lynn
7d6148456e adding inline documentation 2016-06-28 16:02:06 +00:00
Stuart Lynn
f6f9d6e9c8 resolves testing issues 2016-06-28 15:49:34 +00:00
Andy Eschbacher
32515f445e update tests and paths 2016-06-28 10:48:10 -04:00
Andy Eschbacher
dd0ca9a24f Merge branch 'develop' into add-spatial-markov 2016-06-28 10:19:36 -04:00
Andy Eschbacher
045fd67f47 formatting 2016-06-28 10:15:31 -04:00
Andy Eschbacher
2bb2e60af8 catching errors 2016-06-28 10:06:54 -04:00
Andy Eschbacher
99dc363c7d format query 2016-06-27 10:12:06 -04:00
Andy Eschbacher
c799f4d73b fix formatting of query 2016-06-27 10:09:13 -04:00
Andy Eschbacher
3e01470aa0 add error catching 2016-06-27 10:07:22 -04:00
Andy Eschbacher
a2b3733a1e fix improper line drop 2016-06-27 09:58:42 -04:00
Andy Eschbacher
a5e4ae99ce update tests 2016-06-27 09:57:23 -04:00
Andy Eschbacher
c80975fe46 updating formatting 2016-06-27 09:56:58 -04:00
Andy Eschbacher
dae406927f formatting updates 2016-06-27 09:56:12 -04:00
Andy Eschbacher
a177bf5620 minor doc updates 2016-06-27 08:54:28 -04:00
Andy Eschbacher
de7d56dc29 doc edits 2016-06-24 17:00:08 -04:00
Stuart Lynn
642935e44a pyAgg documentation 2016-06-23 21:33:17 +00:00
Stuart Lynn
15fc0bb683 segmentation documentation 2016-06-23 21:17:51 +00:00
Stuart Lynn
8cbc29a3e6 adding helper method to be abel to pass 2D arrays from postgresql to python 2016-06-23 20:23:39 +00:00
Stuart Lynn
03aada8758 Adding a segmentation version that doesn't use query string passing 2016-06-23 20:23:04 +00:00
Rafa de la Torre
faa899cf87 Fix installation for development mode 2016-06-23 10:11:59 +02:00
Stuart Lynn
911a0ccccc Merge branch 'develop' into segmentation 2016-06-22 21:46:07 +00:00
Stuart Lynn
76b3a873b8 segmentation pg tests 2016-06-22 21:43:32 +00:00
Stuart Lynn
de274bf628 removing commented code 2016-06-22 21:43:03 +00:00
Andy Eschbacher
95803b4cd4 Merge pull request #66 from CartoDB/fix-moran-query-ordering-problem
fix ordering problem in moran query construction
2016-06-22 17:16:44 -04:00
Andy Eschbacher
609e2aa015 Merge branch 'develop' into fix-moran-query-ordering-problem 2016-06-22 17:15:47 -04:00
Andy Eschbacher
44a6471c69 Merge pull request #69 from CartoDB/spatial_interpolation_test_edits
Spatial interpolation test edits
2016-06-22 17:15:31 -04:00
Andy Eschbacher
2fa087bb62 adding row info :/ 2016-06-22 17:11:51 -04:00
Andy Eschbacher
3f210c2a71 reducing amt of text in outputs 2016-06-22 17:08:50 -04:00
Andy Eschbacher
6a9045ba62 updating test outputs 2016-06-22 16:56:35 -04:00
Andy Eschbacher
6f72075999 altering test outputs for less formatting 2016-06-22 16:50:10 -04:00
Stuart Lynn
89c47dcef6 mocking out cursor 2016-06-22 20:21:52 +00:00
Stuart Lynn
1d7f62fa85 adding python tests 2016-06-22 20:21:37 +00:00
Andy Eschbacher
279912c03f Merge branch 'develop' into fix-moran-query-ordering-problem 2016-06-22 16:12:44 -04:00
Andy Eschbacher
7c9628eef9 Merge pull request #68 from CartoDB/fix-return-type-spatial-interpolation-query
fixes return problem
2016-06-22 15:21:59 -04:00
Andy Eschbacher
81d7af9e9a fixes return problem 2016-06-22 15:21:09 -04:00
Stuart Lynn
4df8257377 updating to support passing model paramters and returning accuracy from the function along with prediction 2016-06-22 15:56:47 +00:00
Andy Eschbacher
b62d7b32ef fix variable name 2016-06-21 17:41:52 -04:00
Andy Eschbacher
7c4314a411 fix tuple colon 2016-06-21 17:38:49 -04:00
Andy Eschbacher
1912d57891 replacing dict with ordered dict 2016-06-21 17:31:17 -04:00
Stuart Lynn
1d13b98d68 Basic version of create and predict segment 2016-06-21 21:00:30 +00:00
Andy Eschbacher
3c066d65fc Merge pull request #61 from CartoDB/add-interpolation
Add interpolation
2016-06-21 11:59:42 -04:00
Stuart Lynn
79699cd5cb changing to new crankshaft format 2016-06-20 17:27:37 -04:00
Stuart Lynn
59d50a1c48 Merge branch develop into segmentation 2016-06-20 17:20:10 -04:00
Luis Bosque
01fc2c1dd1 Release 0.0.4 2016-06-20 10:13:33 +02:00
Luis Bosque
f5fb4499db Set final dependencies versions 2016-06-20 09:44:52 +02:00
Raul Ochoa
ad5cffbf0d Merge pull request #64 from CartoDB/remove-cartodb-extension-dep
Removes cartodb-extension-dep
2016-06-16 19:19:25 +02:00
Raul Ochoa
1db938c450 Removes cartodb-extension-dep 2016-06-16 19:07:42 +02:00
Luis Bosque
3480a0d252 Allow passing options to pip install 2016-06-16 16:56:16 +02:00
Raul Ochoa
237aa1c581 Declare scipy as dep 2016-06-16 16:34:45 +02:00
Raul Ochoa
1e19f468eb Declare numpy dep 2016-06-16 16:23:43 +02:00
Raul Ochoa
8b5e910234 Release 0.0.3 2016-06-16 14:16:32 +02:00
Raul Ochoa
d08a2b6d2d Remove _cdb_crankshaft_activate_py activation call from kmeans function 2016-06-16 14:12:28 +02:00
Raul Ochoa
a222341863 Merge pull request #59 from CartoDB/kmeans
KMeans clustering and weighted centroid analysis
2016-06-16 12:39:13 +02:00
Andy Eschbacher
4834ee2f42 adding higher numpy version 2016-06-15 11:36:19 -04:00
Andy Eschbacher
bbe22d0b4d new bounds for accuracy in tests 2016-06-15 11:36:04 -04:00
Rafa de la Torre
284d8ede44 Merge pull request #62 from CartoDB/60-remove-venv-activation
Remove virtualenv activation #60
2016-06-15 17:30:01 +02:00
Rafa de la Torre
a8943bae98 Remove reference to clean-environments #60 2016-06-14 18:27:35 +02:00
Rafa de la Torre
75531b671e Remove virtualenv references from READMEs #60 2016-06-14 18:24:43 +02:00
Rafa de la Torre
0acae8240f Remove virtualenv stuff from Makefiles #60 2016-06-14 18:23:30 +02:00
Rafa de la Torre
7b98415da3 Remove virtualenv activation #60 2016-06-14 18:06:23 +02:00
abelvm
5a2319db72 remove garbage 2016-06-14 18:01:03 +02:00
abelvm
b0c47663da Merge branch 'add-interpolation' of github.com:CartoDB/crankshaft into add-interpolation 2016-06-14 17:56:25 +02:00
abelvm
9db4b7f519 first commit 2016-06-14 17:55:45 +02:00
Raul Ochoa
fd1862167c Remove trailing space 2016-06-13 13:06:21 +02:00
Raul Ochoa
c870f68c77 Revert "Declare scipy as dep"
This reverts commit 1e8bc12e0a.
2016-06-13 13:05:50 +02:00
Raul Ochoa
1e8bc12e0a Declare scipy as dep 2016-06-13 12:17:46 +02:00
Raul Ochoa
b33ba2d294 Do not use names for the aggregate params 2016-06-10 18:24:43 +02:00
Raul Ochoa
889cd5c579 Fix scikit-learn dep name 2016-06-10 17:47:46 +02:00
Stuart Lynn
1a4944b960 adding sklearn as a dep 2016-06-10 13:16:16 +00:00
Stuart Lynn
9d3de5a8ef adding not null filter for geom on kmeans 2016-06-10 13:12:55 +00:00
Stuart Lynn
7f3b23f67a reworking CDB_WeightedMean to be an aggregate function 2016-06-10 13:06:49 +00:00
Raul Ochoa
cc4579461d Merge branch 'develop' into kmeans 2016-06-09 11:28:39 +02:00
Raul Ochoa
e95c40c2f9 Ignore idea based configurations 2016-06-09 11:27:15 +02:00
Andy Eschbacher
69f08c4b78 updated tests to check for percent diffs 2016-06-08 11:26:32 -04:00
Ubuntu
4e86965f03 KMeans clustering and weighted centroid analysis 2016-06-07 19:58:32 +00:00
Andy Eschbacher
d41e28bc6f pylint corrections 2016-06-06 09:26:52 -04:00
Andy Eschbacher
1f73be2752 rename functions -> trend 2016-06-06 08:46:13 -04:00
abelvm
5183f5ff92 added function overload with subqueries input 2016-06-03 18:22:26 +02:00
abelvm
73d38bbbaa filling the gaps 2016-06-03 17:32:39 +02:00
Andy Eschbacher
e7de471ac8 adds expectation values for tests 2016-06-03 11:06:48 -04:00
Andy Eschbacher
14d50facda adds test that expects all null values returned 2016-06-03 11:06:29 -04:00
Andy Eschbacher
5e8d11fce2 adds fixtures for spatial markov tests 2016-06-03 11:02:51 -04:00
Andy Eschbacher
183a0f9604 comment out unneeded code whch w.transform handles 2016-06-03 11:01:53 -04:00
Andy Eschbacher
fc3a7e3d78 add casting to function call 2016-06-03 11:01:20 -04:00
Stuart Lynn
4782d39849 stubbing out gravity model 2016-06-03 10:36:21 -04:00
Andy Eschbacher
21dd956c15 Merge pull request #56 from CartoDB/add-issue-templates
adding template for code reviews
2016-06-03 08:25:21 -04:00
Andy Eschbacher
c0dfaa8341 adds test for spatial markov sql function 2016-06-02 16:19:15 -04:00
Andy Eschbacher
edf635886b updates column name to rowid 2016-06-02 14:19:13 -04:00
Andy Eschbacher
a5cb857841 adds docs for spatial markov 2016-06-02 14:17:33 -04:00
Andy Eschbacher
971c8f75d8 Merge branch 'develop' into add-spatial-markov 2016-06-02 12:59:49 -04:00
Andy Eschbacher
54d35c614b adds tests passing for spatial_markov 2016-06-02 12:52:34 -04:00
Andy Eschbacher
d1a267febb adding number of classes options 2016-06-02 12:51:01 -04:00
Andy Eschbacher
0eb3db3c1d adding fixtures 2016-06-02 12:12:52 -04:00
Andy Eschbacher
7a7cbcf33f adds test for get_time_data 2016-06-02 09:49:09 -04:00
Andy Eschbacher
a0a026c2a1 Merge pull request #57 from CartoDB/adding-hot-and-cold-spot-function
adding hot/cold/outlier getters
2016-06-01 15:23:19 -04:00
Andy Eschbacher
c04e15ef81 rename some variables 2016-06-01 15:07:16 -04:00
Andy Eschbacher
ae1bb703a7 descriptions for all functions 2016-06-01 15:06:58 -04:00
Andy Eschbacher
0e3970f52c adds docs for areasofinterestlocal 2016-06-01 13:46:12 -04:00
Andy Eschbacher
59c520da16 renaming output value to vals 2016-06-01 13:40:32 -04:00
Andy Eschbacher
90c3e21c0d renaming output id to rowid 2016-06-01 12:43:33 -04:00
Andy Eschbacher
3013998e1b make functions more flexible to case of weight type 2016-06-01 12:19:08 -04:00
Andy Eschbacher
2b8adb744d switched signature to put more common options in the front 2016-06-01 12:09:43 -04:00
Andy Eschbacher
434cfcd1e9 Merge branch 'develop' into adding-hot-and-cold-spot-function 2016-06-01 11:38:46 -04:00
Andy Eschbacher
b5c6b42081 update names to align with CamelCase convention 2016-06-01 11:33:02 -04:00
Andy Eschbacher
408e34cd38 newline 2016-05-26 10:53:33 -04:00
Andy Eschbacher
ca610af4d8 Merge branch 'develop' into add-spatial-markov 2016-05-25 12:05:03 -04:00
Andy Eschbacher
e80fdca7fc removing time-binning options, reorganizes signature 2016-05-25 12:02:47 -04:00
Andy Eschbacher
fd7aa4140a change function name to match requirements 2016-05-25 12:01:55 -04:00
Andy Eschbacher
7a1eb6b9b6 adding debug to mock plpy 2016-05-25 11:03:58 -04:00
Andy Eschbacher
1b0d1cc82c updating function names 2016-05-24 17:51:29 -04:00
Andy Eschbacher
fe22464b75 Merge pull request #22 from CartoDB/update-docs
Update docs format
2016-05-23 09:51:44 -04:00
Andy Eschbacher
ca5175f15b adding reference to subquery argument requirement 2016-05-20 16:26:43 -04:00
Andy Eschbacher
4e870e4393 adapt other test for new settings in fixtures 2016-05-20 15:15:41 -04:00
Andy Eschbacher
b05ad98ed9 adding tests for hot/cold/outlier for normal moran functions 2016-05-20 15:15:19 -04:00
Andy Eschbacher
bc8055a12b adds pset format unaligned to reduce output for tests 2016-05-20 14:55:16 -04:00
Andy Eschbacher
c3913459d9 adding tests for hotspot, coldspot, and outlier functions 2016-05-20 14:54:42 -04:00
Andy Eschbacher
f571e59a95 adding hot/cold/outlier getters 2016-05-20 14:05:59 -04:00
Andy Eschbacher
0400b1a880 adding template for code reviews 2016-05-20 13:23:56 -04:00
Javier Goizueta
c7e4baa4aa Fix instructions to update/install the extension 2016-05-20 11:47:46 +02:00
Javier Goizueta
cc4a35ebd9 Fix instructions to update/install the extension 2016-05-20 11:47:12 +02:00
Andy Eschbacher
c44434ef08 adding fixture for markov testing 2016-04-01 08:24:33 -04:00
Andy Eschbacher
95247f66bb updating test info 2016-04-01 08:24:09 -04:00
Andy Eschbacher
9ba9d07bb5 building tests for markov 2016-04-01 08:23:46 -04:00
Andy Eschbacher
ef475adc26 testing array outputs 2016-04-01 08:23:10 -04:00
Andy Eschbacher
3294eb35ab remove if conditions and relying on pysal.W to build weights norms 2016-04-01 08:22:27 -04:00
Andy Eschbacher
693f6a68db pylint suggested changes 2016-03-31 11:26:33 -04:00
Stuart Lynn
e73862a6e1 adding function to predict the importance of different features to a dataset. 2016-03-31 11:25:30 -04:00
Andy Eschbacher
fd76a509ea Merge branch 'add-spatial-markov' of https://github.com/CartoDB/crankshaft into add-spatial-markov 2016-03-31 10:29:14 -04:00
Andy Eschbacher
c18baf26d8 adding module refs for pysaul utils 2016-03-31 09:35:10 -04:00
Andy Eschbacher
314d1851db fix tests for array-based inputs 2016-03-31 09:29:51 -04:00
Andy Eschbacher
6165d5e61e restructure to accom list of time columns 2016-03-31 09:27:37 -04:00
Andy Eschbacher
7695102500 update error message 2016-03-30 16:10:53 -04:00
Andy Eschbacher
369d1d2f41 pylinting changes 2016-03-30 16:10:53 -04:00
Andy Eschbacher
e32bab3f88 removed pieces for pysal_utils 2016-03-30 16:10:34 -04:00
Andy Eschbacher
cd3790860a add working version 2016-03-30 16:09:37 -04:00
Andy Eschbacher
1de90a7d39 update output signature 2016-03-30 16:09:37 -04:00
Andy Eschbacher
9535506b93 formated markov sql file a bit 2016-03-30 16:09:37 -04:00
Andy Eschbacher
bae2f04955 update signature used in plpython function 2016-03-30 16:09:37 -04:00
Andy Eschbacher
dc0873cd2b add types to arrays 2016-03-30 16:09:37 -04:00
Andy Eschbacher
d4621a6e9c adding passing tests 2016-03-30 16:09:37 -04:00
Andy Eschbacher
9943d4de58 adding markov python code 2016-03-30 16:08:17 -04:00
Andy Eschbacher
8cccb18eed adding markov functionality 2016-03-30 16:08:17 -04:00
Andy Eschbacher
a0cb699b1a updated move to pysal-utils 2016-03-30 16:06:44 -04:00
Andy Eschbacher
633b63bccc Merge pull request #25 from CartoDB/improve-moran-queries-revisited
adding condition to avoid self-comparison in neighbor queries
2016-03-30 15:40:29 -04:00
Andy Eschbacher
ea02f36235 adding condition to avoid self-comparison in neighbor queries 2016-03-30 15:37:51 -04:00
Andy Eschbacher
22b6aed7c1 Merge pull request #16 from CartoDB/proof-read-and-gitignore-update
Proof read and gitignore update
2016-03-30 12:37:29 -04:00
Andy Eschbacher
f6e8524669 Merge pull request #19 from CartoDB/restructure-moran-redux
Restructure moran redux
2016-03-30 12:10:36 -04:00
Andy Eschbacher
02b74813ac add test for global moran 2016-03-30 12:09:49 -04:00
Andy Eschbacher
4c243bf1d3 correct func signatures 2016-03-30 11:44:44 -04:00
Andy Eschbacher
b0150d4fec adding tests for pysal_utils 2016-03-30 08:27:14 -04:00
Andy Eschbacher
6bb4f36df5 extracting util code to new submodule 2016-03-30 08:10:35 -04:00
Andy Eschbacher
5a46f65e59 update tests to remove plpy notices 2016-03-30 08:09:48 -04:00
Andy Eschbacher
e56519f599 removed unneded comments, make outputs more consistent 2016-03-29 23:39:29 -07:00
Andy Eschbacher
8dd8ab37a5 refactored from pylint 2016-03-29 22:49:31 -07:00
Andy Eschbacher
06f5cf9951 standarizing error reporting 2016-03-29 12:34:23 -07:00
Andy Eschbacher
bc67ae8f69 changed name of functions for observatory 2016-03-29 12:18:52 -07:00
Javier Goizueta
b8330dce07 Fix deployment Makefile goal
The virtual environment virtual directory wasn't set properly for
deployment.
Fixes #21
2016-03-29 14:53:51 +02:00
Andy Eschbacher
2e1b598b4f pylinting changes 2016-03-25 22:49:29 -04:00
Stuart Lynn
d140b4249e updating to use iterative query on prediction 2016-03-24 14:05:17 -04:00
Andy Eschbacher
d398494720 add test case for markov use of functions 2016-03-24 11:34:59 -04:00
Andy Eschbacher
fbc30f1224 add working version 2016-03-24 11:34:28 -04:00
Andy Eschbacher
98c2b11935 update output signature 2016-03-24 11:33:47 -04:00
Andy Eschbacher
491577ed62 formated markov sql file a bit 2016-03-24 08:19:11 -04:00
Andy Eschbacher
68e5e0892c update signature used in plpython function 2016-03-23 20:46:19 -04:00
Andy Eschbacher
00579cd838 adding template 2016-03-23 17:10:08 -04:00
Andy Eschbacher
3f20275d3d adopting new format (wip) 2016-03-23 17:09:52 -04:00
Andy Eschbacher
c5a58f97ec add types to arrays 2016-03-23 15:33:54 -04:00
Andy Eschbacher
42e760b5d1 adding passing tests 2016-03-23 14:50:52 -04:00
Andy Eschbacher
cfb40ddecd fixes test for moran 2016-03-23 13:01:56 -04:00
Andy Eschbacher
f3673d6f89 updating test for moran output 2016-03-23 12:41:15 -04:00
Andy Eschbacher
c488900c8c adding markov python code 2016-03-23 12:39:51 -04:00
Andy Eschbacher
b889416947 adding markov functionality 2016-03-23 12:38:44 -04:00
Andy Eschbacher
58cf210e96 fixes test to acomm markov 2016-03-23 12:37:23 -04:00
Andy Eschbacher
1e16b7839b update methods to acomm markov 2016-03-23 12:35:39 -04:00
Andy Eschbacher
eecbe39547 updating tests 2016-03-22 10:42:44 -04:00
Andy Eschbacher
1578b17eb8 updated function flow without significance 2016-03-22 10:42:06 -04:00
Andy Eschbacher
3eda8ecd16 new signatures for moran (w/o significance) 2016-03-22 10:34:22 -04:00
Andy Eschbacher
9b32918746 fixing cluster query tests 2016-03-21 09:50:57 -04:00
Andy Eschbacher
0aa4d0a50e typo fixes, linking, etc. 2016-03-21 08:51:10 -04:00
Andy Eschbacher
3b31da783a adding mac ds_store ignore 2016-03-21 08:40:37 -04:00
Stuart Lynn
fb071215dc changing the function call to use queries 2016-03-17 19:01:19 -04:00
Stuart Lynn
d3e1fca2b3 changing form of function to use query 2016-03-16 15:46:59 -04:00
Javier Goizueta
8762f6ca1c Merge pull request #12 from CartoDB/feat-moran-free-queries
Allow to pass free queries as `select * from table limit 100` in moran
2016-03-16 19:43:15 +01:00
Raul Ochoa
58c141d217 Allow to pass free queries as select * from table limit 100 in moran 2016-03-16 19:40:06 +01:00
Javier Goizueta
5a7d3178dd Release 0.0.2
This version is the first with the new versioning approach
which uses separate per-version Pyhton virtual enironments.
2016-03-16 19:22:21 +01:00
Javier Goizueta
4903af6cdc Add existing release 0.0.1
The existing 0.0.1 files are placed into their location in release/
2016-03-16 18:41:49 +01:00
Javier Goizueta
692014d694 Merge pull request #11 from CartoDB/new-versioning-package-varenv
New versioning process (with multiple virtual environments)
2016-03-16 18:21:52 +01:00
Javier Goizueta
47e0253652 Fixes to the documentation 2016-03-16 18:18:59 +01:00
Javier Goizueta
9f03a9b075 Reorganize the documentation into separate files
Keep a "Quickstart Guide" in the README, add separate
detailed sections for development (CONTRIBUTING) and
release/deployment (RELEASE).
2016-03-16 17:42:28 +01:00
Javier Goizueta
b5281d0681 Documentation clarifications and corrections. 2016-03-16 17:19:21 +01:00
Javier Goizueta
689ec8a925 Change version function from IMMUTABLE to STABLE
These functions' results will change when the extension
is updated.
2016-03-16 17:09:50 +01:00
Javier Goizueta
a7e42e93cc Rename cdb_crankshaft_internal_version as internal function 2016-03-16 16:41:54 +01:00
Javier Goizueta
bad09ffd7b Remove abandoned alternatives from the documentation 2016-03-16 16:30:03 +01:00
Javier Goizueta
4706442a1d Add documentation about useful make targets 2016-03-16 15:56:19 +01:00
Javier Goizueta
935c7f9963 Add missing Makefile comment 2016-03-16 15:54:39 +01:00
Javier Goizueta
ef3bcaeee8 Restore commented-out make target 2016-03-16 15:52:47 +01:00
Javier Goizueta
4ffb2c9664 Review and fix the documentation 2016-03-16 15:45:13 +01:00
Javier Goizueta
dea6e2f1a7 Refactor the Makefile
Separate concerns properly for each subdirectory's Makefile
2016-03-16 15:40:40 +01:00
Javier Goizueta
d13f167d47 Add RELEASE_VERSION option to make deploy
Now make deploy installs by default the current version,
but can be made to install any prior specific version using
a environmnt varialbe RELEASE_VERSION
2016-03-16 14:38:18 +01:00
Javier Goizueta
a518034e65 Fix .pyc files need not only be ignored inside src/py 2016-03-16 11:13:26 +01:00
Javier Goizueta
24e4037995 Fix version number of released extension script 2016-03-16 11:11:16 +01:00
Javier Goizueta
82a738fe40 Fix make clean tasks 2016-03-16 10:18:07 +01:00
Javier Goizueta
e801c9cb60 Release tasks using release-specific virtual environments
Refine the development process and define the procedure for
releasing new versions.
2016-03-15 18:48:46 +01:00
Stuart Lynn
f134a54c24 adding one shot train and predict function 2016-03-15 12:44:33 -04:00
Stuart Lynn
803781e08d one stop segmentation shop 2016-03-11 14:09:53 -05:00
Javier Goizueta
0206cc6c44 Update documentation 2016-03-10 19:13:46 +01:00
Stuart Lynn
fcf57289fc Training section now works 2016-03-10 12:50:50 -05:00
Rafa de la Torre
b754ffe42a Add info about python dependencies 2016-03-10 18:06:21 +01:00
Stuart Lynn
f885cc9f7b more segmentation fleshing out 2016-03-09 20:04:12 -05:00
Javier Goizueta
0056f411b5 Set the path to virtualenvs in the Makefile
Also, version the virtualenv
2016-03-09 19:04:21 +01:00
Javier Goizueta
1810f02242 Use SciPy from system package python-scipy 2016-03-09 15:03:17 +01:00
Javier Goizueta
8e972128eb Modify sql code to user the python virtualenv 2016-03-09 15:00:50 +01:00
Javier Goizueta
cdd2d9e722 Directory reorganization and sketch of new versioning procedure 2016-03-08 19:35:02 +01:00
Stuart Lynn
d96d6b2c48 fleshing out segmentation 2016-03-07 11:41:37 -05:00
Stuart Lynn
746dcc9723 segmentation 2016-03-05 17:55:32 -05:00
184 changed files with 12875 additions and 585 deletions

7
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View File

@@ -0,0 +1,7 @@
- [ ] All declared geometries are `geometry(Geometry, 4326)` for general geoms, or `geometry(Point, 4326)`
- [ ] Existing functions in crankshaft python library called from the extension are kept at least from version N to version N+1 (to avoid breakage during upgrades).
- [ ] Docs for public-facing functions are written
- [ ] New functions follow the naming conventions: `CDB_NameOfFunction`. Where internal functions begin with an underscore `_`.
- [ ] If appropriate, new functions accepts an arbitrary query as an input (see [Crankshaft Issue #6](https://github.com/CartoDB/crankshaft/issues/6) for more information)

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
envs/
*.pyc
.DS_Store
.idea/

View File

@@ -1,84 +1,93 @@
# Contributing guide
# Development process
## How to add new functions
Please read the Working Process/Quickstart Guide in [README.md](https://github.com/CartoDB/crankshaft/blob/master/README.md) first.
Try to put as little logic in the SQL extension as possible and
just use it as a wrapper to the Python module functionality.
For any modification of crankshaft, such as adding new features,
refactoring or bug-fixing, topic branch must be created out of the `develop`
branch and be used for the development process.
Once a function is defined it should never change its signature in subsequent
versions. To change a function's signature a new function with a different
name must be created.
Modifications are done inside `src/pg/sql` and `src/py/crankshaft`.
### Version numbers
Take into account:
The version of both the SQL extension and the Python package shall
follow the [Semantic Versioning 2.0](http://semver.org/) guidelines:
* Tests must be added for any new functionality
(inside `src/pg/test`, `src/py/crankshaft/test`) as well as to
detect any bugs that are being fixed.
* Add or modify the corresponding documentation files in the `doc` folder.
Since we expect to have highly technical functions here, an extense
background explanation would be of great help to users of this extension.
* Convention: snake case(i.e. `snake_case` and not `CamelCase`)
shall be used for all function names.
Prefix function names intended for public use with `cdb_`
and private functions (to be used only internally inside
the extension) with `_cdb_`.
* When backwards incompatibility is introduced the major number is incremented
* When functionally is added (in a backwards-compatible manner) the minor number
is incremented
* When only fixes are introduced (backwards-compatible) the patch number is
incremented
Once the code is ready to be tested, update the local development installation
with `sudo make install`.
This will update the 'dev' version of the extension in `src/pg/` and
make it available to PostgreSQL.
It will also install the python package (crankshaft) in a virtual
environment `env/dev`.
### Python Package
The version number of the Python package, defined in
`src/pg/crankshaft/setup.py` will be overridden when
the package is released and always match the extension version number,
but for development it shall be kept as '0.0.0'.
...
Run the tests with `make test`.
### SQL Extension
* Generate a **new subfolder version** for `sql` and `test` folders to define
the new functions and tests
- Use symlinks to avoid file duplication between versions that don't update them
- Add new files or modify copies of the old files to add new functions or
modify existing functions (remember to rename a function if the signature
changes)
- Add or modify the corresponding documentation files in the `doc` folder.
Since we expect to have highly technical functions here, an extense
background explanation would be of great help to users of this extension.
- Create tests for the new functions/behaviour
* Generate the **upgrade and downgrade files** for the extension
* Update the control file and the Makefile to generate the complete SQL
file for the new created version. After running `make` a new
file `crankshaft--X.Y.Z.sql` will be created for the current version.
Additional files for migrating to/from the previous version A.B.Z should be
created:
- `crankshaft--X.Y.Z--A.B.C.sql`
- `crankshaft--A.B.C--X.Y.Z.sql`
All these new files must be added to git and pushed.
* Update the public docs! ;-)
## Conventions
# SQL
Use snake case (i.e. `snake_case` and not `CamelCase`) for all
functions. Prefix functions intended for public use with `cdb_`
and private functions (to be used only internally inside
the extension) with `_cdb_`.
# Python
...
## Testing
Running just the Python tests:
To use the python extension for custom tests, activate the virtual
environment with:
```
(cd python && make test)
source envs/dev/bin/activate
```
Installing the Extension and running just the PostgreSQL tests:
Update extension in a working database with:
* `ALTER EXTENSION crankshaft UPDATE TO 'current';`
`ALTER EXTENSION crankshaft UPDATE TO 'dev';`
Note: we keep the current development version install as 'dev' always;
we update through the 'current' alias to allow changing the extension
contents but not the version identifier. This will fail if the
changes involve incompatible function changes such as a different
return type; in that case the offending function (or the whole extension)
should be dropped manually before the update.
If the extension has not previously been installed in a database,
it can be installed directly with:
* `CREATE EXTENSION IF NOT EXISTS plpythonu;`
`CREATE EXTENSION IF NOT EXISTS postgis;`
`CREATE EXTENSION crankshaft WITH VERSION 'dev';`
Note: the development extension uses the development python virtual
environment automatically.
Before proceeding to the release process peer code reviewing of the code is
a must.
Once the feature or bugfix is completed and all the tests are passing
a Pull-Request shall be created on the topic branch, reviewed by a peer
and then merged back into the `develop` branch when all CI tests pass.
When the changes in the `develop` branch are to be released in a new
version of the extension, a PR must be created on the `develop` branch.
The release manage will take hold of the PR at this moment to proceed
to the release process for a new revision of the extension.
## Relevant development tasks available in the Makefile
```
(cd pg && sudo make install && PGUSER=postgres make installcheck)
```
* `make help` show a short description of the available targets
Installing and testing everything:
* `sudo make install` will generate the extension scripts for the development
version ('dev'/'current') and install the python package into the
development virtual environment `envs/dev`.
Intended for use by developers.
```
sudo make install && PGUSER=postgres make testinstalled
* `make test` will run the tests for the installed development extension.
Intended for use by developers.
```

View File

@@ -1,43 +0,0 @@
# Workflow
... (branching/merging flow)
# Deployment
...
Deployment to db servers: the next command will install both the Python
package and the extension.
```
sudo make install
```
Installing only the Python package:
```
sudo pip install python/crankshaft --upgrade
```
Caveat: note that `pip install ./crankshaft` will install
from local files, but `pip install crankshaft` will not.
CI: Install and run the tests on the installed extension and package:
```
(sudo make install && PGUSER=postgres make testinstalled)
```
Installing the extension in user databases:
Once installed in a server, the extension can be added
to a database with the next SQL command:
```
CREATE EXTENSION crankshaft;
```
To upgrade the extension to an specific version X.Y.Z:
```
ALTER EXTENSION crankshaft UPGRADE TO 'X.Y.Z';
```

View File

@@ -1,13 +1,64 @@
EXT_DIR = pg
PYP_DIR = python
include ./Makefile.global
EXT_DIR = src/pg
PYP_DIR = src/py
.PHONY: install
.PHONY: run_tests
.PHONY: release
.PHONY: deploy
install:
# Generate and install developmet versions of the extension
# and python package.
# The extension is named 'dev' with a 'current' alias for easily upgrading.
# Requires sudo.
install: ## Generate and install development version of the extension; requires sudo.
$(MAKE) -C $(PYP_DIR) install
$(MAKE) -C $(EXT_DIR) install
testinstalled:
$(MAKE) -C $(PYP_DIR) testinstalled
$(MAKE) -C $(EXT_DIR) installcheck
# Run the tests for the installed development extension and
# python package
test: ## Run the tests for the development version of the extension
$(MAKE) -C $(PYP_DIR) test
$(MAKE) -C $(EXT_DIR) test
# Generate a new release into release
release: ## Generate a new release of the extension. Only for telease manager
$(MAKE) -C $(EXT_DIR) release
$(MAKE) -C $(PYP_DIR) release
# Install the current release.
# Requires sudo.
# Use the RELEASE_VERSION environment variable to deploy a specific version:
# sudo make deploy RELEASE_VERSION=1.0.0
deploy: ## Deploy a released extension. Only for release manager. Requires sudo.
$(MAKE) -C $(EXT_DIR) deploy
$(MAKE) -C $(PYP_DIR) deploy
# Cleanup development extension script files
clean-dev: ## clean up development extension script files
rm -f src/pg/$(EXTENSION)--*.sql
# Cleanup all releases
clean-releases: ## clean up all releases
rm -rf release/python/*
rm -f release/$(EXTENSION)--*.sql
rm -f release/$(EXTENSION).control
# Cleanup current/specific version
clean-release: ## clean up current release
rm -rf release/python/$(RELEASE_VERSION)
rm -f release/$(RELEASE_VERSION)--*.sql
clean-all: clean-dev clean-release
help:
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//'`); \
for help_line in $${help_lines[@]}; do \
IFS=$$'#' ; \
help_split=($$help_line) ; \
help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
printf "%-30s %s\n" $$help_command $$help_info ; \
done

6
Makefile.global Normal file
View File

@@ -0,0 +1,6 @@
SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
EXTENSION = crankshaft
PACKAGE = crankshaft
EXTVERSION = $(shell grep default_version $(SELF_DIR)/src/pg/$(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
RELEASE_VERSION ?= $(EXTVERSION)
SED = sed

24
NEWS.md Normal file
View File

@@ -0,0 +1,24 @@
0.1.0 (2016-06-29)
------------------
* Adds Spatial Markov function
* Adds Spacial interpolation function
* Adds `CDB_pyAgg (columns Numeric[])` helper function
* Adds Segmentation Functions
0.0.4 (2016-06-20)
------------------
* Remove cartodb extension dependency from tests
* Declare all correct dependencies with correct versions in setup.py
0.0.3 (2016-06-16)
------------------
* Adds new functions: kmeans, weighted centroids.
* Replaces moran functions with new areas of interest naming.
0.0.2 (2016-03-16)
------------------
* New versioning approach using per-version Python virtual environments
0.0.1 (2016-02-22)
------------------
* Preliminar release

View File

@@ -4,9 +4,67 @@ CartoDB Spatial Analysis extension for PostgreSQL.
## Code organization
* *pg* contains the PostgreSQL extension source code
* *python* Python module
* *doc* documentation
* *src* source code
* - *src/pg* contains the PostgreSQL extension source code
* - *src/py* Python module source code
* *release* reseleased versions
## Requirements
* pip
* pip, PostgreSQL
* python-scipy system package (see [src/py/README.md](https://github.com/CartoDB/crankshaft/blob/master/src/py/README.md))
# Working Process -- Quickstart Guide
We distinguish two roles regarding the development cycle of crankshaft:
* *developers* will implement new functionality and bugfixes into
the codebase and will request for new releases of the extension.
* A *release manager* will attend these requests and will handle
the release process. The release process is sequential:
no concurrent releases will ever be in the works.
We use the default `develop` branch as the basis for development.
The `master` branch is used to merge and tag releases to be
deployed in production.
Developers shall create a new topic branch from `develop` for any new feature
or bugfix and commit their changes to it and eventually merge back into
the `develop` branch. When a new release is required a Pull Request
will be open against the `develop` branch.
The `develop` pull requests will be handled by the release manage,
who will merge into master where new releases are prepared and tagged.
The `master` branch is the sole responsibility of the release masters
and developers must not commit or merge into it.
## Development Guidelines
For a detailed description of the development process please see
the [CONTRIBUTING.md](https://github.com/CartoDB/crankshaft/blob/master/CONTRIBUTING.md) guide.
Any modification to the source code (`src/pg/sql` for the SQL extension,
`src/py/crankshaft` for the Python package) shall always be done
in a topic branch created from the `develop` branch.
Tests, documentation and peer code reviewing are required for all
modifications.
The tests (both for SQL and Python) are executed by running,
from the top directory:
```
sudo make install
make test
```
To request a new release, which will be handled by them
release manager, a Pull Request must be created in the `develop`
branch.
## Release
The release and deployment process is described in the
[RELEASE.md](https://github.com/CartoDB/crankshaft/blob/master/RELEASE.md) guide and it is the responsibility of the designated
release manager.

93
RELEASE.md Normal file
View File

@@ -0,0 +1,93 @@
# Release & Deployment Process
Please read the Working Process/Quickstart Guide in README.md
and the Development guidelines in CONTRIBUTING.md.
The release process of a new version of the extension
shall be performed by the designated *Release Manager*.
Note that we expect to gradually automate more of this process.
Having checked PR to be released it shall be
merged back into the `master` branch to prepare the new release.
The version number in `pg/cranckshaft.control` must first be updated.
To do so [Semantic Versioning 2.0](http://semver.org/) is in order.
Thew `NEWS.md` will be updated.
We now will explain the process for the case of backwards-compatible
releases (updating the minor or patch version numbers).
TODO: document the complex case of major releases.
The next command must be executed to produce the main installation
script for the new release, `release/cranckshaft--X.Y.Z.sql` and
also to copy the python package to `release/python/X.Y.Z/crankshaft`.
```
make release
```
Then, the release manager shall produce upgrade and downgrade scripts
to migrate to/from the previous release. In the case of minor/patch
releases this simply consist in extracting the functions that have changed
and placing them in the proper `release/cranckshaft--X.Y.Z--A.B.C.sql`
file.
The new release can be deployed for staging/smoke tests with this command:
```
sudo make deploy
```
This will copy the current 'X.Y.Z' released version of the extension to
PostgreSQL. The corresponding Python extension will be installed in a
virtual environment in `envs/X.Y.Z`.
It can be activated with:
```
source envs/X.Y.Z/bin/activate
```
But note that this is needed only for using the package directly;
the 'X.Y.Z' version of the extension will automatically use the
python package from this virtual environment.
The `sudo make deploy` operation can be also used for installing
the new version after it has been released.
To install a specific version 'X.Y.Z' different from the current one
(which must be present in `releases/`) you can:
```
sudo make deploy RELEASE_VERSION=X.Y.Z
```
TODO: testing procedure for the new release.
TODO: procedure for staging deployment.
TODO: procedure for merging to master, tagging and deploying
in production.
## Relevant release & deployment tasks available in the Makefile
```
* `make help` show a short description of the available targets
* `make release` will generate a new release (version number defined in
`src/pg/crankshaft.control`) into `release/`.
Intended for use by the release manager.
* `sudo make deploy` will install the current release X.Y.Z from the
`release/` files into PostgreSQL and a Python virtual environment
`envs/X.Y.Z`.
Intended for use by the release manager and deployment jobs.
* `sudo make deploy RELEASE_VERSION=X.Y.Z` will install specified version
previously generated in `release/`
into PostgreSQL and a Python virtual environment `envs/X.Y.Z`.
Intended for use by the release manager and deployment jobs.
```

View File

@@ -1,9 +0,0 @@
* [x] Support versioning
* [x] Test use of `plpy` from python Package
* [x] Add `pysal` etc. dependencies
* [x] Define documentation practices (general, per extension/package?)
* [x] Add initial function set (WIP)
* Unify style of function comments
* [x] Add integration tests
* Make target to open a new version development (create symlinks, etc.)
* [x] Should add cartodb ext. as a dependency?

185
doc/02_moran.md Normal file
View File

@@ -0,0 +1,185 @@
## Areas of Interest Functions
### CDB_AreasOfInterestLocal(subquery text, column_name text)
This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| column_name | TEXT | Name of column (e.g., should be `'interesting_value'` instead of `interesting_value` without single quotes) used for the analysis. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. |
| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. |
| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` |
| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| moran | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the geometry with id of `rowid` |
| quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. |
| significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. |
| rowid | INT | Row id of the values which correspond to the input rows. |
| vals | NUMERIC | Values from `'column_name'`. |
#### Example Usage
```sql
SELECT
c.the_geom,
aoi.quads,
aoi.significance,
c.num_cyclists_per_total_population
FROM CDB_GetAreasOfInterestLocal('SELECT * FROM commute_data'
'num_cyclists_per_total_population') As aoi
JOIN commute_data As c
ON c.cartodb_id = aoi.rowid;
```
### CDB_AreasOfInterestGlobal(subquery text, column_name text)
This function identifies the extent to which geometries cluster (the groupings of geometries with similarly high or low values relative to the mean) or form outliers (areas where geometries have values opposite of their neighbors). The output of this function gives values between -1 and 1 as well as a significance of that classification. Values close to 0 mean that there is little to no distribution of values as compared to what one would see in a randomly distributed collection of geometries and values.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| column_name | TEXT | Name of column (e.g., should be `'interesting_value'` instead of `interesting_value` without single quotes) used for the analysis. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. |
| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. |
| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` |
| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| moran | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the entire dataset. Values closer to one indicate cluster, closer to -1 mean more outliers, and near zero indicates a random distribution of data. |
| significance | NUMERIC | The statistical significance of the `moran` measure. |
#### Examples
```sql
SELECT *
FROM CDB_AreasOfInterestGlobal('SELECT * FROM commute_data', 'num_cyclists_per_total_population')
```
### CDB_AreasOfInterestLocalRate(subquery text, numerator_column text, denominator_column text)
Just like `CDB_AreasOfInterestLocal`, this function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. This function differs in that it calculates the classifications based on input `numerator` and `denominator` columns for finding the areas where there are clusters and outliers for the resulting rate of those two values.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| numerator | TEXT | Name of the numerator for forming a rate to be used in analysis. |
| denominator | TEXT | Name of the denominator for forming a rate to be used in analysis. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. |
| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. |
| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` |
| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| moran | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the geometry with id of `rowid` |
| quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. |
| significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. |
| rowid | INT | Row id of the values which correspond to the input rows. |
| vals | NUMERIC | Values from `'column_name'`. |
#### Example Usage
```sql
SELECT
c.the_geom,
aoi.quads,
aoi.significance,
c.cyclists_per_total_population
FROM CDB_GetAreasOfInterestLocalRate('SELECT * FROM commute_data'
'num_cyclists',
'total_population') As aoi
JOIN commute_data As c
ON c.cartodb_id = aoi.rowid;
```
### CDB_AreasOfInterestGlobalRate(subquery text, column_name text)
This function identifies the extent to which geometries cluster (the groupings of geometries with similarly high or low values relative to the mean) or form outliers (areas where geometries have values opposite of their neighbors). The output of this function gives values between -1 and 1 as well as a significance of that classification. Values close to 0 mean that there is little to no distribution of values as compared to what one would see in a randomly distributed collection of geometries and values.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| numerator | TEXT | Name of the numerator for forming a rate to be used in analysis. |
| denominator | TEXT | Name of the denominator for forming a rate to be used in analysis. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. |
| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. |
| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` |
| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| moran | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the entire dataset. Values closer to one indicate cluster, closer to -1 mean more outliers, and near zero indicates a random distribution of data. |
| significance | NUMERIC | The statistical significance of the `moran` measure. |
#### Examples
```sql
SELECT *
FROM CDB_AreasOfInterestGlobalRate('SELECT * FROM commute_data',
'num_cyclists',
'total_population')
```
## Hotspot, Coldspot, and Outlier Functions
These functions are convenience functions for extracting only information that you are interested in exposing based on the outputs of the `CDB_AreasOfInterest` functions. For instance, you can use `CDB_GetSpatialHotspots` to output only the classifications of `HH` and `HL`.
### Non-rate functions
#### CDB_GetSpatialHotspots
This function's inputs and outputs exactly mirror `CDB_AreasOfInterestLocal` except that the outputs are filtered to be only 'HH' and 'HL' (areas of high values). For more information about this function's use, see `CDB_AreasOfInterestLocal`.
#### CDB_GetSpatialColdspots
This function's inputs and outputs exactly mirror `CDB_AreasOfInterestLocal` except that the outputs are filtered to be only 'LL' and 'LH' (areas of low values). For more information about this function's use, see `CDB_AreasOfInterestLocal`.
#### CDB_GetSpatialOutliers
This function's inputs and outputs exactly mirror `CDB_AreasOfInterestLocal` except that the outputs are filtered to be only 'HL' and 'LH' (areas where highs or lows are surrounded by opposite values on average). For more information about this function's use, see `CDB_AreasOfInterestLocal`.
### Rate functions
#### CDB_GetSpatialHotspotsRate
This function's inputs and outputs exactly mirror `CDB_AreasOfInterestLocalRate` except that the outputs are filtered to be only 'HH' and 'HL' (areas of high values). For more information about this function's use, see `CDB_AreasOfInterestLocalRate`.
#### CDB_GetSpatialColdspotsRate
This function's inputs and outputs exactly mirror `CDB_AreasOfInterestLocalRate` except that the outputs are filtered to be only 'LL' and 'LH' (areas of low values). For more information about this function's use, see `CDB_AreasOfInterestLocalRate`.
#### CDB_GetSpatialOutliersRate
This function's inputs and outputs exactly mirror `CDB_AreasOfInterestLocalRate` except that the outputs are filtered to be only 'HL' and 'LH' (areas where highs or lows are surrounded by opposite values on average). For more information about this function's use, see `CDB_AreasOfInterestLocalRate`.

47
doc/04_markov.md Normal file
View File

@@ -0,0 +1,47 @@
## Spatial Markov
### CDB_SpatialMarkovTrend(subquery text, column_names text array)
This function takes time series data associated with geometries and outputs likelihoods that the next value of a geometry will move up, down, or stay static as compared to the most recent measurement. For more information, read about [Spatial Dynamics in PySAL](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/dynamics.html).
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| column_names | TEXT Array | Names of column that form the history of measurements for the geometries (e.g., `Array['y2011', 'y2012', 'y2013', 'y2014', 'y2015', 'y2016']`). |
| num_classes (optional) | INT | Number of quantile classes to separate data into. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. |
| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. |
| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` |
| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| trend | NUMERIC | The probability that the measure at this location will move up (a positive number) or down (a negative number) |
| trend_up | NUMERIC | The probability that a measure will move up in subsequent steps of time |
| trend_down | NUMERIC | The probability that a measure will move down in subsequent steps of time |
| volatility | NUMERIC | A measure of the variance of the probabilities returned from the Spatial Markov predictions |
| rowid | NUMERIC | id of the row that corresponds to the `id_col` (by default `cartodb_id` of the input rows) |
#### Example Usage
```sql
SELECT
c.cartodb_id,
c.the_geom,
m.trend,
m.trend_up,
m.trend_down,
m.volatility
FROM CDB_SpatialMarkovTrend('SELECT * FROM nyc_real_estate'
Array['m03y2009','m03y2010','m03y2011','m03y2012','m03y2013','m03y2014','m03y2015','m03y2016']) As m
JOIN nyc_real_estate As c
ON c.cartodb_id = m.rowid;
```

23
doc/04_pyAgg.md Normal file
View File

@@ -0,0 +1,23 @@
## PyAgg Helper Function
### CDB_pyAgg (columns Numeric[])
Currently it's not possible to pass a multidiemensional array between plpsql and plpythonu. This function aims to
help fix that by aggergating the columns provided in the argument across rows in to a rows * columns + 1 length 1D array. The first element of the array is the array\_length of the columns argument so that python can reconstruct
the 2D array.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| columns | NUMERIC[] | The columns to aggregate across rows|
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| result | NUMERIC[] | An columns * rows + 1 array where the first entry is the no of columns|

51
doc/08_interpolation.md Normal file
View File

@@ -0,0 +1,51 @@
## Spacial interpolation
Function to interpolate a numeric attribute of a point in a scatter dataset of points, using one of three methos:
* [Nearest neighbor](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
* [Barycentric](https://en.wikipedia.org/wiki/Barycentric_coordinate_system)
* [IDW](https://en.wikipedia.org/wiki/Inverse_distance_weighting)
### CDB_SpatialInterpolation (query text, point geometry, method integer DEFAULT 1, p1 integer DEFAULT 0, ps integer DEFAULT 0)
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| query | text | query that returns at least `the_geom` and a numeric value as `attrib` |
| point | geometry | The target point to calc the value |
| method | integer | 0:nearest neighbor, 1: barycentric, 2: IDW|
| p1 | integer | IDW: limit the number of neighbors, 0->no limit|
| p2 | integer | IDW: order of distance decay, 0-> order 1|
### CDB_SpatialInterpolation (geom geometry[], values numeric[], point geometry, method integer DEFAULT 1, p1 integer DEFAULT 0, ps integer DEFAULT 0)
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| geom | geometry[] | Array of points's geometries |
| values | numeric[] | Array of points' values for the param under study|
| point | geometry | The target point to calc the value |
| method | integer | 0:nearest neighbor, 1: barycentric, 2: IDW|
| p1 | integer | IDW: limit the number of neighbors, 0->no limit|
| p2 | integer | IDW: order of distance decay, 0-> order 1|
### Returns
| Column Name | Type | Description |
|-------------|------|-------------|
| value | numeric | Interpolated value at the given point, `-888.888` if the given point is out of the boundaries of the source points set |
#### Example Usage
```sql
with a as (
select
array_agg(the_geom) as geomin,
array_agg(temp::numeric) as colin
from table_4804232032
)
SELECT CDB_SpatialInterpolation(geomin, colin, CDB_latlng(41.38, 2.15),1) FROM a;
```

62
doc/11_kmeans.md Normal file
View File

@@ -0,0 +1,62 @@
## K-Means Functions
### CDB_KMeans(subquery text, no_clusters INTEGER)
This function attempts to find n clusters within the input data. It will return a table to CartoDB ids and
the number of the cluster each point in the input was assigend to.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| no\_clusters | INTEGER | The number of clusters to try and find |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| cartodb\_id | INTEGER | The CartoDB id of the row in the input table.|
| cluster\_no | INTEGER | The cluster that this point belongs to. |
#### Example Usage
```sql
SELECT
customers.*,
km.cluster_no
FROM cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) km, customers_3
WHERE customers.cartodb_id = km.cartodb_id
```
### CDB_WeightedMean(subquery text, weight_column text, category_column text)
Function that computes the weighted centroid of a number of clusters by some weight column.
### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column and the columns specified as the weight and category columns|
| weight\_column | TEXT | The name of the column to use as a weight |
| category\_column | TEXT | The name of the column to use as a category |
### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| the\_geom | GEOMETRY | A point for the weighted cluster center |
| class | INTEGER | The cluster class |
### Example Usage
```sql
SELECT ST_TRANSFORM(the_geom, 3857) as the_geom_webmercator, class
FROM cdb_weighted_mean('SELECT *, customer_value FROM customers','customer_value','cluster_no')
```

83
doc/12_segmentation.md Normal file
View File

@@ -0,0 +1,83 @@
## Segmentation Functions
### CDB_CreateAndPredictSegment(query TEXT, variable_name TEXT, target_query TEXT)
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| query | TEXT | The input query to train the algorithm, which should have both the variable of interest and the features that will be used to predict it |
| variable\_name| TEXT | Specify the variable in the query to predict, all other columns are assumed to be features |
| target\_table | TEXT | The query which returns the `cartodb_id` and features for the rows your would like to predict the target variable for |
| n\_estimators (optional) | INTEGER DEFAULT 1200 | Number of estimators to be used. Values should be between 1 and x. |
| max\_depth (optional) | INTEGER DEFAULT 3 | Max tree depth. Values should be between 1 and n. |
| subsample (optional) | DOUBLE PRECISION DEFAULT 0.5 | Subsample parameter for GradientBooster. Values should be within the range 0 to 1. |
| learning\_rate (optional) | DOUBLE PRECISION DEFAULT 0.01 | Learning rate for the GradientBooster. Values should be between 0 and 1 (??) |
| min\_samples\_leaf (optional) | INTEGER DEFAULT 1 | Minimum samples to use per leaf. Values should range from x to y |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| cartodb\_id | INTEGER | The CartoDB id of the row in the target\_query |
| prediction | NUMERIC | The predicted value of the variable of interest |
| accuracy | NUMERIC | The mean squared accuracy of the model. |
#### Example Usage
```sql
SELECT * from cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT agg, median_rent::numeric, male_pop::numeric, female_pop::numeric FROM late_night_agg',
'agg',
'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny');
```
### CDB_CreateAndPredictSegment(target numeric[], train_features numeric[], prediction_features numeric[], prediction_ids numeric[])
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| target | numeric[] | An array of target values of the variable you want to predict|
| train\_features| numeric[] | 1D array of length n features \* n\_rows + 1 with the first entry in the array being the number of features in each row. These are the features the model will be trained on. CDB\_Crankshaft.CDB_pyAgg(Array[feature1, feature2, feature3]::numeric[]) can be used to construct this. |
| prediction\_features | numeric[] | 1D array of length nfeatures\* n\_rows\_ + 1 with the first entry in the array being the number of features in each row. These are the features that will be used to predict the target variable CDB\_Crankshaft.CDB\_pyAgg(Array[feature1, feature2, feature3]::numeric[]) can be used to construct this. |
| prediction\_ids | numeric[] | 1D array of length n\_rows with the ids that can use used to re-join the data with inputs |
| n\_estimators (optional) | INTEGER DEFAULT 1200 | Number of estimators to be used |
| max\_depth (optional) | INTEGER DEFAULT 3 | Max tree depth |
| subsample (optional) | DOUBLE PRECISION DEFAULT 0.5 | Subsample parameter for GradientBooster|
| learning\_rate (optional) | DOUBLE PRECISION DEFAULT 0.01 | Learning rate for the GradientBooster |
| min\_samples\_leaf (optional) | INTEGER DEFAULT 1 | Minimum samples to use per leaf |
#### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| cartodb\_id | INTEGER | The CartoDB id of the row in the target\_query |
| prediction | NUMERIC | The predicted value of the variable of interest |
| accuracy | NUMERIC | The mean squared accuracy of the model. |
#### Example Usage
```sql
WITH training As (
SELECT array_agg(agg) As target,
cdb_crankshaft.CDB_PyAgg(Array[median_rent, male_pop, female_pop]::Numeric[]) As features
FROM late_night_agg),
target AS (
SELECT cdb_crankshaft.CDB_PyAgg(Array[median_rent, male_pop, female_pop]::Numeric[]) As features,
array_agg(cartodb_id) As cartodb_ids FROM late_night_agg)
SELECT cdb_crankshaft.CDB_CreateAndPredictSegment(training.target, training.features, target.features, target.cartodb_ids)
FROM training, target;
```

24
doc/docs_template.md Normal file
View File

@@ -0,0 +1,24 @@
## Name
## Synopsis
## Description
Availability: v...
## Examples
```SQL
-- example of the function in use
SELECT cdb_awesome_function(the_geom, 'total_pop')
FROM table_name
```
## API Usage
_asdf_
## See Also
_Other function pages_

3
pg/.gitignore vendored
View File

@@ -1,3 +0,0 @@
regression.diffs
regression.out
results/

View File

@@ -1,33 +0,0 @@
# Makefile to generate the extension out of separate sql source files.
# Once a version is released, it is not meant to be changed. E.g: once version 0.0.1 is out, it SHALL NOT be changed.
EXTENSION = crankshaft
EXTVERSION = $(shell grep default_version $(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
# The new version to be generated from templates
NEW_EXTENSION_ARTIFACT = $(EXTENSION)--$(EXTVERSION).sql
# DATA is a special variable used by postgres build infrastructure
# These are the files to be installed in the server shared dir,
# for installation from scratch, upgrades and downgrades.
# @see http://www.postgresql.org/docs/current/static/extend-pgxs.html
DATA = $(NEW_EXTENSION_ARTIFACT)
SOURCES_DATA_DIR = sql/$(EXTVERSION)
SOURCES_DATA = $(wildcard sql/$(EXTVERSION)/*.sql)
# The extension installation artifacts are stored in the base subdirectory
$(NEW_EXTENSION_ARTIFACT): $(SOURCES_DATA)
rm -f $@
cat $(SOURCES_DATA_DIR)/*.sql >> $@
REGRESS = $(notdir $(basename $(wildcard test/$(EXTVERSION)/sql/*test.sql)))
TEST_DIR = test/$(EXTVERSION)
REGRESS_OPTS = --inputdir='$(TEST_DIR)' --outputdir='$(TEST_DIR)'
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
# This seems to be needed at least for PG 9.3.11
all: $(DATA)

View File

@@ -1,7 +0,0 @@
# Running the tests:
```
sudo make install
PGUSER=postgres make installcheck
```

View File

@@ -1,71 +0,0 @@
### Moran's I
#### What is Moran's I and why is it significant for CartoDB?
Moran's I is a geostatistical calculation which gives a measure of the global
clustering and presence of outliers within the geographies in a map. Here global
means over all of the geographies in a dataset. Imagine mapping the incidence
rates of cancer in neighborhoods of a city. If there were areas covering several
neighborhoods with abnormally low rates of cancer, those areas are positively
spatially correlated with one another and would be considered a cluster. If
there was a single neighborhood with a high rate but with all neighbors on
average having a low rate, it would be considered a spatial outlier.
While Moran's I gives a global snapshot, there are local indicators for
clustering called Local Indicators of Spatial Autocorrelation. Clustering is a
process related to autocorrelation -- i.e., a process that compares a
geography's attribute to the attribute in neighbor geographies.
For the example of cancer rates in neighborhoods, since these neighborhoods have
a high value for rate of cancer, and all of their neighbors do as well, they are
designated as "High High" or simply **HH**. For areas with multiple neighborhoods
with low rates of cancer, they are designated as "Low Low" or **LL**. HH and LL
naturally fit into the concept of clustering and are in the correlated
variables.
"Anticorrelated" geogs are in **LH** and **HL** regions -- that is, regions
where a geog has a high value and it's neighbors, on average, have a low value
(or vice versa). An example of this is a "gated community" or placement of a
city housing project in a rich region. These deliberate developments have
opposite median income as compared to the neighbors around them. They have a
high (or low) value while their neighbors have a low (or high) value. They exist
typically as islands, and in rare circumstances can extend as chains dividing
**LL** or **HH**.
Strong policies such as rent stabilization (probably) tend to prevent the
clustering of high rent areas as they integrate middle class incomes. Luxury
apartment buildings, which are a kind of gated community, probably tend to skew
an area's median income upwards while housing projects have the opposite effect.
What are the nuggets in the analysis?
Two functions are available to compute Moran I statistics:
* `cdb_moran_local` computes Moran I measures, quad classification and
significance values from numerial values associated to geometry entities
in an input table. The geometries should be contiguous polygons When
then `queen` `w_type` is used.
* `cdb_moran_local_rate` computes the same statistics using a ratio between
numerator and denominator columns of a table.
The parameters for `cdb_moran_local` are:
* `table` name of the table that contains the data values
* `attr` name of the column
* `signficance` significance threshold for the quads values
* `num_ngbrs` number of neighbors to consider (default: 5)
* `permutations` number of random permutations for calculation of
pseudo-p values (default: 99)
* `geom_column` number of the geometry column (default: "the_geom")
* `id_col` PK column of the table (default: "cartodb_id")
* `w_type` Weight types: can be "knn" for k-nearest neighbor weights
or "queen" for contiguity based weights.
The function returns a table with the following columns:
* `moran` Moran's value
* `quads` quad classification ('HH', 'LL', 'HL', 'LH' or 'Not significant')
* `significance` significance value
* `ids` id of the corresponding record in the input table
Function `cdb_moran_local_rate` only differs in that the `attr` input
parameter is substituted by `numerator` and `denominator`.

View File

@@ -1,260 +0,0 @@
\i test/fixtures/ppoints.sql
-- test table (spanish province centroids with some invented values)
CREATE TABLE ppoints (cartodb_id integer, the_geom geometry, the_geom_webmercator geometry, code text, region_code text, value float);
INSERT INTO ppoints VALUES
( 1,'0101000020E6100000A8306DC0CBC305C051D14B6CE56A4540'::geometry,ST_Transform('0101000020E6100000A8306DC0CBC305C051D14B6CE56A4540'::geometry, 3857),'01','16',0.5),
( 4,'0101000020E6100000E220A4362DC202C0FD8AFA5119994240'::geometry,ST_Transform('0101000020E6100000E220A4362DC202C0FD8AFA5119994240'::geometry, 3857),'04','01',0.1),
( 5,'0101000020E610000004377E573AC813C0CB5871BB17494440'::geometry,ST_Transform('0101000020E610000004377E573AC813C0CB5871BB17494440'::geometry, 3857),'05','07',0.3),
( 2,'0101000020E610000000F49BE19BAFFFBF639958FDA6694340'::geometry,ST_Transform('0101000020E610000000F49BE19BAFFFBF639958FDA6694340'::geometry, 3857),'02','08',0.7),
( 3,'0101000020E61000005D0B7E63C832E2BFDB63EB00443D4340'::geometry,ST_Transform('0101000020E61000005D0B7E63C832E2BFDB63EB00443D4340'::geometry, 3857),'03','10',0.2),
( 6,'0101000020E61000006F3742B7FB9018C0DD967DC4D95A4340'::geometry,ST_Transform('0101000020E61000006F3742B7FB9018C0DD967DC4D95A4340'::geometry, 3857),'06','11',0.05),
( 7,'0101000020E6100000E4BB36995F4C0740EAC0E5CA9FC94340'::geometry,ST_Transform('0101000020E6100000E4BB36995F4C0740EAC0E5CA9FC94340'::geometry, 3857),'07','04',0.4),
( 8,'0101000020E61000003D43CC6CAFBEFF3F6B52E66F91DD4440'::geometry,ST_Transform('0101000020E61000003D43CC6CAFBEFF3F6B52E66F91DD4440'::geometry, 3857),'08','09',0.7),
( 9,'0101000020E61000003CC797BD99AF0CC0495A87FA312F4540'::geometry,ST_Transform('0101000020E61000003CC797BD99AF0CC0495A87FA312F4540'::geometry, 3857),'09','07',0.5),
(13,'0101000020E61000001CAA00A9F19F0EC05DF9267B7A764340'::geometry,ST_Transform('0101000020E61000001CAA00A9F19F0EC05DF9267B7A764340'::geometry, 3857),'13','08',0.4),
(16,'0101000020E6100000D8208F3CBC9001C065638DC1B1F24340'::geometry,ST_Transform('0101000020E6100000D8208F3CBC9001C065638DC1B1F24340'::geometry, 3857),'16','08',0.4),
(17,'0101000020E6100000E9E6A94A71630540AD7A0CB062104540'::geometry,ST_Transform('0101000020E6100000E9E6A94A71630540AD7A0CB062104540'::geometry, 3857),'17','09',0.6),
(18,'0101000020E6100000719792D59E240AC098AC548E00A84240'::geometry,ST_Transform('0101000020E6100000719792D59E240AC098AC548E00A84240'::geometry, 3857),'18','01',0.3),
(19,'0101000020E6100000972C878B50FD04C0123C881D1F684440'::geometry,ST_Transform('0101000020E6100000972C878B50FD04C0123C881D1F684440'::geometry, 3857),'19','08',0.7),
(21,'0101000020E6100000F7893E9934511BC0EAA4BF03E1C94240'::geometry,ST_Transform('0101000020E6100000F7893E9934511BC0EAA4BF03E1C94240'::geometry, 3857),'21','01',0.1),
(22,'0101000020E6100000572C2123B2A8B2BF7ED7FABAFD194540'::geometry,ST_Transform('0101000020E6100000572C2123B2A8B2BF7ED7FABAFD194540'::geometry, 3857),'22','02',0.4),
(25,'0101000020E6100000461B67D688C4F03FD990EEC3A0054540'::geometry,ST_Transform('0101000020E6100000461B67D688C4F03FD990EEC3A0054540'::geometry, 3857),'25','09',0.4),
(26,'0101000020E6100000A139FB06E82204C0539D84F62E234540'::geometry,ST_Transform('0101000020E6100000A139FB06E82204C0539D84F62E234540'::geometry, 3857),'26','17',0.6),
(27,'0101000020E6100000A92E54E618C91DC00D3A947B81814540'::geometry,ST_Transform('0101000020E6100000A92E54E618C91DC00D3A947B81814540'::geometry, 3857),'27','12',0.3),
(28,'0101000020E6100000971DC8B682BC0DC016D0E8055F3F4440'::geometry,ST_Transform('0101000020E6100000971DC8B682BC0DC016D0E8055F3F4440'::geometry, 3857),'28','13',0.8),
(30,'0101000020E6100000A2DC1964A8C5F7BF19299C994D004340'::geometry,ST_Transform('0101000020E6100000A2DC1964A8C5F7BF19299C994D004340'::geometry, 3857),'30','14',0.1),
(31,'0101000020E6100000DCA1FCC87B56FABF9B88E9D866554540'::geometry,ST_Transform('0101000020E6100000DCA1FCC87B56FABF9B88E9D866554540'::geometry, 3857),'31','15',0.9),
(32,'0101000020E6100000E1517AFCD15E1EC0A18D8D4825194540'::geometry,ST_Transform('0101000020E6100000E1517AFCD15E1EC0A18D8D4825194540'::geometry, 3857),'32','12',0.3),
(33,'0101000020E6100000A7FF33825AF917C0FABE7DFB6BA54540'::geometry,ST_Transform('0101000020E6100000A7FF33825AF917C0FABE7DFB6BA54540'::geometry, 3857),'33','03',0.4),
(34,'0101000020E6100000FB4E4EBEB72412C0898E7240982F4540'::geometry,ST_Transform('0101000020E6100000FB4E4EBEB72412C0898E7240982F4540'::geometry, 3857),'34','07',0.3),
(35,'0101000020E6100000224682B01B1A2DC011091656CC5C3C40'::geometry,ST_Transform('0101000020E6100000224682B01B1A2DC011091656CC5C3C40'::geometry, 3857),'35','05',0.3),
(36,'0101000020E6100000F7C9447110EC20C04C5D4823C7374540'::geometry,ST_Transform('0101000020E6100000F7C9447110EC20C04C5D4823C7374540'::geometry, 3857),'36','12',0.2),
(37,'0101000020E610000053D6A26DFB4218C09D58FAE209674440'::geometry,ST_Transform('0101000020E610000053D6A26DFB4218C09D58FAE209674440'::geometry, 3857),'37','07',0.5),
(38,'0101000020E6100000B1D1B5FC910431C03C0C89BA03503C40'::geometry,ST_Transform('0101000020E6100000B1D1B5FC910431C03C0C89BA03503C40'::geometry, 3857),'38','05',0.4),
(39,'0101000020E610000086E6FEE1BD1E10C00417096748994540'::geometry,ST_Transform('0101000020E610000086E6FEE1BD1E10C00417096748994540'::geometry, 3857),'39','06',0.6),
(40,'0101000020E6100000FB51C33F733710C038D01729E4954440'::geometry,ST_Transform('0101000020E6100000FB51C33F733710C038D01729E4954440'::geometry, 3857),'40','07',0.5),
(41,'0101000020E6100000912D6FDA28BB16C031321F08C4B74240'::geometry,ST_Transform('0101000020E6100000912D6FDA28BB16C031321F08C4B74240'::geometry, 3857),'41','01',0.4),
(42,'0101000020E6100000554432EABEB504C069ECD78775CF4440'::geometry,ST_Transform('0101000020E6100000554432EABEB504C069ECD78775CF4440'::geometry, 3857),'42','07',0.2),
(43,'0101000020E6100000157F117C1A2EEA3F027CD1F2368B4440'::geometry,ST_Transform('0101000020E6100000157F117C1A2EEA3F027CD1F2368B4440'::geometry, 3857),'43','09',0.3),
(44,'0101000020E610000051AA5B1BD718EABFEE67613BA4544440'::geometry,ST_Transform('0101000020E610000051AA5B1BD718EABFEE67613BA4544440'::geometry, 3857),'44','02',0.2),
(45,'0101000020E610000022C5C01BB69710C08563BC1499E54340'::geometry,ST_Transform('0101000020E610000022C5C01BB69710C08563BC1499E54340'::geometry, 3857),'45','08',0.3),
(46,'0101000020E6100000D5FCF78A11A0E9BFDEA46F8E64AF4340'::geometry,ST_Transform('0101000020E6100000D5FCF78A11A0E9BFDEA46F8E64AF4340'::geometry, 3857),'46','10',0.2),
(47,'0101000020E61000003AE63525866313C02100050B2BD14440'::geometry,ST_Transform('0101000020E61000003AE63525866313C02100050B2BD14440'::geometry, 3857),'47','07',0.3),
(48,'0101000020E610000030F187FD1FD206C0C767E1496C9E4540'::geometry,ST_Transform('0101000020E610000030F187FD1FD206C0C767E1496C9E4540'::geometry, 3857),'48','16',0.5),
(49,'0101000020E61000009C22867B12EC17C006C5F40C14DD4440'::geometry,ST_Transform('0101000020E61000009C22867B12EC17C006C5F40C14DD4440'::geometry, 3857),'49','07',0.2),
(50,'0101000020E6100000F7D5EFC62D08F1BF69D1231D68CF4440'::geometry,ST_Transform('0101000020E6100000F7D5EFC62D08F1BF69D1231D68CF4440'::geometry, 3857),'50','02',0.6),
(51,'0101000020E61000005B0E1F8DAA5F15C0530BFE285BF24140'::geometry,ST_Transform('0101000020E61000005B0E1F8DAA5F15C0530BFE285BF24140'::geometry, 3857),'51','18',0.01),
(10,'0101000020E61000000FD65D82AEA418C06192D1351FDB4340'::geometry,ST_Transform('0101000020E61000000FD65D82AEA418C06192D1351FDB4340'::geometry, 3857),'10','11',0.04),
(11,'0101000020E6100000B305531DAB0A17C0DEAFCD4EE5464240'::geometry,ST_Transform('0101000020E6100000B305531DAB0A17C0DEAFCD4EE5464240'::geometry, 3857),'11','01',0.08),
(12,'0101000020E610000059721A7297C9C2BF9EBE383BE51E4440'::geometry,ST_Transform('0101000020E610000059721A7297C9C2BF9EBE383BE51E4440'::geometry, 3857),'12','10',0.2),
(14,'0101000020E610000000C86313AF3C13C0E530879C10FF4240'::geometry,ST_Transform('0101000020E610000000C86313AF3C13C0E530879C10FF4240'::geometry, 3857),'14','01',0.2),
(15,'0101000020E61000002A475497B6ED20C06643D4131A904540'::geometry,ST_Transform('0101000020E61000002A475497B6ED20C06643D4131A904540'::geometry, 3857),'15','12',0.3),
(20,'0101000020E6100000F975566FAD8D01C0E840C33F67924540'::geometry,ST_Transform('0101000020E6100000F975566FAD8D01C0E840C33F67924540'::geometry, 3857),'20','16',0.8),
(23,'0101000020E610000025FA13E595880BC022BB07131D024340'::geometry,ST_Transform('0101000020E610000025FA13E595880BC022BB07131D024340'::geometry, 3857),'23','01',0.1),
(24,'0101000020E61000009C5F91C5095C17C0C78784B15A4F4540'::geometry,ST_Transform('0101000020E61000009C5F91C5095C17C0C78784B15A4F4540'::geometry, 3857),'24','07',0.3),
(29,'0101000020E6100000C34D4A5B48E712C092E680892C684240'::geometry,ST_Transform('0101000020E6100000C34D4A5B48E712C092E680892C684240'::geometry, 3857),'29','01',0.3),
(52,'0101000020E6100000406A545EB29A07C04E5F0BDA39A54140'::geometry,ST_Transform('0101000020E6100000406A545EB29A07C04E5F0BDA39A54140'::geometry, 3857),'52','19',0.01)
\i test/fixtures/ppoints2.sql
-- test table (spanish province centroids with some invented values)
CREATE TABLE ppoints2 (cartodb_id integer, the_geom geometry, code text, region_code text, numerator float, denominator float);
INSERT INTO ppoints2 VALUES
( 1,'0101000020E6100000A8306DC0CBC305C051D14B6CE56A4540'::geometry,'01','16',0.5, 1.0),
( 4,'0101000020E6100000E220A4362DC202C0FD8AFA5119994240'::geometry,'04','01',0.1, 1.0),
( 5,'0101000020E610000004377E573AC813C0CB5871BB17494440'::geometry,'05','07',0.3, 1.0),
( 2,'0101000020E610000000F49BE19BAFFFBF639958FDA6694340'::geometry,'02','08',0.7, 1.0),
( 3,'0101000020E61000005D0B7E63C832E2BFDB63EB00443D4340'::geometry,'03','10',0.2, 1.0),
( 6,'0101000020E61000006F3742B7FB9018C0DD967DC4D95A4340'::geometry,'06','11',0.05, 1.0),
( 7,'0101000020E6100000E4BB36995F4C0740EAC0E5CA9FC94340'::geometry,'07','04',0.4, 1.0),
( 8,'0101000020E61000003D43CC6CAFBEFF3F6B52E66F91DD4440'::geometry,'08','09',0.7, 1.0),
( 9,'0101000020E61000003CC797BD99AF0CC0495A87FA312F4540'::geometry,'09','07',0.5, 1.0),
(13,'0101000020E61000001CAA00A9F19F0EC05DF9267B7A764340'::geometry,'13','08',0.4, 1.0),
(16,'0101000020E6100000D8208F3CBC9001C065638DC1B1F24340'::geometry,'16','08',0.4, 1.0),
(17,'0101000020E6100000E9E6A94A71630540AD7A0CB062104540'::geometry,'17','09',0.6, 1.0),
(18,'0101000020E6100000719792D59E240AC098AC548E00A84240'::geometry,'18','01',0.3, 1.0),
(19,'0101000020E6100000972C878B50FD04C0123C881D1F684440'::geometry,'19','08',0.7, 1.0),
(21,'0101000020E6100000F7893E9934511BC0EAA4BF03E1C94240'::geometry,'21','01',0.1, 1.0),
(22,'0101000020E6100000572C2123B2A8B2BF7ED7FABAFD194540'::geometry,'22','02',0.4, 1.0),
(25,'0101000020E6100000461B67D688C4F03FD990EEC3A0054540'::geometry,'25','09',0.4, 1.0),
(26,'0101000020E6100000A139FB06E82204C0539D84F62E234540'::geometry,'26','17',0.6, 1.0),
(27,'0101000020E6100000A92E54E618C91DC00D3A947B81814540'::geometry,'27','12',0.3, 1.0),
(28,'0101000020E6100000971DC8B682BC0DC016D0E8055F3F4440'::geometry,'28','13',0.8, 1.0),
(30,'0101000020E6100000A2DC1964A8C5F7BF19299C994D004340'::geometry,'30','14',0.1, 1.0),
(31,'0101000020E6100000DCA1FCC87B56FABF9B88E9D866554540'::geometry,'31','15',0.9, 1.0),
(32,'0101000020E6100000E1517AFCD15E1EC0A18D8D4825194540'::geometry,'32','12',0.3, 1.0),
(33,'0101000020E6100000A7FF33825AF917C0FABE7DFB6BA54540'::geometry,'33','03',0.4, 1.0),
(34,'0101000020E6100000FB4E4EBEB72412C0898E7240982F4540'::geometry,'34','07',0.3, 1.0),
(35,'0101000020E6100000224682B01B1A2DC011091656CC5C3C40'::geometry,'35','05',0.3, 1.0),
(36,'0101000020E6100000F7C9447110EC20C04C5D4823C7374540'::geometry,'36','12',0.2, 1.0),
(37,'0101000020E610000053D6A26DFB4218C09D58FAE209674440'::geometry,'37','07',0.5, 1.0),
(38,'0101000020E6100000B1D1B5FC910431C03C0C89BA03503C40'::geometry,'38','05',0.4, 1.0),
(39,'0101000020E610000086E6FEE1BD1E10C00417096748994540'::geometry,'39','06',0.6, 1.0),
(40,'0101000020E6100000FB51C33F733710C038D01729E4954440'::geometry,'40','07',0.5, 1.0),
(41,'0101000020E6100000912D6FDA28BB16C031321F08C4B74240'::geometry,'41','01',0.4, 1.0),
(42,'0101000020E6100000554432EABEB504C069ECD78775CF4440'::geometry,'42','07',0.2, 1.0),
(43,'0101000020E6100000157F117C1A2EEA3F027CD1F2368B4440'::geometry,'43','09',0.3, 1.0),
(44,'0101000020E610000051AA5B1BD718EABFEE67613BA4544440'::geometry,'44','02',0.2, 1.0),
(45,'0101000020E610000022C5C01BB69710C08563BC1499E54340'::geometry,'45','08',0.3, 1.0),
(46,'0101000020E6100000D5FCF78A11A0E9BFDEA46F8E64AF4340'::geometry,'46','10',0.2, 1.0),
(47,'0101000020E61000003AE63525866313C02100050B2BD14440'::geometry,'47','07',0.3, 1.0),
(48,'0101000020E610000030F187FD1FD206C0C767E1496C9E4540'::geometry,'48','16',0.5, 1.0),
(49,'0101000020E61000009C22867B12EC17C006C5F40C14DD4440'::geometry,'49','07',0.2, 1.0),
(50,'0101000020E6100000F7D5EFC62D08F1BF69D1231D68CF4440'::geometry,'50','02',0.6, 1.0),
(51,'0101000020E61000005B0E1F8DAA5F15C0530BFE285BF24140'::geometry,'51','18',0.01, 1.0),
(10,'0101000020E61000000FD65D82AEA418C06192D1351FDB4340'::geometry,'10','11',0.04, 1.0),
(11,'0101000020E6100000B305531DAB0A17C0DEAFCD4EE5464240'::geometry,'11','01',0.08, 1.0),
(12,'0101000020E610000059721A7297C9C2BF9EBE383BE51E4440'::geometry,'12','10',0.2, 1.0),
(14,'0101000020E610000000C86313AF3C13C0E530879C10FF4240'::geometry,'14','01',0.2, 1.0),
(15,'0101000020E61000002A475497B6ED20C06643D4131A904540'::geometry,'15','12',0.3, 1.0),
(20,'0101000020E6100000F975566FAD8D01C0E840C33F67924540'::geometry,'20','16',0.8, 1.0),
(23,'0101000020E610000025FA13E595880BC022BB07131D024340'::geometry,'23','01',0.1, 1.0),
(24,'0101000020E61000009C5F91C5095C17C0C78784B15A4F4540'::geometry,'24','07',0.3, 1.0),
(29,'0101000020E6100000C34D4A5B48E712C092E680892C684240'::geometry,'29','01',0.3, 1.0),
(52,'0101000020E6100000406A545EB29A07C04E5F0BDA39A54140'::geometry,'52','19',0.0, 1.01)
-- Moral functions perform some nondeterministic computations
-- (to estimate the significance); we will set the seeds for the RNGs
-- that affect those results to have repeateble results
SELECT cdb_crankshaft._cdb_random_seeds(1234);
_cdb_random_seeds
-------------------
(1 row)
SELECT ppoints.code, m.quads
FROM ppoints
JOIN cdb_crankshaft.cdb_moran_local('ppoints', 'value') m
ON ppoints.cartodb_id = m.ids
ORDER BY ppoints.code;
NOTICE: ** Constructing query
CONTEXT: PL/Python function "cdb_moran_local"
NOTICE: ** Query returned with 52 rows
CONTEXT: PL/Python function "cdb_moran_local"
NOTICE: ** Finished calculations
CONTEXT: PL/Python function "cdb_moran_local"
code | quads
------+-----------------
01 | HH
02 | HL
03 | Not significant
04 | Not significant
05 | Not significant
06 | Not significant
07 | Not significant
08 | Not significant
09 | Not significant
10 | Not significant
11 | LL
12 | Not significant
13 | Not significant
14 | Not significant
15 | Not significant
16 | HH
17 | Not significant
18 | Not significant
19 | Not significant
20 | HH
21 | LL
22 | Not significant
23 | Not significant
24 | Not significant
25 | HH
26 | HH
27 | Not significant
28 | Not significant
29 | LL
30 | Not significant
31 | HH
32 | Not significant
33 | Not significant
34 | Not significant
35 | LL
36 | Not significant
37 | Not significant
38 | HL
39 | Not significant
40 | Not significant
41 | HL
42 | LH
43 | Not significant
44 | Not significant
45 | LH
46 | Not significant
47 | Not significant
48 | HH
49 | Not significant
50 | Not significant
51 | LL
52 | LL
(52 rows)
SELECT cdb_crankshaft._cdb_random_seeds(1234);
_cdb_random_seeds
-------------------
(1 row)
SELECT ppoints2.code, m.quads
FROM ppoints2
JOIN cdb_crankshaft.cdb_moran_local_rate('ppoints2', 'numerator', 'denominator') m
ON ppoints2.cartodb_id = m.ids
ORDER BY ppoints2.code;
NOTICE: ** Constructing query
CONTEXT: PL/Python function "cdb_moran_local_rate"
NOTICE: ** Query returned with 51 rows
CONTEXT: PL/Python function "cdb_moran_local_rate"
NOTICE: ** Finished calculations
CONTEXT: PL/Python function "cdb_moran_local_rate"
code | quads
------+-----------------
01 | LL
02 | Not significant
03 | Not significant
04 | Not significant
05 | Not significant
06 | Not significant
07 | Not significant
08 | Not significant
09 | LL
10 | Not significant
11 | HH
12 | Not significant
13 | Not significant
14 | Not significant
15 | Not significant
16 | Not significant
17 | LL
18 | Not significant
19 | Not significant
20 | LL
21 | Not significant
22 | Not significant
23 | Not significant
24 | Not significant
25 | LL
26 | LL
27 | Not significant
28 | Not significant
29 | LH
30 | Not significant
31 | LL
32 | Not significant
33 | Not significant
34 | Not significant
35 | LH
36 | Not significant
37 | Not significant
38 | LH
39 | Not significant
40 | Not significant
41 | LH
42 | HL
43 | Not significant
44 | Not significant
45 | LL
46 | Not significant
47 | Not significant
48 | LL
49 | Not significant
50 | Not significant
51 | Not significant
(51 rows)

View File

@@ -1,23 +0,0 @@
\i test/fixtures/polyg_values.sql
CREATE TABLE values (cartodb_id integer, value float, the_geom geometry);
INSERT INTO values(cartodb_id, value, the_geom) VALUES
(1,10,'0106000020E61000000100000001030000000100000005000000E5AF3500C03608C08068629111374440C7BC0A00C00F02C0AC0551523B414440C7BC0A00C0A700C0CAF23B6E74FB4340A7267FFFFF5206C0FBB7E41B7EE74340E5AF3500C03608C08068629111374440'::geometry),
(2,20,'0106000020E610000001000000010300000001000000050000002439EC00804AF7BF07D6CCB5C3064440C7BC0A00C0A700C0CAF23B6E74FB4340C7BC0A00C00F02C0AC0551523B414440E20CD5FFFF30FABFBE4F76AFEA4B44402439EC00804AF7BF07D6CCB5C3064440'::geometry)
SELECT round(cdb_crankshaft.cdb_overlap_sum(
'0106000020E61000000100000001030000000100000004000000FFFFFFFFFF3604C09A0B9ECEC42E444000000000C060FBBF30C7FD70E01D44400000000040AD02C06481F1C8CD034440FFFFFFFFFF3604C09A0B9ECEC42E4440'::geometry,
'values', 'value'
), 2);
round
-------
4.42
(1 row)
SELECT round(cdb_crankshaft.cdb_overlap_sum(
'0106000020E61000000100000001030000000100000004000000FFFFFFFFFF3604C09A0B9ECEC42E444000000000C060FBBF30C7FD70E01D44400000000040AD02C06481F1C8CD034440FFFFFFFFFF3604C09A0B9ECEC42E4440'::geometry,
'values', 'value', schema_name := 'public'
), 2);
round
-------
4.42
(1 row)

View File

@@ -1,6 +0,0 @@
-- Install dependencies
CREATE EXTENSION plpythonu;
CREATE EXTENSION postgis;
CREATE EXTENSION cartodb;
-- Install the extension
CREATE EXTENSION crankshaft;

View File

@@ -1,21 +0,0 @@
\i test/fixtures/ppoints.sql
\i test/fixtures/ppoints2.sql
-- Moral functions perform some nondeterministic computations
-- (to estimate the significance); we will set the seeds for the RNGs
-- that affect those results to have repeateble results
SELECT cdb_crankshaft._cdb_random_seeds(1234);
SELECT ppoints.code, m.quads
FROM ppoints
JOIN cdb_crankshaft.cdb_moran_local('ppoints', 'value') m
ON ppoints.cartodb_id = m.ids
ORDER BY ppoints.code;
SELECT cdb_crankshaft._cdb_random_seeds(1234);
SELECT ppoints2.code, m.quads
FROM ppoints2
JOIN cdb_crankshaft.cdb_moran_local_rate('ppoints2', 'numerator', 'denominator') m
ON ppoints2.cartodb_id = m.ids
ORDER BY ppoints2.code;

1
python/.gitignore vendored
View File

@@ -1 +0,0 @@
*.pyc

View File

@@ -1,11 +0,0 @@
# Install the package (needs root privileges)
install:
pip install ./crankshaft --upgrade
# Test from source code
test:
(cd crankshaft && nosetests test/)
# Test currently installed package
testinstalled:
nosetests crankshaft/test/

View File

@@ -1,9 +0,0 @@
# Crankshaft Python Package
...
### Run the tests
```bash
cd crankshaft
nosetests test/
```

0
release/.gitignore vendored Normal file
View File

View File

@@ -0,0 +1,74 @@
CREATE OR REPLACE FUNCTION cdb_crankshaft.cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.2'::text;
$$ language 'sql' STABLE STRICT;
CREATE OR REPLACE FUNCTION cdb_crankshaft._cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
CREATE OR REPLACE FUNCTION cdb_crankshaft._cdb_crankshaft_virtualenvs_path()
RETURNS text
AS $$
BEGIN
RETURN '/home/ubuntu/crankshaft/envs';
END;
$$ language plpgsql IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION cdb_crankshaft._cdb_crankshaft_activate_py()
RETURNS VOID
AS $$
import os
# plpy.notice('%',str(os.environ))
# activate virtualenv
crankshaft_version = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_internal_version()')[0]['_cdb_crankshaft_internal_version']
base_path = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_virtualenvs_path()')[0]['_cdb_crankshaft_virtualenvs_path']
default_venv_path = os.path.join(base_path, crankshaft_version)
venv_path = os.environ.get('CRANKSHAFT_VENV', default_venv_path)
activate_path = venv_path + '/bin/activate_this.py'
exec(open(activate_path).read(), dict(__file__=activate_path))
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION
cdb_crankshaft._cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I
CREATE OR REPLACE FUNCTION
cdb_crankshaft.cdb_moran_local (
t TEXT,
attr TEXT,
significance float DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION
cdb_crankshaft.cdb_moran_local_rate(t TEXT,
numerator TEXT,
denominator TEXT,
significance FLOAT DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;

View File

@@ -1,6 +1,12 @@
-- Moran's I
CREATE OR REPLACE FUNCTION
cdb_moran_local (
cdb_crankshaft._cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION
cdb_crankshaft.cdb_moran_local (
t TEXT,
attr TEXT,
significance float DEFAULT 0.05,
@@ -16,9 +22,8 @@ AS $$
return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate
CREATE OR REPLACE FUNCTION
cdb_moran_local_rate(t TEXT,
cdb_crankshaft.cdb_moran_local_rate(t TEXT,
numerator TEXT,
denominator TEXT,
significance FLOAT DEFAULT 0.05,
@@ -33,3 +38,7 @@ AS $$
# TODO: use named parameters or a dictionary
return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
DROP FUNCTION IF EXISTS cdb_crankshaft.cdb_crankshaft_version();
DROP FUNCTION IF EXISTS cdb_crankshaft._cdb_crankshaft_internal_version();
DROP FUNCTION IF EXISTS cdb_crankshaft._cdb_crankshaft_activate_py();

View File

@@ -0,0 +1,413 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- [MANUALLY] DROP FUNCTIONS REMOVED SINCE 0.0.2 version
DROP FUNCTION IF EXISTS cdb_moran_local(TEXT, TEXT, float, INT, INT, TEXT, TEXT, TEXT);
DROP FUNCTION IF EXISTS cdb_moran_local_rate(TEXT, TEXT, TEXT, FLOAT, INT, INT, TEXT, TEXT, TEXT);
DROP FUNCTION IF EXISTS _cdb_crankshaft_virtualenvs_path();
DROP FUNCTION IF EXISTS _cdb_crankshaft_activate_py();
-- [END MANUALLY] DROP FUNCTIONS REMOVED SINCE 0.0.2 version
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.3'::text;
$$ language 'sql' STABLE STRICT;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I Global Measure (public-facing)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspots(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspots(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliers(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Global Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliersRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
from crankshaft.clustering import kmeans
return kmeans(query,no_clusters,no_init)
$$ language plpythonu;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
RETURNS Numeric[] AS
$$
DECLARE
newX NUMERIC;
newY NUMERIC;
newW NUMERIC;
BEGIN
IF weight IS NULL OR the_geom IS NULL THEN
newX = state[1];
newY = state[2];
newW = state[3];
ELSE
newX = state[1] + ST_X(the_geom)*weight;
newY = state[2] + ST_Y(the_geom)*weight;
newW = state[3] + weight;
END IF;
RETURN Array[newX,newY,newW];
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
RETURNS GEOMETRY AS
$$
BEGIN
IF state[3] = 0 THEN
RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
ELSE
RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)(
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
INITCOND = "{0.0,0.0,0.0}"
);
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -0,0 +1,186 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.2'::text;
$$ language 'sql' STABLE STRICT;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
CREATE OR REPLACE FUNCTION _cdb_crankshaft_virtualenvs_path()
RETURNS text
AS $$
BEGIN
-- RETURN '/opt/virtualenvs/crankshaft';
RETURN '/home/ubuntu/crankshaft/envs';
END;
$$ language plpgsql IMMUTABLE STRICT;
-- Use the crankshaft python module
CREATE OR REPLACE FUNCTION _cdb_crankshaft_activate_py()
RETURNS VOID
AS $$
import os
# plpy.notice('%',str(os.environ))
# activate virtualenv
crankshaft_version = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_internal_version()')[0]['_cdb_crankshaft_internal_version']
base_path = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_virtualenvs_path()')[0]['_cdb_crankshaft_virtualenvs_path']
default_venv_path = os.path.join(base_path, crankshaft_version)
venv_path = os.environ.get('CRANKSHAFT_VENV', default_venv_path)
activate_path = venv_path + '/bin/activate_this.py'
exec(open(activate_path).read(), dict(__file__=activate_path))
$$ LANGUAGE plpythonu;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I
CREATE OR REPLACE FUNCTION
cdb_moran_local (
t TEXT,
attr TEXT,
significance float DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate
CREATE OR REPLACE FUNCTION
cdb_moran_local_rate(t TEXT,
numerator TEXT,
denominator TEXT,
significance FLOAT DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -0,0 +1,209 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- [MANUALLY] DROP FUNCTIONS INTRODUCED IN 0.0.3 version
DROP FUNCTION IF EXISTS CDB_AreasOfInterestGlobal(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS _CDB_AreasOfInterestLocal(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_AreasOfInterestLocal(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_GetSpatialHotspots(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_GetSpatialColdspots(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_GetSpatialOutliers(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_AreasOfInterestGlobalRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_AreasOfInterestLocalRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS _CDB_AreasOfInterestLocalRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_GetSpatialHotspotsRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_GetSpatialColdspotsRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_GetSpatialOutliersRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT);
DROP FUNCTION IF EXISTS CDB_KMeans(text,integer,integer);
DROP AGGREGATE IF EXISTS CDB_WeightedMean(geometry(Point, 4326), NUMERIC);
DROP FUNCTION IF EXISTS CDB_WeightedMeanS(Numeric[], GEOMETRY(Point, 4326), NUMERIC);
DROP FUNCTION IF EXISTS CDB_WeightedMeanF(Numeric[]);
-- [END MANUALLY] DROP FUNCTIONS INTRODUCED IN 0.0.3 version
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.2'::text;
$$ language 'sql' STABLE STRICT;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
CREATE OR REPLACE FUNCTION _cdb_crankshaft_virtualenvs_path()
RETURNS text
AS $$
BEGIN
-- RETURN '/opt/virtualenvs/crankshaft';
RETURN '/home/ubuntu/crankshaft/envs';
END;
$$ language plpgsql IMMUTABLE STRICT;
-- Use the crankshaft python module
CREATE OR REPLACE FUNCTION _cdb_crankshaft_activate_py()
RETURNS VOID
AS $$
import os
# plpy.notice('%',str(os.environ))
# activate virtualenv
crankshaft_version = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_internal_version()')[0]['_cdb_crankshaft_internal_version']
base_path = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_virtualenvs_path()')[0]['_cdb_crankshaft_virtualenvs_path']
default_venv_path = os.path.join(base_path, crankshaft_version)
venv_path = os.environ.get('CRANKSHAFT_VENV', default_venv_path)
activate_path = venv_path + '/bin/activate_this.py'
exec(open(activate_path).read(), dict(__file__=activate_path))
$$ LANGUAGE plpythonu;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I
CREATE OR REPLACE FUNCTION
cdb_moran_local (
t TEXT,
attr TEXT,
significance float DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate
CREATE OR REPLACE FUNCTION
cdb_moran_local_rate(t TEXT,
numerator TEXT,
denominator TEXT,
significance FLOAT DEFAULT 0.05,
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_column TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id',
w_type TEXT DEFAULT 'knn')
RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric)
AS $$
plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type)
$$ LANGUAGE plpythonu;
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -0,0 +1,8 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.4'::text;
$$ language 'sql' STABLE STRICT;

View File

@@ -0,0 +1,403 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.3'::text;
$$ language 'sql' STABLE STRICT;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I Global Measure (public-facing)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspots(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspots(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliers(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Global Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliersRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
from crankshaft.clustering import kmeans
return kmeans(query,no_clusters,no_init)
$$ language plpythonu;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
RETURNS Numeric[] AS
$$
DECLARE
newX NUMERIC;
newY NUMERIC;
newW NUMERIC;
BEGIN
IF weight IS NULL OR the_geom IS NULL THEN
newX = state[1];
newY = state[2];
newW = state[3];
ELSE
newX = state[1] + ST_X(the_geom)*weight;
newY = state[2] + ST_Y(the_geom)*weight;
newW = state[3] + weight;
END IF;
RETURN Array[newX,newY,newW];
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
RETURNS GEOMETRY AS
$$
BEGIN
IF state[3] = 0 THEN
RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
ELSE
RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)(
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
INITCOND = "{0.0,0.0,0.0}"
);
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -0,0 +1,8 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.3'::text;
$$ language 'sql' STABLE STRICT;

View File

@@ -0,0 +1,258 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
--------------------------------------------------------------------------------
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.1.0'::text;
$$ language 'sql' STABLE STRICT;
--------------------------------------------------------------------------------
-- PyAgg stuff
CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$
BEGIN
if array_upper(current_state,1) is null then
RAISE NOTICE 'setting state %',array_upper(current_row,1);
current_state[1] = array_upper(current_row,1);
end if;
return array_cat(current_state,current_row) ;
END
$$ LANGUAGE plpgsql;
CREATE AGGREGATE CDB_PyAgg(NUMERIC[])(
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
INITCOND = "{}"
);
--------------------------------------------------------------------------------
-- Segmentation stuff
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
target NUMERIC[],
features NUMERIC[],
target_features NUMERIC[],
target_ids NUMERIC[],
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC)
AS $$
import numpy as np
import plpy
from crankshaft.segmentation import create_and_predict_segment_agg
model_params = {'n_estimators': n_estimators,
'max_depth': max_depth,
'subsample': subsample,
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf}
def unpack2D(data):
dimension = data.pop(0)
a = np.array(data, dtype=float)
return a.reshape(len(a)/dimension, dimension)
return create_and_predict_segment_agg(np.array(target, dtype=float),
unpack2D(features),
unpack2D(target_features),
target_ids,
model_params)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment (
query TEXT,
variable_name TEXT,
target_table TEXT,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$
from crankshaft.segmentation import create_and_predict_segment
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
return create_and_predict_segment(query,variable_name,target_table, model_params)
$$ LANGUAGE plpythonu;
--------------------------------------------------------------------------------
-- Spatial interpolation
-- 0: nearest neighbor
-- 1: barymetric
-- 2: IDW
CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
IN query text,
IN point geometry,
IN method integer DEFAULT 1,
IN p1 numeric DEFAULT 0,
IN p2 numeric DEFAULT 0
)
RETURNS numeric AS
$$
DECLARE
gs geometry[];
vs numeric[];
output numeric;
BEGIN
EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs;
SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a;
RETURN output;
END;
$$
language plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
IN geomin geometry[],
IN colin numeric[],
IN point geometry,
IN method integer DEFAULT 1,
IN p1 numeric DEFAULT 0,
IN p2 numeric DEFAULT 0
)
RETURNS numeric AS
$$
DECLARE
gs geometry[];
vs numeric[];
gs2 geometry[];
vs2 numeric[];
g geometry;
vertex geometry[];
sg numeric;
sa numeric;
sb numeric;
sc numeric;
va numeric;
vb numeric;
vc numeric;
output numeric;
BEGIN
output := -999.999;
-- nearest
IF method = 0 THEN
WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v)
SELECT a.v INTO output FROM a ORDER BY point<->a.g LIMIT 1;
RETURN output;
-- barymetric
ELSIF method = 1 THEN
WITH a as (SELECT unnest(geomin) AS e),
b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
c as (SELECT (ST_Dump(t)).geom as v FROM b),
d as (SELECT v FROM c WHERE ST_Within(point, v))
SELECT v INTO g FROM d;
IF g is null THEN
-- out of the realm of the input data
RETURN -888.888;
END IF;
-- vertex of the selected cell
WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
SELECT array_agg(v) INTO vertex FROM a;
-- retrieve the value of each vertex
WITH a AS(SELECT unnest(vertex) as geo, unnest(colin) as c)
SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
WITH a AS(SELECT unnest(vertex) as geo, unnest(colin) as c)
SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
WITH a AS(SELECT unnest(vertex) as geo, unnest(colin) as c)
SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg);
RETURN output;
-- IDW
-- p1: limit the number of neighbors, 0->no limit
-- p2: order of distance decay, 0-> order 1
ELSIF method = 2 THEN
IF p2 = 0 THEN
p2 := 1;
END IF;
WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g)
SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b;
IF p1::integer>0 THEN
gs2:=gs;
vs2:=vs;
FOR i IN 1..p1
LOOP
gs2 := gs2 || gs[i];
vs2 := vs2 || vs[i];
END LOOP;
ELSE
gs2:=gs;
vs2:=vs;
END IF;
WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v),
b as (
SELECT
(1/ST_distance(point, a.g)^p2::integer) as k,
(a.v/ST_distance(point, a.g)^p2::integer) as f
FROM a
)
SELECT sum(b.f)/sum(b.k) INTO output FROM b;
RETURN output;
END IF;
RETURN -777.777;
END;
$$
language plpgsql IMMUTABLE;
--------------------------------------------------------------------------------
-- Spatial Markov
-- input table format:
-- id | geom | date_1 | date_2 | date_3
-- 1 | Pt1 | 12.3 | 13.1 | 14.2
-- 2 | Pt2 | 11.0 | 13.2 | 12.5
-- ...
-- Sample Function call:
-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate',
-- Array['date_1', 'date_2', 'date_3'])
CREATE OR REPLACE FUNCTION
CDB_SpatialMarkovTrend (
subquery TEXT,
time_cols TEXT[],
num_classes INT DEFAULT 7,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
AS $$
from crankshaft.space_time_dynamics import spatial_markov_trend
## TODO: use named parameters or a dictionary
return spatial_markov_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;

View File

@@ -0,0 +1,403 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.4'::text;
$$ language 'sql' STABLE STRICT;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
-- Moran's I Global Measure (public-facing)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspots(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspots(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliers(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Global Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliersRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
from crankshaft.clustering import kmeans
return kmeans(query,no_clusters,no_init)
$$ language plpythonu;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
RETURNS Numeric[] AS
$$
DECLARE
newX NUMERIC;
newY NUMERIC;
newW NUMERIC;
BEGIN
IF weight IS NULL OR the_geom IS NULL THEN
newX = state[1];
newY = state[2];
newW = state[3];
ELSE
newX = state[1] + ST_X(the_geom)*weight;
newY = state[2] + ST_Y(the_geom)*weight;
newW = state[3] + weight;
END IF;
RETURN Array[newX,newY,newW];
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
RETURNS GEOMETRY AS
$$
BEGIN
IF state[3] = 0 THEN
RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
ELSE
RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)(
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
INITCOND = "{0.0,0.0,0.0}"
);
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -0,0 +1,81 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.0.4'::text;
$$ language 'sql' STABLE STRICT;
--------------------------------------------------------------------------------
-- Spatial Markov
DROP FUNCTION
CDB_SpatialMarkovTrend (
subquery TEXT,
time_cols TEXT[],
num_classes INT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT);
--------------------------------------------------------------------------------
-- Spatial interpolation
DROP FUNCTION CDB_SpatialInterpolation(
IN geomin geometry[],
IN colin numeric[],
IN point geometry,
IN method integer,
IN p1 numeric,
IN p2 numeric
);
DROP FUNCTION CDB_SpatialInterpolation(
IN query text,
IN point geometry,
IN method integer,
IN p1 numeric,
IN p2 numeric
);
--------------------------------------------------------------------------------
-- Segmentation stuff
DROP FUNCTION
CDB_CreateAndPredictSegment (
query TEXT,
variable_name TEXT,
target_table TEXT,
n_estimators INTEGER,
max_depth INTEGER,
subsample DOUBLE PRECISION,
learning_rate DOUBLE PRECISION,
min_samples_leaf INTEGER);
DROP FUNCTION
CDB_CreateAndPredictSegment(
target NUMERIC[],
features NUMERIC[],
target_features NUMERIC[],
target_ids NUMERIC[],
n_estimators INTEGER,
max_depth INTEGER,
subsample DOUBLE PRECISION,
learning_rate DOUBLE PRECISION,
min_samples_leaf INTEGER);
--------------------------------------------------------------------------------
-- PyAgg stuff
DROP AGGREGATE CDB_PyAgg(NUMERIC[]);
DROP FUNCTION CDB_PyAggS(Numeric[], Numeric[]);

View File

@@ -0,0 +1,686 @@
--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.1.0'::text;
$$ language 'sql' STABLE STRICT;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$
BEGIN
if array_upper(current_state,1) is null then
RAISE NOTICE 'setting state %',array_upper(current_row,1);
current_state[1] = array_upper(current_row,1);
end if;
return array_cat(current_state,current_row) ;
END
$$ LANGUAGE plpgsql;
CREATE AGGREGATE CDB_PyAgg(NUMERIC[])(
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
INITCOND = "{}"
);
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
target NUMERIC[],
features NUMERIC[],
target_features NUMERIC[],
target_ids NUMERIC[],
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC)
AS $$
import numpy as np
import plpy
from crankshaft.segmentation import create_and_predict_segment_agg
model_params = {'n_estimators': n_estimators,
'max_depth': max_depth,
'subsample': subsample,
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf}
def unpack2D(data):
dimension = data.pop(0)
a = np.array(data, dtype=float)
return a.reshape(len(a)/dimension, dimension)
return create_and_predict_segment_agg(np.array(target, dtype=float),
unpack2D(features),
unpack2D(target_features),
target_ids,
model_params)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment (
query TEXT,
variable_name TEXT,
target_table TEXT,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$
from crankshaft.segmentation import create_and_predict_segment
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
return create_and_predict_segment(query,variable_name,target_table, model_params)
$$ LANGUAGE plpythonu;
-- 0: nearest neighbor
-- 1: barymetric
-- 2: IDW
CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
IN query text,
IN point geometry,
IN method integer DEFAULT 1,
IN p1 numeric DEFAULT 0,
IN p2 numeric DEFAULT 0
)
RETURNS numeric AS
$$
DECLARE
gs geometry[];
vs numeric[];
output numeric;
BEGIN
EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs;
SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a;
RETURN output;
END;
$$
language plpgsql IMMUTABLE;
CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
IN geomin geometry[],
IN colin numeric[],
IN point geometry,
IN method integer DEFAULT 1,
IN p1 numeric DEFAULT 0,
IN p2 numeric DEFAULT 0
)
RETURNS numeric AS
$$
DECLARE
gs geometry[];
vs numeric[];
gs2 geometry[];
vs2 numeric[];
g geometry;
vertex geometry[];
sg numeric;
sa numeric;
sb numeric;
sc numeric;
va numeric;
vb numeric;
vc numeric;
output numeric;
BEGIN
output := -999.999;
-- nearest
IF method = 0 THEN
WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v)
SELECT a.v INTO output FROM a ORDER BY point<->a.g LIMIT 1;
RETURN output;
-- barymetric
ELSIF method = 1 THEN
WITH a as (SELECT unnest(geomin) AS e),
b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
c as (SELECT (ST_Dump(t)).geom as v FROM b),
d as (SELECT v FROM c WHERE ST_Within(point, v))
SELECT v INTO g FROM d;
IF g is null THEN
-- out of the realm of the input data
RETURN -888.888;
END IF;
-- vertex of the selected cell
WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
SELECT array_agg(v) INTO vertex FROM a;
-- retrieve the value of each vertex
WITH a AS(SELECT unnest(vertex) as geo, unnest(colin) as c)
SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
WITH a AS(SELECT unnest(vertex) as geo, unnest(colin) as c)
SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
WITH a AS(SELECT unnest(vertex) as geo, unnest(colin) as c)
SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg);
RETURN output;
-- IDW
-- p1: limit the number of neighbors, 0->no limit
-- p2: order of distance decay, 0-> order 1
ELSIF method = 2 THEN
IF p2 = 0 THEN
p2 := 1;
END IF;
WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g)
SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b;
IF p1::integer>0 THEN
gs2:=gs;
vs2:=vs;
FOR i IN 1..p1
LOOP
gs2 := gs2 || gs[i];
vs2 := vs2 || vs[i];
END LOOP;
ELSE
gs2:=gs;
vs2:=vs;
END IF;
WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v),
b as (
SELECT
(1/ST_distance(point, a.g)^p2::integer) as k,
(a.v/ST_distance(point, a.g)^p2::integer) as f
FROM a
)
SELECT sum(b.f)/sum(b.k) INTO output FROM b;
RETURN output;
END IF;
RETURN -777.777;
END;
$$
language plpgsql IMMUTABLE;
-- Moran's I Global Measure (public-facing)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspots(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspots(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliers(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Global Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
from crankshaft.clustering import moran_local
# TODO: use named parameters or a dictionary
return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (internal function)
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local_rate
# TODO: use named parameters or a dictionary
return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL;
-- Moran's I Local Rate only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliersRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
from crankshaft.clustering import kmeans
return kmeans(query,no_clusters,no_init)
$$ language plpythonu;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
RETURNS Numeric[] AS
$$
DECLARE
newX NUMERIC;
newY NUMERIC;
newW NUMERIC;
BEGIN
IF weight IS NULL OR the_geom IS NULL THEN
newX = state[1];
newY = state[2];
newW = state[3];
ELSE
newX = state[1] + ST_X(the_geom)*weight;
newY = state[2] + ST_Y(the_geom)*weight;
newW = state[3] + weight;
END IF;
RETURN Array[newX,newY,newW];
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
RETURNS GEOMETRY AS
$$
BEGIN
IF state[3] = 0 THEN
RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
ELSE
RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)(
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
INITCOND = "{0.0,0.0,0.0}"
);
-- Spatial Markov
-- input table format:
-- id | geom | date_1 | date_2 | date_3
-- 1 | Pt1 | 12.3 | 13.1 | 14.2
-- 2 | Pt2 | 11.0 | 13.2 | 12.5
-- ...
-- Sample Function call:
-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate',
-- Array['date_1', 'date_2', 'date_3'])
CREATE OR REPLACE FUNCTION
CDB_SpatialMarkovTrend (
subquery TEXT,
time_cols TEXT[],
num_classes INT DEFAULT 7,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
AS $$
from crankshaft.space_time_dynamics import spatial_markov_trend
## TODO: use named parameters or a dictionary
return spatial_markov_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- input table format: identical to above but in a predictable format
-- Sample function call:
-- SELECT cdb_spatial_markov('SELECT * FROM real_estate',
-- 'date_1')
-- CREATE OR REPLACE FUNCTION
-- cdb_spatial_markov (
-- subquery TEXT,
-- time_col_min text,
-- time_col_max text,
-- date_format text, -- '_YYYY_MM_DD'
-- num_time_per_bin INT DEFAULT 1,
-- permutations INT DEFAULT 99,
-- geom_column TEXT DEFAULT 'the_geom',
-- id_col TEXT DEFAULT 'cartodb_id',
-- w_type TEXT DEFAULT 'knn',
-- num_ngbrs int DEFAULT 5)
-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
-- AS $$
-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpythonu;
--
-- -- input table format:
-- -- id | geom | date | measurement
-- -- 1 | Pt1 | 12/3 | 13.2
-- -- 2 | Pt2 | 11/5 | 11.3
-- -- 3 | Pt1 | 11/13 | 12.9
-- -- 4 | Pt3 | 12/19 | 10.1
-- -- ...
--
-- CREATE OR REPLACE FUNCTION
-- cdb_spatial_markov (
-- subquery TEXT,
-- time_col text,
-- num_time_per_bin INT DEFAULT 1,
-- permutations INT DEFAULT 99,
-- geom_column TEXT DEFAULT 'the_geom',
-- id_col TEXT DEFAULT 'cartodb_id',
-- w_type TEXT DEFAULT 'knn',
-- num_ngbrs int DEFAULT 5)
-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
-- AS $$
-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpythonu;
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
default_version = '0.0.1'
requires = 'plpythonu, postgis, cartodb'
default_version = '0.1.0'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft

0
release/python/.gitignore vendored Normal file
View File

View File

@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.1',
version='0.0.01',
description='CartoDB Spatial Analysis Python Library',

View File

@@ -0,0 +1,2 @@
import random_seeds
import clustering

View File

@@ -0,0 +1 @@
from moran import *

View File

@@ -0,0 +1,321 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import numpy as np
import pysal as ps
import plpy
# High level interface ---------------------------------------
def moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# TODO: ensure that the significance output can be smaller that 1e-3 (0.001)
# TODO: make a wishlist of output features (zscores, pvalues, raw local lisa, what else?)
plpy.notice('** Constructing query')
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
qvals = {"id_col": id_col,
"attr1": attr,
"geom_col": geom_column,
"table": t,
"num_ngbrs": num_ngbrs}
q = get_query(w_type, qvals)
try:
r = plpy.execute(q)
plpy.notice('** Query returned with %d rows' % len(r))
except plpy.SPIError:
plpy.notice('** Query failed: "%s"' % q)
plpy.notice('** Exiting function')
return zip([None], [None], [None], [None])
y = get_attributes(r, 1)
w = get_weight(r, w_type)
# calculate LISA values
lisa = ps.Moran_Local(y, w)
# find units of significance
lisa_sig = lisa_sig_vals(lisa.p_sim, lisa.q, significance)
plpy.notice('** Finished calculations')
return zip(lisa.Is, lisa_sig, lisa.p_sim, w.id_order)
def moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type):
"""
Moran's I Local Rate
Andy Eschbacher
"""
plpy.notice('** Constructing query')
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
qvals = {"id_col": id_col,
"numerator": numerator,
"denominator": denominator,
"geom_col": geom_column,
"table": t,
"num_ngbrs": num_ngbrs}
q = get_query(w_type, qvals)
try:
r = plpy.execute(q)
plpy.notice('** Query returned with %d rows' % len(r))
except plpy.SPIError:
plpy.notice('** Query failed: "%s"' % q)
plpy.notice('** Error: %s' % plpy.SPIError)
plpy.notice('** Exiting function')
return zip([None], [None], [None], [None])
plpy.notice('r.nrows() = %d' % r.nrows())
## collect attributes
numer = get_attributes(r, 1)
denom = get_attributes(r, 2)
w = get_weight(r, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, w, permutations=permutations)
# find units of significance
lisa_sig = lisa_sig_vals(lisa.p_sim, lisa.q, significance)
plpy.notice('** Finished calculations')
## TODO: Decide on which return values here
return zip(lisa.Is, lisa_sig, lisa.p_sim, w.id_order, lisa.y)
def moran_local_bv(t, attr1, attr2, significance, num_ngbrs, permutations, geom_column, id_col, w_type):
plpy.notice('** Constructing query')
qvals = {"num_ngbrs": num_ngbrs,
"attr1": attr1,
"attr2": attr2,
"table": t,
"geom_col": geom_column,
"id_col": id_col}
q = get_query(w_type, qvals)
try:
r = plpy.execute(q)
plpy.notice('** Query returned with %d rows' % len(r))
except plpy.SPIError:
plpy.notice('** Query failed: "%s"' % q)
plpy.notice('** Error: %s' % plpy.SPIError)
plpy.notice('** Exiting function')
return zip([None], [None], [None], [None])
## collect attributes
attr1_vals = get_attributes(r, 1)
attr2_vals = get_attributes(r, 2)
# create weights
w = get_weight(r, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, w)
plpy.notice("len of Is: %d" % len(lisa.Is))
# find clustering of significance
lisa_sig = lisa_sig_vals(lisa.p_sim, lisa.q, significance)
plpy.notice('** Finished calculations')
return zip(lisa.Is, lisa_sig, lisa.p_sim, w.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
:param coord (int): quadrant of a specific measurement
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
:param params: dict of information used in query (column names,
table name, etc.)
"""
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')]
template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, "
attr_string = ""
for idx, val in enumerate(sorted(attrs)):
attr_string += template % {"col": val, "alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Create portion of WHERE clauses for weeding out NULL-valued geometries
"""
attrs = sorted([k for k in params
if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')])
attr_string = []
for attr in attrs:
attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr)
if len(attrs) == 2:
attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
:param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM \"{table}\" As j " \
"WHERE %(attr_where_j)s " \
"ORDER BY j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs} OFFSET 1 ) " \
") As neighbors " \
"FROM \"{table}\" As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
## SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
:param params: dict of information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM \"{table}\" As j " \
"WHERE ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM \"{table}\" As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
## to add more weight methods open a ticket or pull request
def get_query(w_type, query_vals):
"""Return requested query.
:param w_type: type of neighbors to calculate (knn or queen)
:param query_vals: values used to construct the query
"""
if w_type == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
def get_attributes(query_res, attr_num):
"""
:param query_res: query results with attributes and neighbors
:param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float)
## Build weight object
def get_weight(query_res, w_type='queen', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
:param query_res: query results with attributes and neighbors
"""
if w_type == 'knn':
row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
weights = {x['id']: row_normed_weights for x in query_res}
elif w_type == 'queen':
weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
if len(x['neighbors']) > 0
else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
return ps.W(neighbors, weights)
def quad_position(quads):
"""
Produce Moran's I classification based of n
"""
lisa_sig = np.array([map_quads(q) for q in quads])
return lisa_sig
def lisa_sig_vals(pvals, quads, threshold):
"""
Produce Moran's I classification based of n
"""
sig = (pvals <= threshold)
lisa_sig = np.empty(len(sig), np.chararray)
for idx, val in enumerate(sig):
if val:
lisa_sig[idx] = map_quads(quads[idx])
else:
lisa_sig[idx] = 'Not significant'
return lisa_sig

View File

@@ -0,0 +1,10 @@
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1,48 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.2',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
install_requires=['pysal==1.9.1'],
requires=['pysal', 'numpy' ],
test_suite='test'
)

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "Not significant"],
[0.6152779669180425, "Not significant"],
[-0.14657336660125297, "Not significant"],
[0.6967858120189607, "Not significant"],
[0.07949310115714454, "Not significant"],
[0.4703198759258987, "Not significant"],
[0.4421125200498064, "Not significant"],
[0.5724288737143592, "Not significant"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "Not significant"],
[-0.01466729201304962, "Not significant"],
[0.3481559372544409, "Not significant"],
[0.06547094736902978, "Not significant"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "Not significant"],
[0.15971286468915544, "Not significant"],
[1.0543588860308968, "Not significant"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "Not significant"],
[0.08438455015300014, "Not significant"],
[0.06547094736902978, "Not significant"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "Not significant"],
[0.795275137550483, "Not significant"],
[0.18562939195219, "LL"],
[0.3010757406693439, "Not significant"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "Not significant"],
[-0.07116352791516614, "Not significant"],
[-0.09945240794119009, "Not significant"],
[0.18562939195219, "LL"],
[0.1832733440191868, "Not significant"],
[-0.39054253768447705, "Not significant"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "Not significant"],
[0.2584386102554792, "Not significant"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "Not significant"],
[0.051367269430983485, "Not significant"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "Not significant"],
[0.04191046803899837, "Not significant"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "Not significant"],
[0.5410013139159929, "Not significant"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,34 @@
import re
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def execute(self, query): # TODO: additional arguments
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,144 @@
import unittest
import numpy as np
import unittest
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions."""
def setUp(self):
plpy._reset()
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"table": "a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads."""
self.assertEqual(cc.map_quads(1), 'HH')
self.assertEqual(cc.map_quads(2), 'LH')
self.assertEqual(cc.map_quads(3), 'LL')
self.assertEqual(cc.map_quads(4), 'HL')
self.assertEqual(cc.map_quads(33), None)
self.assertEqual(cc.map_quads('andy'), None)
def test_query_attr_select(self):
"""Test query_attr_select."""
ans = "i.\"{attr1}\"::numeric As attr1, " \
"i.\"{attr2}\"::numeric As attr2, "
self.assertEqual(cc.query_attr_select(self.params), ans)
def test_query_attr_where(self):
"""Test query_attr_where."""
ans = "idx_replace.\"{attr1}\" IS NOT NULL AND "\
"idx_replace.\"{attr2}\" IS NOT NULL AND "\
"idx_replace.\"{attr2}\" <> 0"
self.assertEqual(cc.query_attr_where(self.params), ans)
def test_knn(self):
"""Test knn function."""
ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM \"a_list\" As j WHERE j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 OFFSET 1 ) ) " \
"As neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT " \
"NULL AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER " \
"BY i.\"cartodb_id\" ASC;"
self.assertEqual(cc.knn(self.params), ans)
def test_queen(self):
"""Test queen neighbors function."""
ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \
"j.\"cartodb_id\" FROM \"a_list\" As j WHERE ST_Touches(" \
"i.\"the_geom\", j.\"the_geom\") AND j.\"andy\" IS NOT NULL " \
"AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0)) As " \
"neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT NULL " \
"AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER BY " \
"i.\"cartodb_id\" ASC;"
self.assertEqual(cc.queen(self.params), ans)
def test_get_query(self):
"""Test get_query."""
ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \
"j.\"cartodb_id\" FROM \"a_list\" As j WHERE j.\"andy\" IS " \
"NOT NULL AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 " \
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 " \
"OFFSET 1 ) ) As neighbors FROM \"a_list\" As i WHERE " \
"i.\"andy\" IS NOT NULL AND i.\"jay_z\" IS NOT NULL AND " \
"i.\"jay_z\" <> 0 ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(cc.get_query('knn', self.params), ans)
def test_get_attributes(self):
"""Test get_attributes."""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight."""
self.assertEqual(True, True)
def test_quad_position(self):
"""Test lisa_sig_vals."""
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = cc.quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_moran_local(self):
"""Test Moran's I local"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local('table', 'value', 0.05, 5, 99, 'the_geom', 'cartodb_id', 'knn')
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local_rate('table', 'numerator', 'denominator', 0.05, 5, 99, 'the_geom', 'cartodb_id', 'knn')
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
self.assertAlmostEqual(res_val, exp_val)

View File

@@ -0,0 +1,2 @@
import random_seeds
import clustering

View File

@@ -0,0 +1,2 @@
from moran import *
from kmeans import *

View File

@@ -0,0 +1,18 @@
from sklearn.cluster import KMeans
import plpy
def kmeans(query, no_clusters, no_init=20):
data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
array_agg(ST_X(the_geom) order by cartodb_id) xs,
array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
where the_geom is not null
'''.format(query=query))
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters= no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs,ys))
return zip(ids,labels)

View File

@@ -0,0 +1,260 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
import plpy
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
def moran(subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
qvals = {"id_col": id_col,
"attr1": attr_name,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query = pu.construct_neighbor_query(w_type, qvals)
plpy.notice('** Query: %s' % query)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(2)
plpy.notice('** Query returned with %d rows' % len(result))
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
plpy.notice('** Error: %s' % plpy.SPIError)
return pu.empty_zipped_array(2)
## collect attributes
attr_vals = pu.get_attributes(result)
## calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
## calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def moran_local(subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
qvals = {"id_col": id_col,
"attr1": attr,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query = pu.construct_neighbor_query(w_type, qvals)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(5)
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
return pu.empty_zipped_array(5)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
qvals = {"id_col": id_col,
"attr1": numerator,
"attr2": denominator,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query = pu.construct_neighbor_query(w_type, qvals)
plpy.notice('** Query: %s' % query)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(2)
plpy.notice('** Query returned with %d rows' % len(result))
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
plpy.notice('** Error: %s' % plpy.SPIError)
return pu.empty_zipped_array(2)
## collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
## calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def moran_local_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
query = pu.construct_neighbor_query(w_type,
{"id_col": id_col,
"numerator": numerator,
"denominator": denominator,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs})
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(5)
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
plpy.notice('** Error: %s' % plpy.SPIError)
return pu.empty_zipped_array(5)
## collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find units of significance
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_local_bv(subquery, attr1, attr2,
permutations, geom_col, id_col, w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
plpy.notice('** Constructing query')
qvals = {"num_ngbrs": num_ngbrs,
"attr1": attr1,
"attr2": attr2,
"subquery": subquery,
"geom_col": geom_col,
"id_col": id_col}
query = pu.construct_neighbor_query(w_type, qvals)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
except plpy.SPIError:
plpy.error("Error: areas of interest query failed, " \
"check input parameters")
plpy.notice('** Query failed: "%s"' % query)
return pu.empty_zipped_array(4)
## collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
plpy.notice("len of Is: %d" % len(lisa.Is))
# find clustering of significance
lisa_sig = quad_position(lisa.q)
plpy.notice('** Finished calculations')
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1 @@
from pysal_utils import *

View File

@@ -0,0 +1,152 @@
"""
Utilities module for generic PySAL functionality, mainly centered on translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
## Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res: query results with attributes and neighbors
"""
if w_type.lower() == 'knn':
row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
weights = {x['id']: row_normed_weights for x in query_res}
else:
weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
if len(x['neighbors']) > 0
else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
return ps.W(neighbors, weights)
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
@param params: dict of information used in query (column names,
table name, etc.)
"""
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs')]
template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, "
attr_string = ""
for idx, val in enumerate(sorted(attrs)):
attr_string += template % {"col": val, "alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Create portion of WHERE clauses for weeding out NULL-valued geometries
"""
attrs = sorted([k for k in params
if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs')])
attr_string = []
for attr in attrs:
attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr)
if len(attrs) == 2:
attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
## SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
## to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,10 @@
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1,48 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.3',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
install_requires=['pysal==1.9.1', 'scikit-learn==0.17.1'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1 @@
[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "LL"],
[0.6152779669180425, "LL"],
[-0.14657336660125297, "LH"],
[0.6967858120189607, "LL"],
[0.07949310115714454, "HH"],
[0.4703198759258987, "HH"],
[0.4421125200498064, "HH"],
[0.5724288737143592, "LL"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "LL"],
[-0.01466729201304962, "HL"],
[0.3481559372544409, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "HH"],
[0.15971286468915544, "LL"],
[1.0543588860308968, "HH"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "HH"],
[0.08438455015300014, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "LL"],
[0.795275137550483, "HH"],
[0.18562939195219, "LL"],
[0.3010757406693439, "LL"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "LL"],
[-0.07116352791516614, "HL"],
[-0.09945240794119009, "LH"],
[0.18562939195219, "LL"],
[0.1832733440191868, "LL"],
[-0.39054253768447705, "HL"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "HH"],
[0.2584386102554792, "HH"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "LH"],
[0.051367269430983485, "LL"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "LL"],
[0.04191046803899837, "LL"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "LH"],
[0.5410013139159929, "HH"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,34 @@
import re
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def execute(self, query): # TODO: additional arguments
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,38 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import numpy as np
import crankshaft.clustering as cc
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
class KMeansTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
self.cluster_data = json.loads(open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"
}
def test_kmeans(self):
data = self.cluster_data
plpy._define_result('select' ,data)
clusters = cc.kmeans('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1]==0]
c2 = [a for a in clusters if a[1]==1]
self.assertEqual(len(np.unique(labels)),2)
self.assertEqual(len(c1),20)
self.assertEqual(len(c2),20)

View File

@@ -0,0 +1,83 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import crankshaft.clustering as cc
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
self.assertEqual(cc.map_quads(1), 'HH')
self.assertEqual(cc.map_quads(2), 'LH')
self.assertEqual(cc.map_quads(3), 'LL')
self.assertEqual(cc.map_quads(4), 'HL')
self.assertEqual(cc.map_quads(33), None)
self.assertEqual(cc.map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = cc.quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_moran_local(self):
"""Test Moran's I local"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local('subquery', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local_rate('subquery', 'numerator', 'denominator', 'knn', 5, 99, 'the_geom', 'cartodb_id')
print 'result == None? ', result == None
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{ 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1235)
result = cc.moran('table', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
print 'result == None?', result == None
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)

View File

@@ -0,0 +1,107 @@
import unittest
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
class PysalUtilsTest(unittest.TestCase):
"""Testing class for utility functions related to PySAL integrations"""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
def test_query_attr_select(self):
"""Test query_attr_select"""
ans = "i.\"{attr1}\"::numeric As attr1, " \
"i.\"{attr2}\"::numeric As attr2, "
self.assertEqual(pu.query_attr_select(self.params), ans)
def test_query_attr_where(self):
"""Test pu.query_attr_where"""
ans = "idx_replace.\"{attr1}\" IS NOT NULL AND " \
"idx_replace.\"{attr2}\" IS NOT NULL AND " \
"idx_replace.\"{attr2}\" <> 0"
self.assertEqual(pu.query_attr_where(self.params), ans)
def test_knn(self):
"""Test knn neighbors constructor"""
ans = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL AND " \
"j.\"jay_z\" <> 0 " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL AND " \
"i.\"jay_z\" <> 0 " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params), ans)
def test_queen(self):
"""Test queen neighbors constructor"""
ans = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL AND " \
"j.\"jay_z\" <> 0)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL AND " \
"i.\"jay_z\" <> 0 " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params), ans)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params),
pu.knn(self.params))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)

View File

@@ -0,0 +1,2 @@
import random_seeds
import clustering

View File

@@ -0,0 +1,2 @@
from moran import *
from kmeans import *

View File

@@ -0,0 +1,18 @@
from sklearn.cluster import KMeans
import plpy
def kmeans(query, no_clusters, no_init=20):
data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
array_agg(ST_X(the_geom) order by cartodb_id) xs,
array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
where the_geom is not null
'''.format(query=query))
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters= no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs,ys))
return zip(ids,labels)

View File

@@ -0,0 +1,260 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
import plpy
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
def moran(subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
qvals = {"id_col": id_col,
"attr1": attr_name,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query = pu.construct_neighbor_query(w_type, qvals)
plpy.notice('** Query: %s' % query)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(2)
plpy.notice('** Query returned with %d rows' % len(result))
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
plpy.notice('** Error: %s' % plpy.SPIError)
return pu.empty_zipped_array(2)
## collect attributes
attr_vals = pu.get_attributes(result)
## calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
## calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def moran_local(subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
qvals = {"id_col": id_col,
"attr1": attr,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query = pu.construct_neighbor_query(w_type, qvals)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(5)
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
return pu.empty_zipped_array(5)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
qvals = {"id_col": id_col,
"attr1": numerator,
"attr2": denominator,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query = pu.construct_neighbor_query(w_type, qvals)
plpy.notice('** Query: %s' % query)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(2)
plpy.notice('** Query returned with %d rows' % len(result))
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
plpy.notice('** Error: %s' % plpy.SPIError)
return pu.empty_zipped_array(2)
## collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
## calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def moran_local_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
query = pu.construct_neighbor_query(w_type,
{"id_col": id_col,
"numerator": numerator,
"denominator": denominator,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs})
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(5)
except plpy.SPIError:
plpy.error('Error: areas of interest query failed, check input parameters')
plpy.notice('** Query failed: "%s"' % query)
plpy.notice('** Error: %s' % plpy.SPIError)
return pu.empty_zipped_array(5)
## collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find units of significance
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_local_bv(subquery, attr1, attr2,
permutations, geom_col, id_col, w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
plpy.notice('** Constructing query')
qvals = {"num_ngbrs": num_ngbrs,
"attr1": attr1,
"attr2": attr2,
"subquery": subquery,
"geom_col": geom_col,
"id_col": id_col}
query = pu.construct_neighbor_query(w_type, qvals)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
except plpy.SPIError:
plpy.error("Error: areas of interest query failed, " \
"check input parameters")
plpy.notice('** Query failed: "%s"' % query)
return pu.empty_zipped_array(4)
## collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
plpy.notice("len of Is: %d" % len(lisa.Is))
# find clustering of significance
lisa_sig = quad_position(lisa.q)
plpy.notice('** Finished calculations')
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1 @@
from pysal_utils import *

View File

@@ -0,0 +1,152 @@
"""
Utilities module for generic PySAL functionality, mainly centered on translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
## Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res: query results with attributes and neighbors
"""
if w_type.lower() == 'knn':
row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
weights = {x['id']: row_normed_weights for x in query_res}
else:
weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
if len(x['neighbors']) > 0
else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
return ps.W(neighbors, weights)
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
@param params: dict of information used in query (column names,
table name, etc.)
"""
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs')]
template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, "
attr_string = ""
for idx, val in enumerate(sorted(attrs)):
attr_string += template % {"col": val, "alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Create portion of WHERE clauses for weeding out NULL-valued geometries
"""
attrs = sorted([k for k in params
if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs')])
attr_string = []
for attr in attrs:
attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr)
if len(attrs) == 2:
attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
## SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
## to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,10 @@
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1,48 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.4',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.11.2', 'scikit-learn==0.14.1'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1 @@
[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "LL"],
[0.6152779669180425, "LL"],
[-0.14657336660125297, "LH"],
[0.6967858120189607, "LL"],
[0.07949310115714454, "HH"],
[0.4703198759258987, "HH"],
[0.4421125200498064, "HH"],
[0.5724288737143592, "LL"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "LL"],
[-0.01466729201304962, "HL"],
[0.3481559372544409, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "HH"],
[0.15971286468915544, "LL"],
[1.0543588860308968, "HH"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "HH"],
[0.08438455015300014, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "LL"],
[0.795275137550483, "HH"],
[0.18562939195219, "LL"],
[0.3010757406693439, "LL"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "LL"],
[-0.07116352791516614, "HL"],
[-0.09945240794119009, "LH"],
[0.18562939195219, "LL"],
[0.1832733440191868, "LL"],
[-0.39054253768447705, "HL"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "HH"],
[0.2584386102554792, "HH"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "LH"],
[0.051367269430983485, "LL"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "LL"],
[0.04191046803899837, "LL"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "LH"],
[0.5410013139159929, "HH"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,34 @@
import re
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def execute(self, query): # TODO: additional arguments
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,38 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import numpy as np
import crankshaft.clustering as cc
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
class KMeansTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
self.cluster_data = json.loads(open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"
}
def test_kmeans(self):
data = self.cluster_data
plpy._define_result('select' ,data)
clusters = cc.kmeans('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1]==0]
c2 = [a for a in clusters if a[1]==1]
self.assertEqual(len(np.unique(labels)),2)
self.assertEqual(len(c1),20)
self.assertEqual(len(c2),20)

View File

@@ -0,0 +1,83 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import crankshaft.clustering as cc
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
self.assertEqual(cc.map_quads(1), 'HH')
self.assertEqual(cc.map_quads(2), 'LH')
self.assertEqual(cc.map_quads(3), 'LL')
self.assertEqual(cc.map_quads(4), 'HL')
self.assertEqual(cc.map_quads(33), None)
self.assertEqual(cc.map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = cc.quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_moran_local(self):
"""Test Moran's I local"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local('subquery', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local_rate('subquery', 'numerator', 'denominator', 'knn', 5, 99, 'the_geom', 'cartodb_id')
print 'result == None? ', result == None
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{ 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1235)
result = cc.moran('table', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
print 'result == None?', result == None
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)

View File

@@ -0,0 +1,107 @@
import unittest
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
class PysalUtilsTest(unittest.TestCase):
"""Testing class for utility functions related to PySAL integrations"""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
def test_query_attr_select(self):
"""Test query_attr_select"""
ans = "i.\"{attr1}\"::numeric As attr1, " \
"i.\"{attr2}\"::numeric As attr2, "
self.assertEqual(pu.query_attr_select(self.params), ans)
def test_query_attr_where(self):
"""Test pu.query_attr_where"""
ans = "idx_replace.\"{attr1}\" IS NOT NULL AND " \
"idx_replace.\"{attr2}\" IS NOT NULL AND " \
"idx_replace.\"{attr2}\" <> 0"
self.assertEqual(pu.query_attr_where(self.params), ans)
def test_knn(self):
"""Test knn neighbors constructor"""
ans = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL AND " \
"j.\"jay_z\" <> 0 " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL AND " \
"i.\"jay_z\" <> 0 " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params), ans)
def test_queen(self):
"""Test queen neighbors constructor"""
ans = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL AND " \
"j.\"jay_z\" <> 0)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL AND " \
"i.\"jay_z\" <> 0 " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params), ans)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params),
pu.knn(self.params))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)

View File

@@ -0,0 +1,5 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation

View File

@@ -0,0 +1,3 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *

View File

@@ -0,0 +1,18 @@
from sklearn.cluster import KMeans
import plpy
def kmeans(query, no_clusters, no_init=20):
data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
array_agg(ST_X(the_geom) order by cartodb_id) xs,
array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
where the_geom is not null
'''.format(query=query))
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters= no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs,ys))
return zip(ids,labels)

Some files were not shown because too many files have changed in this diff Show More