Compare commits

...

287 Commits

Author SHA1 Message Date
Andy Eschbacher
2cd6cdd4a6 fixes variale name error 2018-02-16 13:31:17 -05:00
Andy Eschbacher
c48ae50ade small syntax fixes 2018-02-09 16:22:10 -05:00
Stuart Lynn
d2574c20ef update to the kmeans - balanced code 2018-02-09 15:55:04 -05:00
Stuart Lynn
7b42beb82f Merge branch 'develop' into balanced_kmeans 2018-01-29 16:41:58 -05:00
Stuart Lynn
9b7eda798c inital commit for balanced k means implementation 2018-01-29 10:51:13 -05:00
Andy Eschbacher
eefb0d7990 Merge pull request #165 from CartoDB/add-PIA
Add multipolygons and geometry collections support to PIA analyssis
2018-01-10 17:10:02 -05:00
Andy Eschbacher
628fd2b839 adds test on multipolygon 2018-01-10 16:45:36 -05:00
Andy Eschbacher
807a5373e8 adds simple test 2018-01-10 16:35:23 -05:00
Andy Eschbacher
2e0a6f7482 Merge branch 'develop' into add-PIA 2018-01-10 15:30:51 -05:00
Andy Eschbacher
edb1cb5049 Merge pull request #150 from CartoDB/add-nonspatial-kmeans-w-class-framework
Add nonspatial kmeans w class framework
2018-01-10 13:15:13 -05:00
Andy Eschbacher
e5285a2700 adds parallel marker for plpgsql function 2018-01-10 10:57:51 -05:00
Andy Eschbacher
068f43de10 adds test for exception if no data is present 2018-01-09 15:45:03 -05:00
Andy Eschbacher
604f20bb21 grrr adds comments to test expectation 2018-01-09 15:22:14 -05:00
Andy Eschbacher
cfd988c338 uses exact same query text for expectation 2018-01-09 15:14:38 -05:00
Andy Eschbacher
04f290cbad finalizes test query 2018-01-09 15:06:44 -05:00
Andy Eschbacher
18fbc2fa9e updates sql query / fixes error 2018-01-09 14:39:24 -05:00
Andy Eschbacher
b0e3f38f1e correctly finds the number of columns 2018-01-09 14:28:20 -05:00
Andy Eschbacher
b16c73f7d0 Merge branch 'develop' into add-nonspatial-kmeans-w-class-framework 2018-01-09 13:47:29 -05:00
Andy Eschbacher
20104c2df9 adds sql tests for nonspatial kmeans 2018-01-09 13:35:00 -05:00
Andy Eschbacher
49a317ae8e syntax updates / consistency 2018-01-09 13:29:36 -05:00
Andy Eschbacher
001062f660 adds inertia as an output column 2018-01-09 13:02:55 -05:00
Andy Eschbacher
5e0fbf0f6f syntax updates 2018-01-09 13:02:41 -05:00
Andy Eschbacher
e5a03fce82 Merge pull request #157 from CartoDB/add-errors-on-null-only
catch empty return values and error on them
2018-01-09 11:46:41 -05:00
Andy Eschbacher
32bb3b1276 adds missing decorator for gwr_predict 2018-01-09 11:37:27 -05:00
Andy Eschbacher
72260d02aa Merge branch 'develop' into add-errors-on-null-only 2018-01-09 11:34:23 -05:00
Andy Eschbacher
65c7841a7a Merge pull request #179 from CartoDB/update-moran-docs
Update moran docs
2018-01-09 11:30:45 -05:00
Andy Eschbacher
d89e07328e Merge branch 'develop' into update-moran-docs 2018-01-09 11:24:36 -05:00
Andy Eschbacher
4ece712cae Merge pull request #155 from CartoDB/update-markov-docs-null
adds caveats about usage
2018-01-09 11:22:10 -05:00
Andy Eschbacher
8bbfac0dbc removes redundant notes section 2018-01-09 11:17:09 -05:00
Andy Eschbacher
200c3da3cb adds better intro and placement for notes 2018-01-09 11:08:35 -05:00
Andy Eschbacher
83048cdf72 Merge branch 'develop' into update-markov-docs-null 2018-01-09 10:54:09 -05:00
Andy Eschbacher
f1428a3e36 Merge pull request #193 from CartoDB/add-schema-to-docs-examples
updates examples to include schema
2018-01-09 10:51:49 -05:00
Andy Eschbacher
b4ddfa1c5b Merge branch 'develop' into add-errors-on-null-only 2018-01-09 10:30:50 -05:00
Andy Eschbacher
77e73dbc75 updates error syntax 2018-01-09 10:23:38 -05:00
Andy Eschbacher
0e99a14653 Merge branch 'develop' into update-moran-docs 2018-01-08 16:41:19 -05:00
Andy Eschbacher
92becac280 syntax fixes / function name fix 2018-01-08 16:30:03 -05:00
Andy Eschbacher
e28f00d98b updates other examples to use correct schema and consistent syntax 2018-01-02 10:36:10 -05:00
Andy Eschbacher
7effd39f16 updates examples to include schema 2018-01-02 10:20:59 -05:00
Raúl Marín
0194c036f6 Merge branch 'develop' into master 2017-11-27 13:17:02 +01:00
Raul Marin
0d840b65f0 Release 0.6.1 2017-11-23 10:07:52 +01:00
Raul Marin
4ab4eadec9 Update RELEASE process 2017-11-23 10:07:52 +01:00
Raul Marin
cf46633f24 Makefile: Strip PARALLEL tags on deploy
Since releases are commited into the proyect instead of stripping the labels
when generating the release file, we need to do it before deploying (depending
on the PG release)
2017-11-23 10:07:52 +01:00
Raul Marin
67dbdc6a59 Categorize GWR sql functions 2017-11-23 10:07:52 +01:00
Raul Marin
847a32340d Update CONTRIBUTING with information about PG function labels 2017-11-23 10:07:52 +01:00
Raul Marin
5472bc71c6 Avoid regress error with exceptions under some verbosity levels 2017-11-23 10:07:52 +01:00
Raul Marin
e1fd016ef1 Add PARALLEL and VOLATILE categories to PG functions 2017-11-23 10:07:52 +01:00
Raul Marin
3273afeb76 Makefile: Add support for PARALLEL categories 2017-11-23 10:07:52 +01:00
Raul Marin
835f80b1b6 Update RELEASE process 2017-11-23 09:49:25 +01:00
Raul Marin
192f89bf4d Makefile: Strip PARALLEL tags on deploy
Since releases are commited into the proyect instead of stripping the labels
when generating the release file, we need to do it before deploying (depending
on the PG release)
2017-11-23 09:49:25 +01:00
Raul Marin
de0bbb0bd3 Categorize GWR sql functions 2017-11-23 09:49:25 +01:00
Raul Marin
1508e5090c Update CONTRIBUTING with information about PG function labels 2017-11-23 09:49:25 +01:00
Raul Marin
e71e9b10b6 Avoid regress error with exceptions under some verbosity levels 2017-11-23 09:49:25 +01:00
Raul Marin
0a15857a54 Add PARALLEL and VOLATILE categories to PG functions 2017-11-23 09:49:25 +01:00
Raul Marin
ddd2b4e3bf Makefile: Add support for PARALLEL categories 2017-11-23 09:49:25 +01:00
Andy Eschbacher
18109e5a28 Merge pull request #186 from CartoDB/develop
Release for 0.6.0
2017-11-08 14:17:08 -05:00
Andy Eschbacher
d651a401d7 Merge pull request #185 from CartoDB/release-0.6.0
Release 0.6.0
2017-11-08 14:06:55 -05:00
Andy Eschbacher
a3c30c5f48 release files for 0.6.0 2017-11-08 13:57:21 -05:00
Andy Eschbacher
d8372b5c24 Merge pull request #184 from CartoDB/taylor_oshan-pysal_gwr
Geographically-weighted regression
2017-11-08 11:46:34 -05:00
Javier Goizueta
44511682bc Merge branch 'develop' into test-travis-fix 2017-11-03 15:12:29 +01:00
Andy Eschbacher
2dafc0f80e Merge branch 'master' into taylor_oshan-pysal_gwr 2017-11-02 15:38:54 -04:00
Andy Eschbacher
79b6ef59de Merge pull request #146 from TaylorOshan/pysal_gwr
geographically weighted regression (gwr)
2017-11-02 14:21:07 -04:00
Javier Goizueta
7a2a20acef Merge pull request #182 from Algunenano/pass_tests
Pass tests
2017-10-31 12:34:34 +01:00
Raul Marin
a146848a79 Travis: Use postgresql 9.5.2-3 2017-10-31 11:38:20 +01:00
Raul Marin
1dc93284f8 PG test: Make the extension tests version agnostic 2017-10-31 11:38:20 +01:00
Raul Marin
5b30783c04 PG regress: Order the tests
Avoids an issue where some tests were run before the setup
2017-10-31 11:38:20 +01:00
Raul Marin
bd2b190643 Add PIP and NOSETESTS as variables to Makefile.global
This makes it easier to change between pip/pip2 depending on the local environment
2017-10-31 11:38:20 +01:00
Raul Marin
e6c98e83db Travis: Set Ubuntu precise as distribution 2017-10-31 11:38:20 +01:00
Andy Eschbacher
10a1d03287 updates description 2017-09-05 08:48:38 -04:00
Andy Eschbacher
b9d739327f add description of 'standardized' output 2017-09-05 08:42:00 -04:00
Rafa de la Torre
ce5d1f9e86 Release 0.5.2 2017-05-12 17:26:41 +02:00
abelvm
7f5edb26b0 fixed corner case 2017-03-28 14:13:49 +02:00
abelvm
00327e6de2 fixed indexing 2017-03-28 13:38:21 +02:00
abelvm
c252c18adc fixed indexing 2017-03-28 13:32:10 +02:00
abelvm
3bfdb8d3cc Merge branch 'develop' of github.com:CartoDB/crankshaft into add-PIA 2017-03-28 13:03:01 +02:00
abelvm
47251daa5f fixed corner case centroid=PIA 2017-03-28 13:02:37 +02:00
Raul Ochoa
69713ecb0a Merge pull request #172 from CartoDB/fix-global-rate-stat
Fix missing comma for dict creation
2017-03-02 14:39:47 +01:00
Raul Ochoa
d07822c7a0 Fix missing comma for dict creation 2017-02-27 12:07:41 +01:00
Mario de Frutos
154d1a674d Added CLA part in the contributing document 2017-01-25 10:45:30 +01:00
Andy Eschbacher
06746b4c65 corrected code snippet on weighted mean 2017-01-19 08:40:10 -05:00
abelvm
d8604f3c9b agg set error fix 2017-01-18 21:40:36 +01:00
abelvm
e03c3eece2 semi colon fix 2017-01-18 21:32:42 +01:00
abelvm
32117c7480 Merge branch 'develop' of github.com:CartoDB/crankshaft into add-PIA 2017-01-18 17:28:14 +01:00
abelvm
9ab51027fc support multi 2017-01-18 17:28:06 +01:00
Andy Eschbacher
8e4bbb8a90 add default return value on verify_data wrapper 2017-01-13 14:07:31 -05:00
Andy Eschbacher
be2bf19c0a removes print line 2017-01-12 17:14:32 -05:00
Andy Eschbacher
ddd69bb457 adds mock error function 2017-01-12 17:12:40 -05:00
Andy Eschbacher
04bd067045 standardizing naming conventions in code 2017-01-12 17:12:09 -05:00
Andy Eschbacher
4b3481b1a6 adds decorators to reduce boilerplate code 2017-01-12 17:03:01 -05:00
Andy Eschbacher
76f5eae928 quote col names 2017-01-12 16:02:27 -05:00
Andy Eschbacher
7322931ca1 classes to inherit from objects 2017-01-12 12:00:36 -05:00
Andy Eschbacher
c2bfcc8516 adding sql tests 2017-01-11 10:36:46 -05:00
Andy Eschbacher
7725cada13 tests for gwr predict 2017-01-10 15:53:09 -05:00
Andy Eschbacher
e456158cbf removes unneeded function / multilines some queries 2017-01-10 15:12:24 -05:00
Andy Eschbacher
50f6ef0fcc remove unnecessary code / tests 2017-01-10 15:01:44 -05:00
Andy Eschbacher
ca7a2d6e36 update verify_data to get full data reference 2017-01-10 15:00:59 -05:00
Andy Eschbacher
c114ccea33 add condition on null-valued geometries, ref: #143 2017-01-10 14:38:16 -05:00
Andy Eschbacher
10ce109d09 fix typo on error return 2017-01-10 14:37:07 -05:00
Andy Eschbacher
d679975f72 catch empty return values and error on them 2017-01-10 13:53:37 -05:00
Andy Eschbacher
ee5e7d81ae standardize id_col naming convention 2017-01-10 10:52:10 -05:00
Andy Eschbacher
a32b212412 finish docs for kmeans nonspatial 2017-01-10 10:43:42 -05:00
Andy Eschbacher
69f38dd52e change parameter name to align with kmeans.spatial 2017-01-10 09:52:46 -05:00
Andy Eschbacher
c6f64ad2f4 bug fixes and adding of internal docs 2017-01-10 09:49:16 -05:00
Andy Eschbacher
0815db4661 move floats to numpy floats 2017-01-09 16:28:00 -05:00
Andy Eschbacher
0f24a4d35a adds framework for testing predict 2017-01-09 16:04:38 -05:00
Andy Eschbacher
6d59061a00 add optional params to predict function 2017-01-09 15:49:32 -05:00
Andy Eschbacher
25f1fc2b28 fix bug on output of predicted values with wrong id 2017-01-09 15:48:43 -05:00
Andy Eschbacher
1bf64e0d4c updates descriptions 2017-01-09 10:09:25 -05:00
Andy Eschbacher
0007082efe finishes python tests for regression 2017-01-09 10:00:44 -05:00
Andy Eschbacher
87890926ad adding back filter t-vals lost in commit squash 2017-01-09 14:12:22 +00:00
Andy Eschbacher
00567e5272 update date, correct col information and order 2017-01-06 13:50:23 -05:00
Andy Eschbacher
e39051e958 renaming for consistency 2017-01-06 12:04:41 -05:00
Andy Eschbacher
c782b812d5 stubs out more testing efforts 2017-01-06 12:03:30 -05:00
Andy Eschbacher
521a79ad7f packed with orig coords and deps in correct order 2017-01-06 12:02:50 -05:00
Andy Eschbacher
8c71820d97 adds descriptions 2017-01-06 10:44:25 -05:00
Andy Eschbacher
cdb81ea896 refactor 2017-01-06 10:37:12 -05:00
Andy Eschbacher
e2cf12aaba predict -> class framework 2017-01-06 10:36:40 -05:00
Andy Eschbacher
92fc25f6b5 placeholders for desciptions fix references 2017-01-05 16:18:08 -05:00
Andy Eschbacher
46b2b80008 adds references to docs 2017-01-05 15:26:16 -05:00
Andy Eschbacher
ba54e9b42d consolidate gwr sql functions 2017-01-05 14:24:10 -05:00
Andy Eschbacher
d95ca54cdc updating prediction to data provider 2017-01-05 14:22:40 -05:00
Andy Eschbacher
fa3eecb233 test updates 2017-01-05 14:21:46 -05:00
Andy Eschbacher
daf4d5984c first pass at tests 2017-01-05 13:02:36 -05:00
Andy Eschbacher
5735831142 adding fixture files 2017-01-05 13:01:56 -05:00
Andy Eschbacher
1be0aa8330 adding docs for gwr 2017-01-04 16:37:09 -05:00
Andy Eschbacher
b049da4155 switch order of rowid to be last in output signature 2017-01-04 16:36:46 -05:00
Andy Eschbacher
dbee19723e clean up code and loose ends on data provider changes 2017-01-04 14:40:37 -05:00
Andy Eschbacher
fd9d08dfbc Merge branch 'pysal_gwr' of github.com:TaylorOshan/crankshaft into pysal_gwr 2017-01-04 12:09:00 -05:00
Andy Eschbacher
4d75d2cf74 Merge branch 'develop' into pysal_gwr 2017-01-04 12:03:28 -05:00
Andy Eschbacher
e52dd64dad moves gwr to data analysis provider framework 2017-01-04 11:52:44 -05:00
Andy Eschbacher
d404a66d99 Merge branch 'develop' into pysal_gwr 2017-01-04 11:23:02 -05:00
Andy Eschbacher
7afb6948a4 adds caveats about usage 2017-01-03 10:34:06 -05:00
Taylor Oshan
b06b783c67 update GWR output to also provide filtered (corrected) tvals 2016-12-21 15:00:45 -07:00
Taylor Oshan
f511afbbf0 fix bug in filter tvals; doesnt effect any current crankshaft functions 2016-12-21 08:50:38 -07:00
Taylor Oshan
39c2e01827 add crankshaft gwr_prediction infrastructure 2016-12-17 18:25:38 -07:00
Taylor Oshan
8fcb6a6553 Add GWR prediction base files 2016-12-16 07:30:31 -07:00
Mario de Frutos
34161fd8a4 Merge pull request #152 from CartoDB/develop
Version 0.5.1
2016-12-12 14:18:05 +01:00
Mario de Frutos
850f3f6a31 Merge pull request #151 from CartoDB/fixes_050_deploy
Correct upgrade for 0.5.1 version
2016-12-12 13:59:05 +01:00
Mario de Frutos
021738d9f8 Correct upgrade for 0.5.1 version 2016-12-12 13:51:38 +01:00
Mario de Frutos
161bb14c08 Merge pull request #149 from CartoDB/develop
Release 0.5.0
2016-12-12 11:25:01 +01:00
Andy Eschbacher
3dad9c6044 update key name in test 2016-12-06 13:45:04 -05:00
Andy Eschbacher
bb5de09d6d update order of query gen 2016-12-06 12:49:27 -05:00
Andy Eschbacher
cc0a683a26 fix query templating / response access 2016-12-06 12:26:25 -05:00
Andy Eschbacher
c884eae90e fix data provider ref 2016-12-06 10:33:51 -05:00
Andy Eschbacher
b65fa0c613 remove erroneous queryrunner class 2016-12-06 10:27:24 -05:00
Andy Eschbacher
e98f1cbce5 fix query formatting with dict 2016-12-06 10:19:13 -05:00
Andy Eschbacher
9a80244e76 adds tests and pgsql file 2016-12-06 10:14:37 -05:00
Andy Eschbacher
798e754dfb stubs in kmeans non-spatial 2016-12-05 17:14:36 -05:00
Mario de Frutos
f8739b6a68 Version 0.5.0 release artifacts 2016-12-02 13:35:43 +01:00
Mario de Frutos
5df846fe66 Merge pull request #145 from CartoDB/adds-nonspatial-kmeans
updates internal framework for python functions
2016-12-02 13:23:18 +01:00
Mario de Frutos
b9c4e6e8ef Merge branch 'develop' into adds-nonspatial-kmeans 2016-12-02 13:09:59 +01:00
Mario de Frutos
5c34e08c7d Remove old configuration for postgresql 9.5 in travis 2016-12-02 13:09:09 +01:00
Mario de Frutos
3c8ac7d45d Remove default postgres-9.5 from travis 2016-12-02 12:36:11 +01:00
Andy Eschbacher
59dc9434f7 moves getis to class-based framework 2016-12-01 17:06:21 -05:00
Mario de Frutos
2c6fcfc294 Merge branch 'develop' into adds-nonspatial-kmeans 2016-12-01 16:26:52 +01:00
Mario de Frutos
15b460eeb9 Merge pull request #142 from CartoDB/markov-error-fix
fix error variable name bug, pep8 updates
2016-12-01 10:44:20 +01:00
Mario de Frutos
b0dcd7f572 Merge pull request #137 from CartoDB/adds-outlier-functions
Adds (nonspatial) outlier functions
2016-12-01 10:43:51 +01:00
Mario de Frutos
2547318f59 Merge pull request #127 from CartoDB/adds-getis-analysis
Adds getis analysis
2016-12-01 10:42:13 +01:00
Mario de Frutos
25e453a882 Merge pull request #122 from CartoDB/moran-query-ordering-fix
Moran query ordering fix
2016-12-01 10:41:19 +01:00
Mario de Frutos
62076bb48c Merge pull request #126 from CartoDB/stuartlynn-patch-1
Update PULL_REQUEST_TEMPLATE.md
2016-12-01 10:39:29 +01:00
Andy Eschbacher
6ab1c285d9 places query gen in kmeans data provider 2016-11-30 10:08:36 -05:00
Andy Eschbacher
7efb064fd9 Merge branch 'pysal_gwr' of github.com:TaylorOshan/crankshaft into pysal_gwr 2016-11-29 15:52:11 -05:00
Andy Eschbacher
77b7217368 force array data types 2016-11-29 15:51:59 -05:00
Taylor Oshan
413e6aa5c7 add bw flag that accepts user input 2016-11-29 13:44:36 -07:00
Taylor Oshan
22ce970062 add bandwidth column 2016-11-29 13:22:42 -07:00
Taylor Oshan
f4ccfe712b flatten results 2016-11-29 11:49:19 -07:00
Andy Eschbacher
e6a9397373 pep8 updates 2016-11-29 13:30:07 -05:00
Andy Eschbacher
a9b7d2a9cc Merge branch 'pysal_gwr' of github.com:TaylorOshan/crankshaft into pysal_gwr 2016-11-29 11:58:12 -05:00
Andy Eschbacher
a84806e820 update signature 2016-11-29 11:57:11 -05:00
Taylor Oshan
4a1efc4e3c simplify loops to idx; add json.dumps 2016-11-29 09:16:01 -07:00
Andy Eschbacher
b22f79b0cc Merge branch 'develop' into adds-nonspatial-kmeans 2016-11-29 10:17:18 -05:00
Andy Eschbacher
8aca98433b Merge branch 'pysal_gwr' of github.com:TaylorOshan/crankshaft into pysal_gwr 2016-11-29 10:14:21 -05:00
Andy Eschbacher
b8d08d5a96 Merge branch 'develop' into pysal_gwr 2016-11-29 10:14:07 -05:00
Mario de Frutos
7c63b66fdd Update travis yml to the new postgres-9.5 package 2016-11-29 15:52:29 +01:00
Taylor Oshan
8beb7220b2 gwr output 2016-11-28 15:53:59 -07:00
Andy Eschbacher
b39d0150c7 adds array agg to query 2016-11-28 15:49:10 -05:00
Andy Eschbacher
b399e883ad fix paths; fix query 2016-11-28 20:10:51 +00:00
Andy Eschbacher
6f400ee2b7 Merge branch 'pysal_gwr' of github.com:TaylorOshan/crankshaft into pysal_gwr 2016-11-28 14:54:47 -05:00
Taylor Oshan
c3e99cda30 pysal.contrib -> crankshaft.regression 2016-11-28 12:49:38 -07:00
Andy Eschbacher
5be5a48894 adds psql connector 2016-11-28 19:49:21 +00:00
Taylor Oshan
1c0e4fae47 merge in pg work 2016-11-28 12:36:37 -07:00
Andy Eschbacher
8b061bac72 adds basic pgsql file 2016-11-28 14:33:23 -05:00
Andy Eschbacher
76bd7ff783 update dependency paths 2016-11-28 14:33:02 -05:00
Andy Eschbacher
d7b4eaf110 remove ipython notebook checkpoints 2016-11-28 11:42:40 -05:00
Andy Eschbacher
1b969f6735 more robust table_refs defaults 2016-11-28 11:31:56 -05:00
Andy Eschbacher
6e50e43e1c add query for gwr 2016-11-28 11:07:23 -05:00
Taylor Oshan
cfa9111052 reformat output 2016-11-28 08:24:43 -07:00
Taylor Oshan
01c9195ea5 add glm/gwr base code; start gwr crankshaft func 2016-11-22 16:10:13 -07:00
Andy Eschbacher
db501a2f02 move query generation to inside analysis data provider class 2016-11-22 15:20:14 +00:00
Andy Eschbacher
6fe4fc9668 rename queryrunner in tests 2016-11-22 09:58:06 -05:00
Andy Eschbacher
280a5193ef rename queryrunner to analysisdataprovider 2016-11-22 09:32:39 -05:00
Andy Eschbacher
c27ec58948 Merge branch 'adds-nonspatial-kmeans' of https://github.com/CartoDB/crankshaft into adds-nonspatial-kmeans 2016-11-21 16:26:37 +00:00
Mario de Frutos
bb3ff43f0f Update .travis.yml 2016-11-21 17:25:08 +01:00
Andy Eschbacher
2f27622a6d strips out kmeans non spatial 2016-11-21 16:19:54 +00:00
Andy Eschbacher
c5a2746a53 Merge branch 'develop' into adds-nonspatial-kmeans 2016-11-21 15:46:44 +00:00
Mario de Frutos
538ab9a071 Changed to the last postgresql-9.5 package 2016-11-21 16:14:48 +01:00
Andy Eschbacher
c8f5448b7c seprates out query runner 2016-11-19 14:20:06 +00:00
Andy Eschbacher
224fbc2fc5 move to class based markov 2016-11-19 09:05:35 +00:00
Andy Eschbacher
2738c1f29c move to class-based module 2016-11-18 17:46:55 +00:00
Andy Eschbacher
a8bd122762 remove mock plpy dependencies 2016-11-18 17:46:29 +00:00
Andy Eschbacher
a9add4b49c rename results file 2016-11-18 17:40:57 +00:00
Andy Eschbacher
83f1900512 creates class-based approach to analysis methods 2016-11-18 17:26:24 +00:00
Andy Eschbacher
7eee4faac1 rename to match numbering elsewhere 2016-11-18 17:22:02 +00:00
Andy Eschbacher
84d33d841f tests for new class 2016-11-15 12:03:54 +01:00
Andy Eschbacher
ded26dc46b adding class for database response 2016-11-15 12:03:24 +01:00
Andy Eschbacher
0d40080f6c move back to colnames 2016-11-15 12:02:42 +01:00
Andy Eschbacher
0867e69d1f replace plpy method colnames 2016-11-15 11:19:15 +01:00
Andy Eschbacher
cbe8571546 fixes argument in not-standardize 2016-11-15 10:10:07 +01:00
Andy Eschbacher
af536757fe adds silhouettes to output 2016-11-14 23:29:38 +00:00
Andy Eschbacher
b6dae5e380 adding silhouette 2016-11-15 00:15:23 +01:00
Andy Eschbacher
64c4b6611c changes cluster centers to json 2016-11-10 16:56:04 +00:00
Andy Eschbacher
a188b2e104 adds missing arguments 2016-10-21 15:51:54 -06:00
Andy Eschbacher
4389c9538d small updates for readability 2016-10-21 10:13:21 -06:00
Javier Villar
2bc6b0782a Adding requirements.txt to master branch 2016-10-21 16:28:17 +02:00
Andy Eschbacher
3c6d73b7e2 Merge branch 'adds-nonspatial-kmeans' of https://github.com/CartoDB/crankshaft into adds-nonspatial-kmeans 2016-10-18 21:14:09 -06:00
Andy Eschbacher
3e0dba3522 update comments 2016-10-18 21:13:34 -06:00
Andy Eschbacher
5d8641732f change string formatting 2016-10-18 19:30:09 +00:00
Andy Eschbacher
f0c6cca766 fix key name 2016-10-18 13:05:56 -06:00
Andy Eschbacher
f800a35fd1 new format for input data 2016-10-18 13:01:31 -06:00
Andy Eschbacher
54bbd18b02 remove unneeded modules from test script 2016-10-18 12:12:38 -06:00
Andy Eschbacher
da23b002cf rename to match submodule name 2016-10-18 11:51:53 -06:00
Andy Eschbacher
a370a2da52 pep8 updates of test file 2016-10-18 11:50:59 -06:00
Andy Eschbacher
5404589058 Merge branch 'adds-nonspatial-kmeans' of https://github.com/CartoDB/crankshaft into adds-nonspatial-kmeans 2016-10-13 12:52:07 -04:00
Andy Eschbacher
b255fd3e06 make private functions more explictly private 2016-10-13 12:50:46 -04:00
Andy Eschbacher
0feaf36cf6 outputting consistent labels and centers 2016-10-13 15:52:00 +00:00
Andy Eschbacher
5d2a1881b1 make numpy with global scope in module 2016-10-13 15:00:28 +00:00
Andy Eschbacher
a95423174c adds back alias for kmeans removed by accident 2016-10-13 10:50:48 -04:00
Andy Eschbacher
4314f0f066 adds more robust data processing 2016-10-13 10:28:29 -04:00
Andy Eschbacher
c2e2359e65 addes minmax scaling for variables 2016-10-12 17:16:52 -04:00
Andy Eschbacher
361505fca9 fixes syntax errors 2016-10-12 21:13:51 +00:00
Andy Eschbacher
c47116571f properly close plpgsql function 2016-10-12 14:19:19 -04:00
Andy Eschbacher
3e1cef9958 fix output signature 2016-10-11 16:48:22 -04:00
Andy Eschbacher
947d6ba798 first add 2016-10-11 16:38:18 -04:00
Rafa de la Torre
ffd651b91a Merge remote-tracking branch 'origin/develop' into stuartlynn-patch-1 2016-10-11 15:26:03 +02:00
Andy Eschbacher
c7e690980f update column names in tests 2016-10-06 10:29:52 -04:00
Andy Eschbacher
da1449331c update signature variable names 2016-10-06 09:53:38 -04:00
Andy Eschbacher
c7f5c24510 update signature names 2016-10-06 09:53:22 -04:00
Andy Eschbacher
11c33ce3fa adds pep8 check item 2016-10-06 08:56:14 -04:00
Andy Eschbacher
0a53a6e71d fix error variable name bug, pep8 updates 2016-10-06 08:19:57 -04:00
Andy Eschbacher
fa4e5ae686 Merge branch 'develop' into adds-outlier-functions 2016-09-30 09:48:02 -04:00
Andy Eschbacher
6846014a4f adding docs 2016-09-29 11:42:11 -04:00
Andy Eschbacher
23b2ad57c5 test updates 2016-09-29 11:37:42 -04:00
Andy Eschbacher
99856ce956 flip inequality 2016-09-29 11:37:22 -04:00
Andy Eschbacher
f11982f531 Merge branch 'adds-outlier-functions' of https://github.com/CartoDB/crankshaft into adds-outlier-functions 2016-09-29 11:11:36 -04:00
Andy Eschbacher
bd05e7739d add test to produce error 2016-09-29 11:10:54 -04:00
Andy Eschbacher
5754087140 adds symmetric option for stddev outlier 2016-09-29 11:09:10 -04:00
Andy Eschbacher
8bc6f69a1b adding exceptions to improve robustness 2016-09-29 10:12:32 -04:00
Andy Eschbacher
b54c62890f adds hand-off doc line 2016-09-29 08:48:22 -04:00
Andy Eschbacher
acde384157 update tests 2016-09-28 16:27:41 -04:00
Andy Eschbacher
b8accb48fc adds tests 2016-09-28 15:55:56 -04:00
Andy Eschbacher
f2bb0b496b small fixes 2016-09-26 16:51:22 -04:00
Andy Eschbacher
aaa36569de first add 2016-09-26 16:26:34 -04:00
Andy Eschbacher
803816f5c9 Merge branch 'moran-query-ordering-fix' of https://github.com/CartoDB/crankshaft into moran-query-ordering-fix 2016-09-26 10:15:49 -04:00
Andy Eschbacher
1ef3f86474 small updates after ordering fix 2016-09-26 10:13:27 -04:00
Andy Eschbacher
f1d420a6f7 ordering fixes 2016-09-26 10:11:16 -04:00
Andy Eschbacher
06452562b9 fix ordering problems in input columns 2016-09-26 10:10:52 -04:00
Andy Eschbacher
07e4062237 Merge branch 'develop' into moran-query-ordering-fix 2016-09-23 13:25:36 -04:00
Andy Eschbacher
5443b67470 adding docs for getis ord's g 2016-09-22 08:58:22 -04:00
Andy Eschbacher
795413e46d cleaning test files 2016-09-21 12:01:42 -04:00
Andy Eschbacher
e5ea836493 fix json format 2016-09-21 11:53:17 -04:00
Andy Eschbacher
258322fcca update tests to queen weights from knn 2016-09-21 11:46:07 -04:00
Andy Eschbacher
166e9e223f minor formatting changes 2016-09-20 09:55:33 -04:00
Andy Eschbacher
29de72de33 output column renaming 2016-09-20 09:55:13 -04:00
Andy Eschbacher
eff548dec9 aligning parameters for fistures and tests 2016-09-19 17:17:51 -04:00
Andy Eschbacher
dcb364c3ee up default number of permutations 2016-09-19 17:16:53 -04:00
Andy Eschbacher
1d09eac3e7 adding pgsql tests 2016-09-19 16:10:29 -04:00
Andy Eschbacher
5127845100 Merge branch 'adds-getis-analysis' of https://github.com/cartodb/crankshaft into adds-getis-analysis 2016-09-19 19:24:43 +00:00
Andy Eschbacher
ee4eb795b7 adding getis fixture file 2016-09-19 19:24:23 +00:00
Andy Eschbacher
2ede55d165 pep8 updates 2016-09-19 12:17:01 -04:00
Andy Eschbacher
df5faa6745 Merge branch 'adds-getis-analysis' of https://github.com/cartodb/crankshaft into adds-getis-analysis
Conflicts:
	src/py/crankshaft/test/test_clustering_getis.py
2016-09-19 15:54:46 +00:00
Andy Eschbacher
06f0cb0dc4 updating how p values are tested 2016-09-19 15:45:10 +00:00
Stuart Lynn
11176b71b3 Update PULL_REQUEST_TEMPLATE.md 2016-09-19 10:47:59 -04:00
Andy Eschbacher
b5445da303 remove kinks in test 2016-09-14 12:45:43 +00:00
Andy Eschbacher
5d109acd8d remove debug messages 2016-09-13 17:59:15 -04:00
Andy Eschbacher
2937c97fea including correct fixtures 2016-09-13 17:47:30 -04:00
Andy Eschbacher
c392aec98a re-ordered columns 2016-09-13 15:32:32 -04:00
Andy Eschbacher
4e42625d79 fix indexing of fixture 2016-09-13 14:29:06 -04:00
Andy Eschbacher
b71152a884 adds fixtures and tests 2016-09-13 09:06:09 -04:00
Andy Eschbacher
ce4cc637ae adding permutations to interface 2016-09-13 09:05:24 -04:00
Andy Eschbacher
ccccf68066 fix module call 2016-09-12 11:39:21 -04:00
Andy Eschbacher
60f52633fa adds hotspot/coldspot function 2016-09-09 11:11:32 -04:00
Andy Eschbacher
1148aa417a additional test on alphabetical ordering 2016-09-06 09:23:59 -04:00
Andy Eschbacher
e29f6f2861 add more comments 2016-09-06 09:23:39 -04:00
Andy Eschbacher
44dc5811b5 updating tests for query ordering error 2016-09-01 16:47:57 -04:00
Andy Eschbacher
40481f1286 adding more tests 2016-08-29 17:10:58 -04:00
Andy Eschbacher
622235d787 :P adding commas 2016-08-29 16:52:40 -04:00
Andy Eschbacher
623613aa5c adding ordered dict to tests 2016-08-29 16:46:49 -04:00
Andy Eschbacher
a451fb5b6a minor ordering changes 2016-08-29 15:50:19 -04:00
316 changed files with 62988 additions and 1045 deletions

View File

@@ -2,6 +2,9 @@
- [ ] All declared geometries are `geometry(Geometry, 4326)` for general geoms, or `geometry(Point, 4326)`
- [ ] Existing functions in crankshaft python library called from the extension are kept at least from version N to version N+1 (to avoid breakage during upgrades).
- [ ] Docs for public-facing functions are written
- [ ] New functions follow the naming conventions: `CDB_NameOfFunction`. Where internal functions begin with an underscore `_`.
- [ ] If appropriate, new functions accepts an arbitrary query as an input (see [Crankshaft Issue #6](https://github.com/CartoDB/crankshaft/issues/6) for more information)
- [ ] New functions follow the naming conventions: `CDB_NameOfFunction`. Where internal functions begin with an underscore
- [ ] Video explaining the analysis and showing examples
- [ ] Analysis Documentation written [template](https://docs.google.com/a/cartodb.com/document/d/1X2KOtaiEBKWNMp8UjwcLB-kE9aIOw09aOjX3oaCjeME/edit?usp=sharing)
- [ ] Smoke test written
- [ ] Hand-off document for camshaft node written
- [ ] If function is in Python, code conforms to [PEP8 Style Guide](https://www.python.org/dev/peps/pep-0008/)

View File

@@ -1,4 +1,6 @@
language: c
dist: precise
sudo: required
env:
global:
@@ -35,12 +37,16 @@ before_install:
- sudo apt-get -y remove --purge postgresql-9.2
- sudo apt-get -y remove --purge postgresql-9.3
- sudo apt-get -y remove --purge postgresql-9.4
- sudo apt-get -y remove --purge postgis
- sudo apt-get -y remove --purge postgresql-9.5
- sudo rm -rf /var/lib/postgresql/
- sudo rm -rf /var/log/postgresql/
- sudo rm -rf /etc/postgresql/
- sudo apt-get -y remove --purge postgis-2.2
- sudo apt-get -y autoremove
- sudo apt-get -y install postgresql-9.5=9.5.2-2ubuntu1
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-2ubuntu1
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-2ubuntu1
- sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
- sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2

View File

@@ -5,6 +5,12 @@ refactoring or bugfixing, a topic branch must be created out of the `develop`.
Modifications are done inside `src/pg/sql` and `src/py/crankshaft`.
When adding a new PostgreSQL function or modifying an exiting one make sure that the
[VOLATILITY](https://www.postgresql.org/docs/current/static/xfunc-volatility.html) and [PARALLEL](https://www.postgresql.org/docs/9.6/static/parallel-safety.html) categories are updated accordingly.
As PARALLEL labels need to be stripped for incompatible PostgreSQL versions
please use _PARALLEL SAFE/RESTRICTED/UNSAFE_ in uppercase so it's handled
automatically.
Take into account:
* Tests must be added for any new functionality
@@ -55,3 +61,7 @@ sudo make install
# Run the tests against the installed extension.
make test
```
## Submitting contributions
Before opening a pull request (or submitting a contribution) you will need to sign a Contributor License Agreement (CLA) before making a submission, [learn more here](https://carto.com/contributions).

View File

@@ -4,3 +4,8 @@ PACKAGE = crankshaft
EXTVERSION = $(shell grep default_version $(SELF_DIR)/src/pg/$(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
RELEASE_VERSION ?= $(EXTVERSION)
SED = sed
PIP = pip
NOSETESTS = nosetests
AWK = awk
PG_CONFIG = pg_config
PG_PARALLEL := $(shell $(PG_CONFIG) --version | ($(AWK) '{$$2*=1000; if ($$2 >= 9600) print 1; else print 0;}' 2> /dev/null || echo 0))

23
NEWS.md
View File

@@ -1,3 +1,26 @@
0.6.1 (2017-11-23)
* Add VOLATILITY and PARALLEL categories to PostgreSQL functions
0.6.0 (2017-11-08)
------------------
* Adds new functions: `CDB_GWR` and `CDB_GWR_Predict`
0.5.2 (2017-05-12)
------------------
* Fixes missing comma for dict creation #172
0.5.1 (2016-12-12)
------------------
* Fixed problem with the upgrade file from 0.4.2 to 0.5.0 that hasn't changes that should be there (as per ethervoid).
0.5.0 (2016-12-12)
------------------
* Updated PULL_REQUEST_TEMPLATE
* Fixed a bug that flips the order of the numerator in denominator for calculating using Moran Local Rate because previously the code sorted the keys alphabetically.
* Add new CDB_GetisOrdsG functions. Getis-Ord's G\* is a geo-statistical measurement of the intensity of clustering of high or low values
* Add new outlier detection functions: CDB_StaticOutlier, CDB_PercentOutlier and CDB_StdDevOutlier
* Updates in the framework for accessing the Python functions.
0.4.2 (2016-09-22)
------------------
* Bugfix for cdb_areasofinterestglobal: import correct modules

View File

@@ -4,33 +4,35 @@ The release process of a new version of the extension
shall be performed by the designated *Release Manager*.
## Release steps
1. Make sure `develop` branch passes all the tests.
1. Merge `develop` into `master`
1. Update the version number in `src/pg/crankshaft.control`.
1. Generate the next release files with this command:
* Make sure `develop` branch passes all the tests.
* Merge `develop` into `master`
* Update the version number in `src/pg/crankshaft.control`.
* Generate the next release files with this command:
```shell
make release
```
1. Generate an upgrade path from the previous to the next release by copying the generated release file. E.g:
* Generate an upgrade path from the previous to the next release by copying the generated release file. E.g:
```shell
cp release/crankshaft--X.Y.Z.sql release/crankshaft--A.B.C--X.Y.Z.sql
```
NOTE: you can rely on this thanks to the compatibility checks.
NOTE: you can rely on this thanks to the compatibility checks.
TODO: automate this step [#94](https://github.com/CartoDB/crankshaft/issues/94)
2. Update the [NEWS.md](https://github.com/CartoDB/crankshaft/blob/master/NEWS.md) file
1. Commit and push the generated files.
1. Tag the release:
* Update the [NEWS.md](https://github.com/CartoDB/crankshaft/blob/master/NEWS.md) file
* Commit and push the generated files.
* Tag the release:
```
git tag -a X.Y.Z -m "Release X.Y.Z"
git push origin X.Y.Z
```
1. Deploy and test in staging
1. Deploy and test in production
1. Merge back into develop
* Deploy and test in staging
* Merge `master` into **`stable`**
* Deploy and test in production
* Merge `master` into **`develop`**
## Some remarks

View File

@@ -1,5 +1,14 @@
## Areas of Interest Functions
A family of analyses to uncover groupings of areas with consistently high or low values (clusters) and smaller areas with values unlike those around them (outliers). A cluster is labeled by an 'HH' (high value compared to the entire dataset in an area with other high values), or its opposite 'LL'. An outlier is labeled by an 'LH' (low value surrounded by high values) or an 'HL' (the opposite). Each cluster and outlier classification has an associated p-value, a measure of how significant the pattern of highs and lows is compared to a random distribution.
These functions have two forms: local and global. The local versions classify every input geometry while the global function gives a rating of the overall clustering characteristics of the dataset. Both forms accept an optional denomiator (see the rate versions) if, for example, working with count data and a denominator is needed.
### Notes
* Rows with null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, or use a `LEFT JOIN` to get null outputs from the analysis.
* Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified within the `subquery` argument.
### CDB_AreasOfInterestLocal(subquery text, column_name text)
This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I.
@@ -29,6 +38,7 @@ A table with the following columns.
| vals | NUMERIC | Values from `'column_name'`. |
#### Example Usage
```sql
@@ -37,8 +47,10 @@ SELECT
aoi.quads,
aoi.significance,
c.num_cyclists_per_total_population
FROM CDB_AreasOfInterestLocal('SELECT * FROM commute_data'
'num_cyclists_per_total_population') As aoi
FROM
cdb_crankshaft.CDB_AreasOfInterestLocal(
'SELECT * FROM commute_data'
'num_cyclists_per_total_population') As aoi
JOIN commute_data As c
ON c.cartodb_id = aoi.rowid;
```
@@ -71,8 +83,12 @@ A table with the following columns.
#### Examples
```sql
SELECT *
FROM CDB_AreasOfInterestGlobal('SELECT * FROM commute_data', 'num_cyclists_per_total_population')
SELECT
*
FROM
cdb_crankshaft.CDB_AreasOfInterestGlobal(
'SELECT * FROM commute_data',
'num_cyclists_per_total_population')
```
### CDB_AreasOfInterestLocalRate(subquery text, numerator_column text, denominator_column text)
@@ -102,7 +118,7 @@ A table with the following columns.
| quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. |
| significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. |
| rowid | INT | Row id of the values which correspond to the input rows. |
| vals | NUMERIC | Values from `'column_name'`. |
| vals | NUMERIC | Standardized rate (centered on the mean and normalized by the standard deviation) calculated from `numerator` and `denominator`. This is calculated by [Assuncao Rate](http://pysal.readthedocs.io/en/latest/library/esda/smoothing.html?highlight=assuncao#pysal.esda.smoothing.assuncao_rate) in the PySAL library. |
#### Example Usage
@@ -113,9 +129,11 @@ SELECT
aoi.quads,
aoi.significance,
c.cyclists_per_total_population
FROM CDB_AreasOfInterestLocalRate('SELECT * FROM commute_data'
'num_cyclists',
'total_population') As aoi
FROM
cdb_crankshaft.CDB_AreasOfInterestLocalRate(
'SELECT * FROM commute_data'
'num_cyclists',
'total_population') As aoi
JOIN commute_data As c
ON c.cartodb_id = aoi.rowid;
```
@@ -149,10 +167,13 @@ A table with the following columns.
#### Examples
```sql
SELECT *
FROM CDB_AreasOfInterestGlobalRate('SELECT * FROM commute_data',
'num_cyclists',
'total_population')
SELECT
*
FROM
cdb_crankshaft.CDB_AreasOfInterestGlobalRate(
'SELECT * FROM commute_data',
'num_cyclists',
'total_population')
```
## Hotspot, Coldspot, and Outlier Functions

View File

@@ -8,7 +8,7 @@ This function takes time series data associated with geometries and outputs like
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments. Tables in queries must exist in user's database (i.e., no CTEs at present) |
| column_names | TEXT Array | Names of column that form the history of measurements for the geometries (e.g., `Array['y2011', 'y2012', 'y2013', 'y2014', 'y2015', 'y2016']`). |
| num_classes (optional) | INT | Number of quantile classes to separate data into. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
@@ -30,18 +30,29 @@ A table with the following columns.
| rowid | NUMERIC | id of the row that corresponds to the `id_col` (by default `cartodb_id` of the input rows) |
#### Notes
* Rows will null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, etc.
* Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified in the `subquery` parameter.
#### Example Usage
```sql
SELECT
c.cartodb_id,
c.the_geom,
c.the_geom_webmercator,
m.trend,
m.trend_up,
m.trend_down,
m.volatility
FROM CDB_SpatialMarkovTrend('SELECT * FROM nyc_real_estate'
Array['m03y2009','m03y2010','m03y2011','m03y2012','m03y2013','m03y2014','m03y2015','m03y2016']) As m
FROM
cdb_crankshaft.CDB_SpatialMarkovTrend(
'SELECT * FROM nyc_real_estate'
Array['m03y2009', 'm03y2010', 'm03y2011',
'm03y2012', 'm03y2013', 'm03y2014',
'm03y2015','m03y2016']) As m
JOIN nyc_real_estate As c
ON c.cartodb_id = m.rowid;
```

View File

@@ -54,9 +54,9 @@ with t as (
SELECT
array_agg(cartodb_id::bigint) as id,
array_agg(the_geom) as g,
array_agg(coalesce(gla,0)::numeric) as w
array_agg(coalesce(gla, 0)::numeric) as w
FROM
abel.centros_comerciales_de_madrid
centros_comerciales_de_madrid
WHERE not no_cc
),
s as (
@@ -67,12 +67,15 @@ SELECT
FROM
sscc_madrid
)
select
SELECT
g.the_geom,
trunc(g.h,2) as h,
trunc(g.h, 2) as h,
round(g.hpop) as hpop,
trunc(g.dist/1000,2) as dist_km
FROM t, s, CDB_Gravity1(t.id, t.g, t.w, s.id, s.g, s.p, newmall_ID, 100000, 5000) g
trunc(g.dist/1000, 2) as dist_km
FROM
t,
s,
cdb_crankshaft.CDB_Gravity(t.id, t.g, t.w, s.id, s.g, s.p, newmall_ID, 100000, 5000) as g
```

View File

@@ -44,11 +44,18 @@ Default values:
#### Example Usage
```sql
with a as (
select
WITH a as (
SELECT
array_agg(the_geom) as geomin,
array_agg(temp::numeric) as colin
from table_4804232032
FROM table_4804232032
)
SELECT CDB_SpatialInterpolation(geomin, colin, CDB_latlng(41.38, 2.15),1) FROM a;
SELECT
cdb_crankshaft.CDB_SpatialInterpolation(
geomin,
colin,
CDB_latlng(41.38, 2.15),
1)
FROM
a
```

View File

@@ -27,12 +27,20 @@ PostGIS wil include this in future versions ([doc for dev branch](http://postgis
```sql
WITH a AS (
SELECT
ARRAY[ST_GeomFromText('POINT(2.1744 41.403)', 4326),ST_GeomFromText('POINT(2.1228 41.380)', 4326),ST_GeomFromText('POINT(2.1511 41.374)', 4326),ST_GeomFromText('POINT(2.1528 41.413)', 4326),ST_GeomFromText('POINT(2.165 41.391)', 4326),ST_GeomFromText('POINT(2.1498 41.371)', 4326),ST_GeomFromText('POINT(2.1533 41.368)', 4326),ST_GeomFromText('POINT(2.131386 41.41399)', 4326)] AS geomin
ARRAY[
ST_GeomFromText('POINT(2.1744 41.403)', 4326),
ST_GeomFromText('POINT(2.1228 41.380)', 4326),
ST_GeomFromText('POINT(2.1511 41.374)', 4326),
ST_GeomFromText('POINT(2.1528 41.413)', 4326),
ST_GeomFromText('POINT(2.165 41.391)', 4326),
ST_GeomFromText('POINT(2.1498 41.371)', 4326),
ST_GeomFromText('POINT(2.1533 41.368)', 4326),
ST_GeomFromText('POINT(2.131386 41.41399)', 4326)
] AS geomin
)
SELECT
st_transform(
(st_dump(CDB_voronoi(geomin, 0.2, 1e-9)
)).geom
, 3857) as the_geom_webmercator
ST_TRANSFORM(
(ST_Dump(cdb_crankshaft.CDB_Voronoi(geomin, 0.2, 1e-9))).geom,
3857) as the_geom_webmercator
FROM a;
```

View File

@@ -1,17 +1,17 @@
## K-Means Functions
### CDB_KMeans(subquery text, no_clusters INTEGER)
k-means clustering is a popular technique for finding clusters in data by minimizing the intra-cluster 'distance' and maximizing the inter-cluster 'distance'. The distance is defined in the parameter space of the variables entered.
This function attempts to find n clusters within the input data. It will return a table to CartoDB ids and
the number of the cluster each point in the input was assigend to.
### CDB_KMeans(subquery text, no_clusters integer)
This function attempts to find `no_clusters` clusters within the input data based on the geographic distribution. It will return a table with ids and the cluster classification of each point input assuming `the_geom` is not null-valued. If `the_geom` is null-valued, the point will not be considered in the analysis.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| no\_clusters | INTEGER | The number of clusters to try and find |
| no\_clusters | INTEGER | The number of clusters to find |
#### Returns
@@ -19,25 +19,28 @@ A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| cartodb\_id | INTEGER | The CartoDB id of the row in the input table.|
| cluster\_no | INTEGER | The cluster that this point belongs to. |
| cartodb\_id | INTEGER | The row id of the row from the input table |
| cluster\_no | INTEGER | The cluster that this point belongs to |
#### Example Usage
```sql
SELECT
customers.*,
km.cluster_no
FROM cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) km, customers_3
WHERE customers.cartodb_id = km.cartodb_id
SELECT
customers.*,
km.cluster_no
FROM
cdb_crankshaft.CDB_KMeans('SELECT * from customers' , 6) As km,
customers
WHERE
customers.cartodb_id = km.cartodb_id
```
### CDB_WeightedMean(subquery text, weight_column text, category_column text)
Function that computes the weighted centroid of a number of clusters by some weight column.
### Arguments
### Arguments
| Name | Type | Description |
|------|------|-------------|
@@ -45,18 +48,75 @@ Function that computes the weighted centroid of a number of clusters by some wei
| weight\_column | TEXT | The name of the column to use as a weight |
| category\_column | TEXT | The name of the column to use as a category |
### Returns
### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| the\_geom | GEOMETRY | A point for the weighted cluster center |
| class | INTEGER | The cluster class |
| class | INTEGER | The cluster class |
### Example Usage
### Example Usage
```sql
SELECT ST_TRANSFORM(the_geom, 3857) as the_geom_webmercator, class
FROM cdb_weighted_mean('SELECT *, customer_value FROM customers','customer_value','cluster_no')
```sql
SELECT
ST_Transform(km.the_geom, 3857) As the_geom_webmercator,
km.class
FROM
cdb_crankshaft.CDB_WeightedMean(
'SELECT *, customer_value FROM customers',
'customer_value',
'cluster_no') As km
```
## CDB_KMeansNonspatial(subquery text, colnames text[], no_clusters int)
K-means clustering classifies the rows of your dataset into `no_clusters` by finding the centers (means) of the variables in `colnames` and classifying each row by it's proximity to the nearest center. This method partitions space into distinct Voronoi cells.
As a standard machine learning method, k-means clustering is an unsupervised learning technique that finds the natural clustering of values. For instance, it is useful for finding subgroups in census data leading to demographic segmentation.
### Arguments
| Name | Type | Description |
|------|------|-------------|
| query | TEXT | SQL query to expose the data to be used in the analysis (e.g., `SELECT * FROM iris_data`). It should contain at least the columns specified in `colnames` and the `id_colname`. |
| colnames | TEXT[] | Array of columns to be used in the analysis (e.g., `Array['petal_width', 'sepal_length', 'petal_length']`). |
| no\_clusters | INTEGER | Number of clusters for the classification of the data |
| id\_col (optional) | TEXT | The id column (default: 'cartodb_id') for identifying rows |
| standarize (optional) | BOOLEAN | Setting this to true (default) standardizes the data to have a mean at zero and a standard deviation of 1 |
### Returns
A table with the following columns.
| Column | Type | Description |
|--------|------|-------------|
| cluster_label | TEXT | Label that a cluster belongs to, number from 0 to `no_clusters - 1`. |
| cluster_center | JSON | Center of the cluster that a row belongs to. The keys of the JSON object are the `colnames`, with values that are the center of the respective cluster |
| silhouettes | NUMERIC | [Silhouette score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html#sklearn.metrics.silhouette_score) of the cluster label |
| inertia | NUMERIC | Sum of squared distances of samples to their closest cluster center |
| rowid | BIGINT | id of the original row for associating back with the original data |
### Example Usage
```sql
SELECT
customers.*,
km.cluster_label,
km.cluster_center,
km.silhouettes
FROM
cdb_crankshaft.CDB_KMeansNonspatial(
'SELECT * FROM customers',
Array['customer_value', 'avg_amt_spent', 'home_median_income'],
7) As km,
customers
WHERE
customers.cartodb_id = km.rowid
```
### Resources
- Read more in [scikit-learn's documentation](http://scikit-learn.org/stable/modules/clustering.html#k-means)
- [K-means basics](https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials)

View File

@@ -3,7 +3,7 @@
### CDB_CreateAndPredictSegment(query TEXT, variable_name TEXT, target_query TEXT)
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
#### Arguments
@@ -34,12 +34,12 @@ A table with the following columns.
SELECT * from cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT agg, median_rent::numeric, male_pop::numeric, female_pop::numeric FROM late_night_agg',
'agg',
'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny');
'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny');
```
### CDB_CreateAndPredictSegment(target numeric[], train_features numeric[], prediction_features numeric[], prediction_ids numeric[])
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
#### Arguments
@@ -76,7 +76,7 @@ WITH training As (
FROM late_night_agg),
target AS (
SELECT cdb_crankshaft.CDB_PyAgg(Array[median_rent, male_pop, female_pop]::Numeric[]) As features,
array_agg(cartodb_id) As cartodb_ids FROM late_night_agg)
array_agg(cartodb_id) As cartodb_ids FROM late_night_agg)
SELECT cdb_crankshaft.CDB_CreateAndPredictSegment(training.target, training.features, target.features, target.cartodb_ids)
FROM training, target;

View File

@@ -23,11 +23,17 @@ Function to find the [PIA](https://en.wikipedia.org/wiki/Pole_of_inaccessibility
#### Example Usage
```sql
with a as(
select st_geomfromtext('POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))', 3857) as g
WITH a as (
SELECT
ST_GeomFromText(
'POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))',
3857) as g
),
b as (
select ST_Transform(g, 4326) as g from a
SELECT ST_Transform(g, 4326) as g
FROM a
)
SELECT st_astext(CDB_PIA(g)) from b;
SELECT
ST_AsText(cdb_crankshaft.CDB_PIA(g))
FROM b
```

View File

@@ -24,12 +24,22 @@ Returns a table object
#### Example Usage
```sql
with data as (
select
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),ST_GeomFromText('POINT(2.1228 41.3809)'),ST_GeomFromText('POINT(2.1511 41.3742)'),ST_GeomFromText('POINT(2.1528 41.4136)'),ST_GeomFromText('POINT(2.165 41.3917)'),ST_GeomFromText('POINT(2.1498 41.3713)'),ST_GeomFromText('POINT(2.1533 41.3683)'),ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin
WITH data as (
SELECT
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[
ST_GeomFromText('POINT(2.1744 41.4036)'),
ST_GeomFromText('POINT(2.1228 41.3809)'),
ST_GeomFromText('POINT(2.1511 41.3742)'),
ST_GeomFromText('POINT(2.1528 41.4136)'),
ST_GeomFromText('POINT(2.165 41.3917)'),
ST_GeomFromText('POINT(2.1498 41.3713)'),
ST_GeomFromText('POINT(2.1533 41.3683)'),
ST_GeomFromText('POINT(2.131386 41.413998)')
] as geomin
)
select CDB_Densify(geomin, colin, 2) from data;
SELECT cdb_crankshaft.CDB_Densify(geomin, colin, 2)
FROM data
```

View File

@@ -26,11 +26,19 @@ Returns a table object
#### Example Usage
```sql
with data as (
select
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),ST_GeomFromText('POINT(2.1228 41.3809)'),ST_GeomFromText('POINT(2.1511 41.3742)'),ST_GeomFromText('POINT(2.1528 41.4136)'),ST_GeomFromText('POINT(2.165 41.3917)'),ST_GeomFromText('POINT(2.1498 41.3713)'),ST_GeomFromText('POINT(2.1533 41.3683)'),ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin
WITH data as (
SELECT
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),
ST_GeomFromText('POINT(2.1228 41.3809)'),
ST_GeomFromText('POINT(2.1511 41.3742)'),
ST_GeomFromText('POINT(2.1528 41.4136)'),
ST_GeomFromText('POINT(2.165 41.3917)'),
ST_GeomFromText('POINT(2.1498 41.3713)'),
ST_GeomFromText('POINT(2.1533 41.3683)'),
ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin
)
select CDB_TINmap(geomin, colin, 2) from data;
SELECT cdb_crankshaft.CDB_TINmap(geomin, colin, 2)
FROM data
```

40
doc/16_getis_ord_gstar.md Normal file
View File

@@ -0,0 +1,40 @@
## Getis-Ord's G\*
Getis-Ord's G\* is a geo-statistical measurement of the intensity of clustering of high or low values. The clustering of high values can be referred to as "hotspots" because these are areas of high activity or large (relative to the global mean) measurement values. Coldspots are clustered areas with low activity or small measurement values.
### CDB_GetisOrdsG(subquery text, column_name text)
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | text | A query of the data you want to pass to the function. It must include `column_name`, a geometry column (usually `the_geom`) and an id column (usually `cartodb_id`) |
| column_name | text | This is the column of interest for performing this analysis on. This column should be a numeric type. |
| w_type (optional) | text | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation.](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html) |
| num_ngbrs (optional) | integer | Default: 5. If `knn` is chosen, this will set the number of neighbors. If `knn` is not chosen, any entered value will be ignored. Use `NULL` if not choosing `knn`. |
| permutations (optional) | integer | The number of permutations for calculating p-values. Default: 999 |
| geom_col (optional) | text | The column where the geometry information is stored. The format must be PostGIS Geometry type (SRID 4326). Default: `the_geom`. |
| id_col (optional) | text | The column that has the unique row identifier. |
### Returns
Returns a table with the following columns.
| Name | Type | Description |
|------|------|-------------|
| z_score | numeric | z-score, a measure of the intensity of clustering of high values (hotspots) or low values (coldspots). Positive values represent 'hotspots', while negative values represent 'coldspots'. |
| p_value | numeric | p-value, a measure of the significance of the intensity of clustering |
| p_z_sim | numeric | p-value based on standard normal approximation from permutations |
| rowid | integer | The original `id_col` that can be used to associate the outputs with the original geometry and inputs |
#### Example Usage
The following query returns the original table augmented with the values calculated from the Getis-Ord's G\* analysis.
```sql
SELECT i.*, m.z_score, m.p_value
FROM cdb_crankshaft.CDB_GetisOrdsG('SELECT * FROM incident_reports_clustered',
'num_incidents') As m
JOIN incident_reports_clustered As i
ON i.cartodb_id = m.rowid;
```

163
doc/18_outliers.md Normal file
View File

@@ -0,0 +1,163 @@
## Outlier Detection
This set of functions detects the presence of outliers. There are three functions for finding outliers from non-spatial data:
1. Static Outliers
1. Percentage Outliers
1. Standard Deviation Outliers
### CDB_StaticOutlier(column_value numeric, threshold numeric)
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| column_value | numeric | The column of values on which to apply the threshold |
| threshold | numeric | The static threshold which is used to indicate whether a `column_value` is an outlier or not |
### Returns
Returns a boolean (true/false) depending on whether a value is above or below (or equal to) the threshold
| Name | Type | Description |
|------|------|-------------|
| outlier | boolean | classification of whether a row is an outlier or not |
#### Example Usage
With a table `website_visits` and a column of the number of website visits in units of 10,000 visits:
```
| id | visits_10k |
|----|------------|
| 1 | 1 |
| 2 | 3 |
| 3 | 5 |
| 4 | 1 |
| 5 | 32 |
| 6 | 3 |
| 7 | 57 |
| 8 | 2 |
```
```sql
SELECT
id,
cdb_crankshaft.CDB_StaticOutlier(visits_10k, 11.0) As outlier,
visits_10k
FROM website_visits
```
```
| id | outlier | visits_10k |
|----|---------|------------|
| 1 | f | 1 |
| 2 | f | 3 |
| 3 | f | 5 |
| 4 | f | 1 |
| 5 | t | 32 |
| 6 | f | 3 |
| 7 | t | 57 |
| 8 | f | 2 |
```
### CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
`CDB_PercentOutlier` calculates whether or not a value falls above a given threshold based on a percentage above the mean value of the input values.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| column_values | numeric[] | An array of the values to calculate the outlier classification on |
| outlier_fraction | numeric | The threshold above which a column value divided by the mean of all values is considered an outlier |
| ids | int[] | An array of the unique row ids of the input data (usually `cartodb_id`) |
### Returns
Returns a table of the outlier classification with the following columns
| Name | Type | Description |
|------|------|-------------|
| is_outlier | boolean | classification of whether a row is an outlier or not |
| rowid | int | original row id (e.g., input `cartodb_id`) of the row which has the outlier classification |
#### Example Usage
This example find outliers which are more than 100% larger than the average (that is, more than 2.0 times larger).
```sql
WITH cte As (
SELECT
unnest(Array[1,2,3,4,5,6,7,8]) As id,
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
)
SELECT
(cdb_crankshaft.CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
FROM cte;
```
Output
```
| outlier | rowid |
|---------+-------|
| f | 1 |
| f | 2 |
| f | 3 |
| f | 4 |
| t | 5 |
| f | 6 |
| t | 7 |
| f | 8 |
```
### CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
`CDB_StdDevOutlier` calculates whether or not a value falls above or below a given threshold based on the number of standard deviations from the mean.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| column_values | numeric[] | An array of the values to calculate the outlier classification on |
| num_deviations | numeric | The threshold in units of standard deviation |
| ids | int[] | An array of the unique row ids of the input data (usually `cartodb_id`) |
| is_symmetric (optional) | boolean | Consider outliers that are symmetric about the mean (default: true) |
### Returns
Returns a table of the outlier classification with the following columns
| Name | Type | Description |
|------|------|-------------|
| is_outlier | boolean | classification of whether a row is an outlier or not |
| rowid | int | original row id (e.g., input `cartodb_id`) of the row which has the outlier classification |
#### Example Usage
This example find outliers which are more than 100% larger than the average (that is, more than 2.0 times larger).
```sql
WITH cte As (
SELECT
unnest(Array[1,2,3,4,5,6,7,8]) As id,
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
)
SELECT
(cdb_crankshaft.CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
FROM cte;
```
Output
```
| outlier | rowid |
|---------+-------|
| f | 1 |
| f | 2 |
| f | 3 |
| f | 4 |
| f | 5 |
| f | 6 |
| t | 7 |
| f | 8 |
```

128
doc/21_gwr.md Normal file
View File

@@ -0,0 +1,128 @@
## Regression
### Predictive geographically weighted regression (GWR)
Predictive GWR generates estimates of the dependent variable at locations where it has not been observed. It predicts these unknown values by first using the GWR model estimation analysis with known data values of the dependent and independent variables sampled from around the prediction location(s) to build a geographically weighted, spatially-varying regression model. It then uses this model and known values of the independent variables at the prediction locations to predict the value of the dependent variable where it is otherwise unknown.
For predictive GWR to work, a dataset needs known independent variables, some known dependent variables, and some unknown dependent variables. The dataset also needs to have geometry data (e.g., point, lines, or polygons).
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that expose the data to be analyzed (e.g., `SELECT * FROM regression_inputs`). This query must have the geometry column name (see the optional `geom_col` for default), the id column name (see `id_col`), and the dependent (`dep_var`) and independent (`ind_vars`) column names. |
| dep_var | TEXT | Name of the dependent variable in the regression model |
| ind_vars | TEXT[] | Text array of independent variable column names used in the model to describe the dependent variable. |
| bw (optional) | NUMERIC | Value of bandwidth. If `NULL` then select optimal (default). |
| fixed (optional) | BOOLEAN | True for distance based kernel function and False (default) for adaptive (nearest neighbor) kernel function. Defaults to `False`. |
| kernel (optional)| TEXT | Type of kernel function used to weight observations. One of `gaussian`, `bisquare` (default), or `exponential`. |
#### Returns
| Column Name | Type | Description |
|-------------|------|-------------|
| coeffs | JSON | JSON object with parameter estimates for each of the dependent variables. The keys of the JSON object are the dependent variables, with values corresponding to the parameter estimate. |
| stand_errs | JSON | Standard errors for each of the dependent variables. The keys of the JSON object are the dependent variables, with values corresponding to the respective standard errors. |
| t_vals | JSON | T-values for each of the dependent variables. The keys of the JSON object are the dependent variable names, with values corresponding to the respective t-value. |
| predicted | NUMERIC | predicted value of y |
| residuals | NUMERIC | residuals of the response |
| r_squared | NUMERIC | R-squared for the parameter fit |
| bandwidth | NUMERIC | bandwidth value consisting of either a distance or N nearest neighbors |
| rowid | INTEGER | row id of the original row |
#### Example Usage
```sql
SELECT
g.cartodb_id,
g.the_geom,
g.the_geom_webmercator,
(gwr.coeffs->>'pctblack')::numeric as coeff_pctblack,
(gwr.coeffs->>'pctrural')::numeric as coeff_pctrural,
(gwr.coeffs->>'pcteld')::numeric as coeff_pcteld,
(gwr.coeffs->>'pctpov')::numeric as coeff_pctpov,
gwr.residuals
FROM cdb_crankshaft.CDB_GWR_Predict('select * from g_utm'::text,
'pctbach'::text,
Array['pctblack', 'pctrural', 'pcteld', 'pctpov']) As gwr
JOIN g_utm as g
on g.cartodb_id = gwr.rowid
```
Note: See [PostgreSQL syntax for parsing JSON objects](https://www.postgresql.org/docs/9.5/static/functions-json.html).
### Geographically weighted regression model estimation
This analysis generates the model coefficients for a geographically weighted, spatially-varying regression. The model coefficients, along with their respective statistics, allow one to make inferences or describe a dependent variable based on a set of independent variables. Similar to traditional linear regression, GWR takes a linear combination of independent variables and a known dependent variable to estimate an optimal set of coefficients. The model coefficients are spatially varying (controlled by the `bandwidth` and `fixed` parameters), so that the model output is allowed to vary from geometry to geometry. This allows GWR to capture non-stationarity -- that is, how local processes vary over space. In contrast, coefficients obtained from estimating a traditional linear regression model assume that processes are constant over space.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that expose the data to be analyzed (e.g., `SELECT * FROM regression_inputs`). This query must have the geometry column name (see the optional `geom_col` for default), the id column name (see `id_col`), dependent and independent column names. |
| dep_var | TEXT | name of the dependent variable in the regression model |
| ind_vars | TEXT[] | Text array of independent variables used in the model to describe the dependent variable |
| bw (optional) | NUMERIC | Value of bandwidth. If `NULL` then select optimal (default). |
| fixed (optional) | BOOLEAN | True for distance based kernel function and False for adaptive (nearest neighbor) kernel function (default). Defaults to false. |
| kernel | TEXT | Type of kernel function used to weight observations. One of `gaussian`, `bisquare` (default), or `exponential`. |
#### Returns
| Column Name | Type | Description |
|-------------|------|-------------|
| coeffs | JSON | JSON object with parameter estimates for each of the dependent variables. The keys of the JSON object are the dependent variables, with values corresponding to the parameter estimate. |
| stand_errs | JSON | Standard errors for each of the dependent variables. The keys of the JSON object are the dependent variables, with values corresponding to the respective standard errors. |
| t_vals | JSON | T-values for each of the dependent variables. The keys of the JSON object are the dependent variable names, with values corresponding to the respective t-value. |
| predicted | NUMERIC | predicted value of y |
| residuals | NUMERIC | residuals of the response |
| r_squared | NUMERIC | R-squared for the parameter fit |
| bandwidth | NUMERIC | bandwidth value consisting of either a distance or N nearest neighbors |
| rowid | INTEGER | row id of the original row |
#### Example Usage
```sql
SELECT
g.cartodb_id,
g.the_geom,
g.the_geom_webmercator,
(gwr.coeffs->>'pctblack')::numeric as coeff_pctblack,
(gwr.coeffs->>'pctrural')::numeric as coeff_pctrural,
(gwr.coeffs->>'pcteld')::numeric as coeff_pcteld,
(gwr.coeffs->>'pctpov')::numeric as coeff_pctpov,
gwr.residuals
FROM cdb_crankshaft.CDB_GWR('select * from g_utm'::text, 'pctbach'::text, Array['pctblack', 'pctrural', 'pcteld', 'pctpov']) As gwr
JOIN g_utm as g
on g.cartodb_id = gwr.rowid
```
Note: See [PostgreSQL syntax for parsing JSON objects](https://www.postgresql.org/docs/9.5/static/functions-json.html).
## Advanced reading
* Fotheringham, A. Stewart, Chris Brunsdon, and Martin Charlton. 2002. Geographically Weighted Regression: The Analysis of Spatially Varying Relationships. John Wiley & Sons. <http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471496162.html>
* Brunsdon, Chris, A. Stewart Fotheringham, and Martin E. Charlton. 1996. "Geographically Weighted Regression: A Method for Exploring Spatial Nonstationarity." Geographical Analysis 28 (4): 28198. <http://onlinelibrary.wiley.com/doi/10.1111/j.1538-4632.1996.tb00936.x/abstract>
* Brunsdon, Chris, Stewart Fotheringham, and Martin Charlton. 1998. "Geographically Weighted Regression." Journal of the Royal Statistical Society: Series D (The Statistician) 47 (3): 43143. <http://onlinelibrary.wiley.com/doi/10.1111/1467-9884.00145/abstract>
* Fotheringham, A. S., M. E. Charlton, and C. Brunsdon. 1998. "Geographically Weighted Regression: A Natural Evolution of the Expansion Method for Spatial Data Analysis." Environment and Planning A 30 (11): 190527. doi:10.1068/a301905. <https://www.researchgate.net/publication/23538637_Geographically_Weighted_Regression_A_Natural_Evolution_Of_The_Expansion_Method_for_Spatial_Data_Analysis>
### GWR for prediction
* Harris, P., A. S. Fotheringham, R. Crespo, and M. Charlton. 2010. "The Use of Geographically Weighted Regression for Spatial Prediction: An Evaluation of Models Using Simulated Data Sets." Mathematical Geosciences 42 (6): 65780. doi:10.1007/s11004-010-9284-7. <https://www.researchgate.net/publication/225757830_The_Use_of_Geographically_Weighted_Regression_for_Spatial_Prediction_An_Evaluation_of_Models_Using_Simulated_Data_Sets>
### GWR in application
* Cahill, Meagan, and Gordon Mulligan. 2007. "Using Geographically Weighted Regression to Explore Local Crime Patterns." Social Science Computer Review 25 (2): 17493. doi:10.1177/0894439307298925. <http://isites.harvard.edu/fs/docs/icb.topic923297.files/174.pdf>
* Gilbert, Angela, and Jayajit Chakraborty. 2011. "Using Geographically Weighted Regression for Environmental Justice Analysis: Cumulative Cancer Risks from Air Toxics in Florida." Social Science Research 40 (1): 27386. doi:10.1016/j.ssresearch.2010.08.006. <http://scholarcommons.usf.edu/cgi/viewcontent.cgi?article=2985&context=etd>
* Ali, Kamar, Mark D. Partridge, and M. Rose Olfert. 2007. "Can Geographically Weighted Regressions Improve Regional Analysis and Policy Making?" International Regional Science Review 30 (3): 300329. doi:10.1177/0160017607301609. <https://www.researchgate.net/publication/249682503_Can_Geographically_Weighted_Regressions_Improve_Regional_Analysis_and_Policy_Making>
* Lu, Binbin, Martin Charlton, and A. Stewart Fotheringhama. 2011. "Geographically Weighted Regression Using a Non-Euclidean Distance Metric with a Study on London House Price Data." Procedia Environmental Sciences, Spatial Statistics 2011: Mapping Global Change, 7: 9297. doi:10.1016/j.proenv.2011.07.017. <https://www.researchgate.net/publication/261960122_Geographically_weighted_regression_with_a_non-Euclidean_distance_metric_A_case_study_using_hedonic_house_price_data>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
default_version = '0.4.2'
default_version = '0.6.1'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft

View File

@@ -0,0 +1,6 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import analysis_data_provider

View File

@@ -0,0 +1,67 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
class AnalysisDataProvider:
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@@ -0,0 +1,4 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *
from getis import *

View File

@@ -0,0 +1,50 @@
"""
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
from collections import OrderedDict
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, qvals)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -0,0 +1,32 @@
from sklearn.cluster import KMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)

View File

@@ -0,0 +1,208 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
from collections import OrderedDict
from crankshaft.analysis_data_provider import AnalysisDataProvider
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
class Moran:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator)
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1,2 @@
"""Import all functions for pysal_utils"""
from crankshaft.pysal_utils.pysal_utils import *

View File

@@ -0,0 +1,211 @@
"""
Utilities module for generic PySAL functionality, mainly centered on
translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
# if w_type.lower() == 'knn':
# row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
# weights = {x['id']: row_normed_weights for x in query_res}
# else:
# weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
# if len(x['neighbors']) > 0
# else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'
return built_weight
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
Defaults to order in the params
@param params: dict of information used in query (column names,
table name, etc.)
Example:
OrderedDict([('numerator', 'price'),
('denominator', 'sq_meters'),
('subquery', 'SELECT * FROM interesting_data')])
Output:
"i.\"price\"::numeric As attr1, " \
"i.\"sq_meters\"::numeric As attr2, "
"""
attr_string = ""
template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "
if 'time_cols' in params:
# if markov analysis
attrs = params['time_cols']
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(attrs):
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Construct where conditions when building neighbors query
Create portion of WHERE clauses for weeding out NULL-valued geometries
Input: dict of params:
{'subquery': ...,
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output:
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
'etc': ...}
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
NULL AND idx_replace."time3" IS NOT NULL'
"""
attr_string = []
template = "idx_replace.\"%s\" IS NOT NULL"
if 'time_cols' in params:
# markov where clauses
attrs = params['time_cols']
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
# moran where clauses
# get keys
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
if 'denominator' in attrs:
attr_string.append(
"idx_replace.\"%s\" <> 0" % params['denominator'])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,11 @@
"""Random seed generator used for non-deterministic functions in crankshaft"""
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1 @@
from segmentation import *

View File

@@ -0,0 +1,176 @@
"""
Segmentation creation and prediction
"""
import sklearn
import numpy as np
import plpy
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
# Lower level functions
#----------------------
def replace_nan_with_mean(array):
"""
Input:
@param array: an array of floats which may have null-valued entries
Output:
array with nans filled in with the mean of the dataset
"""
# returns an array of rows and column indices
indices = np.where(np.isnan(array))
# iterate through entries which have nan values
for row, col in zip(*indices):
array[row, col] = np.mean(array[~np.isnan(array[:, col]), col])
return array
def get_data(variable, feature_columns, query):
"""
Fetch data from the database, clean, and package into
numpy arrays
Input:
@param variable: name of the target variable
@param feature_columns: list of column names
@param query: subquery that data is pulled from for the packaging
Output:
prepared data, packaged into NumPy arrays
"""
columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns])
try:
data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format(
variable=variable,
columns=columns,
query=query))
except Exception, e:
plpy.error('Failed to access data to build segmentation model: %s' % e)
# extract target data from plpy object
target = np.array(data[0]['target'])
# put n feature data arrays into an n x m array of arrays
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
return replace_nan_with_mean(target), replace_nan_with_mean(features)
# High level interface
# --------------------
def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters):
"""
Version of create_and_predict_segment that works on arrays that come stright form the SQL calling
the function.
Input:
@param target: The 1D array of lenth NSamples containing the target variable we want the model to predict
@param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model
@param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from
@param model_parameters: A dictionary containing parameters for the model.
"""
clean_target = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features)
model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2)
prediction = model.predict(target_features)
accuracy_array = [accuracy]*prediction.shape[0]
return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array))
def create_and_predict_segment(query, variable, target_query, model_params):
"""
generate a segment with machine learning
Stuart Lynn
"""
## fetch column names
try:
columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys()
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
## extract column names to be used in building the segmentation model
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
## get data from database
target, features = get_data(variable, feature_columns, query)
model, accuracy = train_model(target, features, model_params, 0.2)
cartodb_ids, result = predict_segment(model, feature_columns, target_query)
accuracy_array = [accuracy]*result.shape[0]
return zip(cartodb_ids, result, accuracy_array)
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def calculate_model_accuracy(model, features, target):
"""
Calculate the mean squared error of the model prediction
Input:
@param model: model trained from input features
@param features: features to make a prediction from
@param target: target to compare prediction to
Output:
mean squared error of the model prection compared to the target
"""
prediction = model.predict(features)
return metrics.mean_squared_error(prediction, target)
def predict_segment(model, features, target_query):
"""
Use the provided model to predict the values for the new feature set
Input:
@param model: The pretrained model
@features: A list of features to use in the model prediction (list of column names)
@target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it.
"""
batch_size = 1000
joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
try:
cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format(
joined_features=joined_features,
target_query=target_query))
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
results = []
while True:
rows = cursor.fetch(batch_size)
if not rows:
break
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
#Need to fix this. Should be global mean. This will cause weird effects
batch = replace_nan_with_mean(batch)
prediction = model.predict(batch)
results.append(prediction)
try:
cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids']
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
return cartodb_ids, np.concatenate(results)

View File

@@ -0,0 +1,2 @@
"""Import all functions from clustering libraries."""
from markov import *

View File

@@ -0,0 +1,194 @@
"""
Spatial dynamics measurements using Spatial Markov
"""
# TODO: remove all plpy dependencies
import numpy as np
import pysal as ps
import plpy
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Markov:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial_trend(self, subquery, time_cols, num_classes=7,
w_type='knn', num_ngbrs=5, permutations=0,
geom_col='the_geom', id_col='cartodb_id'):
"""
Predict the trends of a unit based on:
1. history of its transitions to different classes (e.g., 1st
quantile -> 2nd quantile)
2. average class of its neighbors
Inputs:
@param subquery string: e.g., SELECT the_geom, cartodb_id,
interesting_time_column FROM table_name
@param time_cols list of strings: list of strings of column names
@param num_classes (optional): number of classes to break
distribution of values into. Currently uses quantile bins.
@param w_type string (optional): weight type ('knn' or 'queen')
@param num_ngbrs int (optional): number of neighbors (if knn type)
@param permutations int (optional): number of permutations for test
stats
@param geom_col string (optional): name of column which contains
the geometries
@param id_col string (optional): name of column which has the ids
of the table
Outputs:
@param trend_up float: probablity that a geom will move to a higher
class
@param trend_down float: probablity that a geom will move to a
lower class
@param trend float: (trend_up - trend_down) / trend_static
@param volatility float: a measure of the volatility based on
probability stddev(prob array)
"""
if len(time_cols) < 2:
plpy.error('More than one time column needs to be passed')
params = {"id_col": id_col,
"time_cols": time_cols,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query_result = self.data_provider.get_markov(w_type, params)
# build weight
weights = pu.get_weight(query_result, w_type)
weights.transform = 'r'
# prep time data
t_data = get_time_data(query_result, time_cols)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,
k=num_classes,
fixed=False,
permutations=permutations)
# get lag classes
lag_classes = ps.Quantiles(
ps.lag_spatial(weights, t_data[:, -1]),
k=num_classes).yb
# look up probablity distribution for each unit according to class and
# lag class
prob_dist = get_prob_dist(sp_markov_result.P,
lag_classes,
sp_markov_result.classes[:, -1])
# find the ups and down and overall distribution of each cell
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
def get_time_data(markov_data, time_cols):
"""
Extract the time columns and bin appropriately
"""
num_attrs = len(time_cols)
return np.array([[x['attr' + str(i)] for x in markov_data]
for i in range(1, num_attrs+1)], dtype=float).transpose()
# not currently used
def rebin_data(time_data, num_time_per_bin):
"""
Convert an n x l matrix into an (n/m) x l matrix where the values are
reduced (averaged) for the intervening states:
1 2 3 4 1.5 3.5
5 6 7 8 -> 5.5 7.5
9 8 7 6 8.5 6.5
5 4 3 2 4.5 2.5
if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix.
This process effectively resamples the data at a longer time span n
units longer than the input data.
For cases when there is a remainder (remainder(5/3) = 2), the remaining
two columns are binned together as the last time period, while the
first three are binned together for the first period.
Input:
@param time_data n x l ndarray: measurements of an attribute at
different time intervals
@param num_time_per_bin int: number of columns to average into a new
column
Output:
ceil(n / m) x l ndarray of resampled time series
"""
if time_data.shape[1] % num_time_per_bin == 0:
# if fit is perfect, then use it
n_max = time_data.shape[1] / num_time_per_bin
else:
# fit remainders into an additional column
n_max = time_data.shape[1] / num_time_per_bin + 1
return np.array(
[time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
for i in range(n_max)]).T
def get_prob_dist(transition_matrix, lag_indices, unit_indices):
"""
Given an array of transition matrices, look up the probability
associated with the arrangements passed
Input:
@param transition_matrix ndarray[k,k,k]:
@param lag_indices ndarray:
@param unit_indices ndarray:
Output:
Array of probability distributions
"""
return np.array([transition_matrix[(lag_indices[i], unit_indices[i])]
for i in range(len(lag_indices))])
def get_prob_stats(prob_dist, unit_indices):
"""
get the statistics of the probability distributions
Outputs:
@param trend_up ndarray(float): sum of probabilities for upward
movement (relative to the unit index of that prob)
@param trend_down ndarray(float): sum of probabilities for downward
movement (relative to the unit index of that prob)
@param trend ndarray(float): difference of upward and downward
movements
"""
num_elements = len(unit_indices)
trend_up = np.empty(num_elements, dtype=float)
trend_down = np.empty(num_elements, dtype=float)
trend = np.empty(num_elements, dtype=float)
for i in range(num_elements):
trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
if prob_dist[i, unit_indices[i]] > 0.0:
trend[i] = (trend_up[i] - trend_down[i]) / (
prob_dist[i, unit_indices[i]])
else:
trend[i] = None
# calculate volatility of distribution
volatility = prob_dist.std(axis=1)
return trend_up, trend_down, trend, volatility

View File

@@ -0,0 +1,5 @@
joblib==0.8.3
numpy==1.6.1
scipy==0.14.0
pysal==1.11.2
scikit-learn==0.14.1

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.5.0',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.11.2', 'scikit-learn==0.14.1'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1 @@
[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]]

View File

@@ -0,0 +1 @@
[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]

View File

@@ -0,0 +1 @@
[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]]

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "LL"],
[0.6152779669180425, "LL"],
[-0.14657336660125297, "LH"],
[0.6967858120189607, "LL"],
[0.07949310115714454, "HH"],
[0.4703198759258987, "HH"],
[0.4421125200498064, "HH"],
[0.5724288737143592, "LL"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "LL"],
[-0.01466729201304962, "HL"],
[0.3481559372544409, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "HH"],
[0.15971286468915544, "LL"],
[1.0543588860308968, "HH"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "HH"],
[0.08438455015300014, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "LL"],
[0.795275137550483, "HH"],
[0.18562939195219, "LL"],
[0.3010757406693439, "LL"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "LL"],
[-0.07116352791516614, "HL"],
[-0.09945240794119009, "LH"],
[0.18562939195219, "LL"],
[0.1832733440191868, "LL"],
[-0.39054253768447705, "HL"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "HH"],
[0.2584386102554792, "HH"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "LH"],
[0.051367269430983485, "LL"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "LL"],
[0.04191046803899837, "LL"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "LH"],
[0.5410013139159929, "HH"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,54 @@
import re
class MockCursor:
def __init__(self, data):
self.cursor_pos = 0
self.data = data
def fetch(self, batch_size):
batch = self.data[self.cursor_pos:self.cursor_pos + batch_size]
self.cursor_pos += batch_size
return batch
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def debug(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def cursor(self, query):
data = self.execute(query)
return MockCursor(data)
# TODO: additional arguments
def execute(self, query):
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,78 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Getis
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from crankshaft.analysis_data_provider import AnalysisDataProvider
# Fixture files produced as follows
#
# import pysal as ps
# import numpy as np
# import random
#
# # setup variables
# f = ps.open(ps.examples.get_path("stl_hom.dbf"))
# y = np.array(f.by_col['HR8893'])
# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp"))
#
# out_queen = [{"id": index + 1,
# "neighbors": [x+1 for x in w_queen.neighbors[index]],
# "value": val} for index, val in enumerate(y)]
#
# with open('neighbors_queen_getis.json', 'w') as f:
# f.write(str(out_queen))
#
# random.seed(1234)
# np.random.seed(1234)
# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True,
# permutations=999)
#
# with open('getis_queen.json', 'w') as f:
# f.write(str(zip(lgstar_queen.z_sim,
# lgstar_queen.p_sim, lgstar_queen.p_z_sim)))
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_getis(self, w_type, param):
return self.mock_result
class GetisTest(unittest.TestCase):
"""Testing class for Getis-Ord's G* funtion
This test replicates the work done in PySAL documentation:
https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g
"""
def setUp(self):
# load raw data for analysis
self.neighbors_data = json.loads(
open(fixture_file('neighbors_getis.json')).read())
# load pre-computed/known values
self.getis_data = json.loads(
open(fixture_file('getis.json')).read())
def test_getis_ord(self):
"""Test Getis-Ord's G*"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
getis = Getis(FakeDataProvider(data))
result = getis.getis_ord('subquery', 'value',
'queen', None, 999, 'the_geom',
'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = np.array(self.getis_data)[:, 0:2]
for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected):
self.assertAlmostEqual(res_z, exp_z, delta=1e-2)

View File

@@ -0,0 +1,56 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import fixture_file
from crankshaft.clustering import Kmeans
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mocked_result):
self.mocked_result = mocked_result
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
return self.mocked_result
class KMeansTest(unittest.TestCase):
"""Testing class for k-means spatial"""
def setUp(self):
self.cluster_data = json.loads(
open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"}
def test_kmeans(self):
"""
"""
data = [{'xs': d['xs'],
'ys': d['ys'],
'ids': d['ids']} for d in self.cluster_data]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data))
clusters = kmeans.spatial('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1] == 0]
c2 = [a for a in clusters if a[1] == 1]
self.assertEqual(len(np.unique(labels)), 2)
self.assertEqual(len(c1), 20)
self.assertEqual(len(c2), 20)

View File

@@ -0,0 +1,112 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Moran
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_moran(self, w_type, params):
return self.mock_result
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.params_markov = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan",
"_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(
open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
from crankshaft.clustering import map_quads
self.assertEqual(map_quads(1), 'HH')
self.assertEqual(map_quads(2), 'LH')
self.assertEqual(map_quads(3), 'LL')
self.assertEqual(map_quads(4), 'HL')
self.assertEqual(map_quads(33), None)
self.assertEqual(map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
from crankshaft.clustering import quad_position
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_local_stat(self):
"""Test Moran's I local"""
data = [OrderedDict([('id', d['id']),
('attr1', d['value']),
('neighbors', d['neighbors'])])
for d in self.neighbors_data]
moran = Moran(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [{'id': d['id'],
'attr1': d['value'],
'attr2': 1,
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
moran = Moran(FakeDataProvider(data))
result = moran.local_rate_stat('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1235)
moran = Moran(FakeDataProvider(data))
result = moran.global_stat('table', 'value',
'knn', 5, 99, 'the_geom',
'cartodb_id')
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)

View File

@@ -0,0 +1,160 @@
import unittest
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
from collections import OrderedDict
class PysalUtilsTest(unittest.TestCase):
"""Testing class for utility functions related to PySAL integrations"""
def setUp(self):
self.params1 = OrderedDict([("id_col", "cartodb_id"),
("attr1", "andy"),
("attr2", "jay_z"),
("subquery", "SELECT * FROM a_list"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params2 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "price"),
("denominator", "sq_meters"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params3 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "sq_meters"),
("denominator", "price"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params_array = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
def test_query_attr_select(self):
"""Test query_attr_select"""
ans1 = ("i.\"andy\"::numeric As attr1, "
"i.\"jay_z\"::numeric As attr2, ")
ans2 = ("i.\"price\"::numeric As attr1, "
"i.\"sq_meters\"::numeric As attr2, ")
ans3 = ("i.\"sq_meters\"::numeric As attr1, "
"i.\"price\"::numeric As attr2, ")
ans_array = ("i.\"_2013_dec\"::numeric As attr1, "
"i.\"_2014_jan\"::numeric As attr2, "
"i.\"_2014_feb\"::numeric As attr3, ")
self.assertEqual(pu.query_attr_select(self.params1), ans1)
self.assertEqual(pu.query_attr_select(self.params2), ans2)
self.assertEqual(pu.query_attr_select(self.params3), ans3)
self.assertEqual(pu.query_attr_select(self.params_array), ans_array)
def test_query_attr_where(self):
"""Test pu.query_attr_where"""
ans1 = ("idx_replace.\"andy\" IS NOT NULL AND "
"idx_replace.\"jay_z\" IS NOT NULL")
ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND "
"idx_replace.\"_2014_jan\" IS NOT NULL AND "
"idx_replace.\"_2014_feb\" IS NOT NULL")
self.assertEqual(pu.query_attr_where(self.params1), ans1)
self.assertEqual(pu.query_attr_where(self.params_array), ans_array)
def test_knn(self):
"""Test knn neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
ans_array = "SELECT i.\"cartodb_id\" As id, " \
"i.\"_2013_dec\"::numeric As attr1, " \
"i.\"_2014_jan\"::numeric As attr2, " \
"i.\"_2014_feb\"::numeric As attr3, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"_2013_dec\" IS NOT NULL AND " \
"j.\"_2014_jan\" IS NOT NULL AND " \
"j.\"_2014_feb\" IS NOT NULL " \
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"_2013_dec\" IS NOT NULL AND " \
"i.\"_2014_jan\" IS NOT NULL AND " \
"i.\"_2014_feb\" IS NOT NULL "\
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params1), ans1)
self.assertEqual(pu.knn(self.params_array), ans_array)
def test_queen(self):
"""Test queen neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params1), ans1)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params1),
pu.knn(self.params1))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)

View File

@@ -0,0 +1,64 @@
import unittest
import numpy as np
from helper import plpy, fixture_file
import crankshaft.segmentation as segmentation
import json
class SegmentationTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
def generate_random_data(self,n_samples,random_state, row_type=False):
x1 = random_state.uniform(size=n_samples)
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)
y = x1+x2*x2+x3
cartodb_id = range(len(x1))
if row_type:
return [ {'features': vals} for vals in zip(x1,x2,x3)], y
else:
return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))]
def test_replace_nan_with_mean(self):
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
def test_create_and_predict_segment(self):
n_samples = 1000
random_state_train = np.random.RandomState(13)
random_state_test = np.random.RandomState(134)
training_data = self.generate_random_data(n_samples, random_state_train)
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
ids = [{'cartodb_ids': range(len(test_data))}]
rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}]
plpy._define_result('select \* from \(select \* from training\) a limit 1',rows)
plpy._define_result('.*from \(select \* from training\) as a' ,training_data)
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids)
plpy._define_result('.*select \* from test.*' ,test_data)
model_parameters = {'n_estimators': 1200,
'max_depth': 3,
'subsample' : 0.5,
'learning_rate': 0.01,
'min_samples_leaf': 1}
result = segmentation.create_and_predict_segment(
'select * from training',
'target',
'select * from test',
model_parameters)
prediction = [r[1] for r in result]
accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
self.assertEqual(len(result),len(test_data))
self.assertTrue( result[0][2] < 0.01)
self.assertTrue( accuracy < 0.5*np.mean(test_y) )

View File

@@ -0,0 +1,349 @@
import unittest
import numpy as np
import unittest
from helper import fixture_file
from crankshaft.space_time_dynamics import Markov
import crankshaft.space_time_dynamics as std
from crankshaft import random_seeds
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, data):
self.mock_result = data
def get_markov(self, w_type, params):
return self.mock_result
class SpaceTimeTests(unittest.TestCase):
"""Testing class for Markov Functions."""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"time_cols": ['dec_2013', 'jan_2014', 'feb_2014'],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors_markov.json')).read())
self.markov_data = json.loads(open(fixture_file('markov.json')).read())
self.time_data = np.array([i * np.ones(10, dtype=float)
for i in range(10)]).T
self.transition_matrix = np.array([
[[0.96341463, 0.0304878, 0.00609756, 0., 0.],
[0.06040268, 0.83221477, 0.10738255, 0., 0.],
[0., 0.14, 0.74, 0.12, 0.],
[0., 0.03571429, 0.32142857, 0.57142857, 0.07142857],
[0., 0., 0., 0.16666667, 0.83333333]],
[[0.79831933, 0.16806723, 0.03361345, 0., 0.],
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.],
[0., 0., 0.06372549, 0.90196078, 0.03431373],
[0., 0., 0., 0.19444444, 0.80555556]],
[[0.84693878, 0.15306122, 0., 0., 0.],
[0.08133971, 0.78947368, 0.1291866, 0., 0.],
[0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0., 0., 0., 0.10204082, 0.89795918]],
[[0.8852459, 0.09836066, 0., 0.01639344, 0.],
[0.03875969, 0.81395349, 0.13953488, 0., 0.00775194],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0.02339181, 0.12865497, 0.75438596, 0.09356725],
[0., 0., 0., 0.09661836, 0.90338164]],
[[0.33333333, 0.66666667, 0., 0., 0.],
[0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.],
[0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.],
[0., 0.01036269, 0.06217617, 0.89637306, 0.03108808],
[0., 0., 0., 0.02352941, 0.97647059]]]
)
def test_spatial_markov(self):
"""Test Spatial Markov."""
data = [{'id': d['id'],
'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
# print(str(data[0]))
markov = Markov(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = markov.spatial_trend('subquery',
['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'],
5, 'knn', 5, 0, 'the_geom',
'cartodb_id')
self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print result[0]
expected = self.markov_data
for ([res_trend, res_up, res_down, res_vol, res_id],
[exp_trend, exp_up, exp_down, exp_vol, exp_id]
) in zip(result, expected):
self.assertAlmostEqual(res_trend, exp_trend)
def test_get_time_data(self):
"""Test get_time_data"""
data = [{'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009']} for d in self.neighbors_data]
result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'])
# expected was prepared from PySAL example:
# f = ps.open(ps.examples.get_path("usjoin.csv"))
# pci = np.array([f.by_col[str(y)]
# for y in range(1995, 2010)]).transpose()
# rpci = pci / (pci.mean(axis = 0))
expected = np.array(
[[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154,
0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612,
0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356],
[0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388,
0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176,
0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858],
[0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522,
0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196,
0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177],
[1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841,
1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806,
1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775],
[1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025,
1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964,
1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337],
[1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684,
1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372,
1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431],
[1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149,
1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239,
1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225],
[1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545,
0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905,
1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746],
[0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845,
0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787,
0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858],
[0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044,
0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998,
0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146],
[1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624,
1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989,
1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097],
[0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687,
0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989,
0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385],
[0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647,
0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424,
0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441],
[0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897,
0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471,
0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704],
[0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012,
0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883,
0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404],
[0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136,
0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334,
0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184],
[0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238,
0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265,
0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657],
[1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723,
1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042,
1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398],
[1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667,
1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841,
1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462],
[1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093,
1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416,
0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102],
[1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264,
1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944,
1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155],
[0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284,
0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288,
0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192],
[0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803,
0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162,
0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232],
[0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801,
0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307,
0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675],
[0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619,
0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651,
0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148],
[1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683,
1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751,
1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943],
[1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272,
1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235,
1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828],
[1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667,
1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786,
1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446],
[0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864,
0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634,
0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395],
[1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626,
1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434,
1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738],
[0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282,
0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835,
0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363],
[0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368,
0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333,
0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296],
[1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073,
1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768,
0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705],
[0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613,
0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183,
0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675],
[1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086,
1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924,
0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765],
[1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857,
1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979,
1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571],
[1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043,
1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136,
1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083],
[0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809,
0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414,
0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599],
[0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286,
0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237,
0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351],
[0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967,
0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506,
0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864],
[0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824,
0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888,
0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908],
[0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751,
0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586,
0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874],
[0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441,
0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768,
0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366],
[1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249,
1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735,
1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926],
[1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863,
1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827,
1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125],
[0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744,
0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865,
0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911],
[1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561,
1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032,
0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284],
[0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289,
0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619,
1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]])
self.assertTrue(np.allclose(result, expected))
self.assertTrue(type(result) == type(expected))
self.assertTrue(result.shape == expected.shape)
def test_rebin_data(self):
"""Test rebin_data"""
# sample in double the time (even case since 10 % 2 = 0):
# (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
# = 0.5, 2.5, 4.5, 6.5, 8.5
ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float)
for i in range(0, 10, 2)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
# sample in triple the time (uneven since 10 % 3 = 1):
# (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
# = 1, 4, 7, 9
ans_odd = np.array([i * np.ones(10, dtype=float)
for i in (1, 4, 7, 9)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
def test_get_prob_dist(self):
"""Test get_prob_dist"""
lag_indices = np.array([1, 2, 3, 4])
unit_indices = np.array([1, 3, 2, 4])
answer = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
result = std.get_prob_dist(self.transition_matrix,
lag_indices, unit_indices)
self.assertTrue(np.array_equal(result, answer))
def test_get_prob_stats(self):
"""Test get_prob_stats"""
probs = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
unit_indices = np.array([1, 3, 2, 4])
answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.])
answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941])
answer_trend = np.array([-0.03301887 / 0.88207547,
-0.05882353 / 0.87058824,
0.02475248 / 0.77722772,
-0.02352941 / 0.97647059])
answer_volatility = np.array([0.34221495, 0.33705421,
0.29226542, 0.38834223])
result = std.get_prob_stats(probs, unit_indices)
result_up = result[0]
result_down = result[1]
result_trend = result[2]
result_volatility = result[3]
self.assertTrue(np.allclose(result_up, answer_up))
self.assertTrue(np.allclose(result_down, answer_down))
self.assertTrue(np.allclose(result_trend, answer_trend))
self.assertTrue(np.allclose(result_volatility, answer_volatility))

View File

@@ -0,0 +1,6 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import analysis_data_provider

View File

@@ -0,0 +1,67 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
class AnalysisDataProvider:
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@@ -0,0 +1,4 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *
from getis import *

View File

@@ -0,0 +1,50 @@
"""
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
from collections import OrderedDict
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, qvals)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -0,0 +1,32 @@
from sklearn.cluster import KMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)

View File

@@ -0,0 +1,208 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
from collections import OrderedDict
from crankshaft.analysis_data_provider import AnalysisDataProvider
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
class Moran:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator)
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1,2 @@
"""Import all functions for pysal_utils"""
from crankshaft.pysal_utils.pysal_utils import *

View File

@@ -0,0 +1,211 @@
"""
Utilities module for generic PySAL functionality, mainly centered on
translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
# if w_type.lower() == 'knn':
# row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
# weights = {x['id']: row_normed_weights for x in query_res}
# else:
# weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
# if len(x['neighbors']) > 0
# else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'
return built_weight
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
Defaults to order in the params
@param params: dict of information used in query (column names,
table name, etc.)
Example:
OrderedDict([('numerator', 'price'),
('denominator', 'sq_meters'),
('subquery', 'SELECT * FROM interesting_data')])
Output:
"i.\"price\"::numeric As attr1, " \
"i.\"sq_meters\"::numeric As attr2, "
"""
attr_string = ""
template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "
if 'time_cols' in params:
# if markov analysis
attrs = params['time_cols']
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(attrs):
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Construct where conditions when building neighbors query
Create portion of WHERE clauses for weeding out NULL-valued geometries
Input: dict of params:
{'subquery': ...,
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output:
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
'etc': ...}
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
NULL AND idx_replace."time3" IS NOT NULL'
"""
attr_string = []
template = "idx_replace.\"%s\" IS NOT NULL"
if 'time_cols' in params:
# markov where clauses
attrs = params['time_cols']
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
# moran where clauses
# get keys
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
if 'denominator' in attrs:
attr_string.append(
"idx_replace.\"%s\" <> 0" % params['denominator'])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,11 @@
"""Random seed generator used for non-deterministic functions in crankshaft"""
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1 @@
from segmentation import *

View File

@@ -0,0 +1,176 @@
"""
Segmentation creation and prediction
"""
import sklearn
import numpy as np
import plpy
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
# Lower level functions
#----------------------
def replace_nan_with_mean(array):
"""
Input:
@param array: an array of floats which may have null-valued entries
Output:
array with nans filled in with the mean of the dataset
"""
# returns an array of rows and column indices
indices = np.where(np.isnan(array))
# iterate through entries which have nan values
for row, col in zip(*indices):
array[row, col] = np.mean(array[~np.isnan(array[:, col]), col])
return array
def get_data(variable, feature_columns, query):
"""
Fetch data from the database, clean, and package into
numpy arrays
Input:
@param variable: name of the target variable
@param feature_columns: list of column names
@param query: subquery that data is pulled from for the packaging
Output:
prepared data, packaged into NumPy arrays
"""
columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns])
try:
data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format(
variable=variable,
columns=columns,
query=query))
except Exception, e:
plpy.error('Failed to access data to build segmentation model: %s' % e)
# extract target data from plpy object
target = np.array(data[0]['target'])
# put n feature data arrays into an n x m array of arrays
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
return replace_nan_with_mean(target), replace_nan_with_mean(features)
# High level interface
# --------------------
def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters):
"""
Version of create_and_predict_segment that works on arrays that come stright form the SQL calling
the function.
Input:
@param target: The 1D array of lenth NSamples containing the target variable we want the model to predict
@param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model
@param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from
@param model_parameters: A dictionary containing parameters for the model.
"""
clean_target = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features)
model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2)
prediction = model.predict(target_features)
accuracy_array = [accuracy]*prediction.shape[0]
return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array))
def create_and_predict_segment(query, variable, target_query, model_params):
"""
generate a segment with machine learning
Stuart Lynn
"""
## fetch column names
try:
columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys()
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
## extract column names to be used in building the segmentation model
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
## get data from database
target, features = get_data(variable, feature_columns, query)
model, accuracy = train_model(target, features, model_params, 0.2)
cartodb_ids, result = predict_segment(model, feature_columns, target_query)
accuracy_array = [accuracy]*result.shape[0]
return zip(cartodb_ids, result, accuracy_array)
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def calculate_model_accuracy(model, features, target):
"""
Calculate the mean squared error of the model prediction
Input:
@param model: model trained from input features
@param features: features to make a prediction from
@param target: target to compare prediction to
Output:
mean squared error of the model prection compared to the target
"""
prediction = model.predict(features)
return metrics.mean_squared_error(prediction, target)
def predict_segment(model, features, target_query):
"""
Use the provided model to predict the values for the new feature set
Input:
@param model: The pretrained model
@features: A list of features to use in the model prediction (list of column names)
@target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it.
"""
batch_size = 1000
joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
try:
cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format(
joined_features=joined_features,
target_query=target_query))
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
results = []
while True:
rows = cursor.fetch(batch_size)
if not rows:
break
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
#Need to fix this. Should be global mean. This will cause weird effects
batch = replace_nan_with_mean(batch)
prediction = model.predict(batch)
results.append(prediction)
try:
cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids']
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
return cartodb_ids, np.concatenate(results)

View File

@@ -0,0 +1,2 @@
"""Import all functions from clustering libraries."""
from markov import *

View File

@@ -0,0 +1,194 @@
"""
Spatial dynamics measurements using Spatial Markov
"""
# TODO: remove all plpy dependencies
import numpy as np
import pysal as ps
import plpy
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Markov:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial_trend(self, subquery, time_cols, num_classes=7,
w_type='knn', num_ngbrs=5, permutations=0,
geom_col='the_geom', id_col='cartodb_id'):
"""
Predict the trends of a unit based on:
1. history of its transitions to different classes (e.g., 1st
quantile -> 2nd quantile)
2. average class of its neighbors
Inputs:
@param subquery string: e.g., SELECT the_geom, cartodb_id,
interesting_time_column FROM table_name
@param time_cols list of strings: list of strings of column names
@param num_classes (optional): number of classes to break
distribution of values into. Currently uses quantile bins.
@param w_type string (optional): weight type ('knn' or 'queen')
@param num_ngbrs int (optional): number of neighbors (if knn type)
@param permutations int (optional): number of permutations for test
stats
@param geom_col string (optional): name of column which contains
the geometries
@param id_col string (optional): name of column which has the ids
of the table
Outputs:
@param trend_up float: probablity that a geom will move to a higher
class
@param trend_down float: probablity that a geom will move to a
lower class
@param trend float: (trend_up - trend_down) / trend_static
@param volatility float: a measure of the volatility based on
probability stddev(prob array)
"""
if len(time_cols) < 2:
plpy.error('More than one time column needs to be passed')
params = {"id_col": id_col,
"time_cols": time_cols,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query_result = self.data_provider.get_markov(w_type, params)
# build weight
weights = pu.get_weight(query_result, w_type)
weights.transform = 'r'
# prep time data
t_data = get_time_data(query_result, time_cols)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,
k=num_classes,
fixed=False,
permutations=permutations)
# get lag classes
lag_classes = ps.Quantiles(
ps.lag_spatial(weights, t_data[:, -1]),
k=num_classes).yb
# look up probablity distribution for each unit according to class and
# lag class
prob_dist = get_prob_dist(sp_markov_result.P,
lag_classes,
sp_markov_result.classes[:, -1])
# find the ups and down and overall distribution of each cell
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
def get_time_data(markov_data, time_cols):
"""
Extract the time columns and bin appropriately
"""
num_attrs = len(time_cols)
return np.array([[x['attr' + str(i)] for x in markov_data]
for i in range(1, num_attrs+1)], dtype=float).transpose()
# not currently used
def rebin_data(time_data, num_time_per_bin):
"""
Convert an n x l matrix into an (n/m) x l matrix where the values are
reduced (averaged) for the intervening states:
1 2 3 4 1.5 3.5
5 6 7 8 -> 5.5 7.5
9 8 7 6 8.5 6.5
5 4 3 2 4.5 2.5
if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix.
This process effectively resamples the data at a longer time span n
units longer than the input data.
For cases when there is a remainder (remainder(5/3) = 2), the remaining
two columns are binned together as the last time period, while the
first three are binned together for the first period.
Input:
@param time_data n x l ndarray: measurements of an attribute at
different time intervals
@param num_time_per_bin int: number of columns to average into a new
column
Output:
ceil(n / m) x l ndarray of resampled time series
"""
if time_data.shape[1] % num_time_per_bin == 0:
# if fit is perfect, then use it
n_max = time_data.shape[1] / num_time_per_bin
else:
# fit remainders into an additional column
n_max = time_data.shape[1] / num_time_per_bin + 1
return np.array(
[time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
for i in range(n_max)]).T
def get_prob_dist(transition_matrix, lag_indices, unit_indices):
"""
Given an array of transition matrices, look up the probability
associated with the arrangements passed
Input:
@param transition_matrix ndarray[k,k,k]:
@param lag_indices ndarray:
@param unit_indices ndarray:
Output:
Array of probability distributions
"""
return np.array([transition_matrix[(lag_indices[i], unit_indices[i])]
for i in range(len(lag_indices))])
def get_prob_stats(prob_dist, unit_indices):
"""
get the statistics of the probability distributions
Outputs:
@param trend_up ndarray(float): sum of probabilities for upward
movement (relative to the unit index of that prob)
@param trend_down ndarray(float): sum of probabilities for downward
movement (relative to the unit index of that prob)
@param trend ndarray(float): difference of upward and downward
movements
"""
num_elements = len(unit_indices)
trend_up = np.empty(num_elements, dtype=float)
trend_down = np.empty(num_elements, dtype=float)
trend = np.empty(num_elements, dtype=float)
for i in range(num_elements):
trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
if prob_dist[i, unit_indices[i]] > 0.0:
trend[i] = (trend_up[i] - trend_down[i]) / (
prob_dist[i, unit_indices[i]])
else:
trend[i] = None
# calculate volatility of distribution
volatility = prob_dist.std(axis=1)
return trend_up, trend_down, trend, volatility

View File

@@ -0,0 +1,5 @@
joblib==0.8.3
numpy==1.6.1
scipy==0.14.0
pysal==1.11.2
scikit-learn==0.14.1

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.5.1',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.11.2', 'scikit-learn==0.14.1'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1 @@
[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]]

View File

@@ -0,0 +1 @@
[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]

View File

@@ -0,0 +1 @@
[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]]

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "LL"],
[0.6152779669180425, "LL"],
[-0.14657336660125297, "LH"],
[0.6967858120189607, "LL"],
[0.07949310115714454, "HH"],
[0.4703198759258987, "HH"],
[0.4421125200498064, "HH"],
[0.5724288737143592, "LL"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "LL"],
[-0.01466729201304962, "HL"],
[0.3481559372544409, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "HH"],
[0.15971286468915544, "LL"],
[1.0543588860308968, "HH"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "HH"],
[0.08438455015300014, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "LL"],
[0.795275137550483, "HH"],
[0.18562939195219, "LL"],
[0.3010757406693439, "LL"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "LL"],
[-0.07116352791516614, "HL"],
[-0.09945240794119009, "LH"],
[0.18562939195219, "LL"],
[0.1832733440191868, "LL"],
[-0.39054253768447705, "HL"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "HH"],
[0.2584386102554792, "HH"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "LH"],
[0.051367269430983485, "LL"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "LL"],
[0.04191046803899837, "LL"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "LH"],
[0.5410013139159929, "HH"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,54 @@
import re
class MockCursor:
def __init__(self, data):
self.cursor_pos = 0
self.data = data
def fetch(self, batch_size):
batch = self.data[self.cursor_pos:self.cursor_pos + batch_size]
self.cursor_pos += batch_size
return batch
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def debug(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def cursor(self, query):
data = self.execute(query)
return MockCursor(data)
# TODO: additional arguments
def execute(self, query):
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,78 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Getis
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from crankshaft.analysis_data_provider import AnalysisDataProvider
# Fixture files produced as follows
#
# import pysal as ps
# import numpy as np
# import random
#
# # setup variables
# f = ps.open(ps.examples.get_path("stl_hom.dbf"))
# y = np.array(f.by_col['HR8893'])
# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp"))
#
# out_queen = [{"id": index + 1,
# "neighbors": [x+1 for x in w_queen.neighbors[index]],
# "value": val} for index, val in enumerate(y)]
#
# with open('neighbors_queen_getis.json', 'w') as f:
# f.write(str(out_queen))
#
# random.seed(1234)
# np.random.seed(1234)
# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True,
# permutations=999)
#
# with open('getis_queen.json', 'w') as f:
# f.write(str(zip(lgstar_queen.z_sim,
# lgstar_queen.p_sim, lgstar_queen.p_z_sim)))
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_getis(self, w_type, param):
return self.mock_result
class GetisTest(unittest.TestCase):
"""Testing class for Getis-Ord's G* funtion
This test replicates the work done in PySAL documentation:
https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g
"""
def setUp(self):
# load raw data for analysis
self.neighbors_data = json.loads(
open(fixture_file('neighbors_getis.json')).read())
# load pre-computed/known values
self.getis_data = json.loads(
open(fixture_file('getis.json')).read())
def test_getis_ord(self):
"""Test Getis-Ord's G*"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
getis = Getis(FakeDataProvider(data))
result = getis.getis_ord('subquery', 'value',
'queen', None, 999, 'the_geom',
'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = np.array(self.getis_data)[:, 0:2]
for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected):
self.assertAlmostEqual(res_z, exp_z, delta=1e-2)

View File

@@ -0,0 +1,56 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import fixture_file
from crankshaft.clustering import Kmeans
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mocked_result):
self.mocked_result = mocked_result
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
return self.mocked_result
class KMeansTest(unittest.TestCase):
"""Testing class for k-means spatial"""
def setUp(self):
self.cluster_data = json.loads(
open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"}
def test_kmeans(self):
"""
"""
data = [{'xs': d['xs'],
'ys': d['ys'],
'ids': d['ids']} for d in self.cluster_data]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data))
clusters = kmeans.spatial('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1] == 0]
c2 = [a for a in clusters if a[1] == 1]
self.assertEqual(len(np.unique(labels)), 2)
self.assertEqual(len(c1), 20)
self.assertEqual(len(c2), 20)

View File

@@ -0,0 +1,112 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Moran
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_moran(self, w_type, params):
return self.mock_result
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.params_markov = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan",
"_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(
open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
from crankshaft.clustering import map_quads
self.assertEqual(map_quads(1), 'HH')
self.assertEqual(map_quads(2), 'LH')
self.assertEqual(map_quads(3), 'LL')
self.assertEqual(map_quads(4), 'HL')
self.assertEqual(map_quads(33), None)
self.assertEqual(map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
from crankshaft.clustering import quad_position
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_local_stat(self):
"""Test Moran's I local"""
data = [OrderedDict([('id', d['id']),
('attr1', d['value']),
('neighbors', d['neighbors'])])
for d in self.neighbors_data]
moran = Moran(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [{'id': d['id'],
'attr1': d['value'],
'attr2': 1,
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
moran = Moran(FakeDataProvider(data))
result = moran.local_rate_stat('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1235)
moran = Moran(FakeDataProvider(data))
result = moran.global_stat('table', 'value',
'knn', 5, 99, 'the_geom',
'cartodb_id')
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)

View File

@@ -0,0 +1,160 @@
import unittest
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
from collections import OrderedDict
class PysalUtilsTest(unittest.TestCase):
"""Testing class for utility functions related to PySAL integrations"""
def setUp(self):
self.params1 = OrderedDict([("id_col", "cartodb_id"),
("attr1", "andy"),
("attr2", "jay_z"),
("subquery", "SELECT * FROM a_list"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params2 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "price"),
("denominator", "sq_meters"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params3 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "sq_meters"),
("denominator", "price"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params_array = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
def test_query_attr_select(self):
"""Test query_attr_select"""
ans1 = ("i.\"andy\"::numeric As attr1, "
"i.\"jay_z\"::numeric As attr2, ")
ans2 = ("i.\"price\"::numeric As attr1, "
"i.\"sq_meters\"::numeric As attr2, ")
ans3 = ("i.\"sq_meters\"::numeric As attr1, "
"i.\"price\"::numeric As attr2, ")
ans_array = ("i.\"_2013_dec\"::numeric As attr1, "
"i.\"_2014_jan\"::numeric As attr2, "
"i.\"_2014_feb\"::numeric As attr3, ")
self.assertEqual(pu.query_attr_select(self.params1), ans1)
self.assertEqual(pu.query_attr_select(self.params2), ans2)
self.assertEqual(pu.query_attr_select(self.params3), ans3)
self.assertEqual(pu.query_attr_select(self.params_array), ans_array)
def test_query_attr_where(self):
"""Test pu.query_attr_where"""
ans1 = ("idx_replace.\"andy\" IS NOT NULL AND "
"idx_replace.\"jay_z\" IS NOT NULL")
ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND "
"idx_replace.\"_2014_jan\" IS NOT NULL AND "
"idx_replace.\"_2014_feb\" IS NOT NULL")
self.assertEqual(pu.query_attr_where(self.params1), ans1)
self.assertEqual(pu.query_attr_where(self.params_array), ans_array)
def test_knn(self):
"""Test knn neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
ans_array = "SELECT i.\"cartodb_id\" As id, " \
"i.\"_2013_dec\"::numeric As attr1, " \
"i.\"_2014_jan\"::numeric As attr2, " \
"i.\"_2014_feb\"::numeric As attr3, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"_2013_dec\" IS NOT NULL AND " \
"j.\"_2014_jan\" IS NOT NULL AND " \
"j.\"_2014_feb\" IS NOT NULL " \
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"_2013_dec\" IS NOT NULL AND " \
"i.\"_2014_jan\" IS NOT NULL AND " \
"i.\"_2014_feb\" IS NOT NULL "\
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params1), ans1)
self.assertEqual(pu.knn(self.params_array), ans_array)
def test_queen(self):
"""Test queen neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params1), ans1)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params1),
pu.knn(self.params1))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)

View File

@@ -0,0 +1,64 @@
import unittest
import numpy as np
from helper import plpy, fixture_file
import crankshaft.segmentation as segmentation
import json
class SegmentationTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
def generate_random_data(self,n_samples,random_state, row_type=False):
x1 = random_state.uniform(size=n_samples)
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)
y = x1+x2*x2+x3
cartodb_id = range(len(x1))
if row_type:
return [ {'features': vals} for vals in zip(x1,x2,x3)], y
else:
return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))]
def test_replace_nan_with_mean(self):
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
def test_create_and_predict_segment(self):
n_samples = 1000
random_state_train = np.random.RandomState(13)
random_state_test = np.random.RandomState(134)
training_data = self.generate_random_data(n_samples, random_state_train)
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
ids = [{'cartodb_ids': range(len(test_data))}]
rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}]
plpy._define_result('select \* from \(select \* from training\) a limit 1',rows)
plpy._define_result('.*from \(select \* from training\) as a' ,training_data)
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids)
plpy._define_result('.*select \* from test.*' ,test_data)
model_parameters = {'n_estimators': 1200,
'max_depth': 3,
'subsample' : 0.5,
'learning_rate': 0.01,
'min_samples_leaf': 1}
result = segmentation.create_and_predict_segment(
'select * from training',
'target',
'select * from test',
model_parameters)
prediction = [r[1] for r in result]
accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
self.assertEqual(len(result),len(test_data))
self.assertTrue( result[0][2] < 0.01)
self.assertTrue( accuracy < 0.5*np.mean(test_y) )

View File

@@ -0,0 +1,349 @@
import unittest
import numpy as np
import unittest
from helper import fixture_file
from crankshaft.space_time_dynamics import Markov
import crankshaft.space_time_dynamics as std
from crankshaft import random_seeds
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, data):
self.mock_result = data
def get_markov(self, w_type, params):
return self.mock_result
class SpaceTimeTests(unittest.TestCase):
"""Testing class for Markov Functions."""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"time_cols": ['dec_2013', 'jan_2014', 'feb_2014'],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors_markov.json')).read())
self.markov_data = json.loads(open(fixture_file('markov.json')).read())
self.time_data = np.array([i * np.ones(10, dtype=float)
for i in range(10)]).T
self.transition_matrix = np.array([
[[0.96341463, 0.0304878, 0.00609756, 0., 0.],
[0.06040268, 0.83221477, 0.10738255, 0., 0.],
[0., 0.14, 0.74, 0.12, 0.],
[0., 0.03571429, 0.32142857, 0.57142857, 0.07142857],
[0., 0., 0., 0.16666667, 0.83333333]],
[[0.79831933, 0.16806723, 0.03361345, 0., 0.],
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.],
[0., 0., 0.06372549, 0.90196078, 0.03431373],
[0., 0., 0., 0.19444444, 0.80555556]],
[[0.84693878, 0.15306122, 0., 0., 0.],
[0.08133971, 0.78947368, 0.1291866, 0., 0.],
[0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0., 0., 0., 0.10204082, 0.89795918]],
[[0.8852459, 0.09836066, 0., 0.01639344, 0.],
[0.03875969, 0.81395349, 0.13953488, 0., 0.00775194],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0.02339181, 0.12865497, 0.75438596, 0.09356725],
[0., 0., 0., 0.09661836, 0.90338164]],
[[0.33333333, 0.66666667, 0., 0., 0.],
[0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.],
[0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.],
[0., 0.01036269, 0.06217617, 0.89637306, 0.03108808],
[0., 0., 0., 0.02352941, 0.97647059]]]
)
def test_spatial_markov(self):
"""Test Spatial Markov."""
data = [{'id': d['id'],
'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
# print(str(data[0]))
markov = Markov(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = markov.spatial_trend('subquery',
['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'],
5, 'knn', 5, 0, 'the_geom',
'cartodb_id')
self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print result[0]
expected = self.markov_data
for ([res_trend, res_up, res_down, res_vol, res_id],
[exp_trend, exp_up, exp_down, exp_vol, exp_id]
) in zip(result, expected):
self.assertAlmostEqual(res_trend, exp_trend)
def test_get_time_data(self):
"""Test get_time_data"""
data = [{'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009']} for d in self.neighbors_data]
result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'])
# expected was prepared from PySAL example:
# f = ps.open(ps.examples.get_path("usjoin.csv"))
# pci = np.array([f.by_col[str(y)]
# for y in range(1995, 2010)]).transpose()
# rpci = pci / (pci.mean(axis = 0))
expected = np.array(
[[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154,
0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612,
0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356],
[0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388,
0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176,
0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858],
[0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522,
0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196,
0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177],
[1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841,
1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806,
1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775],
[1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025,
1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964,
1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337],
[1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684,
1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372,
1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431],
[1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149,
1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239,
1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225],
[1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545,
0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905,
1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746],
[0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845,
0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787,
0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858],
[0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044,
0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998,
0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146],
[1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624,
1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989,
1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097],
[0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687,
0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989,
0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385],
[0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647,
0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424,
0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441],
[0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897,
0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471,
0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704],
[0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012,
0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883,
0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404],
[0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136,
0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334,
0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184],
[0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238,
0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265,
0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657],
[1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723,
1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042,
1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398],
[1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667,
1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841,
1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462],
[1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093,
1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416,
0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102],
[1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264,
1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944,
1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155],
[0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284,
0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288,
0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192],
[0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803,
0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162,
0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232],
[0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801,
0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307,
0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675],
[0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619,
0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651,
0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148],
[1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683,
1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751,
1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943],
[1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272,
1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235,
1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828],
[1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667,
1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786,
1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446],
[0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864,
0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634,
0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395],
[1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626,
1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434,
1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738],
[0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282,
0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835,
0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363],
[0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368,
0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333,
0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296],
[1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073,
1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768,
0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705],
[0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613,
0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183,
0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675],
[1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086,
1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924,
0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765],
[1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857,
1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979,
1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571],
[1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043,
1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136,
1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083],
[0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809,
0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414,
0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599],
[0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286,
0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237,
0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351],
[0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967,
0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506,
0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864],
[0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824,
0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888,
0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908],
[0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751,
0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586,
0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874],
[0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441,
0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768,
0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366],
[1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249,
1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735,
1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926],
[1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863,
1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827,
1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125],
[0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744,
0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865,
0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911],
[1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561,
1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032,
0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284],
[0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289,
0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619,
1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]])
self.assertTrue(np.allclose(result, expected))
self.assertTrue(type(result) == type(expected))
self.assertTrue(result.shape == expected.shape)
def test_rebin_data(self):
"""Test rebin_data"""
# sample in double the time (even case since 10 % 2 = 0):
# (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
# = 0.5, 2.5, 4.5, 6.5, 8.5
ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float)
for i in range(0, 10, 2)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
# sample in triple the time (uneven since 10 % 3 = 1):
# (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
# = 1, 4, 7, 9
ans_odd = np.array([i * np.ones(10, dtype=float)
for i in (1, 4, 7, 9)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
def test_get_prob_dist(self):
"""Test get_prob_dist"""
lag_indices = np.array([1, 2, 3, 4])
unit_indices = np.array([1, 3, 2, 4])
answer = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
result = std.get_prob_dist(self.transition_matrix,
lag_indices, unit_indices)
self.assertTrue(np.array_equal(result, answer))
def test_get_prob_stats(self):
"""Test get_prob_stats"""
probs = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
unit_indices = np.array([1, 3, 2, 4])
answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.])
answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941])
answer_trend = np.array([-0.03301887 / 0.88207547,
-0.05882353 / 0.87058824,
0.02475248 / 0.77722772,
-0.02352941 / 0.97647059])
answer_volatility = np.array([0.34221495, 0.33705421,
0.29226542, 0.38834223])
result = std.get_prob_stats(probs, unit_indices)
result_up = result[0]
result_down = result[1]
result_trend = result[2]
result_volatility = result[3]
self.assertTrue(np.allclose(result_up, answer_up))
self.assertTrue(np.allclose(result_down, answer_down))
self.assertTrue(np.allclose(result_trend, answer_trend))
self.assertTrue(np.allclose(result_volatility, answer_volatility))

View File

@@ -0,0 +1,6 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import analysis_data_provider

View File

@@ -0,0 +1,67 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
class AnalysisDataProvider:
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@@ -0,0 +1,4 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *
from getis import *

View File

@@ -0,0 +1,50 @@
"""
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
from collections import OrderedDict
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, qvals)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -0,0 +1,32 @@
from sklearn.cluster import KMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)

View File

@@ -0,0 +1,208 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
from collections import OrderedDict
from crankshaft.analysis_data_provider import AnalysisDataProvider
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
class Moran:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1,2 @@
"""Import all functions for pysal_utils"""
from crankshaft.pysal_utils.pysal_utils import *

View File

@@ -0,0 +1,211 @@
"""
Utilities module for generic PySAL functionality, mainly centered on
translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
# if w_type.lower() == 'knn':
# row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
# weights = {x['id']: row_normed_weights for x in query_res}
# else:
# weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
# if len(x['neighbors']) > 0
# else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'
return built_weight
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
Defaults to order in the params
@param params: dict of information used in query (column names,
table name, etc.)
Example:
OrderedDict([('numerator', 'price'),
('denominator', 'sq_meters'),
('subquery', 'SELECT * FROM interesting_data')])
Output:
"i.\"price\"::numeric As attr1, " \
"i.\"sq_meters\"::numeric As attr2, "
"""
attr_string = ""
template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "
if 'time_cols' in params:
# if markov analysis
attrs = params['time_cols']
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(attrs):
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Construct where conditions when building neighbors query
Create portion of WHERE clauses for weeding out NULL-valued geometries
Input: dict of params:
{'subquery': ...,
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output:
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
'etc': ...}
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
NULL AND idx_replace."time3" IS NOT NULL'
"""
attr_string = []
template = "idx_replace.\"%s\" IS NOT NULL"
if 'time_cols' in params:
# markov where clauses
attrs = params['time_cols']
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
# moran where clauses
# get keys
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
if 'denominator' in attrs:
attr_string.append(
"idx_replace.\"%s\" <> 0" % params['denominator'])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,11 @@
"""Random seed generator used for non-deterministic functions in crankshaft"""
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1 @@
from segmentation import *

Some files were not shown because too many files have changed in this diff Show More