Compare commits

...

83 Commits

Author SHA1 Message Date
Andy Eschbacher
2cd6cdd4a6 fixes variale name error 2018-02-16 13:31:17 -05:00
Andy Eschbacher
c48ae50ade small syntax fixes 2018-02-09 16:22:10 -05:00
Stuart Lynn
d2574c20ef update to the kmeans - balanced code 2018-02-09 15:55:04 -05:00
Stuart Lynn
7b42beb82f Merge branch 'develop' into balanced_kmeans 2018-01-29 16:41:58 -05:00
Stuart Lynn
9b7eda798c inital commit for balanced k means implementation 2018-01-29 10:51:13 -05:00
Andy Eschbacher
eefb0d7990 Merge pull request #165 from CartoDB/add-PIA
Add multipolygons and geometry collections support to PIA analyssis
2018-01-10 17:10:02 -05:00
Andy Eschbacher
628fd2b839 adds test on multipolygon 2018-01-10 16:45:36 -05:00
Andy Eschbacher
807a5373e8 adds simple test 2018-01-10 16:35:23 -05:00
Andy Eschbacher
2e0a6f7482 Merge branch 'develop' into add-PIA 2018-01-10 15:30:51 -05:00
Andy Eschbacher
edb1cb5049 Merge pull request #150 from CartoDB/add-nonspatial-kmeans-w-class-framework
Add nonspatial kmeans w class framework
2018-01-10 13:15:13 -05:00
Andy Eschbacher
e5285a2700 adds parallel marker for plpgsql function 2018-01-10 10:57:51 -05:00
Andy Eschbacher
068f43de10 adds test for exception if no data is present 2018-01-09 15:45:03 -05:00
Andy Eschbacher
604f20bb21 grrr adds comments to test expectation 2018-01-09 15:22:14 -05:00
Andy Eschbacher
cfd988c338 uses exact same query text for expectation 2018-01-09 15:14:38 -05:00
Andy Eschbacher
04f290cbad finalizes test query 2018-01-09 15:06:44 -05:00
Andy Eschbacher
18fbc2fa9e updates sql query / fixes error 2018-01-09 14:39:24 -05:00
Andy Eschbacher
b0e3f38f1e correctly finds the number of columns 2018-01-09 14:28:20 -05:00
Andy Eschbacher
b16c73f7d0 Merge branch 'develop' into add-nonspatial-kmeans-w-class-framework 2018-01-09 13:47:29 -05:00
Andy Eschbacher
20104c2df9 adds sql tests for nonspatial kmeans 2018-01-09 13:35:00 -05:00
Andy Eschbacher
49a317ae8e syntax updates / consistency 2018-01-09 13:29:36 -05:00
Andy Eschbacher
001062f660 adds inertia as an output column 2018-01-09 13:02:55 -05:00
Andy Eschbacher
5e0fbf0f6f syntax updates 2018-01-09 13:02:41 -05:00
Andy Eschbacher
e5a03fce82 Merge pull request #157 from CartoDB/add-errors-on-null-only
catch empty return values and error on them
2018-01-09 11:46:41 -05:00
Andy Eschbacher
32bb3b1276 adds missing decorator for gwr_predict 2018-01-09 11:37:27 -05:00
Andy Eschbacher
72260d02aa Merge branch 'develop' into add-errors-on-null-only 2018-01-09 11:34:23 -05:00
Andy Eschbacher
65c7841a7a Merge pull request #179 from CartoDB/update-moran-docs
Update moran docs
2018-01-09 11:30:45 -05:00
Andy Eschbacher
d89e07328e Merge branch 'develop' into update-moran-docs 2018-01-09 11:24:36 -05:00
Andy Eschbacher
4ece712cae Merge pull request #155 from CartoDB/update-markov-docs-null
adds caveats about usage
2018-01-09 11:22:10 -05:00
Andy Eschbacher
8bbfac0dbc removes redundant notes section 2018-01-09 11:17:09 -05:00
Andy Eschbacher
200c3da3cb adds better intro and placement for notes 2018-01-09 11:08:35 -05:00
Andy Eschbacher
83048cdf72 Merge branch 'develop' into update-markov-docs-null 2018-01-09 10:54:09 -05:00
Andy Eschbacher
f1428a3e36 Merge pull request #193 from CartoDB/add-schema-to-docs-examples
updates examples to include schema
2018-01-09 10:51:49 -05:00
Andy Eschbacher
b4ddfa1c5b Merge branch 'develop' into add-errors-on-null-only 2018-01-09 10:30:50 -05:00
Andy Eschbacher
77e73dbc75 updates error syntax 2018-01-09 10:23:38 -05:00
Andy Eschbacher
0e99a14653 Merge branch 'develop' into update-moran-docs 2018-01-08 16:41:19 -05:00
Andy Eschbacher
92becac280 syntax fixes / function name fix 2018-01-08 16:30:03 -05:00
Andy Eschbacher
e28f00d98b updates other examples to use correct schema and consistent syntax 2018-01-02 10:36:10 -05:00
Andy Eschbacher
7effd39f16 updates examples to include schema 2018-01-02 10:20:59 -05:00
Raúl Marín
0194c036f6 Merge branch 'develop' into master 2017-11-27 13:17:02 +01:00
Raul Marin
835f80b1b6 Update RELEASE process 2017-11-23 09:49:25 +01:00
Raul Marin
192f89bf4d Makefile: Strip PARALLEL tags on deploy
Since releases are commited into the proyect instead of stripping the labels
when generating the release file, we need to do it before deploying (depending
on the PG release)
2017-11-23 09:49:25 +01:00
Raul Marin
de0bbb0bd3 Categorize GWR sql functions 2017-11-23 09:49:25 +01:00
Raul Marin
1508e5090c Update CONTRIBUTING with information about PG function labels 2017-11-23 09:49:25 +01:00
Raul Marin
e71e9b10b6 Avoid regress error with exceptions under some verbosity levels 2017-11-23 09:49:25 +01:00
Raul Marin
0a15857a54 Add PARALLEL and VOLATILE categories to PG functions 2017-11-23 09:49:25 +01:00
Raul Marin
ddd2b4e3bf Makefile: Add support for PARALLEL categories 2017-11-23 09:49:25 +01:00
Andy Eschbacher
10a1d03287 updates description 2017-09-05 08:48:38 -04:00
Andy Eschbacher
b9d739327f add description of 'standardized' output 2017-09-05 08:42:00 -04:00
abelvm
7f5edb26b0 fixed corner case 2017-03-28 14:13:49 +02:00
abelvm
00327e6de2 fixed indexing 2017-03-28 13:38:21 +02:00
abelvm
c252c18adc fixed indexing 2017-03-28 13:32:10 +02:00
abelvm
3bfdb8d3cc Merge branch 'develop' of github.com:CartoDB/crankshaft into add-PIA 2017-03-28 13:03:01 +02:00
abelvm
47251daa5f fixed corner case centroid=PIA 2017-03-28 13:02:37 +02:00
Andy Eschbacher
06746b4c65 corrected code snippet on weighted mean 2017-01-19 08:40:10 -05:00
abelvm
d8604f3c9b agg set error fix 2017-01-18 21:40:36 +01:00
abelvm
e03c3eece2 semi colon fix 2017-01-18 21:32:42 +01:00
abelvm
32117c7480 Merge branch 'develop' of github.com:CartoDB/crankshaft into add-PIA 2017-01-18 17:28:14 +01:00
abelvm
9ab51027fc support multi 2017-01-18 17:28:06 +01:00
Andy Eschbacher
8e4bbb8a90 add default return value on verify_data wrapper 2017-01-13 14:07:31 -05:00
Andy Eschbacher
be2bf19c0a removes print line 2017-01-12 17:14:32 -05:00
Andy Eschbacher
ddd69bb457 adds mock error function 2017-01-12 17:12:40 -05:00
Andy Eschbacher
04bd067045 standardizing naming conventions in code 2017-01-12 17:12:09 -05:00
Andy Eschbacher
4b3481b1a6 adds decorators to reduce boilerplate code 2017-01-12 17:03:01 -05:00
Andy Eschbacher
7322931ca1 classes to inherit from objects 2017-01-12 12:00:36 -05:00
Andy Eschbacher
e456158cbf removes unneeded function / multilines some queries 2017-01-10 15:12:24 -05:00
Andy Eschbacher
50f6ef0fcc remove unnecessary code / tests 2017-01-10 15:01:44 -05:00
Andy Eschbacher
ca7a2d6e36 update verify_data to get full data reference 2017-01-10 15:00:59 -05:00
Andy Eschbacher
c114ccea33 add condition on null-valued geometries, ref: #143 2017-01-10 14:38:16 -05:00
Andy Eschbacher
10ce109d09 fix typo on error return 2017-01-10 14:37:07 -05:00
Andy Eschbacher
d679975f72 catch empty return values and error on them 2017-01-10 13:53:37 -05:00
Andy Eschbacher
ee5e7d81ae standardize id_col naming convention 2017-01-10 10:52:10 -05:00
Andy Eschbacher
a32b212412 finish docs for kmeans nonspatial 2017-01-10 10:43:42 -05:00
Andy Eschbacher
69f38dd52e change parameter name to align with kmeans.spatial 2017-01-10 09:52:46 -05:00
Andy Eschbacher
c6f64ad2f4 bug fixes and adding of internal docs 2017-01-10 09:49:16 -05:00
Andy Eschbacher
7afb6948a4 adds caveats about usage 2017-01-03 10:34:06 -05:00
Andy Eschbacher
3dad9c6044 update key name in test 2016-12-06 13:45:04 -05:00
Andy Eschbacher
bb5de09d6d update order of query gen 2016-12-06 12:49:27 -05:00
Andy Eschbacher
cc0a683a26 fix query templating / response access 2016-12-06 12:26:25 -05:00
Andy Eschbacher
c884eae90e fix data provider ref 2016-12-06 10:33:51 -05:00
Andy Eschbacher
b65fa0c613 remove erroneous queryrunner class 2016-12-06 10:27:24 -05:00
Andy Eschbacher
e98f1cbce5 fix query formatting with dict 2016-12-06 10:19:13 -05:00
Andy Eschbacher
9a80244e76 adds tests and pgsql file 2016-12-06 10:14:37 -05:00
Andy Eschbacher
798e754dfb stubs in kmeans non-spatial 2016-12-05 17:14:36 -05:00
28 changed files with 1514 additions and 290 deletions

View File

@@ -1,5 +1,14 @@
## Areas of Interest Functions
A family of analyses to uncover groupings of areas with consistently high or low values (clusters) and smaller areas with values unlike those around them (outliers). A cluster is labeled by an 'HH' (high value compared to the entire dataset in an area with other high values), or its opposite 'LL'. An outlier is labeled by an 'LH' (low value surrounded by high values) or an 'HL' (the opposite). Each cluster and outlier classification has an associated p-value, a measure of how significant the pattern of highs and lows is compared to a random distribution.
These functions have two forms: local and global. The local versions classify every input geometry while the global function gives a rating of the overall clustering characteristics of the dataset. Both forms accept an optional denomiator (see the rate versions) if, for example, working with count data and a denominator is needed.
### Notes
* Rows with null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, or use a `LEFT JOIN` to get null outputs from the analysis.
* Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified within the `subquery` argument.
### CDB_AreasOfInterestLocal(subquery text, column_name text)
This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I.
@@ -29,6 +38,7 @@ A table with the following columns.
| vals | NUMERIC | Values from `'column_name'`. |
#### Example Usage
```sql
@@ -37,8 +47,10 @@ SELECT
aoi.quads,
aoi.significance,
c.num_cyclists_per_total_population
FROM CDB_AreasOfInterestLocal('SELECT * FROM commute_data'
'num_cyclists_per_total_population') As aoi
FROM
cdb_crankshaft.CDB_AreasOfInterestLocal(
'SELECT * FROM commute_data'
'num_cyclists_per_total_population') As aoi
JOIN commute_data As c
ON c.cartodb_id = aoi.rowid;
```
@@ -71,8 +83,12 @@ A table with the following columns.
#### Examples
```sql
SELECT *
FROM CDB_AreasOfInterestGlobal('SELECT * FROM commute_data', 'num_cyclists_per_total_population')
SELECT
*
FROM
cdb_crankshaft.CDB_AreasOfInterestGlobal(
'SELECT * FROM commute_data',
'num_cyclists_per_total_population')
```
### CDB_AreasOfInterestLocalRate(subquery text, numerator_column text, denominator_column text)
@@ -102,7 +118,7 @@ A table with the following columns.
| quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. |
| significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. |
| rowid | INT | Row id of the values which correspond to the input rows. |
| vals | NUMERIC | Values from `'column_name'`. |
| vals | NUMERIC | Standardized rate (centered on the mean and normalized by the standard deviation) calculated from `numerator` and `denominator`. This is calculated by [Assuncao Rate](http://pysal.readthedocs.io/en/latest/library/esda/smoothing.html?highlight=assuncao#pysal.esda.smoothing.assuncao_rate) in the PySAL library. |
#### Example Usage
@@ -113,9 +129,11 @@ SELECT
aoi.quads,
aoi.significance,
c.cyclists_per_total_population
FROM CDB_AreasOfInterestLocalRate('SELECT * FROM commute_data'
'num_cyclists',
'total_population') As aoi
FROM
cdb_crankshaft.CDB_AreasOfInterestLocalRate(
'SELECT * FROM commute_data'
'num_cyclists',
'total_population') As aoi
JOIN commute_data As c
ON c.cartodb_id = aoi.rowid;
```
@@ -149,10 +167,13 @@ A table with the following columns.
#### Examples
```sql
SELECT *
FROM CDB_AreasOfInterestGlobalRate('SELECT * FROM commute_data',
'num_cyclists',
'total_population')
SELECT
*
FROM
cdb_crankshaft.CDB_AreasOfInterestGlobalRate(
'SELECT * FROM commute_data',
'num_cyclists',
'total_population')
```
## Hotspot, Coldspot, and Outlier Functions

View File

@@ -8,7 +8,7 @@ This function takes time series data associated with geometries and outputs like
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM real_estate_history`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments. Tables in queries must exist in user's database (i.e., no CTEs at present) |
| column_names | TEXT Array | Names of column that form the history of measurements for the geometries (e.g., `Array['y2011', 'y2012', 'y2013', 'y2014', 'y2015', 'y2016']`). |
| num_classes (optional) | INT | Number of quantile classes to separate data into. |
| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). |
@@ -30,18 +30,29 @@ A table with the following columns.
| rowid | NUMERIC | id of the row that corresponds to the `id_col` (by default `cartodb_id` of the input rows) |
#### Notes
* Rows will null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, etc.
* Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified in the `subquery` parameter.
#### Example Usage
```sql
SELECT
c.cartodb_id,
c.the_geom,
c.the_geom_webmercator,
m.trend,
m.trend_up,
m.trend_down,
m.volatility
FROM CDB_SpatialMarkovTrend('SELECT * FROM nyc_real_estate'
Array['m03y2009','m03y2010','m03y2011','m03y2012','m03y2013','m03y2014','m03y2015','m03y2016']) As m
FROM
cdb_crankshaft.CDB_SpatialMarkovTrend(
'SELECT * FROM nyc_real_estate'
Array['m03y2009', 'm03y2010', 'm03y2011',
'm03y2012', 'm03y2013', 'm03y2014',
'm03y2015','m03y2016']) As m
JOIN nyc_real_estate As c
ON c.cartodb_id = m.rowid;
```

View File

@@ -54,9 +54,9 @@ with t as (
SELECT
array_agg(cartodb_id::bigint) as id,
array_agg(the_geom) as g,
array_agg(coalesce(gla,0)::numeric) as w
array_agg(coalesce(gla, 0)::numeric) as w
FROM
abel.centros_comerciales_de_madrid
centros_comerciales_de_madrid
WHERE not no_cc
),
s as (
@@ -67,12 +67,15 @@ SELECT
FROM
sscc_madrid
)
select
SELECT
g.the_geom,
trunc(g.h,2) as h,
trunc(g.h, 2) as h,
round(g.hpop) as hpop,
trunc(g.dist/1000,2) as dist_km
FROM t, s, CDB_Gravity1(t.id, t.g, t.w, s.id, s.g, s.p, newmall_ID, 100000, 5000) g
trunc(g.dist/1000, 2) as dist_km
FROM
t,
s,
cdb_crankshaft.CDB_Gravity(t.id, t.g, t.w, s.id, s.g, s.p, newmall_ID, 100000, 5000) as g
```

View File

@@ -44,11 +44,18 @@ Default values:
#### Example Usage
```sql
with a as (
select
WITH a as (
SELECT
array_agg(the_geom) as geomin,
array_agg(temp::numeric) as colin
from table_4804232032
FROM table_4804232032
)
SELECT CDB_SpatialInterpolation(geomin, colin, CDB_latlng(41.38, 2.15),1) FROM a;
SELECT
cdb_crankshaft.CDB_SpatialInterpolation(
geomin,
colin,
CDB_latlng(41.38, 2.15),
1)
FROM
a
```

View File

@@ -27,12 +27,20 @@ PostGIS wil include this in future versions ([doc for dev branch](http://postgis
```sql
WITH a AS (
SELECT
ARRAY[ST_GeomFromText('POINT(2.1744 41.403)', 4326),ST_GeomFromText('POINT(2.1228 41.380)', 4326),ST_GeomFromText('POINT(2.1511 41.374)', 4326),ST_GeomFromText('POINT(2.1528 41.413)', 4326),ST_GeomFromText('POINT(2.165 41.391)', 4326),ST_GeomFromText('POINT(2.1498 41.371)', 4326),ST_GeomFromText('POINT(2.1533 41.368)', 4326),ST_GeomFromText('POINT(2.131386 41.41399)', 4326)] AS geomin
ARRAY[
ST_GeomFromText('POINT(2.1744 41.403)', 4326),
ST_GeomFromText('POINT(2.1228 41.380)', 4326),
ST_GeomFromText('POINT(2.1511 41.374)', 4326),
ST_GeomFromText('POINT(2.1528 41.413)', 4326),
ST_GeomFromText('POINT(2.165 41.391)', 4326),
ST_GeomFromText('POINT(2.1498 41.371)', 4326),
ST_GeomFromText('POINT(2.1533 41.368)', 4326),
ST_GeomFromText('POINT(2.131386 41.41399)', 4326)
] AS geomin
)
SELECT
st_transform(
(st_dump(CDB_voronoi(geomin, 0.2, 1e-9)
)).geom
, 3857) as the_geom_webmercator
ST_TRANSFORM(
(ST_Dump(cdb_crankshaft.CDB_Voronoi(geomin, 0.2, 1e-9))).geom,
3857) as the_geom_webmercator
FROM a;
```

View File

@@ -1,17 +1,17 @@
## K-Means Functions
### CDB_KMeans(subquery text, no_clusters INTEGER)
k-means clustering is a popular technique for finding clusters in data by minimizing the intra-cluster 'distance' and maximizing the inter-cluster 'distance'. The distance is defined in the parameter space of the variables entered.
This function attempts to find n clusters within the input data. It will return a table to CartoDB ids and
the number of the cluster each point in the input was assigend to.
### CDB_KMeans(subquery text, no_clusters integer)
This function attempts to find `no_clusters` clusters within the input data based on the geographic distribution. It will return a table with ids and the cluster classification of each point input assuming `the_geom` is not null-valued. If `the_geom` is null-valued, the point will not be considered in the analysis.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
| no\_clusters | INTEGER | The number of clusters to try and find |
| no\_clusters | INTEGER | The number of clusters to find |
#### Returns
@@ -19,25 +19,28 @@ A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| cartodb\_id | INTEGER | The CartoDB id of the row in the input table.|
| cluster\_no | INTEGER | The cluster that this point belongs to. |
| cartodb\_id | INTEGER | The row id of the row from the input table |
| cluster\_no | INTEGER | The cluster that this point belongs to |
#### Example Usage
```sql
SELECT
customers.*,
km.cluster_no
FROM cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) km, customers_3
WHERE customers.cartodb_id = km.cartodb_id
SELECT
customers.*,
km.cluster_no
FROM
cdb_crankshaft.CDB_KMeans('SELECT * from customers' , 6) As km,
customers
WHERE
customers.cartodb_id = km.cartodb_id
```
### CDB_WeightedMean(subquery text, weight_column text, category_column text)
Function that computes the weighted centroid of a number of clusters by some weight column.
### Arguments
### Arguments
| Name | Type | Description |
|------|------|-------------|
@@ -45,18 +48,75 @@ Function that computes the weighted centroid of a number of clusters by some wei
| weight\_column | TEXT | The name of the column to use as a weight |
| category\_column | TEXT | The name of the column to use as a category |
### Returns
### Returns
A table with the following columns.
| Column Name | Type | Description |
|-------------|------|-------------|
| the\_geom | GEOMETRY | A point for the weighted cluster center |
| class | INTEGER | The cluster class |
| class | INTEGER | The cluster class |
### Example Usage
### Example Usage
```sql
SELECT ST_TRANSFORM(the_geom, 3857) as the_geom_webmercator, class
FROM cdb_weighted_mean('SELECT *, customer_value FROM customers','customer_value','cluster_no')
```sql
SELECT
ST_Transform(km.the_geom, 3857) As the_geom_webmercator,
km.class
FROM
cdb_crankshaft.CDB_WeightedMean(
'SELECT *, customer_value FROM customers',
'customer_value',
'cluster_no') As km
```
## CDB_KMeansNonspatial(subquery text, colnames text[], no_clusters int)
K-means clustering classifies the rows of your dataset into `no_clusters` by finding the centers (means) of the variables in `colnames` and classifying each row by it's proximity to the nearest center. This method partitions space into distinct Voronoi cells.
As a standard machine learning method, k-means clustering is an unsupervised learning technique that finds the natural clustering of values. For instance, it is useful for finding subgroups in census data leading to demographic segmentation.
### Arguments
| Name | Type | Description |
|------|------|-------------|
| query | TEXT | SQL query to expose the data to be used in the analysis (e.g., `SELECT * FROM iris_data`). It should contain at least the columns specified in `colnames` and the `id_colname`. |
| colnames | TEXT[] | Array of columns to be used in the analysis (e.g., `Array['petal_width', 'sepal_length', 'petal_length']`). |
| no\_clusters | INTEGER | Number of clusters for the classification of the data |
| id\_col (optional) | TEXT | The id column (default: 'cartodb_id') for identifying rows |
| standarize (optional) | BOOLEAN | Setting this to true (default) standardizes the data to have a mean at zero and a standard deviation of 1 |
### Returns
A table with the following columns.
| Column | Type | Description |
|--------|------|-------------|
| cluster_label | TEXT | Label that a cluster belongs to, number from 0 to `no_clusters - 1`. |
| cluster_center | JSON | Center of the cluster that a row belongs to. The keys of the JSON object are the `colnames`, with values that are the center of the respective cluster |
| silhouettes | NUMERIC | [Silhouette score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html#sklearn.metrics.silhouette_score) of the cluster label |
| inertia | NUMERIC | Sum of squared distances of samples to their closest cluster center |
| rowid | BIGINT | id of the original row for associating back with the original data |
### Example Usage
```sql
SELECT
customers.*,
km.cluster_label,
km.cluster_center,
km.silhouettes
FROM
cdb_crankshaft.CDB_KMeansNonspatial(
'SELECT * FROM customers',
Array['customer_value', 'avg_amt_spent', 'home_median_income'],
7) As km,
customers
WHERE
customers.cartodb_id = km.rowid
```
### Resources
- Read more in [scikit-learn's documentation](http://scikit-learn.org/stable/modules/clustering.html#k-means)
- [K-means basics](https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials)

View File

@@ -3,7 +3,7 @@
### CDB_CreateAndPredictSegment(query TEXT, variable_name TEXT, target_query TEXT)
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
#### Arguments
@@ -34,12 +34,12 @@ A table with the following columns.
SELECT * from cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT agg, median_rent::numeric, male_pop::numeric, female_pop::numeric FROM late_night_agg',
'agg',
'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny');
'SELECT row_number() OVER () As cartodb_id, median_rent, male_pop, female_pop FROM ml_learning_ny');
```
### CDB_CreateAndPredictSegment(target numeric[], train_features numeric[], prediction_features numeric[], prediction_ids numeric[])
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
This function trains a [Gradient Boosting](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) model to attempt to predict the target data and then generates predictions for new data.
#### Arguments
@@ -76,7 +76,7 @@ WITH training As (
FROM late_night_agg),
target AS (
SELECT cdb_crankshaft.CDB_PyAgg(Array[median_rent, male_pop, female_pop]::Numeric[]) As features,
array_agg(cartodb_id) As cartodb_ids FROM late_night_agg)
array_agg(cartodb_id) As cartodb_ids FROM late_night_agg)
SELECT cdb_crankshaft.CDB_CreateAndPredictSegment(training.target, training.features, target.features, target.cartodb_ids)
FROM training, target;

View File

@@ -23,11 +23,17 @@ Function to find the [PIA](https://en.wikipedia.org/wiki/Pole_of_inaccessibility
#### Example Usage
```sql
with a as(
select st_geomfromtext('POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))', 3857) as g
WITH a as (
SELECT
ST_GeomFromText(
'POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))',
3857) as g
),
b as (
select ST_Transform(g, 4326) as g from a
SELECT ST_Transform(g, 4326) as g
FROM a
)
SELECT st_astext(CDB_PIA(g)) from b;
SELECT
ST_AsText(cdb_crankshaft.CDB_PIA(g))
FROM b
```

View File

@@ -24,12 +24,22 @@ Returns a table object
#### Example Usage
```sql
with data as (
select
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),ST_GeomFromText('POINT(2.1228 41.3809)'),ST_GeomFromText('POINT(2.1511 41.3742)'),ST_GeomFromText('POINT(2.1528 41.4136)'),ST_GeomFromText('POINT(2.165 41.3917)'),ST_GeomFromText('POINT(2.1498 41.3713)'),ST_GeomFromText('POINT(2.1533 41.3683)'),ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin
WITH data as (
SELECT
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[
ST_GeomFromText('POINT(2.1744 41.4036)'),
ST_GeomFromText('POINT(2.1228 41.3809)'),
ST_GeomFromText('POINT(2.1511 41.3742)'),
ST_GeomFromText('POINT(2.1528 41.4136)'),
ST_GeomFromText('POINT(2.165 41.3917)'),
ST_GeomFromText('POINT(2.1498 41.3713)'),
ST_GeomFromText('POINT(2.1533 41.3683)'),
ST_GeomFromText('POINT(2.131386 41.413998)')
] as geomin
)
select CDB_Densify(geomin, colin, 2) from data;
SELECT cdb_crankshaft.CDB_Densify(geomin, colin, 2)
FROM data
```

View File

@@ -26,11 +26,19 @@ Returns a table object
#### Example Usage
```sql
with data as (
select
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),ST_GeomFromText('POINT(2.1228 41.3809)'),ST_GeomFromText('POINT(2.1511 41.3742)'),ST_GeomFromText('POINT(2.1528 41.4136)'),ST_GeomFromText('POINT(2.165 41.3917)'),ST_GeomFromText('POINT(2.1498 41.3713)'),ST_GeomFromText('POINT(2.1533 41.3683)'),ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin
WITH data as (
SELECT
ARRAY[7.0,8.0,1.0,2.0,3.0,5.0,6.0,4.0] as colin,
ARRAY[ST_GeomFromText('POINT(2.1744 41.4036)'),
ST_GeomFromText('POINT(2.1228 41.3809)'),
ST_GeomFromText('POINT(2.1511 41.3742)'),
ST_GeomFromText('POINT(2.1528 41.4136)'),
ST_GeomFromText('POINT(2.165 41.3917)'),
ST_GeomFromText('POINT(2.1498 41.3713)'),
ST_GeomFromText('POINT(2.1533 41.3683)'),
ST_GeomFromText('POINT(2.131386 41.413998)')] as geomin
)
select CDB_TINmap(geomin, colin, 2) from data;
SELECT cdb_crankshaft.CDB_TINmap(geomin, colin, 2)
FROM data
```

View File

@@ -43,7 +43,7 @@ With a table `website_visits` and a column of the number of website visits in un
```sql
SELECT
id,
CDB_StaticOutlier(visits_10k, 11.0) As outlier,
cdb_crankshaft.CDB_StaticOutlier(visits_10k, 11.0) As outlier,
visits_10k
FROM website_visits
```
@@ -93,7 +93,7 @@ WITH cte As (
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
)
SELECT
(CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
(cdb_crankshaft.CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
FROM cte;
```
@@ -144,7 +144,7 @@ WITH cte As (
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
)
SELECT
(CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
(cdb_crankshaft.CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
FROM cte;
```

View File

@@ -1,18 +1,76 @@
-- Spatial k-means clustering
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer, no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
CREATE OR REPLACE FUNCTION CDB_KMeans(
query TEXT,
no_clusters INTEGER,
no_init INTEGER DEFAULT 20
)
RETURNS TABLE(
cartodb_id INTEGER,
cluster_no INTEGER
) AS $$
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
-- colnames: text array of column names for doing the clustering analysis
-- no_clusters: number of requested clusters
-- standardize: whether to scale variables to a mean of zero and a standard
-- deviation of 1
-- id_colname: name of the id column
CREATE OR REPLACE FUNCTION CDB_KMeansBalanced(
query text,
no_clusters integer,
value_col TEXT default NULL,
no_init integer default 20,
max_per_cluster float default NULL)
RETURNS table(cartodb_id integer, cluster_no integer)
AS $$
from crankshaft.clustering import KmeansBalanced
kmeans = KmeansBalanced()
return kmeans.spatial_balanced(query,no_clusters,no_init, max_per_cluster, value_col)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
RETURNS Numeric[] AS
$$
CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
query TEXT,
colnames TEXT[],
no_clusters INTEGER,
standardize BOOLEAN DEFAULT true,
id_col TEXT DEFAULT 'cartodb_id'
)
RETURNS TABLE(
cluster_label text,
cluster_center json,
silhouettes numeric,
inertia numeric,
rowid bigint
) AS $$
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters,
standardize=standardize,
id_col=id_col)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
state NUMERIC[],
the_geom GEOMETRY(Point, 4326),
weight NUMERIC
)
RETURNS Numeric[] AS $$
DECLARE
newX NUMERIC;
newY NUMERIC;
@@ -32,7 +90,8 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[])
RETURNS GEOMETRY AS
$$
BEGIN

View File

@@ -31,7 +31,7 @@ DECLARE
sqr numeric;
p geometry;
BEGIN
sqr := |/2;
sqr := 0.5*(|/2.0);
polygon := ST_Transform(polygon, 3857);
-- grid #0 cell size
@@ -46,6 +46,7 @@ BEGIN
SELECT array_agg(c) INTO cells FROM c1;
-- 1st guess: centroid
best_c := polygon;
best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon));
-- looping the loop
@@ -56,6 +57,7 @@ BEGIN
EXIT WHEN i > n;
cell := cells[i];
i := i+1;
-- cell side size, it's square
@@ -63,13 +65,14 @@ BEGIN
-- check distance
test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell));
IF test_d > best_d THEN
best_d := test_d;
best_c := cells[i];
best_c := cell;
END IF;
-- longest distance within the cell
test_mx := test_d + (test_h/2 * sqr);
test_mx := test_d + (test_h * sqr);
-- if the cell has no chance to contains the desired point, continue
CONTINUE WHEN test_mx - best_d <= tolerance;
@@ -94,29 +97,46 @@ END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
-- signed distance point to polygon with holes
-- negative is the point is out the polygon
-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm
CREATE OR REPLACE FUNCTION _Signed_Dist(
IN polygon geometry,
IN point geometry
)
RETURNS numeric AS $$
DECLARE
pols geometry[];
pol geometry;
i integer;
j integer;
within integer;
w integer;
holes integer;
dist numeric;
d numeric;
BEGIN
dist := 1e999;
SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(polygon))::numeric) INTO dist;
SELECT CASE WHEN ST_Within(point,polygon) THEN 1 ELSE -1 END INTO within;
SELECT ST_NumInteriorRings(polygon) INTO holes;
IF holes > 0 THEN
FOR i IN 1..holes
LOOP
SELECT LEAST(dist, ST_distance(point, ST_InteriorRingN(polygon, i))::numeric) INTO dist;
END LOOP;
END IF;
WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection;
FOR j in 1..array_length(pols, 1)
LOOP
pol := pols[j];
d := dist;
SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d;
SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w;
SELECT ST_NumInteriorRings(pol) INTO holes;
IF holes > 0 THEN
FOR i IN 1..holes
LOOP
SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d;
END LOOP;
END IF;
IF d < dist THEN
dist:= d;
within := w;
END IF;
END LOOP;
dist := dist * within::numeric;
RETURN dist;
END;

View File

@@ -1,10 +1,43 @@
\pset format unaligned
\set ECHO all
SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2);
-- spatial kmeans
SELECT
count(DISTINCT cluster_no) as clusters
FROM
cdb_crankshaft.cdb_kmeans('select * from ppoints', 2);
clusters
2
(1 row)
SELECT count(*) clusters from (select cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), code from ppoints group by code) p;
-- weighted mean
SELECT
count(*) clusters
FROM (
SELECT
cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC),
code
FROM ppoints
GROUP BY code
) p;
clusters
52
(1 row)
-- nonspatial kmeans
SELECT
cluster_label::int in (0, 1) As cluster_label,
cluster_center::json->>'col1' As cc_col1,
cluster_center::json->>'col2' As cc_col2,
silhouettes,
inertia,
rowid
FROM cdb_crankshaft.CDB_KMeansNonspatial(
'SELECT unnest(Array[1, 1, 10, 10]) As col1, ' ||
'unnest(Array[100, 100, 2, 2]) As col2, ' ||
'unnest(Array[1, 2, 3, 4]) As cartodb_id ',
Array['col1', 'col2']::text[],
2);
cluster_label|cc_col1|cc_col2|silhouettes|inertia|rowid
t|-1.0|1.0|1.0|0.0|1
t|-1.0|1.0|1.0|0.0|2
t|1.0|-1.0|1.0|0.0|3
t|1.0|-1.0|1.0|0.0|4
(4 rows)

View File

@@ -2,6 +2,16 @@ SET client_min_messages TO WARNING;
\set ECHO none
st_astext
-------------------------------------------
POINT(-3.67484492582767 40.4395084885993)
POINT(-3.67484492582767 40.4394914243877)
(1 row)
st_astext
------------
POINT(0 0)
(1 row)
st_astext
------------
POINT(0 0)
(1 row)

View File

@@ -1,6 +1,34 @@
\pset format unaligned
\set ECHO all
SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2);
-- spatial kmeans
SELECT
count(DISTINCT cluster_no) as clusters
FROM
cdb_crankshaft.cdb_kmeans('select * from ppoints', 2);
SELECT count(*) clusters from (select cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), code from ppoints group by code) p;
-- weighted mean
SELECT
count(*) clusters
FROM (
SELECT
cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC),
code
FROM ppoints
GROUP BY code
) p;
-- nonspatial kmeans
SELECT
cluster_label::int in (0, 1) As cluster_label,
cluster_center::json->>'col1' As cc_col1,
cluster_center::json->>'col2' As cc_col2,
silhouettes,
inertia,
rowid
FROM cdb_crankshaft.CDB_KMeansNonspatial(
'SELECT unnest(Array[1, 1, 10, 10]) As col1, ' ||
'unnest(Array[100, 100, 2, 2]) As col2, ' ||
'unnest(Array[1, 2, 3, 4]) As cartodb_id ',
Array['col1', 'col2']::text[],
2);

View File

@@ -5,3 +5,22 @@ with a as(
select st_geomfromtext('POLYGON((-432540.453078056 4949775.20452642,-432329.947920966 4951361.232584,-431245.028163694 4952223.31516671,-429131.071033529 4951768.00415574,-424622.07505895 4952843.13503987,-423688.327170174 4953499.20752423,-424086.294349759 4954968.38274191,-423068.388925945 4954378.63345336,-423387.653225542 4953355.67417084,-420594.869840519 4953781.00230592,-416026.095299382 4951484.06849063,-412483.018546414 4951024.5410983,-410490.399661215 4954502.24032205,-408186.197521284 4956398.91417441,-407627.262358013 4959300.94633864,-406948.770061627 4959874.85407739,-404949.583326472 4959047.74518163,-402570.908447199 4953743.46829807,-400971.358683991 4952193.11680804,-403533.488084088 4949649.89857885,-406335.177028373 4950193.19571096,-407790.456731515 4952391.46015616,-412060.672398345 4950381.2389307,-410716.93482498 4949156.7509561,-408464.162289794 4943912.8940387,-409350.599394983 4942819.84896006,-408087.791091424 4942451.6711778,-407274.045613725 4940572.4807777,-404446.196589102 4939976.71501489,-402422.964843936 4940450.3670813,-401010.654464241 4939054.8061663,-397647.247369412 4940679.80737878,-395658.413346901 4940528.84765185,-395536.852462953 4938829.79565997,-394268.923462818 4938003.7277717,-393388.720249116 4934757.80596815,-392393.301362444 4934326.71675815,-392573.527618037 4932323.40974412,-393464.640141837 4931903.10653605,-393085.597275686 4931094.7353605,-398426.261165985 4929156.87541607,-398261.174361137 4926238.00816416,-394045.059966834 4925765.18668498,-392982.960705174 4926391.81893628,-393090.272694301 4927176.84692181,-391648.240010564 4924626.06386961,-391889.914625075 4923086.14787613,-394345.177314013 4923235.086036,-395550.878718795 4917812.79243978,-399009.463978251 4912927.7157945,-398948.794855767 4911941.91010796,-398092.636652078 4911806.57392519,-401991.601817112 4911722.9204501,-406225.972607907 4914505.47286319,-411104.994569885 4912569.26941163,-412925.513522316 4913030.3608866,-414630.148884835 4914436.69169949,-414207.691417276 4919205.78028405,-418306.141109809 4917994.9580478,-424184.700779621 4918938.12432889,-426816.961458921 4923664.37379373,-420956.324227126 4923381.98014807,-420186.661267781 4924286.48693378,-420943.411166194 4926812.76394433,-419779.45457046 4928527.43466337,-419768.767899344 4930681.94459216,-421911.668097113 4930432.40620397,-423482.386112205 4933451.28047252,-427272.814773717 4934151.56473242,-427144.908678797 4939731.77191996,-428982.125554848 4940522.84445172,-428986.133056516 4942437.17281266,-431237.792396792 4947309.68284815,-432476.889648814 4947791.74800037,-432540.453078056 4949775.20452642))', 3857) as g
)
SELECT st_astext(cdb_crankshaft.CDB_PIA(g)) from a;
-- square centered on 0,0 with sides of length 2
-- expectation: point(0, 0)
WITH square AS (
SELECT 'SRID=4326;POLYGON((-1 1, 1 1, 1 -1, -1 -1, -1 1))'::geometry as g
)
SELECT ST_AsText(cdb_crankshaft.CDB_PIA(g))
FROM square;
-- MultiPolygon test
-- square centered on 0,0 with sides of length 2
-- expectation: point(0, 0)
WITH square AS (
SELECT
ST_Multi('SRID=4326;POLYGON((-1 1, 1 1, 1 -1, -1 -1, -1 1))'::geometry) as g
)
SELECT ST_AsText(cdb_crankshaft.CDB_PIA(g))
FROM square

View File

@@ -2,62 +2,99 @@
import plpy
import pysal_utils as pu
NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
'for null values and fill in appropriately.')
class AnalysisDataProvider:
def verify_data(func):
"""decorator to verify data result before returning to algorithm"""
def wrapper(*args, **kwargs):
"""Error checking"""
try:
data = func(*args, **kwargs)
if not data:
plpy.error(NULL_VALUE_ERROR)
else:
return data
except Exception as err:
plpy.error('Analysis failed: {}'.format(err))
return []
return wrapper
class AnalysisDataProvider(object):
@verify_data
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
query = pu.construct_neighbor_query(w_type, params)
return plpy.execute(query)
@verify_data
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
query = pu.construct_neighbor_query(w_type, params)
return plpy.execute(query)
@verify_data
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
query = pu.construct_neighbor_query(w_type, params)
return plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
@verify_data
def get_nonspatial_kmeans(self, params):
"""
Fetch data for non-spatial k-means.
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
Inputs - a dict (params) with the following keys:
colnames: a (text) list of column names (e.g.,
`['andy', 'cookie']`)
id_col: the name of the id column (e.g., `'cartodb_id'`)
subquery: the subquery for exposing the data (e.g.,
SELECT * FROM favorite_things)
Output:
A SQL query for packaging the data for consumption within
`KMeans().nonspatial`. Format will be a list of length one,
with the first element a dict with keys ('rowid', 'attr1',
'attr2', ...)
"""
agg_cols = ', '.join([
'array_agg({0}) As arr_col{1}'.format(val, idx+1)
for idx, val in enumerate(params['colnames'])
])
query = '''
SELECT {cols}, array_agg({id_col}) As rowid
FROM ({subquery}) As a
'''.format(subquery=params['subquery'],
id_col=params['id_col'],
cols=agg_cols).strip()
return plpy.execute(query)
@verify_data
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = '''
SELECT
array_agg("{id_col}" ORDER BY "{id_col}") as ids,
array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs,
array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys
FROM ({subquery}) As a
WHERE "{geom_col}" IS NOT NULL
'''.format(**params)
return plpy.execute(query)
def get_spatial_balanced_kmeans(self, params):
"""fetch data for spatial kmeans"""
params.setdefault('value_column', 1)
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"array_agg({value_column} ORDER BY {id_col}) As values"
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
@@ -66,20 +103,14 @@ class AnalysisDataProvider:
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
@verify_data
def get_gwr(self, params):
"""fetch data for gwr analysis"""
query = pu.gwr_query(params)
try:
query_result = plpy.execute(query)
return query_result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
return plpy.execute(query)
@verify_data
def get_gwr_predict(self, params):
"""fetch data for gwr predict"""
query = pu.gwr_predict_query(params)
try:
query_result = plpy.execute(query)
return query_result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
return plpy.execute(query)

View File

@@ -0,0 +1,836 @@
"""
Balanced groups K-Means clustering (same size | max size | custom sizes)
utlizing the scikit-learn api and related
utilities.
BSD 3-Clause License
Copyright (c) 2017, Cayetano Benavent, José Manuel Camacho (Balanced-Sizes-K-Means
Implementation)
Copyright (c) 2017, Nathan Danielsen (Same-Size-K-Means Implementation)
Copyright (c) 20072017 The scikit-learn developers.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
BSD 3-clause "New" or "Revised" License
version 0.17.1
"""
import warnings
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
from sklearn.cluster import k_means_
from sklearn.cluster import _k_means
from sklearn.externals.joblib import Parallel
from sklearn.externals.joblib import delayed
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.utils.extmath import row_norms, squared_norm
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.utils import check_array
from sklearn.utils import check_random_state
from sklearn.utils import as_float_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES
class BalancedGroupsKMeans(BaseEstimator, ClusterMixin, TransformerMixin):
"""Balanced Groups K-Means clustering
90 percent of this is the Kmeans implmentations with the equal groups logic
located in `_labels_inertia_precompute_dense()` which follows the steps laid
out in the Elki Same-size k-Means Variation tutorial.
https://elki-project.github.io/tutorial/same-size_k_means
Please note that this implementation only works in scikit-learn 17.X as later
versions having breaking changes to this implementation.
Parameters
----------
n_clusters : int, optional, default: 8
The number of clusters to form as well as the number of
centroids to generate.
max_iter : int, default: 300
Maximum number of iterations of the k-means algorithm for a
single run.
n_init : int, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
init : {'k-means++', 'random' or an ndarray}
Method for initialization, defaults to 'k-means++':
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': choose k observations (rows) at random from data for
the initial centroids.
If an ndarray is passed, it should be of shape (n_clusters, n_features)
and gives the initial centers.
precompute_distances : {'auto', True, False}
Precompute distances (faster but takes more memory).
'auto' : do not precompute distances if n_samples * n_clusters > 12
million. This corresponds to about 100MB overhead per job using
double precision.
True : always precompute distances
False : never precompute distances
tol : float, default: 1e-4
Relative tolerance with regards to inertia to declare convergence
n_jobs : int
The number of jobs to use for the computation. This works by computing
each of the n_init runs in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code is
used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
are used.
random_state : integer or numpy.RandomState, optional
The generator used to initialize the centers. If an integer is
given, it fixes the seed. Defaults to the global numpy random
number generator.
verbose : int, default 0
Verbosity mode.
copy_x : boolean, default True
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True, then the original data is not
modified. If False, the original data is modified, and put back before
the function returns, but small numerical differences may be introduced
by subtracting and then adding the data mean.
Attributes
----------
cluster_centers_ : array, [n_clusters, n_features]
Coordinates of cluster centers
labels_ :
Labels of each point
inertia_ : float
Sum of distances of samples to their closest cluster center.
Notes
------
The k-means problem is solved using Lloyd's algorithm.
The average complexity is given by O(k n T), were n is the number of
samples and T is the number of iteration.
The worst case complexity is given by O(n^(k+2/p)) with
n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
'How slow is the k-means method?' SoCG2006)
In practice, the k-means algorithm is very fast (one of the fastest
clustering algorithms available), but it falls in local minima. That's why
it can be useful to restart it several times.
See also
--------
MiniBatchKMeans:
Alternative online implementation that does incremental updates
of the centers positions using mini-batches.
For large scale learning (say n_samples > 10k) MiniBatchKMeans is
probably much faster to than the default batch implementation.
"""
def __init__(self, n_clusters=8, max_cluster_size=None,
init='k-means++', n_init=10, max_iter=300,
tol=1e-4, precompute_distances='auto',
verbose=0, random_state=None, copy_x=True, n_jobs=1):
self.n_clusters = n_clusters
self.max_cluster_size = max_cluster_size
self.init = init
self.max_iter = max_iter
self.tol = tol
self.precompute_distances = precompute_distances
self.n_init = n_init
self.verbose = verbose
self.random_state = random_state
self.copy_x = copy_x
self.n_jobs = n_jobs
self.cluster_centers_ = None
self.labels_ = None
def _check_fit_data(self, X):
"""Verify that the number of samples given is larger than k"""
X = check_array(X, accept_sparse='csr', dtype=np.float64)
if X.shape[0] < self.n_clusters:
raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
X.shape[0], self.n_clusters))
return X
def _check_test_data(self, X):
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
warn_on_dtype=True)
n_samples, n_features = X.shape
expected_n_features = self.cluster_centers_.shape[1]
if not n_features == expected_n_features:
raise ValueError("Incorrect number of features. "
"Got %d features, expected %d" % (
n_features, expected_n_features))
return X
def get_cluster_labels(self):
return self.labels_
def get_cluster_centers(self):
return self.cluster_centers_
def get_inertia(self):
return self.inertia_
def fit(self, X, y=None, values=None):
"""Compute k-means clustering.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
"""
print('fit ', values)
random_state = check_random_state(self.random_state)
X = self._check_fit_data(X)
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
k_means(
X, n_clusters=self.n_clusters, max_cluster_size=self.max_cluster_size,
init=self.init, n_init=self.n_init, max_iter=self.max_iter,
verbose=self.verbose, return_n_iter=True,
precompute_distances=self.precompute_distances,
tol=self.tol, random_state=random_state, copy_x=self.copy_x,
n_jobs=self.n_jobs,values=values)
return self
def fit_predict(self, X, y=None, values=None):
"""Compute cluster centers and predict cluster index for each sample.
Convenience method; equivalent to calling fit(X) followed by
predict(X).
"""
print('fit predict ', values)
return self.fit(X,values=values).labels_
def fit_transform(self, X, y=None):
"""Compute clustering and transform X to cluster-distance space.
Equivalent to fit(X).transform(X), but more efficiently implemented.
"""
# Currently, this just skips a copy of the data if it is not in
# np.array or CSR format already.
# XXX This skips _check_test_data, which may change the dtype;
# we should refactor the input validation.
X = self._check_fit_data(X)
return self.fit(X)._transform(X)
def transform(self, X, y=None):
"""Transform X to a cluster-distance space.
In the new space, each dimension is the distance to the cluster
centers. Note that even if X is sparse, the array returned by
`transform` will typically be dense.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to transform.
Returns
-------
X_new : array, shape [n_samples, k]
X transformed in the new space.
"""
check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
return self._transform(X)
def _transform(self, X):
"""guts of transform method; no input validation"""
return euclidean_distances(X, self.cluster_centers_)
def predict(self, X, values=None):
"""Predict the closest cluster each sample in X belongs to.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to predict.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
return _labels_inertia(X, x_squared_norms, self.cluster_centers_,
self.max_cluster_size,values=values)[0]
def score(self, X, y=None, values=None):
"""Opposite of the value of X on the K-means objective.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data.
Returns
-------
score : float
Opposite of the value of X on the K-means objective.
"""
check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
return -_labels_inertia(X, x_squared_norms, self.cluster_centers_,
self.max_cluster_size, values=values)[1]
def k_means(X, n_clusters, max_cluster_size=None, init='k-means++', precompute_distances='auto',
n_init=10, max_iter=300, verbose=False,
tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
return_n_iter=False, values=None):
"""K-means clustering algorithm.
Read more in the :ref:`User Guide <k_means>`.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
The observations to cluster.
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate.
max_iter : int, optional, default 300
Maximum number of iterations of the k-means algorithm to run.
n_init : int, optional, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
init : {'k-means++', 'random', or ndarray, or a callable}, optional
Method for initialization, default to 'k-means++':
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': generate k centroids from a Gaussian with mean and
variance estimated from the data.
If an ndarray is passed, it should be of shape (n_clusters, n_features)
and gives the initial centers.
If a callable is passed, it should take arguments X, k and
and a random state and return an initialization.
precompute_distances : {'auto', True, False}
Precompute distances (faster but takes more memory).
'auto' : do not precompute distances if n_samples * n_clusters > 12
million. This corresponds to about 100MB overhead per job using
double precision.
True : always precompute distances
False : never precompute distances
tol : float, optional
The relative increment in the results before declaring convergence.
verbose : boolean, optional
Verbosity mode.
random_state : integer or numpy.RandomState, optional
The generator used to initialize the centers. If an integer is
given, it fixes the seed. Defaults to the global numpy random
number generator.
copy_x : boolean, optional
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True, then the original data is not
modified. If False, the original data is modified, and put back before
the function returns, but small numerical differences may be introduced
by subtracting and then adding the data mean.
n_jobs : int
The number of jobs to use for the computation. This works by computing
each of the n_init runs in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code is
used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
are used.
return_n_iter : bool, optional
Whether or not to return the number of iterations.
Returns
-------
centroid : float ndarray with shape (k, n_features)
Centroids found at the last iteration of k-means.
label : integer ndarray with shape (n_samples,)
label[i] is the code or index of the centroid the
i'th observation is closest to.
inertia : float
The final value of the inertia criterion (sum of squared distances to
the closest centroid for all observations in the training set).
best_n_iter: int
Number of iterations corresponding to the best results.
Returned only if `return_n_iter` is set to True.
"""
if n_init <= 0:
raise ValueError("Invalid number of initializations."
" n_init=%d must be bigger than zero." % n_init)
random_state = check_random_state(random_state)
if max_iter <= 0:
raise ValueError('Number of iterations should be a positive number,'
' got %d instead' % max_iter)
best_inertia = np.infty
X = as_float_array(X, copy=copy_x)
tol = _tolerance(X, tol)
# If the distances are precomputed every job will create a matrix of shape
# (n_clusters, n_samples). To stop KMeans from eating up memory we only
# activate this if the created matrix is guaranteed to be under 100MB. 12
# million entries consume a little under 100MB if they are of type double.
if precompute_distances == 'auto':
n_samples = X.shape[0]
precompute_distances = (n_clusters * n_samples) < 12e6
elif isinstance(precompute_distances, bool):
pass
else:
raise ValueError("precompute_distances should be 'auto' or True/False"
", but a value of %r was passed" %
precompute_distances)
# subtract of mean of x for more accurate distance computations
if not sp.issparse(X) or hasattr(init, '__array__'):
X_mean = X.mean(axis=0)
if not sp.issparse(X):
# The copy was already done above
X -= X_mean
if hasattr(init, '__array__'):
init = check_array(init, dtype=np.float64, copy=True)
_validate_center_shape(X, n_clusters, init)
init -= X_mean
if n_init != 1:
warnings.warn(
'Explicit initial center position passed: '
'performing only one init in k-means instead of n_init=%d'
% n_init, RuntimeWarning, stacklevel=2)
n_init = 1
# precompute squared norms of data points
x_squared_norms = row_norms(X, squared=True)
best_labels, best_inertia, best_centers = None, None, None
if n_jobs == 1:
# For a single thread, less memory is needed if we just store one set
# of the best results (as opposed to one set per run per thread).
for it in range(n_init):
# run a k-means once
labels, inertia, centers, n_iter_ = _kmeans_single(
X, n_clusters, max_cluster_size, max_iter=max_iter, init=init,
verbose=verbose, precompute_distances=precompute_distances, tol=tol,
x_squared_norms=x_squared_norms, random_state=random_state, values=values)
# determine if these results are the best so far
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
best_n_iter = n_iter_
else:
# parallelisation of k-means runs
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(_kmeans_single)(X, n_clusters, max_cluster_size, max_iter=max_iter,
init=init, verbose=verbose, tol=tol,
precompute_distances=precompute_distances,
x_squared_norms=x_squared_norms,
# Change seed to ensure variety
random_state=seed,
values=values)
for seed in seeds)
# Get results with the lowest inertia
labels, inertia, centers, n_iters = zip(*results)
best = np.argmin(inertia)
best_labels = labels[best]
best_inertia = inertia[best]
best_centers = centers[best]
best_n_iter = n_iters[best]
if not sp.issparse(X):
if not copy_x:
X += X_mean
best_centers += X_mean
if return_n_iter:
return best_centers, best_labels, best_inertia, best_n_iter
else:
return best_centers, best_labels, best_inertia
def _kmeans_single(X, n_clusters, max_cluster_size, x_squared_norms, max_iter=300,
init='k-means++', verbose=False, random_state=None,
tol=1e-4, precompute_distances=True, values=None):
"""A single run of k-means, assumes preparation completed prior.
Parameters
----------
X: array-like of floats, shape (n_samples, n_features)
The observations to cluster.
n_clusters: int
The number of clusters to form as well as the number of
centroids to generate.
max_iter: int, optional, default 300
Maximum number of iterations of the k-means algorithm to run.
init: {'k-means++', 'random', or ndarray, or a callable}, optional
Method for initialization, default to 'k-means++':
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': generate k centroids from a Gaussian with mean and
variance estimated from the data.
If an ndarray is passed, it should be of shape (k, p) and gives
the initial centers.
If a callable is passed, it should take arguments X, k and
and a random state and return an initialization.
tol: float, optional
The relative increment in the results before declaring convergence.
verbose: boolean, optional
Verbosity mode
x_squared_norms: array
Precomputed x_squared_norms.
precompute_distances : boolean, default: True
Precompute distances (faster but takes more memory).
random_state: integer or numpy.RandomState, optional
The generator used to initialize the centers. If an integer is
given, it fixes the seed. Defaults to the global numpy random
number generator.
Returns
-------
centroid: float ndarray with shape (k, n_features)
Centroids found at the last iteration of k-means.
label: integer ndarray with shape (n_samples,)
label[i] is the code or index of the centroid the
i'th observation is closest to.
inertia: float
The final value of the inertia criterion (sum of squared distances to
the closest centroid for all observations in the training set).
n_iter : int
Number of iterations run.
"""
random_state = check_random_state(random_state)
best_labels, best_inertia, best_centers = None, None, None
# init
centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state,
x_squared_norms=x_squared_norms)
if verbose:
print("Initialization complete")
# Allocate memory to store the distances for each sample to its
# closer center for reallocation in case of ties
distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)
# iterations
for i in range(max_iter):
centers_old = centers.copy()
# labels assignment is also called the E-step of EM
labels, inertia = \
_labels_inertia(X, x_squared_norms, centers, max_cluster_size,
precompute_distances=precompute_distances,
distances=distances, values=values)
# computation of the means is also called the M-step of EM
if sp.issparse(X):
centers = _k_means._centers_sparse(X, labels, n_clusters,
distances)
else:
centers = _k_means._centers_dense(X, labels, n_clusters, distances)
if verbose:
print("Iteration %2d, inertia %.3f" % (i, inertia))
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
shift = squared_norm(centers_old - centers)
if shift <= tol:
if verbose:
print("Converged at iteration %d" % i)
break
if shift > 0:
# rerun E-step in case of non-convergence so that predicted labels
# match cluster centers
best_labels, best_inertia = \
_labels_inertia(X, x_squared_norms, best_centers, max_cluster_size,
precompute_distances=precompute_distances,
distances=distances, values=values)
return best_labels, best_inertia, best_centers, i + 1
def _validate_center_shape(X, n_centers, centers):
"""Check if centers is compatible with X and n_centers"""
if len(centers) != n_centers:
raise ValueError('The shape of the initial centers (%s) '
'does not match the number of clusters %i'
% (centers.shape, n_centers))
if centers.shape[1] != X.shape[1]:
raise ValueError(
"The number of features of the initial centers %s "
"does not match the number of features of the data %s."
% (centers.shape[1], X.shape[1]))
def _tolerance(X, tol):
"""Return a tolerance which is independent of the dataset"""
if sp.issparse(X):
variances = mean_variance_axis(X, axis=0)[1]
else:
variances = np.var(X, axis=0)
return np.mean(variances) * tol
def _labels_inertia(X, x_squared_norms, centers, max_cluster_size,
precompute_distances=True, distances=None, values=None):
"""E step of the K-means EM algorithm.
Compute the labels and the inertia of the given samples and centers.
This will compute the distances in-place.
Parameters
----------
X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
The input samples to assign to the labels.
x_squared_norms: array, shape (n_samples,)
Precomputed squared euclidean norm of each data point, to speed up
computations.
centers: float64 array, shape (k, n_features)
The cluster centers.
precompute_distances : boolean, default: True
Precompute distances (faster but takes more memory).
distances: float64 array, shape (n_samples,)
Pre-allocated array to be filled in with each sample's distance
to the closest center.
Returns
-------
labels: int array of shape(n)
The resulting assignment
inertia : float
Sum of distances of samples to their closest cluster center.
"""
n_samples = X.shape[0]
# set the default value of centers to -1 to be able to detect any anomaly
# easily
labels = -np.ones(n_samples, np.int32)
if distances is None:
distances = np.zeros(shape=(0,), dtype=np.float64)
# distances will be changed in-place
if sp.issparse(X):
inertia = k_means_._k_means._assign_labels_csr(
X, x_squared_norms, centers, labels, distances=distances)
else:
if precompute_distances:
return _labels_inertia_precompute_dense(X, x_squared_norms,
centers, distances,
max_cluster_size,values=values)
inertia = k_means_._k_means._assign_labels_array(
X, x_squared_norms, centers, labels, distances=distances)
return labels, inertia
def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances,
max_cluster_size, values=None):
"""Compute labels and inertia using a full distance matrix.
This will overwrite the 'distances' array in-place.
Parameters
----------
X : numpy array, shape (n_sample, n_features)
Input data.
x_squared_norms : numpy array, shape (n_samples,)
Precomputed squared norms of X.
centers : numpy array, shape (n_clusters, n_features)
Cluster centers which data is assigned to.
distances : numpy array, shape (n_samples,)
Pre-allocated array in which distances are stored.
max_cluster_size: TODO
Returns
-------
labels : numpy array, dtype=np.int, shape (n_samples,)
Indices of clusters that samples are assigned to.
inertia : float
Sum of distances of samples to their closest cluster center.
"""
n_samples = X.shape[0]
k = centers.shape[0]
all_distances = euclidean_distances(centers, X, x_squared_norms,
squared=True)
labels = np.empty(n_samples, dtype=np.int32)
labels.fill(-1)
mindist = np.empty(n_samples)
mindist.fill(np.infty)
n_samples = X.shape[0]
k = centers.shape[0]
if isinstance(max_cluster_size, int) and max_cluster_size * k < n_samples:
raise ValueError("max_cluster_size * k must be greater than n_samples")
elif isinstance(max_cluster_size, list):
if sum(max_cluster_size) != n_samples:
raise ValueError("max_cluster_size summatory must be equal than n_samples")
elif len(max_cluster_size) != k:
raise ValueError("max_cluster_size len must be equal than k")
if not max_cluster_size:
max_cluster_size = get_clusters_size(n_samples, k)
labels, mindist = initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size,values=values)
all_points = np.arange(n_samples)
for point in all_points:
for point_dist in get_best_point_distances(point, all_distances):
cluster_id, point_dist = point_dist
# initial assignment
if not is_cluster_full(cluster_id, max_cluster_size, labels,values):
labels[point] = cluster_id
mindist[point] = point_dist
break
# refinement of clustering
transfer_list = []
best_mindist = mindist.copy()
best_labels = labels.copy()
# sort all of the points from largest distance to smallest
points_by_high_distance = np.argsort(mindist)[::-1]
for point in points_by_high_distance:
point_cluster = labels[point]
# see if there is an opening on the best cluster for this point
cluster_id, point_dist = get_best_cluster_for_point(point, all_distances)
if not is_cluster_full(cluster_id, max_cluster_size, labels,values=values) and point_cluster != cluster_id:
labels[point] = cluster_id
mindist[point] = point_dist
best_labels = labels.copy()
best_mindist = mindist.copy()
continue # on to the next point
for swap_candidate in transfer_list:
cand_cluster = labels[swap_candidate]
if point_cluster != cand_cluster:
# get the current dist of swap candidate
cand_distance = mindist[swap_candidate]
# get the potential dist of point
point_distance = all_distances[cand_cluster, point]
# compare
if point_distance < cand_distance:
labels[point] = cand_cluster
mindist[point] = all_distances[cand_cluster, point]
labels[swap_candidate] = point_cluster
mindist[swap_candidate] = all_distances[point_cluster, swap_candidate]
if np.absolute(mindist).sum() < np.absolute(best_mindist).sum():
# update the labels since the transfer was a success
best_labels = labels.copy()
best_mindist = mindist.copy()
break
else:
# reset since the transfer was not a success
labels = best_labels.copy()
mindist = best_mindist.copy()
transfer_list.append(point)
if n_samples == distances.shape[0]:
# distances will be changed in-place
distances[:] = mindist
inertia = best_mindist.sum()
return best_labels, inertia
def get_best_cluster_for_point(point, all_distances):
"""Gets the best cluster by distance for a point
Argument
--------
point : int
the point index
Returns
--------
tuple
(cluster_id, distance_from_cluster_center)
"""
sorted_distances = get_best_point_distances(point, all_distances)
cluster_id, point_dist = sorted_distances[0]
return cluster_id, point_dist
def get_best_point_distances(point, all_distances):
"""Gets a sorted by best distance of clusters
Argument
--------
point : int
the point index
Returns
--------
list of tuples sorted by point_dist
example: [(cluster_id, point_dist), (cluster_id, point_dist)]
"""
points_distances = all_distances[:, point]
sorted_points = sort_adjust_row(points_distances)
return sorted_points
def sort_adjust_row(points_distances):
"Sorts the points row from smallest distance to lowest distance"
return sorted([(cluster_id, point_dist) for cluster_id, point_dist in enumerate(points_distances)], key=lambda x: x[1])
def is_cluster_full(cluster_id, max_cluster_size, labels, values=None):
"""Determines if a cluster is full"""
if isinstance(max_cluster_size, list):
mx_cs = max_cluster_size[cluster_id]
else:
mx_cs = max_cluster_size
if values is not None:
cluster_count = np.sum(values[np.where(labels == cluster_id)])
else:
cluster_count = len(np.where(labels == cluster_id)[0])
is_full = cluster_count >= mx_cs
return is_full
def get_clusters_size(n_samples, n_clusters):
"""Gets the number of members per cluster for equal groups kmeans"""
return (n_samples + n_clusters - 1) // n_clusters
def initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size, values=None):
"""Initial assignment of labels and mindist"""
all_points = np.arange(n_samples)
for point in all_points:
for point_dist in get_best_point_distances(point, all_distances):
cluster_id, point_dist = point_dist
# initial assignment
if not is_cluster_full(cluster_id, max_cluster_size, labels, values):
labels[point] = cluster_id
mindist[point] = point_dist
break
return labels, mindist

View File

@@ -12,7 +12,7 @@ from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis:
class Getis(object):
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
@@ -31,13 +31,13 @@ class Getis:
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, qvals)
result = self.data_provider.get_getis(w_type, params)
attr_vals = pu.get_attributes(result)
# build PySAL weight object

View File

@@ -1,10 +1,12 @@
from sklearn.cluster import KMeans
from .balanced_kmeans import BalancedGroupsKMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans:
class Kmeans(object):
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
@@ -20,13 +22,126 @@ class Kmeans:
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
result = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
xs = result[0]['xs']
ys = result[0]['ys']
ids = result[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)
def spatial_balanced(self, query, no_clusters, no_init=20,
target_per_cluster=None, value_column=None):
params = {
"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id",
"value_column": value_column,
}
data = self.data_provider.get_spatial_balanced_kmeans(params)
lons = data[0]['xs']
lats = data[0]['ys']
ids = data[0]['ids']
values = data[0]['values']
total_value = np.sum(values)
if target_per_cluster is None:
target_per_cluster = total_value / float(no_clusters)
bal_kmeans = BalancedGroupsKMeans(
n_clusters=17,
max_iter=100,
max_cluster_size=target_per_cluster
)
labels = bal_kmeans.fit_predict(
zip(lons, lats),
values=values
)
return zip(ids, labels)
def nonspatial(self, subquery, colnames, no_clusters=5,
standardize=True, id_col='cartodb_id'):
"""
Arguments:
query (string): A SQL query to retrieve the data required to do the
k-means clustering analysis, like so:
SELECT * FROM iris_flower_data
colnames (list): a list of the column names which contain the data
of interest, like so: ['sepal_width',
'petal_width',
'sepal_length',
'petal_length']
no_clusters (int): number of clusters (greater than zero)
id_col (string): name of the input id_column
Returns:
A list of tuples with the following columns:
cluster labels: a label for the cluster that the row belongs to
centers: center of the cluster that this row belongs to
silhouettes: silhouette measure for this value
rowid: row that these values belong to (corresponds to the value in
`id_col`)
"""
import json
from sklearn import metrics
params = {
"colnames": colnames,
"subquery": subquery,
"id_col": id_col
}
data = self.data_provider.get_nonspatial_kmeans(params)
# fill array with values for k-means clustering
if standardize:
cluster_columns = _scale_data(
_extract_columns(data))
else:
cluster_columns = _extract_columns(data)
kmeans = KMeans(n_clusters=no_clusters,
random_state=0).fit(cluster_columns)
centers = [json.dumps(dict(zip(colnames, c)))
for c in kmeans.cluster_centers_[kmeans.labels_]]
silhouettes = metrics.silhouette_samples(cluster_columns,
kmeans.labels_,
metric='sqeuclidean')
return zip(kmeans.labels_,
centers,
silhouettes,
[kmeans.inertia_] * kmeans.labels_.shape[0],
data[0]['rowid'])
# -- Preprocessing steps
def _extract_columns(data):
"""
Extract the features from the query and pack them into a NumPy array
data (list of dicts): result of the kmeans request
"""
# number of columns minus rowid column
n_cols = len(data[0]) - 1
return np.array([data[0]['arr_col{0}'.format(i+1)]
for i in xrange(n_cols)],
dtype=float).T
def _scale_data(features):
"""
Scale all input columns to center on 0 with a standard devation of 1
features (numpy matrix): features of dimension (n_features, n_samples)
"""
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
return scaler.fit_transform(features)

View File

@@ -15,7 +15,7 @@ import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
class Moran:
class Moran(object):
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()

View File

@@ -25,13 +25,6 @@ def get_weight(query_res, w_type='knn', num_ngbrs=5):
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
# if w_type.lower() == 'knn':
# row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
# weights = {x['id']: row_normed_weights for x in query_res}
# else:
# weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
# if len(x['neighbors']) > 0
# else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
@@ -148,22 +141,21 @@ def knn(params):
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
query = '''
SELECT
i."{id_col}" As id,
%(attr_select)s
(SELECT ARRAY(SELECT j."{id_col}"
FROM ({subquery}) As j
WHERE i."{id_col}" <> j."{id_col}" AND
%(attr_where_j)s AND
j."{geom_col}" IS NOT NULL
ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC
LIMIT {num_ngbrs})) As neighbors
FROM ({subquery}) As i
WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL
ORDER BY i."{id_col}" ASC;
''' % replacements
return query.format(**params)
@@ -180,19 +172,20 @@ def queen(params):
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
query = '''
SELECT
i."{id_col}" As id,
%(attr_select)s
(SELECT ARRAY(SELECT j."{id_col}"
FROM ({subquery}) As j
WHERE i."{id_col}" <> j."{id_col}" AND
ST_Touches(i."{geom_col}", j."{geom_col}") AND
%(attr_where_j)s)) As neighbors
FROM ({subquery}) As i
WHERE
%(attr_where_i)s
ORDER BY i."{id_col}" ASC;
''' % replacements
return query.format(**params)
@@ -256,15 +249,3 @@ def get_attributes(query_res, attr_num=1):
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -2,6 +2,7 @@
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)

View File

@@ -11,7 +11,7 @@ import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Markov:
class Markov(object):
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
@@ -61,14 +61,14 @@ class Markov:
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query_result = self.data_provider.get_markov(w_type, params)
result = self.data_provider.get_markov(w_type, params)
# build weight
weights = pu.get_weight(query_result, w_type)
weights = pu.get_weight(result, w_type)
weights.transform = 'r'
# prep time data
t_data = get_time_data(query_result, time_cols)
t_data = get_time_data(result, time_cols)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,

View File

@@ -42,6 +42,9 @@ class MockPlPy:
def info(self, msg):
self.infos.append(msg)
def error(self, msg):
self.notices.append(msg)
def cursor(self, query):
data = self.execute(query)
return MockCursor(data)

View File

@@ -2,17 +2,12 @@ import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import fixture_file
from crankshaft.clustering import Kmeans
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
from collections import OrderedDict
@@ -24,7 +19,7 @@ class FakeDataProvider(AnalysisDataProvider):
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
def get_nonspatial_kmeans(self, query):
return self.mocked_result
@@ -54,3 +49,39 @@ class KMeansTest(unittest.TestCase):
self.assertEqual(len(np.unique(labels)), 2)
self.assertEqual(len(c1), 20)
self.assertEqual(len(c2), 20)
class KMeansNonspatialTest(unittest.TestCase):
"""Testing class for k-means non-spatial"""
def setUp(self):
self.params = {"subquery": "SELECT * FROM TABLE",
"n_clusters": 5}
def test_kmeans_nonspatial(self):
"""
test for k-means non-spatial
"""
# data from:
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]),
("arr_col2", [2, 4, 0, 2, 4, 0]),
("rowid", [1, 2, 3, 4, 5, 6])])]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data_raw))
clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2)
cl1 = clusters[0][0]
cl2 = clusters[3][0]
for idx, val in enumerate(clusters):
if idx < 3:
self.assertEqual(val[0], cl1)
else:
self.assertEqual(val[0], cl2)
# raises exception for no data
with self.assertRaises(Exception):
kmeans = Kmeans(FakeDataProvider([]))
kmeans.nonspatial('subquery', ['col1', 'col2'], 2)

View File

@@ -70,80 +70,10 @@ class PysalUtilsTest(unittest.TestCase):
self.assertEqual(pu.query_attr_where(self.params1), ans1)
self.assertEqual(pu.query_attr_where(self.params_array), ans_array)
def test_knn(self):
"""Test knn neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
ans_array = "SELECT i.\"cartodb_id\" As id, " \
"i.\"_2013_dec\"::numeric As attr1, " \
"i.\"_2014_jan\"::numeric As attr2, " \
"i.\"_2014_feb\"::numeric As attr3, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"_2013_dec\" IS NOT NULL AND " \
"j.\"_2014_jan\" IS NOT NULL AND " \
"j.\"_2014_feb\" IS NOT NULL " \
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"_2013_dec\" IS NOT NULL AND " \
"i.\"_2014_jan\" IS NOT NULL AND " \
"i.\"_2014_feb\" IS NOT NULL "\
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params1), ans1)
self.assertEqual(pu.knn(self.params_array), ans_array)
def test_queen(self):
"""Test queen neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params1), ans1)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params1),
pu.knn(self.params1))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
# need to add tests
self.assertEqual(True, True)
@@ -151,10 +81,3 @@ class PysalUtilsTest(unittest.TestCase):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)