diff --git a/doc/11_kmeans.md b/doc/11_kmeans.md new file mode 100644 index 0000000..6153010 --- /dev/null +++ b/doc/11_kmeans.md @@ -0,0 +1,62 @@ +## K-Means Functions + +### CDB_KMeans(subquery text, no_clusters INTEGER) + +This function attempts to find n clusters within the input data. It will return a table to CartoDB ids and +the number of the cluster each point in the input was assigend to. + + +#### Arguments + +| Name | Type | Description | +|------|------|-------------| +| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments | +| no\_clusters | INTEGER | The number of clusters to try and find | + +#### Returns + +A table with the following columns. + +| Column Name | Type | Description | +|-------------|------|-------------| +| cartodb\_id | INTEGER | The CartoDB id of the row in the input table.| +| cluster\_no | INTEGER | The cluster that this point belongs to. | + + +#### Example Usage + +```sql +SELECT + customers.*, + km.cluster_no + FROM cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) km, customers_3 + WHERE customers.cartodb_id = km.cartodb_id +``` + +### CDB_WeightedMean(subquery text, weight_column text, category_column text) + +Function that computes the weighted centroid of a number of clusters by some weight column. + +### Arguments + +| Name | Type | Description | +|------|------|-------------| +| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column and the columns specified as the weight and category columns| +| weight\_column | TEXT | The name of the column to use as a weight | +| category\_column | TEXT | The name of the column to use as a category | + +### Returns + +A table with the following columns. + +| Column Name | Type | Description | +|-------------|------|-------------| +| the\_geom | GEOMETRY | A point for the weighted cluster center | +| class | INTEGER | The cluster class | + +### Example Usage + +```sql +SELECT ST_TRANSFORM(the_geom, 3857) as the_geom_webmercator, class +FROM cdb_weighted_mean('SELECT *, customer_value FROM customers','customer_value','cluster_no') +``` diff --git a/src/pg/sql/11_kmeans.sql b/src/pg/sql/11_kmeans.sql new file mode 100644 index 0000000..73e2f1d --- /dev/null +++ b/src/pg/sql/11_kmeans.sql @@ -0,0 +1,31 @@ +CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20) +RETURNS table (cartodb_id integer, cluster_no integer) as $$ + + import plpy + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import kmeans + return kmeans(query,no_clusters,no_init) + +$$ language plpythonu; + +CREATE OR REPLACE FUNCTION CDB_WeightedMean(query text, weight_column text, category_column text default null ) +RETURNS table (the_geom geometry,class integer ) as $$ +BEGIN + +RETURN QUERY + EXECUTE format( $string$ + select ST_SETSRID(st_makepoint(cx, cy),4326) the_geom, class from ( + select + %I as class, + sum(st_x(the_geom)*%I)/sum(%I) cx, + sum(st_y(the_geom)*%I)/sum(%I) cy + from (%s) a + group by %I + ) q + + $string$, category_column, weight_column,weight_column,weight_column,weight_column,query, category_column + ) + using the_geom + RETURN; +END +$$ LANGUAGE plpgsql; diff --git a/src/pg/test/expected/05_kmeans_test.out b/src/pg/test/expected/05_kmeans_test.out new file mode 100644 index 0000000..4e6db09 --- /dev/null +++ b/src/pg/test/expected/05_kmeans_test.out @@ -0,0 +1,10 @@ +\pset format unaligned +\set ECHO all +SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2); +clusters +2 +(1 row) +SELECT count(*) clusters from cdb_crankshaft.cdb_WeightedMean( 'select *, code::INTEGER as cluster from ppoints' , 'value', 'cluster' ); +clusters +52 +(1 row) diff --git a/src/pg/test/sql/05_kmeans_test.sql b/src/pg/test/sql/05_kmeans_test.sql new file mode 100644 index 0000000..a400e5e --- /dev/null +++ b/src/pg/test/sql/05_kmeans_test.sql @@ -0,0 +1,6 @@ +\pset format unaligned +\set ECHO all + +SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2); + +SELECT count(*) clusters from cdb_crankshaft.cdb_WeightedMean( 'select *, code::INTEGER as cluster from ppoints' , 'value', 'cluster' ); diff --git a/src/py/crankshaft/crankshaft/clustering/__init__.py b/src/py/crankshaft/crankshaft/clustering/__init__.py index 0df080f..338e8ea 100644 --- a/src/py/crankshaft/crankshaft/clustering/__init__.py +++ b/src/py/crankshaft/crankshaft/clustering/__init__.py @@ -1 +1,2 @@ from moran import * +from kmeans import * diff --git a/src/py/crankshaft/crankshaft/clustering/kmeans.py b/src/py/crankshaft/crankshaft/clustering/kmeans.py new file mode 100644 index 0000000..3d9ed58 --- /dev/null +++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py @@ -0,0 +1,17 @@ +from sklearn.cluster import KMeans +import plpy + +def kmeans(query, no_clusters, no_init=20): + data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids, + array_agg(ST_X(the_geom) order by cartodb_id) xs, + array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a + '''.format(query=query)) + + xs = data[0]['xs'] + ys = data[0]['ys'] + ids = data[0]['ids'] + + km = KMeans(n_clusters= no_clusters, n_init=no_init) + labels = km.fit_predict(zip(xs,ys)) + return zip(ids,labels) + diff --git a/src/py/crankshaft/test/fixtures/kmeans.json b/src/py/crankshaft/test/fixtures/kmeans.json new file mode 100644 index 0000000..8f31c79 --- /dev/null +++ b/src/py/crankshaft/test/fixtures/kmeans.json @@ -0,0 +1 @@ +[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}] \ No newline at end of file diff --git a/src/py/crankshaft/test/test_cluster_kmeans.py b/src/py/crankshaft/test/test_cluster_kmeans.py new file mode 100644 index 0000000..aba8e07 --- /dev/null +++ b/src/py/crankshaft/test/test_cluster_kmeans.py @@ -0,0 +1,38 @@ +import unittest +import numpy as np + + +# from mock_plpy import MockPlPy +# plpy = MockPlPy() +# +# import sys +# sys.modules['plpy'] = plpy +from helper import plpy, fixture_file +import numpy as np +import crankshaft.clustering as cc +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json + +class KMeansTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + plpy._reset() + self.cluster_data = json.loads(open(fixture_file('kmeans.json')).read()) + self.params = {"subquery": "select * from table", + "no_clusters": "10" + } + + def test_kmeans(self): + data = self.cluster_data + plpy._define_result('select' ,data) + clusters = cc.kmeans('subquery', 2) + labels = [a[1] for a in clusters] + c1 = [a for a in clusters if a[1]==0] + c2 = [a for a in clusters if a[1]==1] + + self.assertEqual(len(np.unique(labels)),2) + self.assertEqual(len(c1),20) + self.assertEqual(len(c2),20) +