Compare commits

..

58 Commits

Author SHA1 Message Date
Raul Ochoa
69713ecb0a Merge pull request #172 from CartoDB/fix-global-rate-stat
Fix missing comma for dict creation
2017-03-02 14:39:47 +01:00
Raul Ochoa
d07822c7a0 Fix missing comma for dict creation 2017-02-27 12:07:41 +01:00
Mario de Frutos
154d1a674d Added CLA part in the contributing document 2017-01-25 10:45:30 +01:00
Mario de Frutos
34161fd8a4 Merge pull request #152 from CartoDB/develop
Version 0.5.1
2016-12-12 14:18:05 +01:00
Mario de Frutos
850f3f6a31 Merge pull request #151 from CartoDB/fixes_050_deploy
Correct upgrade for 0.5.1 version
2016-12-12 13:59:05 +01:00
Mario de Frutos
021738d9f8 Correct upgrade for 0.5.1 version 2016-12-12 13:51:38 +01:00
Mario de Frutos
161bb14c08 Merge pull request #149 from CartoDB/develop
Release 0.5.0
2016-12-12 11:25:01 +01:00
Mario de Frutos
f8739b6a68 Version 0.5.0 release artifacts 2016-12-02 13:35:43 +01:00
Mario de Frutos
5df846fe66 Merge pull request #145 from CartoDB/adds-nonspatial-kmeans
updates internal framework for python functions
2016-12-02 13:23:18 +01:00
Mario de Frutos
b9c4e6e8ef Merge branch 'develop' into adds-nonspatial-kmeans 2016-12-02 13:09:59 +01:00
Mario de Frutos
5c34e08c7d Remove old configuration for postgresql 9.5 in travis 2016-12-02 13:09:09 +01:00
Andy Eschbacher
59dc9434f7 moves getis to class-based framework 2016-12-01 17:06:21 -05:00
Mario de Frutos
2c6fcfc294 Merge branch 'develop' into adds-nonspatial-kmeans 2016-12-01 16:26:52 +01:00
Andy Eschbacher
6ab1c285d9 places query gen in kmeans data provider 2016-11-30 10:08:36 -05:00
Andy Eschbacher
b22f79b0cc Merge branch 'develop' into adds-nonspatial-kmeans 2016-11-29 10:17:18 -05:00
Andy Eschbacher
db501a2f02 move query generation to inside analysis data provider class 2016-11-22 15:20:14 +00:00
Andy Eschbacher
6fe4fc9668 rename queryrunner in tests 2016-11-22 09:58:06 -05:00
Andy Eschbacher
280a5193ef rename queryrunner to analysisdataprovider 2016-11-22 09:32:39 -05:00
Andy Eschbacher
c27ec58948 Merge branch 'adds-nonspatial-kmeans' of https://github.com/CartoDB/crankshaft into adds-nonspatial-kmeans 2016-11-21 16:26:37 +00:00
Mario de Frutos
bb3ff43f0f Update .travis.yml 2016-11-21 17:25:08 +01:00
Andy Eschbacher
2f27622a6d strips out kmeans non spatial 2016-11-21 16:19:54 +00:00
Andy Eschbacher
c5a2746a53 Merge branch 'develop' into adds-nonspatial-kmeans 2016-11-21 15:46:44 +00:00
Andy Eschbacher
c8f5448b7c seprates out query runner 2016-11-19 14:20:06 +00:00
Andy Eschbacher
224fbc2fc5 move to class based markov 2016-11-19 09:05:35 +00:00
Andy Eschbacher
2738c1f29c move to class-based module 2016-11-18 17:46:55 +00:00
Andy Eschbacher
a8bd122762 remove mock plpy dependencies 2016-11-18 17:46:29 +00:00
Andy Eschbacher
a9add4b49c rename results file 2016-11-18 17:40:57 +00:00
Andy Eschbacher
83f1900512 creates class-based approach to analysis methods 2016-11-18 17:26:24 +00:00
Andy Eschbacher
7eee4faac1 rename to match numbering elsewhere 2016-11-18 17:22:02 +00:00
Andy Eschbacher
84d33d841f tests for new class 2016-11-15 12:03:54 +01:00
Andy Eschbacher
ded26dc46b adding class for database response 2016-11-15 12:03:24 +01:00
Andy Eschbacher
0d40080f6c move back to colnames 2016-11-15 12:02:42 +01:00
Andy Eschbacher
0867e69d1f replace plpy method colnames 2016-11-15 11:19:15 +01:00
Andy Eschbacher
cbe8571546 fixes argument in not-standardize 2016-11-15 10:10:07 +01:00
Andy Eschbacher
af536757fe adds silhouettes to output 2016-11-14 23:29:38 +00:00
Andy Eschbacher
b6dae5e380 adding silhouette 2016-11-15 00:15:23 +01:00
Andy Eschbacher
64c4b6611c changes cluster centers to json 2016-11-10 16:56:04 +00:00
Andy Eschbacher
a188b2e104 adds missing arguments 2016-10-21 15:51:54 -06:00
Andy Eschbacher
4389c9538d small updates for readability 2016-10-21 10:13:21 -06:00
Andy Eschbacher
3c6d73b7e2 Merge branch 'adds-nonspatial-kmeans' of https://github.com/CartoDB/crankshaft into adds-nonspatial-kmeans 2016-10-18 21:14:09 -06:00
Andy Eschbacher
3e0dba3522 update comments 2016-10-18 21:13:34 -06:00
Andy Eschbacher
5d8641732f change string formatting 2016-10-18 19:30:09 +00:00
Andy Eschbacher
f0c6cca766 fix key name 2016-10-18 13:05:56 -06:00
Andy Eschbacher
f800a35fd1 new format for input data 2016-10-18 13:01:31 -06:00
Andy Eschbacher
54bbd18b02 remove unneeded modules from test script 2016-10-18 12:12:38 -06:00
Andy Eschbacher
da23b002cf rename to match submodule name 2016-10-18 11:51:53 -06:00
Andy Eschbacher
a370a2da52 pep8 updates of test file 2016-10-18 11:50:59 -06:00
Andy Eschbacher
5404589058 Merge branch 'adds-nonspatial-kmeans' of https://github.com/CartoDB/crankshaft into adds-nonspatial-kmeans 2016-10-13 12:52:07 -04:00
Andy Eschbacher
b255fd3e06 make private functions more explictly private 2016-10-13 12:50:46 -04:00
Andy Eschbacher
0feaf36cf6 outputting consistent labels and centers 2016-10-13 15:52:00 +00:00
Andy Eschbacher
5d2a1881b1 make numpy with global scope in module 2016-10-13 15:00:28 +00:00
Andy Eschbacher
a95423174c adds back alias for kmeans removed by accident 2016-10-13 10:50:48 -04:00
Andy Eschbacher
4314f0f066 adds more robust data processing 2016-10-13 10:28:29 -04:00
Andy Eschbacher
c2e2359e65 addes minmax scaling for variables 2016-10-12 17:16:52 -04:00
Andy Eschbacher
361505fca9 fixes syntax errors 2016-10-12 21:13:51 +00:00
Andy Eschbacher
c47116571f properly close plpgsql function 2016-10-12 14:19:19 -04:00
Andy Eschbacher
3e1cef9958 fix output signature 2016-10-11 16:48:22 -04:00
Andy Eschbacher
947d6ba798 first add 2016-10-11 16:38:18 -04:00
89 changed files with 13024 additions and 825 deletions

View File

@@ -55,3 +55,7 @@ sudo make install
# Run the tests against the installed extension.
make test
```
## Submitting contributions
Before opening a pull request (or submitting a contribution) you will need to sign a Contributor License Agreement (CLA) before making a submission, [learn more here](https://carto.com/contributions).

View File

@@ -1,3 +1,11 @@
0.5.0 (2016-12-15)
------------------
* Updated PULL_REQUEST_TEMPLATE
* Fixed a bug that flips the order of the numerator in denominator for calculating using Moran Local Rate because previously the code sorted the keys alphabetically.
* Add new CDB_GetisOrdsG functions. Getis-Ord's G\* is a geo-statistical measurement of the intensity of clustering of high or low values
* Add new outlier detection functions: CDB_StaticOutlier, CDB_PercentOutlier and CDB_StdDevOutlier
* Updates in the framework for accessing the Python functions.
0.4.2 (2016-09-22)
------------------
* Bugfix for cdb_areasofinterestglobal: import correct modules

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
default_version = '0.4.2'
default_version = '0.5.1'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft

View File

@@ -0,0 +1,6 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import analysis_data_provider

View File

@@ -0,0 +1,67 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
class AnalysisDataProvider:
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@@ -0,0 +1,4 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *
from getis import *

View File

@@ -0,0 +1,50 @@
"""
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
from collections import OrderedDict
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, qvals)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -0,0 +1,32 @@
from sklearn.cluster import KMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)

View File

@@ -0,0 +1,208 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
from collections import OrderedDict
from crankshaft.analysis_data_provider import AnalysisDataProvider
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
class Moran:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator)
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1,2 @@
"""Import all functions for pysal_utils"""
from crankshaft.pysal_utils.pysal_utils import *

View File

@@ -0,0 +1,211 @@
"""
Utilities module for generic PySAL functionality, mainly centered on
translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
# if w_type.lower() == 'knn':
# row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
# weights = {x['id']: row_normed_weights for x in query_res}
# else:
# weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
# if len(x['neighbors']) > 0
# else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'
return built_weight
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
Defaults to order in the params
@param params: dict of information used in query (column names,
table name, etc.)
Example:
OrderedDict([('numerator', 'price'),
('denominator', 'sq_meters'),
('subquery', 'SELECT * FROM interesting_data')])
Output:
"i.\"price\"::numeric As attr1, " \
"i.\"sq_meters\"::numeric As attr2, "
"""
attr_string = ""
template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "
if 'time_cols' in params:
# if markov analysis
attrs = params['time_cols']
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(attrs):
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Construct where conditions when building neighbors query
Create portion of WHERE clauses for weeding out NULL-valued geometries
Input: dict of params:
{'subquery': ...,
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output:
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
'etc': ...}
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
NULL AND idx_replace."time3" IS NOT NULL'
"""
attr_string = []
template = "idx_replace.\"%s\" IS NOT NULL"
if 'time_cols' in params:
# markov where clauses
attrs = params['time_cols']
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
# moran where clauses
# get keys
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
if 'denominator' in attrs:
attr_string.append(
"idx_replace.\"%s\" <> 0" % params['denominator'])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,11 @@
"""Random seed generator used for non-deterministic functions in crankshaft"""
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1 @@
from segmentation import *

View File

@@ -0,0 +1,176 @@
"""
Segmentation creation and prediction
"""
import sklearn
import numpy as np
import plpy
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
# Lower level functions
#----------------------
def replace_nan_with_mean(array):
"""
Input:
@param array: an array of floats which may have null-valued entries
Output:
array with nans filled in with the mean of the dataset
"""
# returns an array of rows and column indices
indices = np.where(np.isnan(array))
# iterate through entries which have nan values
for row, col in zip(*indices):
array[row, col] = np.mean(array[~np.isnan(array[:, col]), col])
return array
def get_data(variable, feature_columns, query):
"""
Fetch data from the database, clean, and package into
numpy arrays
Input:
@param variable: name of the target variable
@param feature_columns: list of column names
@param query: subquery that data is pulled from for the packaging
Output:
prepared data, packaged into NumPy arrays
"""
columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns])
try:
data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format(
variable=variable,
columns=columns,
query=query))
except Exception, e:
plpy.error('Failed to access data to build segmentation model: %s' % e)
# extract target data from plpy object
target = np.array(data[0]['target'])
# put n feature data arrays into an n x m array of arrays
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
return replace_nan_with_mean(target), replace_nan_with_mean(features)
# High level interface
# --------------------
def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters):
"""
Version of create_and_predict_segment that works on arrays that come stright form the SQL calling
the function.
Input:
@param target: The 1D array of lenth NSamples containing the target variable we want the model to predict
@param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model
@param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from
@param model_parameters: A dictionary containing parameters for the model.
"""
clean_target = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features)
model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2)
prediction = model.predict(target_features)
accuracy_array = [accuracy]*prediction.shape[0]
return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array))
def create_and_predict_segment(query, variable, target_query, model_params):
"""
generate a segment with machine learning
Stuart Lynn
"""
## fetch column names
try:
columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys()
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
## extract column names to be used in building the segmentation model
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
## get data from database
target, features = get_data(variable, feature_columns, query)
model, accuracy = train_model(target, features, model_params, 0.2)
cartodb_ids, result = predict_segment(model, feature_columns, target_query)
accuracy_array = [accuracy]*result.shape[0]
return zip(cartodb_ids, result, accuracy_array)
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def calculate_model_accuracy(model, features, target):
"""
Calculate the mean squared error of the model prediction
Input:
@param model: model trained from input features
@param features: features to make a prediction from
@param target: target to compare prediction to
Output:
mean squared error of the model prection compared to the target
"""
prediction = model.predict(features)
return metrics.mean_squared_error(prediction, target)
def predict_segment(model, features, target_query):
"""
Use the provided model to predict the values for the new feature set
Input:
@param model: The pretrained model
@features: A list of features to use in the model prediction (list of column names)
@target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it.
"""
batch_size = 1000
joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
try:
cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format(
joined_features=joined_features,
target_query=target_query))
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
results = []
while True:
rows = cursor.fetch(batch_size)
if not rows:
break
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
#Need to fix this. Should be global mean. This will cause weird effects
batch = replace_nan_with_mean(batch)
prediction = model.predict(batch)
results.append(prediction)
try:
cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids']
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
return cartodb_ids, np.concatenate(results)

View File

@@ -0,0 +1,2 @@
"""Import all functions from clustering libraries."""
from markov import *

View File

@@ -0,0 +1,194 @@
"""
Spatial dynamics measurements using Spatial Markov
"""
# TODO: remove all plpy dependencies
import numpy as np
import pysal as ps
import plpy
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Markov:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial_trend(self, subquery, time_cols, num_classes=7,
w_type='knn', num_ngbrs=5, permutations=0,
geom_col='the_geom', id_col='cartodb_id'):
"""
Predict the trends of a unit based on:
1. history of its transitions to different classes (e.g., 1st
quantile -> 2nd quantile)
2. average class of its neighbors
Inputs:
@param subquery string: e.g., SELECT the_geom, cartodb_id,
interesting_time_column FROM table_name
@param time_cols list of strings: list of strings of column names
@param num_classes (optional): number of classes to break
distribution of values into. Currently uses quantile bins.
@param w_type string (optional): weight type ('knn' or 'queen')
@param num_ngbrs int (optional): number of neighbors (if knn type)
@param permutations int (optional): number of permutations for test
stats
@param geom_col string (optional): name of column which contains
the geometries
@param id_col string (optional): name of column which has the ids
of the table
Outputs:
@param trend_up float: probablity that a geom will move to a higher
class
@param trend_down float: probablity that a geom will move to a
lower class
@param trend float: (trend_up - trend_down) / trend_static
@param volatility float: a measure of the volatility based on
probability stddev(prob array)
"""
if len(time_cols) < 2:
plpy.error('More than one time column needs to be passed')
params = {"id_col": id_col,
"time_cols": time_cols,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query_result = self.data_provider.get_markov(w_type, params)
# build weight
weights = pu.get_weight(query_result, w_type)
weights.transform = 'r'
# prep time data
t_data = get_time_data(query_result, time_cols)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,
k=num_classes,
fixed=False,
permutations=permutations)
# get lag classes
lag_classes = ps.Quantiles(
ps.lag_spatial(weights, t_data[:, -1]),
k=num_classes).yb
# look up probablity distribution for each unit according to class and
# lag class
prob_dist = get_prob_dist(sp_markov_result.P,
lag_classes,
sp_markov_result.classes[:, -1])
# find the ups and down and overall distribution of each cell
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
def get_time_data(markov_data, time_cols):
"""
Extract the time columns and bin appropriately
"""
num_attrs = len(time_cols)
return np.array([[x['attr' + str(i)] for x in markov_data]
for i in range(1, num_attrs+1)], dtype=float).transpose()
# not currently used
def rebin_data(time_data, num_time_per_bin):
"""
Convert an n x l matrix into an (n/m) x l matrix where the values are
reduced (averaged) for the intervening states:
1 2 3 4 1.5 3.5
5 6 7 8 -> 5.5 7.5
9 8 7 6 8.5 6.5
5 4 3 2 4.5 2.5
if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix.
This process effectively resamples the data at a longer time span n
units longer than the input data.
For cases when there is a remainder (remainder(5/3) = 2), the remaining
two columns are binned together as the last time period, while the
first three are binned together for the first period.
Input:
@param time_data n x l ndarray: measurements of an attribute at
different time intervals
@param num_time_per_bin int: number of columns to average into a new
column
Output:
ceil(n / m) x l ndarray of resampled time series
"""
if time_data.shape[1] % num_time_per_bin == 0:
# if fit is perfect, then use it
n_max = time_data.shape[1] / num_time_per_bin
else:
# fit remainders into an additional column
n_max = time_data.shape[1] / num_time_per_bin + 1
return np.array(
[time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
for i in range(n_max)]).T
def get_prob_dist(transition_matrix, lag_indices, unit_indices):
"""
Given an array of transition matrices, look up the probability
associated with the arrangements passed
Input:
@param transition_matrix ndarray[k,k,k]:
@param lag_indices ndarray:
@param unit_indices ndarray:
Output:
Array of probability distributions
"""
return np.array([transition_matrix[(lag_indices[i], unit_indices[i])]
for i in range(len(lag_indices))])
def get_prob_stats(prob_dist, unit_indices):
"""
get the statistics of the probability distributions
Outputs:
@param trend_up ndarray(float): sum of probabilities for upward
movement (relative to the unit index of that prob)
@param trend_down ndarray(float): sum of probabilities for downward
movement (relative to the unit index of that prob)
@param trend ndarray(float): difference of upward and downward
movements
"""
num_elements = len(unit_indices)
trend_up = np.empty(num_elements, dtype=float)
trend_down = np.empty(num_elements, dtype=float)
trend = np.empty(num_elements, dtype=float)
for i in range(num_elements):
trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
if prob_dist[i, unit_indices[i]] > 0.0:
trend[i] = (trend_up[i] - trend_down[i]) / (
prob_dist[i, unit_indices[i]])
else:
trend[i] = None
# calculate volatility of distribution
volatility = prob_dist.std(axis=1)
return trend_up, trend_down, trend, volatility

View File

@@ -0,0 +1,5 @@
joblib==0.8.3
numpy==1.6.1
scipy==0.14.0
pysal==1.11.2
scikit-learn==0.14.1

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.5.0',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.11.2', 'scikit-learn==0.14.1'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1 @@
[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]]

View File

@@ -0,0 +1 @@
[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]

View File

@@ -0,0 +1 @@
[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]]

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "LL"],
[0.6152779669180425, "LL"],
[-0.14657336660125297, "LH"],
[0.6967858120189607, "LL"],
[0.07949310115714454, "HH"],
[0.4703198759258987, "HH"],
[0.4421125200498064, "HH"],
[0.5724288737143592, "LL"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "LL"],
[-0.01466729201304962, "HL"],
[0.3481559372544409, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "HH"],
[0.15971286468915544, "LL"],
[1.0543588860308968, "HH"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "HH"],
[0.08438455015300014, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "LL"],
[0.795275137550483, "HH"],
[0.18562939195219, "LL"],
[0.3010757406693439, "LL"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "LL"],
[-0.07116352791516614, "HL"],
[-0.09945240794119009, "LH"],
[0.18562939195219, "LL"],
[0.1832733440191868, "LL"],
[-0.39054253768447705, "HL"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "HH"],
[0.2584386102554792, "HH"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "LH"],
[0.051367269430983485, "LL"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "LL"],
[0.04191046803899837, "LL"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "LH"],
[0.5410013139159929, "HH"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,54 @@
import re
class MockCursor:
def __init__(self, data):
self.cursor_pos = 0
self.data = data
def fetch(self, batch_size):
batch = self.data[self.cursor_pos:self.cursor_pos + batch_size]
self.cursor_pos += batch_size
return batch
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def debug(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def cursor(self, query):
data = self.execute(query)
return MockCursor(data)
# TODO: additional arguments
def execute(self, query):
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,78 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Getis
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from crankshaft.analysis_data_provider import AnalysisDataProvider
# Fixture files produced as follows
#
# import pysal as ps
# import numpy as np
# import random
#
# # setup variables
# f = ps.open(ps.examples.get_path("stl_hom.dbf"))
# y = np.array(f.by_col['HR8893'])
# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp"))
#
# out_queen = [{"id": index + 1,
# "neighbors": [x+1 for x in w_queen.neighbors[index]],
# "value": val} for index, val in enumerate(y)]
#
# with open('neighbors_queen_getis.json', 'w') as f:
# f.write(str(out_queen))
#
# random.seed(1234)
# np.random.seed(1234)
# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True,
# permutations=999)
#
# with open('getis_queen.json', 'w') as f:
# f.write(str(zip(lgstar_queen.z_sim,
# lgstar_queen.p_sim, lgstar_queen.p_z_sim)))
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_getis(self, w_type, param):
return self.mock_result
class GetisTest(unittest.TestCase):
"""Testing class for Getis-Ord's G* funtion
This test replicates the work done in PySAL documentation:
https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g
"""
def setUp(self):
# load raw data for analysis
self.neighbors_data = json.loads(
open(fixture_file('neighbors_getis.json')).read())
# load pre-computed/known values
self.getis_data = json.loads(
open(fixture_file('getis.json')).read())
def test_getis_ord(self):
"""Test Getis-Ord's G*"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
getis = Getis(FakeDataProvider(data))
result = getis.getis_ord('subquery', 'value',
'queen', None, 999, 'the_geom',
'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = np.array(self.getis_data)[:, 0:2]
for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected):
self.assertAlmostEqual(res_z, exp_z, delta=1e-2)

View File

@@ -0,0 +1,56 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import fixture_file
from crankshaft.clustering import Kmeans
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mocked_result):
self.mocked_result = mocked_result
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
return self.mocked_result
class KMeansTest(unittest.TestCase):
"""Testing class for k-means spatial"""
def setUp(self):
self.cluster_data = json.loads(
open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"}
def test_kmeans(self):
"""
"""
data = [{'xs': d['xs'],
'ys': d['ys'],
'ids': d['ids']} for d in self.cluster_data]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data))
clusters = kmeans.spatial('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1] == 0]
c2 = [a for a in clusters if a[1] == 1]
self.assertEqual(len(np.unique(labels)), 2)
self.assertEqual(len(c1), 20)
self.assertEqual(len(c2), 20)

View File

@@ -0,0 +1,112 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Moran
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_moran(self, w_type, params):
return self.mock_result
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.params_markov = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan",
"_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(
open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
from crankshaft.clustering import map_quads
self.assertEqual(map_quads(1), 'HH')
self.assertEqual(map_quads(2), 'LH')
self.assertEqual(map_quads(3), 'LL')
self.assertEqual(map_quads(4), 'HL')
self.assertEqual(map_quads(33), None)
self.assertEqual(map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
from crankshaft.clustering import quad_position
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_local_stat(self):
"""Test Moran's I local"""
data = [OrderedDict([('id', d['id']),
('attr1', d['value']),
('neighbors', d['neighbors'])])
for d in self.neighbors_data]
moran = Moran(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [{'id': d['id'],
'attr1': d['value'],
'attr2': 1,
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
moran = Moran(FakeDataProvider(data))
result = moran.local_rate_stat('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1235)
moran = Moran(FakeDataProvider(data))
result = moran.global_stat('table', 'value',
'knn', 5, 99, 'the_geom',
'cartodb_id')
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)

View File

@@ -0,0 +1,160 @@
import unittest
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
from collections import OrderedDict
class PysalUtilsTest(unittest.TestCase):
"""Testing class for utility functions related to PySAL integrations"""
def setUp(self):
self.params1 = OrderedDict([("id_col", "cartodb_id"),
("attr1", "andy"),
("attr2", "jay_z"),
("subquery", "SELECT * FROM a_list"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params2 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "price"),
("denominator", "sq_meters"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params3 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "sq_meters"),
("denominator", "price"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params_array = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
def test_query_attr_select(self):
"""Test query_attr_select"""
ans1 = ("i.\"andy\"::numeric As attr1, "
"i.\"jay_z\"::numeric As attr2, ")
ans2 = ("i.\"price\"::numeric As attr1, "
"i.\"sq_meters\"::numeric As attr2, ")
ans3 = ("i.\"sq_meters\"::numeric As attr1, "
"i.\"price\"::numeric As attr2, ")
ans_array = ("i.\"_2013_dec\"::numeric As attr1, "
"i.\"_2014_jan\"::numeric As attr2, "
"i.\"_2014_feb\"::numeric As attr3, ")
self.assertEqual(pu.query_attr_select(self.params1), ans1)
self.assertEqual(pu.query_attr_select(self.params2), ans2)
self.assertEqual(pu.query_attr_select(self.params3), ans3)
self.assertEqual(pu.query_attr_select(self.params_array), ans_array)
def test_query_attr_where(self):
"""Test pu.query_attr_where"""
ans1 = ("idx_replace.\"andy\" IS NOT NULL AND "
"idx_replace.\"jay_z\" IS NOT NULL")
ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND "
"idx_replace.\"_2014_jan\" IS NOT NULL AND "
"idx_replace.\"_2014_feb\" IS NOT NULL")
self.assertEqual(pu.query_attr_where(self.params1), ans1)
self.assertEqual(pu.query_attr_where(self.params_array), ans_array)
def test_knn(self):
"""Test knn neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
ans_array = "SELECT i.\"cartodb_id\" As id, " \
"i.\"_2013_dec\"::numeric As attr1, " \
"i.\"_2014_jan\"::numeric As attr2, " \
"i.\"_2014_feb\"::numeric As attr3, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"_2013_dec\" IS NOT NULL AND " \
"j.\"_2014_jan\" IS NOT NULL AND " \
"j.\"_2014_feb\" IS NOT NULL " \
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"_2013_dec\" IS NOT NULL AND " \
"i.\"_2014_jan\" IS NOT NULL AND " \
"i.\"_2014_feb\" IS NOT NULL "\
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params1), ans1)
self.assertEqual(pu.knn(self.params_array), ans_array)
def test_queen(self):
"""Test queen neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params1), ans1)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params1),
pu.knn(self.params1))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)

View File

@@ -0,0 +1,64 @@
import unittest
import numpy as np
from helper import plpy, fixture_file
import crankshaft.segmentation as segmentation
import json
class SegmentationTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
def generate_random_data(self,n_samples,random_state, row_type=False):
x1 = random_state.uniform(size=n_samples)
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)
y = x1+x2*x2+x3
cartodb_id = range(len(x1))
if row_type:
return [ {'features': vals} for vals in zip(x1,x2,x3)], y
else:
return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))]
def test_replace_nan_with_mean(self):
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
def test_create_and_predict_segment(self):
n_samples = 1000
random_state_train = np.random.RandomState(13)
random_state_test = np.random.RandomState(134)
training_data = self.generate_random_data(n_samples, random_state_train)
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
ids = [{'cartodb_ids': range(len(test_data))}]
rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}]
plpy._define_result('select \* from \(select \* from training\) a limit 1',rows)
plpy._define_result('.*from \(select \* from training\) as a' ,training_data)
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids)
plpy._define_result('.*select \* from test.*' ,test_data)
model_parameters = {'n_estimators': 1200,
'max_depth': 3,
'subsample' : 0.5,
'learning_rate': 0.01,
'min_samples_leaf': 1}
result = segmentation.create_and_predict_segment(
'select * from training',
'target',
'select * from test',
model_parameters)
prediction = [r[1] for r in result]
accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
self.assertEqual(len(result),len(test_data))
self.assertTrue( result[0][2] < 0.01)
self.assertTrue( accuracy < 0.5*np.mean(test_y) )

View File

@@ -0,0 +1,349 @@
import unittest
import numpy as np
import unittest
from helper import fixture_file
from crankshaft.space_time_dynamics import Markov
import crankshaft.space_time_dynamics as std
from crankshaft import random_seeds
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, data):
self.mock_result = data
def get_markov(self, w_type, params):
return self.mock_result
class SpaceTimeTests(unittest.TestCase):
"""Testing class for Markov Functions."""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"time_cols": ['dec_2013', 'jan_2014', 'feb_2014'],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors_markov.json')).read())
self.markov_data = json.loads(open(fixture_file('markov.json')).read())
self.time_data = np.array([i * np.ones(10, dtype=float)
for i in range(10)]).T
self.transition_matrix = np.array([
[[0.96341463, 0.0304878, 0.00609756, 0., 0.],
[0.06040268, 0.83221477, 0.10738255, 0., 0.],
[0., 0.14, 0.74, 0.12, 0.],
[0., 0.03571429, 0.32142857, 0.57142857, 0.07142857],
[0., 0., 0., 0.16666667, 0.83333333]],
[[0.79831933, 0.16806723, 0.03361345, 0., 0.],
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.],
[0., 0., 0.06372549, 0.90196078, 0.03431373],
[0., 0., 0., 0.19444444, 0.80555556]],
[[0.84693878, 0.15306122, 0., 0., 0.],
[0.08133971, 0.78947368, 0.1291866, 0., 0.],
[0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0., 0., 0., 0.10204082, 0.89795918]],
[[0.8852459, 0.09836066, 0., 0.01639344, 0.],
[0.03875969, 0.81395349, 0.13953488, 0., 0.00775194],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0.02339181, 0.12865497, 0.75438596, 0.09356725],
[0., 0., 0., 0.09661836, 0.90338164]],
[[0.33333333, 0.66666667, 0., 0., 0.],
[0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.],
[0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.],
[0., 0.01036269, 0.06217617, 0.89637306, 0.03108808],
[0., 0., 0., 0.02352941, 0.97647059]]]
)
def test_spatial_markov(self):
"""Test Spatial Markov."""
data = [{'id': d['id'],
'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
# print(str(data[0]))
markov = Markov(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = markov.spatial_trend('subquery',
['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'],
5, 'knn', 5, 0, 'the_geom',
'cartodb_id')
self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print result[0]
expected = self.markov_data
for ([res_trend, res_up, res_down, res_vol, res_id],
[exp_trend, exp_up, exp_down, exp_vol, exp_id]
) in zip(result, expected):
self.assertAlmostEqual(res_trend, exp_trend)
def test_get_time_data(self):
"""Test get_time_data"""
data = [{'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009']} for d in self.neighbors_data]
result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'])
# expected was prepared from PySAL example:
# f = ps.open(ps.examples.get_path("usjoin.csv"))
# pci = np.array([f.by_col[str(y)]
# for y in range(1995, 2010)]).transpose()
# rpci = pci / (pci.mean(axis = 0))
expected = np.array(
[[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154,
0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612,
0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356],
[0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388,
0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176,
0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858],
[0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522,
0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196,
0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177],
[1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841,
1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806,
1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775],
[1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025,
1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964,
1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337],
[1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684,
1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372,
1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431],
[1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149,
1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239,
1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225],
[1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545,
0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905,
1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746],
[0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845,
0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787,
0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858],
[0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044,
0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998,
0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146],
[1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624,
1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989,
1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097],
[0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687,
0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989,
0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385],
[0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647,
0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424,
0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441],
[0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897,
0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471,
0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704],
[0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012,
0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883,
0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404],
[0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136,
0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334,
0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184],
[0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238,
0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265,
0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657],
[1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723,
1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042,
1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398],
[1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667,
1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841,
1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462],
[1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093,
1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416,
0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102],
[1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264,
1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944,
1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155],
[0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284,
0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288,
0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192],
[0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803,
0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162,
0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232],
[0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801,
0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307,
0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675],
[0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619,
0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651,
0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148],
[1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683,
1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751,
1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943],
[1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272,
1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235,
1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828],
[1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667,
1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786,
1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446],
[0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864,
0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634,
0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395],
[1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626,
1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434,
1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738],
[0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282,
0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835,
0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363],
[0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368,
0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333,
0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296],
[1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073,
1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768,
0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705],
[0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613,
0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183,
0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675],
[1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086,
1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924,
0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765],
[1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857,
1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979,
1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571],
[1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043,
1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136,
1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083],
[0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809,
0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414,
0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599],
[0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286,
0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237,
0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351],
[0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967,
0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506,
0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864],
[0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824,
0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888,
0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908],
[0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751,
0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586,
0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874],
[0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441,
0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768,
0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366],
[1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249,
1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735,
1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926],
[1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863,
1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827,
1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125],
[0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744,
0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865,
0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911],
[1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561,
1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032,
0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284],
[0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289,
0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619,
1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]])
self.assertTrue(np.allclose(result, expected))
self.assertTrue(type(result) == type(expected))
self.assertTrue(result.shape == expected.shape)
def test_rebin_data(self):
"""Test rebin_data"""
# sample in double the time (even case since 10 % 2 = 0):
# (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
# = 0.5, 2.5, 4.5, 6.5, 8.5
ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float)
for i in range(0, 10, 2)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
# sample in triple the time (uneven since 10 % 3 = 1):
# (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
# = 1, 4, 7, 9
ans_odd = np.array([i * np.ones(10, dtype=float)
for i in (1, 4, 7, 9)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
def test_get_prob_dist(self):
"""Test get_prob_dist"""
lag_indices = np.array([1, 2, 3, 4])
unit_indices = np.array([1, 3, 2, 4])
answer = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
result = std.get_prob_dist(self.transition_matrix,
lag_indices, unit_indices)
self.assertTrue(np.array_equal(result, answer))
def test_get_prob_stats(self):
"""Test get_prob_stats"""
probs = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
unit_indices = np.array([1, 3, 2, 4])
answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.])
answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941])
answer_trend = np.array([-0.03301887 / 0.88207547,
-0.05882353 / 0.87058824,
0.02475248 / 0.77722772,
-0.02352941 / 0.97647059])
answer_volatility = np.array([0.34221495, 0.33705421,
0.29226542, 0.38834223])
result = std.get_prob_stats(probs, unit_indices)
result_up = result[0]
result_down = result[1]
result_trend = result[2]
result_volatility = result[3]
self.assertTrue(np.allclose(result_up, answer_up))
self.assertTrue(np.allclose(result_down, answer_down))
self.assertTrue(np.allclose(result_trend, answer_trend))
self.assertTrue(np.allclose(result_volatility, answer_volatility))

View File

@@ -0,0 +1,6 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import analysis_data_provider

View File

@@ -0,0 +1,67 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
class AnalysisDataProvider:
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@@ -0,0 +1,4 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *
from getis import *

View File

@@ -0,0 +1,50 @@
"""
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
from collections import OrderedDict
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, qvals)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -0,0 +1,32 @@
from sklearn.cluster import KMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)

View File

@@ -0,0 +1,208 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
"""
# TODO: Fill in local neighbors which have null/NoneType values with the
# average of the their neighborhood
import pysal as ps
from collections import OrderedDict
from crankshaft.analysis_data_provider import AnalysisDataProvider
# crankshaft module
import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
class Moran:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator)
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Input:
@param coord (int): quadrant of a specific measurement
Output:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n
Input:
@param quads ndarray: an array of quads classified by
1-4 (PySAL default)
Output:
@param list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1,2 @@
"""Import all functions for pysal_utils"""
from crankshaft.pysal_utils.pysal_utils import *

View File

@@ -0,0 +1,211 @@
"""
Utilities module for generic PySAL functionality, mainly centered on
translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
# if w_type.lower() == 'knn':
# row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
# weights = {x['id']: row_normed_weights for x in query_res}
# else:
# weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
# if len(x['neighbors']) > 0
# else [] for x in query_res}
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'
return built_weight
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
Defaults to order in the params
@param params: dict of information used in query (column names,
table name, etc.)
Example:
OrderedDict([('numerator', 'price'),
('denominator', 'sq_meters'),
('subquery', 'SELECT * FROM interesting_data')])
Output:
"i.\"price\"::numeric As attr1, " \
"i.\"sq_meters\"::numeric As attr2, "
"""
attr_string = ""
template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "
if 'time_cols' in params:
# if markov analysis
attrs = params['time_cols']
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(attrs):
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Construct where conditions when building neighbors query
Create portion of WHERE clauses for weeding out NULL-valued geometries
Input: dict of params:
{'subquery': ...,
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output:
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
'etc': ...}
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
NULL AND idx_replace."time3" IS NOT NULL'
"""
attr_string = []
template = "idx_replace.\"%s\" IS NOT NULL"
if 'time_cols' in params:
# markov where clauses
attrs = params['time_cols']
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
# moran where clauses
# get keys
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
if 'denominator' in attrs:
attr_string.append(
"idx_replace.\"%s\" <> 0" % params['denominator'])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE " \
"i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"%(attr_where_j)s " \
"ORDER BY " \
"j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
"LIMIT {num_ngbrs})" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = "SELECT " \
"i.\"{id_col}\" As id, " \
"%(attr_select)s" \
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
"FROM ({subquery}) As j " \
"WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \
"ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
"%(attr_where_j)s)" \
") As neighbors " \
"FROM ({subquery}) As i " \
"WHERE " \
"%(attr_where_i)s " \
"ORDER BY i.\"{id_col}\" ASC;" % replacements
return query.format(**params)
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""
prepare return values for cases of empty weights objects (no neighbors)
Input:
@param num_nones int: number of columns (e.g., 4)
Output:
[(None, None, None, None)]
"""
return [tuple([None] * num_nones)]

View File

@@ -0,0 +1,11 @@
"""Random seed generator used for non-deterministic functions in crankshaft"""
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1 @@
from segmentation import *

View File

@@ -0,0 +1,176 @@
"""
Segmentation creation and prediction
"""
import sklearn
import numpy as np
import plpy
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
# Lower level functions
#----------------------
def replace_nan_with_mean(array):
"""
Input:
@param array: an array of floats which may have null-valued entries
Output:
array with nans filled in with the mean of the dataset
"""
# returns an array of rows and column indices
indices = np.where(np.isnan(array))
# iterate through entries which have nan values
for row, col in zip(*indices):
array[row, col] = np.mean(array[~np.isnan(array[:, col]), col])
return array
def get_data(variable, feature_columns, query):
"""
Fetch data from the database, clean, and package into
numpy arrays
Input:
@param variable: name of the target variable
@param feature_columns: list of column names
@param query: subquery that data is pulled from for the packaging
Output:
prepared data, packaged into NumPy arrays
"""
columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns])
try:
data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format(
variable=variable,
columns=columns,
query=query))
except Exception, e:
plpy.error('Failed to access data to build segmentation model: %s' % e)
# extract target data from plpy object
target = np.array(data[0]['target'])
# put n feature data arrays into an n x m array of arrays
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
return replace_nan_with_mean(target), replace_nan_with_mean(features)
# High level interface
# --------------------
def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters):
"""
Version of create_and_predict_segment that works on arrays that come stright form the SQL calling
the function.
Input:
@param target: The 1D array of lenth NSamples containing the target variable we want the model to predict
@param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model
@param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from
@param model_parameters: A dictionary containing parameters for the model.
"""
clean_target = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features)
model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2)
prediction = model.predict(target_features)
accuracy_array = [accuracy]*prediction.shape[0]
return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array))
def create_and_predict_segment(query, variable, target_query, model_params):
"""
generate a segment with machine learning
Stuart Lynn
"""
## fetch column names
try:
columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys()
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
## extract column names to be used in building the segmentation model
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
## get data from database
target, features = get_data(variable, feature_columns, query)
model, accuracy = train_model(target, features, model_params, 0.2)
cartodb_ids, result = predict_segment(model, feature_columns, target_query)
accuracy_array = [accuracy]*result.shape[0]
return zip(cartodb_ids, result, accuracy_array)
def train_model(target, features, model_params, test_split):
"""
Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
Input:
@param target: 1D Array of the variable that the model is to be trianed to predict
@param features: 2D Array NSamples * NFeatures to use in trining the model
@param model_params: A dictionary of model parameters, the full specification can be found on the
scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
@parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
"""
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingRegressor(**model_params)
model.fit(features_train, target_train)
accuracy = calculate_model_accuracy(model, features, target)
return model, accuracy
def calculate_model_accuracy(model, features, target):
"""
Calculate the mean squared error of the model prediction
Input:
@param model: model trained from input features
@param features: features to make a prediction from
@param target: target to compare prediction to
Output:
mean squared error of the model prection compared to the target
"""
prediction = model.predict(features)
return metrics.mean_squared_error(prediction, target)
def predict_segment(model, features, target_query):
"""
Use the provided model to predict the values for the new feature set
Input:
@param model: The pretrained model
@features: A list of features to use in the model prediction (list of column names)
@target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it.
"""
batch_size = 1000
joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
try:
cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format(
joined_features=joined_features,
target_query=target_query))
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
results = []
while True:
rows = cursor.fetch(batch_size)
if not rows:
break
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
#Need to fix this. Should be global mean. This will cause weird effects
batch = replace_nan_with_mean(batch)
prediction = model.predict(batch)
results.append(prediction)
try:
cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids']
except Exception, e:
plpy.error('Failed to build segmentation model: %s' % e)
return cartodb_ids, np.concatenate(results)

View File

@@ -0,0 +1,2 @@
"""Import all functions from clustering libraries."""
from markov import *

View File

@@ -0,0 +1,194 @@
"""
Spatial dynamics measurements using Spatial Markov
"""
# TODO: remove all plpy dependencies
import numpy as np
import pysal as ps
import plpy
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Markov:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial_trend(self, subquery, time_cols, num_classes=7,
w_type='knn', num_ngbrs=5, permutations=0,
geom_col='the_geom', id_col='cartodb_id'):
"""
Predict the trends of a unit based on:
1. history of its transitions to different classes (e.g., 1st
quantile -> 2nd quantile)
2. average class of its neighbors
Inputs:
@param subquery string: e.g., SELECT the_geom, cartodb_id,
interesting_time_column FROM table_name
@param time_cols list of strings: list of strings of column names
@param num_classes (optional): number of classes to break
distribution of values into. Currently uses quantile bins.
@param w_type string (optional): weight type ('knn' or 'queen')
@param num_ngbrs int (optional): number of neighbors (if knn type)
@param permutations int (optional): number of permutations for test
stats
@param geom_col string (optional): name of column which contains
the geometries
@param id_col string (optional): name of column which has the ids
of the table
Outputs:
@param trend_up float: probablity that a geom will move to a higher
class
@param trend_down float: probablity that a geom will move to a
lower class
@param trend float: (trend_up - trend_down) / trend_static
@param volatility float: a measure of the volatility based on
probability stddev(prob array)
"""
if len(time_cols) < 2:
plpy.error('More than one time column needs to be passed')
params = {"id_col": id_col,
"time_cols": time_cols,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
query_result = self.data_provider.get_markov(w_type, params)
# build weight
weights = pu.get_weight(query_result, w_type)
weights.transform = 'r'
# prep time data
t_data = get_time_data(query_result, time_cols)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,
k=num_classes,
fixed=False,
permutations=permutations)
# get lag classes
lag_classes = ps.Quantiles(
ps.lag_spatial(weights, t_data[:, -1]),
k=num_classes).yb
# look up probablity distribution for each unit according to class and
# lag class
prob_dist = get_prob_dist(sp_markov_result.P,
lag_classes,
sp_markov_result.classes[:, -1])
# find the ups and down and overall distribution of each cell
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
def get_time_data(markov_data, time_cols):
"""
Extract the time columns and bin appropriately
"""
num_attrs = len(time_cols)
return np.array([[x['attr' + str(i)] for x in markov_data]
for i in range(1, num_attrs+1)], dtype=float).transpose()
# not currently used
def rebin_data(time_data, num_time_per_bin):
"""
Convert an n x l matrix into an (n/m) x l matrix where the values are
reduced (averaged) for the intervening states:
1 2 3 4 1.5 3.5
5 6 7 8 -> 5.5 7.5
9 8 7 6 8.5 6.5
5 4 3 2 4.5 2.5
if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix.
This process effectively resamples the data at a longer time span n
units longer than the input data.
For cases when there is a remainder (remainder(5/3) = 2), the remaining
two columns are binned together as the last time period, while the
first three are binned together for the first period.
Input:
@param time_data n x l ndarray: measurements of an attribute at
different time intervals
@param num_time_per_bin int: number of columns to average into a new
column
Output:
ceil(n / m) x l ndarray of resampled time series
"""
if time_data.shape[1] % num_time_per_bin == 0:
# if fit is perfect, then use it
n_max = time_data.shape[1] / num_time_per_bin
else:
# fit remainders into an additional column
n_max = time_data.shape[1] / num_time_per_bin + 1
return np.array(
[time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
for i in range(n_max)]).T
def get_prob_dist(transition_matrix, lag_indices, unit_indices):
"""
Given an array of transition matrices, look up the probability
associated with the arrangements passed
Input:
@param transition_matrix ndarray[k,k,k]:
@param lag_indices ndarray:
@param unit_indices ndarray:
Output:
Array of probability distributions
"""
return np.array([transition_matrix[(lag_indices[i], unit_indices[i])]
for i in range(len(lag_indices))])
def get_prob_stats(prob_dist, unit_indices):
"""
get the statistics of the probability distributions
Outputs:
@param trend_up ndarray(float): sum of probabilities for upward
movement (relative to the unit index of that prob)
@param trend_down ndarray(float): sum of probabilities for downward
movement (relative to the unit index of that prob)
@param trend ndarray(float): difference of upward and downward
movements
"""
num_elements = len(unit_indices)
trend_up = np.empty(num_elements, dtype=float)
trend_down = np.empty(num_elements, dtype=float)
trend = np.empty(num_elements, dtype=float)
for i in range(num_elements):
trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
if prob_dist[i, unit_indices[i]] > 0.0:
trend[i] = (trend_up[i] - trend_down[i]) / (
prob_dist[i, unit_indices[i]])
else:
trend[i] = None
# calculate volatility of distribution
volatility = prob_dist.std(axis=1)
return trend_up, trend_down, trend, volatility

View File

@@ -0,0 +1,5 @@
joblib==0.8.3
numpy==1.6.1
scipy==0.14.0
pysal==1.11.2
scikit-learn==0.14.1

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.5.1',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.11.2', 'scikit-learn==0.14.1'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1 @@
[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]]

View File

@@ -0,0 +1 @@
[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]

View File

@@ -0,0 +1 @@
[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]]

View File

@@ -0,0 +1,52 @@
[[0.9319096128346788, "HH"],
[-1.135787401862846, "HL"],
[0.11732030672508517, "LL"],
[0.6152779669180425, "LL"],
[-0.14657336660125297, "LH"],
[0.6967858120189607, "LL"],
[0.07949310115714454, "HH"],
[0.4703198759258987, "HH"],
[0.4421125200498064, "HH"],
[0.5724288737143592, "LL"],
[0.8970743435692062, "LL"],
[0.18327334401918674, "LL"],
[-0.01466729201304962, "HL"],
[0.3481559372544409, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329988, "HH"],
[0.4373841193538136, "HH"],
[0.15971286468915544, "LL"],
[1.0543588860308968, "HH"],
[1.7372866900020818, "HH"],
[1.091998586053999, "LL"],
[0.1171572584252222, "HH"],
[0.08438455015300014, "LL"],
[0.06547094736902978, "LL"],
[0.15482141569329985, "HH"],
[1.1627044812890683, "HH"],
[0.06547094736902978, "LL"],
[0.795275137550483, "HH"],
[0.18562939195219, "LL"],
[0.3010757406693439, "LL"],
[2.8205795942839376, "HH"],
[0.11259190602909264, "LL"],
[-0.07116352791516614, "HL"],
[-0.09945240794119009, "LH"],
[0.18562939195219, "LL"],
[0.1832733440191868, "LL"],
[-0.39054253768447705, "HL"],
[-0.1672071289487642, "HL"],
[0.3337669247916343, "HH"],
[0.2584386102554792, "HH"],
[-0.19733845476322634, "HL"],
[-0.9379282899805409, "LH"],
[-0.028770969951095866, "LH"],
[0.051367269430983485, "LL"],
[-0.2172548045913472, "LH"],
[0.05136726943098351, "LL"],
[0.04191046803899837, "LL"],
[0.7482357030403517, "HH"],
[-0.014585767863118111, "LH"],
[0.5410013139159929, "HH"],
[1.0223932668429925, "LL"],
[1.4179402898927476, "LL"]]

View File

@@ -0,0 +1,54 @@
[
{"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
{"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
{"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
{"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
{"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
{"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
{"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
{"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
{"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
{"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
{"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
{"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
{"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
{"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
{"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
{"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
{"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
{"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
{"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
{"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
{"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
{"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
{"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
{"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
{"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
{"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
{"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
{"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
{"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
{"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
{"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
{"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
{"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
{"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
{"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
{"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
{"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
{"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
{"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
{"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
{"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
{"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
{"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
{"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
{"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
{"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
{"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
{"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
{"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
{"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
{"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
{"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,13 @@
import unittest
from mock_plpy import MockPlPy
plpy = MockPlPy()
import sys
sys.modules['plpy'] = plpy
import os
def fixture_file(name):
dir = os.path.dirname(os.path.realpath(__file__))
return os.path.join(dir, 'fixtures', name)

View File

@@ -0,0 +1,54 @@
import re
class MockCursor:
def __init__(self, data):
self.cursor_pos = 0
self.data = data
def fetch(self, batch_size):
batch = self.data[self.cursor_pos:self.cursor_pos + batch_size]
self.cursor_pos += batch_size
return batch
class MockPlPy:
def __init__(self):
self._reset()
def _reset(self):
self.infos = []
self.notices = []
self.debugs = []
self.logs = []
self.warnings = []
self.errors = []
self.fatals = []
self.executes = []
self.results = []
self.prepares = []
self.results = []
def _define_result(self, query, result):
pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
self.results.append([pattern, result])
def notice(self, msg):
self.notices.append(msg)
def debug(self, msg):
self.notices.append(msg)
def info(self, msg):
self.infos.append(msg)
def cursor(self, query):
data = self.execute(query)
return MockCursor(data)
# TODO: additional arguments
def execute(self, query):
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -0,0 +1,78 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Getis
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from crankshaft.analysis_data_provider import AnalysisDataProvider
# Fixture files produced as follows
#
# import pysal as ps
# import numpy as np
# import random
#
# # setup variables
# f = ps.open(ps.examples.get_path("stl_hom.dbf"))
# y = np.array(f.by_col['HR8893'])
# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp"))
#
# out_queen = [{"id": index + 1,
# "neighbors": [x+1 for x in w_queen.neighbors[index]],
# "value": val} for index, val in enumerate(y)]
#
# with open('neighbors_queen_getis.json', 'w') as f:
# f.write(str(out_queen))
#
# random.seed(1234)
# np.random.seed(1234)
# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True,
# permutations=999)
#
# with open('getis_queen.json', 'w') as f:
# f.write(str(zip(lgstar_queen.z_sim,
# lgstar_queen.p_sim, lgstar_queen.p_z_sim)))
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_getis(self, w_type, param):
return self.mock_result
class GetisTest(unittest.TestCase):
"""Testing class for Getis-Ord's G* funtion
This test replicates the work done in PySAL documentation:
https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g
"""
def setUp(self):
# load raw data for analysis
self.neighbors_data = json.loads(
open(fixture_file('neighbors_getis.json')).read())
# load pre-computed/known values
self.getis_data = json.loads(
open(fixture_file('getis.json')).read())
def test_getis_ord(self):
"""Test Getis-Ord's G*"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
getis = Getis(FakeDataProvider(data))
result = getis.getis_ord('subquery', 'value',
'queen', None, 999, 'the_geom',
'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = np.array(self.getis_data)[:, 0:2]
for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected):
self.assertAlmostEqual(res_z, exp_z, delta=1e-2)

View File

@@ -0,0 +1,56 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import fixture_file
from crankshaft.clustering import Kmeans
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mocked_result):
self.mocked_result = mocked_result
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
return self.mocked_result
class KMeansTest(unittest.TestCase):
"""Testing class for k-means spatial"""
def setUp(self):
self.cluster_data = json.loads(
open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"}
def test_kmeans(self):
"""
"""
data = [{'xs': d['xs'],
'ys': d['ys'],
'ids': d['ids']} for d in self.cluster_data]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data))
clusters = kmeans.spatial('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1] == 0]
c2 = [a for a in clusters if a[1] == 1]
self.assertEqual(len(np.unique(labels)), 2)
self.assertEqual(len(c1), 20)
self.assertEqual(len(c2), 20)

View File

@@ -0,0 +1,112 @@
import unittest
import numpy as np
from helper import fixture_file
from crankshaft.clustering import Moran
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_moran(self, w_type, params):
return self.mock_result
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.params_markov = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan",
"_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(
open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
from crankshaft.clustering import map_quads
self.assertEqual(map_quads(1), 'HH')
self.assertEqual(map_quads(2), 'LH')
self.assertEqual(map_quads(3), 'LL')
self.assertEqual(map_quads(4), 'HL')
self.assertEqual(map_quads(33), None)
self.assertEqual(map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
from crankshaft.clustering import quad_position
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_local_stat(self):
"""Test Moran's I local"""
data = [OrderedDict([('id', d['id']),
('attr1', d['value']),
('neighbors', d['neighbors'])])
for d in self.neighbors_data]
moran = Moran(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [{'id': d['id'],
'attr1': d['value'],
'attr2': 1,
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1234)
moran = Moran(FakeDataProvider(data))
result = moran.local_rate_stat('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
random_seeds.set_random_seeds(1235)
moran = Moran(FakeDataProvider(data))
result = moran.global_stat('table', 'value',
'knn', 5, 99, 'the_geom',
'cartodb_id')
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)

View File

@@ -0,0 +1,160 @@
import unittest
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
from collections import OrderedDict
class PysalUtilsTest(unittest.TestCase):
"""Testing class for utility functions related to PySAL integrations"""
def setUp(self):
self.params1 = OrderedDict([("id_col", "cartodb_id"),
("attr1", "andy"),
("attr2", "jay_z"),
("subquery", "SELECT * FROM a_list"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params2 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "price"),
("denominator", "sq_meters"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params3 = OrderedDict([("id_col", "cartodb_id"),
("numerator", "sq_meters"),
("denominator", "price"),
("subquery", "SELECT * FROM pecan"),
("geom_col", "the_geom"),
("num_ngbrs", 321)])
self.params_array = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
def test_query_attr_select(self):
"""Test query_attr_select"""
ans1 = ("i.\"andy\"::numeric As attr1, "
"i.\"jay_z\"::numeric As attr2, ")
ans2 = ("i.\"price\"::numeric As attr1, "
"i.\"sq_meters\"::numeric As attr2, ")
ans3 = ("i.\"sq_meters\"::numeric As attr1, "
"i.\"price\"::numeric As attr2, ")
ans_array = ("i.\"_2013_dec\"::numeric As attr1, "
"i.\"_2014_jan\"::numeric As attr2, "
"i.\"_2014_feb\"::numeric As attr3, ")
self.assertEqual(pu.query_attr_select(self.params1), ans1)
self.assertEqual(pu.query_attr_select(self.params2), ans2)
self.assertEqual(pu.query_attr_select(self.params3), ans3)
self.assertEqual(pu.query_attr_select(self.params_array), ans_array)
def test_query_attr_where(self):
"""Test pu.query_attr_where"""
ans1 = ("idx_replace.\"andy\" IS NOT NULL AND "
"idx_replace.\"jay_z\" IS NOT NULL")
ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND "
"idx_replace.\"_2014_jan\" IS NOT NULL AND "
"idx_replace.\"_2014_feb\" IS NOT NULL")
self.assertEqual(pu.query_attr_where(self.params1), ans1)
self.assertEqual(pu.query_attr_where(self.params_array), ans_array)
def test_knn(self):
"""Test knn neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL " \
"ORDER BY " \
"j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
ans_array = "SELECT i.\"cartodb_id\" As id, " \
"i.\"_2013_dec\"::numeric As attr1, " \
"i.\"_2014_jan\"::numeric As attr2, " \
"i.\"_2014_feb\"::numeric As attr3, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"j.\"_2013_dec\" IS NOT NULL AND " \
"j.\"_2014_jan\" IS NOT NULL AND " \
"j.\"_2014_feb\" IS NOT NULL " \
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC " \
"LIMIT 321)) As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"_2013_dec\" IS NOT NULL AND " \
"i.\"_2014_jan\" IS NOT NULL AND " \
"i.\"_2014_feb\" IS NOT NULL "\
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.knn(self.params1), ans1)
self.assertEqual(pu.knn(self.params_array), ans_array)
def test_queen(self):
"""Test queen neighbors constructor"""
ans1 = "SELECT i.\"cartodb_id\" As id, " \
"i.\"andy\"::numeric As attr1, " \
"i.\"jay_z\"::numeric As attr2, " \
"(SELECT ARRAY(SELECT j.\"cartodb_id\" " \
"FROM (SELECT * FROM a_list) As j " \
"WHERE " \
"i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \
"ST_Touches(i.\"the_geom\", " \
"j.\"the_geom\") AND " \
"j.\"andy\" IS NOT NULL AND " \
"j.\"jay_z\" IS NOT NULL)" \
") As neighbors " \
"FROM (SELECT * FROM a_list) As i " \
"WHERE i.\"andy\" IS NOT NULL AND " \
"i.\"jay_z\" IS NOT NULL " \
"ORDER BY i.\"cartodb_id\" ASC;"
self.assertEqual(pu.queen(self.params1), ans1)
def test_construct_neighbor_query(self):
"""Test construct_neighbor_query"""
# Compare to raw knn query
self.assertEqual(pu.construct_neighbor_query('knn', self.params1),
pu.knn(self.params1))
def test_get_attributes(self):
"""Test get_attributes"""
## need to add tests
self.assertEqual(True, True)
def test_get_weight(self):
"""Test get_weight"""
self.assertEqual(True, True)
def test_empty_zipped_array(self):
"""Test empty_zipped_array"""
ans2 = [(None, None)]
ans4 = [(None, None, None, None)]
self.assertEqual(pu.empty_zipped_array(2), ans2)
self.assertEqual(pu.empty_zipped_array(4), ans4)

View File

@@ -0,0 +1,64 @@
import unittest
import numpy as np
from helper import plpy, fixture_file
import crankshaft.segmentation as segmentation
import json
class SegmentationTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
def generate_random_data(self,n_samples,random_state, row_type=False):
x1 = random_state.uniform(size=n_samples)
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)
y = x1+x2*x2+x3
cartodb_id = range(len(x1))
if row_type:
return [ {'features': vals} for vals in zip(x1,x2,x3)], y
else:
return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))]
def test_replace_nan_with_mean(self):
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
def test_create_and_predict_segment(self):
n_samples = 1000
random_state_train = np.random.RandomState(13)
random_state_test = np.random.RandomState(134)
training_data = self.generate_random_data(n_samples, random_state_train)
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
ids = [{'cartodb_ids': range(len(test_data))}]
rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}]
plpy._define_result('select \* from \(select \* from training\) a limit 1',rows)
plpy._define_result('.*from \(select \* from training\) as a' ,training_data)
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids)
plpy._define_result('.*select \* from test.*' ,test_data)
model_parameters = {'n_estimators': 1200,
'max_depth': 3,
'subsample' : 0.5,
'learning_rate': 0.01,
'min_samples_leaf': 1}
result = segmentation.create_and_predict_segment(
'select * from training',
'target',
'select * from test',
model_parameters)
prediction = [r[1] for r in result]
accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
self.assertEqual(len(result),len(test_data))
self.assertTrue( result[0][2] < 0.01)
self.assertTrue( accuracy < 0.5*np.mean(test_y) )

View File

@@ -0,0 +1,349 @@
import unittest
import numpy as np
import unittest
from helper import fixture_file
from crankshaft.space_time_dynamics import Markov
import crankshaft.space_time_dynamics as std
from crankshaft import random_seeds
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, data):
self.mock_result = data
def get_markov(self, w_type, params):
return self.mock_result
class SpaceTimeTests(unittest.TestCase):
"""Testing class for Markov Functions."""
def setUp(self):
self.params = {"id_col": "cartodb_id",
"time_cols": ['dec_2013', 'jan_2014', 'feb_2014'],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(
open(fixture_file('neighbors_markov.json')).read())
self.markov_data = json.loads(open(fixture_file('markov.json')).read())
self.time_data = np.array([i * np.ones(10, dtype=float)
for i in range(10)]).T
self.transition_matrix = np.array([
[[0.96341463, 0.0304878, 0.00609756, 0., 0.],
[0.06040268, 0.83221477, 0.10738255, 0., 0.],
[0., 0.14, 0.74, 0.12, 0.],
[0., 0.03571429, 0.32142857, 0.57142857, 0.07142857],
[0., 0., 0., 0.16666667, 0.83333333]],
[[0.79831933, 0.16806723, 0.03361345, 0., 0.],
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.],
[0., 0., 0.06372549, 0.90196078, 0.03431373],
[0., 0., 0., 0.19444444, 0.80555556]],
[[0.84693878, 0.15306122, 0., 0., 0.],
[0.08133971, 0.78947368, 0.1291866, 0., 0.],
[0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0., 0., 0., 0.10204082, 0.89795918]],
[[0.8852459, 0.09836066, 0., 0.01639344, 0.],
[0.03875969, 0.81395349, 0.13953488, 0., 0.00775194],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0.02339181, 0.12865497, 0.75438596, 0.09356725],
[0., 0., 0., 0.09661836, 0.90338164]],
[[0.33333333, 0.66666667, 0., 0., 0.],
[0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.],
[0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.],
[0., 0.01036269, 0.06217617, 0.89637306, 0.03108808],
[0., 0., 0., 0.02352941, 0.97647059]]]
)
def test_spatial_markov(self):
"""Test Spatial Markov."""
data = [{'id': d['id'],
'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
# print(str(data[0]))
markov = Markov(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = markov.spatial_trend('subquery',
['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'],
5, 'knn', 5, 0, 'the_geom',
'cartodb_id')
self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print result[0]
expected = self.markov_data
for ([res_trend, res_up, res_down, res_vol, res_id],
[exp_trend, exp_up, exp_down, exp_vol, exp_id]
) in zip(result, expected):
self.assertAlmostEqual(res_trend, exp_trend)
def test_get_time_data(self):
"""Test get_time_data"""
data = [{'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009']} for d in self.neighbors_data]
result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'])
# expected was prepared from PySAL example:
# f = ps.open(ps.examples.get_path("usjoin.csv"))
# pci = np.array([f.by_col[str(y)]
# for y in range(1995, 2010)]).transpose()
# rpci = pci / (pci.mean(axis = 0))
expected = np.array(
[[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154,
0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612,
0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356],
[0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388,
0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176,
0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858],
[0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522,
0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196,
0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177],
[1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841,
1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806,
1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775],
[1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025,
1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964,
1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337],
[1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684,
1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372,
1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431],
[1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149,
1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239,
1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225],
[1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545,
0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905,
1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746],
[0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845,
0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787,
0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858],
[0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044,
0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998,
0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146],
[1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624,
1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989,
1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097],
[0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687,
0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989,
0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385],
[0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647,
0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424,
0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441],
[0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897,
0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471,
0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704],
[0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012,
0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883,
0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404],
[0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136,
0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334,
0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184],
[0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238,
0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265,
0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657],
[1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723,
1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042,
1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398],
[1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667,
1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841,
1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462],
[1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093,
1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416,
0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102],
[1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264,
1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944,
1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155],
[0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284,
0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288,
0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192],
[0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803,
0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162,
0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232],
[0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801,
0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307,
0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675],
[0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619,
0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651,
0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148],
[1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683,
1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751,
1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943],
[1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272,
1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235,
1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828],
[1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667,
1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786,
1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446],
[0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864,
0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634,
0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395],
[1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626,
1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434,
1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738],
[0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282,
0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835,
0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363],
[0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368,
0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333,
0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296],
[1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073,
1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768,
0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705],
[0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613,
0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183,
0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675],
[1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086,
1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924,
0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765],
[1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857,
1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979,
1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571],
[1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043,
1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136,
1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083],
[0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809,
0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414,
0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599],
[0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286,
0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237,
0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351],
[0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967,
0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506,
0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864],
[0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824,
0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888,
0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908],
[0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751,
0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586,
0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874],
[0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441,
0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768,
0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366],
[1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249,
1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735,
1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926],
[1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863,
1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827,
1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125],
[0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744,
0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865,
0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911],
[1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561,
1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032,
0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284],
[0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289,
0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619,
1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]])
self.assertTrue(np.allclose(result, expected))
self.assertTrue(type(result) == type(expected))
self.assertTrue(result.shape == expected.shape)
def test_rebin_data(self):
"""Test rebin_data"""
# sample in double the time (even case since 10 % 2 = 0):
# (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
# = 0.5, 2.5, 4.5, 6.5, 8.5
ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float)
for i in range(0, 10, 2)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
# sample in triple the time (uneven since 10 % 3 = 1):
# (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
# = 1, 4, 7, 9
ans_odd = np.array([i * np.ones(10, dtype=float)
for i in (1, 4, 7, 9)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
def test_get_prob_dist(self):
"""Test get_prob_dist"""
lag_indices = np.array([1, 2, 3, 4])
unit_indices = np.array([1, 3, 2, 4])
answer = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
result = std.get_prob_dist(self.transition_matrix,
lag_indices, unit_indices)
self.assertTrue(np.array_equal(result, answer))
def test_get_prob_stats(self):
"""Test get_prob_stats"""
probs = np.array([
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
unit_indices = np.array([1, 3, 2, 4])
answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.])
answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941])
answer_trend = np.array([-0.03301887 / 0.88207547,
-0.05882353 / 0.87058824,
0.02475248 / 0.77722772,
-0.02352941 / 0.97647059])
answer_volatility = np.array([0.34221495, 0.33705421,
0.29226542, 0.38834223])
result = std.get_prob_stats(probs, unit_indices)
result_up = result[0]
result_down = result[1]
result_trend = result[2]
result_volatility = result[3]
self.assertTrue(np.allclose(result_up, answer_up))
self.assertTrue(np.allclose(result_down, answer_down))
self.assertTrue(np.allclose(result_trend, answer_trend))
self.assertTrue(np.allclose(result_volatility, answer_volatility))

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
default_version = '0.4.2'
default_version = '0.5.1'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft

View File

@@ -10,9 +10,11 @@ CREATE OR REPLACE FUNCTION
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import moran
from crankshaft.clustering import Moran
# TODO: use named parameters or a dictionary
return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
moran = Moran()
return moran.global_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (internal function)
@@ -27,9 +29,11 @@ CREATE OR REPLACE FUNCTION
id_col TEXT)
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local
from crankshaft.clustering import Moran
moran = Moran()
# TODO: use named parameters or a dictionary
return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
return moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local (public-facing function)
@@ -120,9 +124,11 @@ CREATE OR REPLACE FUNCTION
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
from crankshaft.clustering import moran_local
from crankshaft.clustering import Moran
moran = Moran()
# TODO: use named parameters or a dictionary
return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
return moran.global_rate_stat(subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
@@ -140,9 +146,10 @@ CREATE OR REPLACE FUNCTION
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
from crankshaft.clustering import moran_local_rate
from crankshaft.clustering import Moran
moran = Moran()
# TODO: use named parameters or a dictionary
return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- Moran's I Local Rate (public-facing function)

View File

@@ -1,21 +1,24 @@
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
from crankshaft.clustering import kmeans
return kmeans(query,no_clusters,no_init)
-- Spatial k-means clustering
$$ language plpythonu;
CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer, no_init integer default 20)
RETURNS table (cartodb_id integer, cluster_no integer) as $$
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
RETURNS Numeric[] AS
RETURNS Numeric[] AS
$$
DECLARE
DECLARE
newX NUMERIC;
newY NUMERIC;
newW NUMERIC;
BEGIN
IF weight IS NULL OR the_geom IS NULL THEN
IF weight IS NULL OR the_geom IS NULL THEN
newX = state[1];
newY = state[2];
newW = state[3];
@@ -30,12 +33,12 @@ END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
RETURNS GEOMETRY AS
RETURNS GEOMETRY AS
$$
BEGIN
IF state[3] = 0 THEN
IF state[3] = 0 THEN
RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
ELSE
ELSE
RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
END IF;
END
@@ -56,7 +59,7 @@ BEGIN
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
INITCOND = "{0.0,0.0,0.0}"
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END

View File

@@ -22,10 +22,11 @@ CREATE OR REPLACE FUNCTION
RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
AS $$
from crankshaft.space_time_dynamics import spatial_markov_trend
from crankshaft.space_time_dynamics import Markov
markov = Markov()
## TODO: use named parameters or a dictionary
return spatial_markov_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- input table format: identical to above but in a predictable format

View File

@@ -11,8 +11,9 @@ CREATE OR REPLACE FUNCTION
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT)
AS $$
from crankshaft.clustering import getis_ord
return getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
from crankshaft.clustering import Getis
getis = Getis()
return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;
-- TODO: make a version that accepts the values as arrays

View File

@@ -10,192 +10,45 @@
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION CDB_DotDensity(g geometry(Polygon, 4326), no_points integer, max_iter integer DEFAULT 1000)
RETURNS SETOF geometry(Point, 4326)
AS $$
DECLARE
extent GEOMETRY;
eq_area_geom GEOMETRY;
test_point Geometry;
iter NUMERIC;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
no_left INTEGER;
sample_points GEOMETRY[];
points GEOMETRY[];
BEGIN
eq_area_geom := ST_TRANSFORM(g, 2163);
extent := ST_Envelope(eq_area_geom);
iter := 0;
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
IF(no_left <= 0 or iter >= max_iter) THEN
RETURN;
END IF;
with random_points as(
SELECT ST_SetSRID(ST_MAKEPOINT( x0 + width*random(), y0 + height*random()), 2163) as p
FROM generate_series(1,no_left)
)
SELECT array_agg(p) from random_points
WHERE ST_WITHIN(p, eq_area_geom)
into sample_points;
RETURN QUERY select ST_TRANSFORM(a, 4326) from unnest(sample_points) as a;
IF sample_points IS NOT null THEN
no_left := no_left - array_length(sample_points, 1);
END IF;
iter = iter + 1;
END LOOP;
RETURN;
END;
$$ LANGUAGE plpgsql;
-- DEPRECATED
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry, no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY
AS $$
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
final_points GEOMETRY;
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
with new_points as(
SELECT * FROM CDB_DotDensity(geom, no_points, max_iter_per_point) as a
)
SELECT ST_Collect(a) FROM new_points
into final_points;
RETURN final_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$ LANGUAGE plpgsql;
--
-- Creates N points randomly distributed in the specified secondary polygons
--
-- @param g - array of the geometries to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
--
-- Generate a random response based on the weights given
--
-- @param array_ids an array of ids representing the category to return
--
-- @param weights an array of weights for each category
--
-- Returns : The randomly selected ID.
CREATE OR REPLACE function _cdb_SelectRandomWeights(array_ids numeric[], weights numeric[]) returns NUMERIC
as $$
DECLARE
result NUMERIC;
BEGIN
WITH idw as (
select unnest(array_ids) as id, unnest(weights) as percent
),
CTE AS (
SELECT random() * (SELECT SUM(percent) FROM idw) R
)
SELECT *
FROM (
SELECT id, SUM(percent) OVER (ORDER BY id) S, R
FROM idw as percent CROSS JOIN CTE
) Q
WHERE S >= R
ORDER BY id
LIMIT 1
into result;
return result;
END
$$ LANGUAGE plpgsql;
--
-- Weighted Dot Density
--
-- @param no_points the number of points to generate
--
-- @param geoms the target geometries to place the points in
--
-- @param weights the weight for each of the target polygons
--
-- RETURNS set of points
CREATE OR REPLACE FUNCTION _cdb_WeightedDD(no_points numeric, geoms geometry[], weights numeric[])
RETURNS SETOF geometry
AS $$
DECLARE
i NUMERIC;
ids NUMERIC[];
perGeom NUMERIC[];
selected_poly NUMERIC;
BEGIN
with idseries as (
select generate_series(1,array_upper(geoms,1)) as id
)
select array_agg(id) from idseries into ids;
FOR i in 1..no_points
LOOP
select cdb_crankshaft._cdb_SelectRandomWeights(ids, weights) INTO selected_poly;
perGeom[selected_poly] = coalesce(perGeom[selected_poly] + 1, 0 );
END LOOP;
raise notice 'pergeom %', perGeom;
FOR i in 1..array_length(ids,1)
LOOP
return QUERY
select cdb_crankshaft.CDB_DotDensity(geoms[i], coalesce(perGeom[i],0)::INTEGER);
END LOOP;
END
$$
LANGUAGE plpgsql;
--
-- Daysymetric Dot Density
--
-- @param geom: the geometry that has the
--
-- @param no_points: the total number of points to create
--
-- @param targetGeoms: the geometry that has the
--
-- @param weights: targetGeom weights
--
-- RETURNS setof points
CREATE OR REPLACE FUNCTION CDB_DasymetricDotDensity(geom GEOMETRY, no_points NUMERIC, targetGeoms GEOMETRY[], weights numeric [])
RETURNS setof GEOMETRY
AS $$
BEGIN
RAISE NOTICE 'running Dasymetric';
RETURN QUERY
SELECT cdb_crankshaft._CDB_WeightedDD(no_points, array_agg( ST_INTERSECTION(geom,g)), array_agg(ST_AREA(ST_INTERSECTION(geom,g))*w)::NUMERIC[])
FROM unnest(targetGeoms) as g , unnest(weights) as w
WHERE geom && g;
END
$$
LANGUAGE plpgsql;
LANGUAGE plpgsql VOLATILE;

View File

@@ -0,0 +1,16 @@
WITH g AS (
SELECT ST_Buffer(ST_SetSRID(ST_MakePoint(0,0),4326)::geometry, 1000)::geometry AS g
),
points AS(
SELECT (
ST_Dump(
cdb_crankshaft.cdb_dot_density(g.g, 100)
)
).geom AS p FROM g
)
SELECT count(*), sum(CASE WHEN ST_Contains(g,p) THEN 1 ELSE 0 END) FROM points, g
count | sum
-------+-----
100 | 100
(1 row)

View File

@@ -0,0 +1,12 @@
WITH g AS (
SELECT ST_Buffer(ST_SetSRID(ST_MakePoint(0,0),4326)::geometry, 1000)::geometry AS g
),
points AS(
SELECT (
ST_Dump(
cdb_crankshaft.cdb_dot_density(g.g, 100)
)
).geom AS p FROM g
)
SELECT count(*), sum(CASE WHEN ST_Contains(g,p) THEN 1 ELSE 0 END) FROM points, g

View File

@@ -3,3 +3,4 @@ import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import analysis_data_provider

View File

@@ -0,0 +1,67 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
class AnalysisDataProvider:
def get_getis(self, w_type, params):
"""fetch data for getis ord's g"""
try:
query = pu.construct_neighbor_query(w_type, params)
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
else:
return result
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_markov(self, w_type, params):
"""fetch data for spatial markov"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
if len(data) == 0:
return pu.empty_zipped_array(4)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_moran(self, w_type, params):
"""fetch data for moran's i analyses"""
try:
query = pu.construct_neighbor_query(w_type, params)
data = plpy.execute(query)
# if there are no neighbors, exit
if len(data) == 0:
return pu.empty_zipped_array(2)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, query):
"""fetch data for non-spatial kmeans"""
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
def get_spatial_kmeans(self, params):
"""fetch data for spatial kmeans"""
query = ("SELECT "
"array_agg({id_col} ORDER BY {id_col}) as ids,"
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
"FROM ({subquery}) As a "
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@@ -3,50 +3,48 @@ Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
import plpy
from collections import OrderedDict
# crankshaft module
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
def getis_ord(subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
class Getis:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
query = pu.construct_neighbor_query(w_type, qvals)
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
except plpy.SPIError, err:
plpy.error('Query failed: %s' % err)
result = self.data_provider.get_getis(w_type, qvals)
attr_vals = pu.get_attributes(result)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -1,18 +1,32 @@
from sklearn.cluster import KMeans
import plpy
import numpy as np
def kmeans(query, no_clusters, no_init=20):
data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
array_agg(ST_X(the_geom) order by cartodb_id) xs,
array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
where the_geom is not null
'''.format(query=query))
from crankshaft.analysis_data_provider import AnalysisDataProvider
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters= no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs,ys))
return zip(ids,labels)
class Kmeans:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
data = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = data[0]['xs']
ys = data[0]['ys']
ids = data[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)

View File

@@ -6,8 +6,8 @@ Moran's I geostatistics (global clustering & outliers presence)
# average of the their neighborhood
import pysal as ps
import plpy
from collections import OrderedDict
from crankshaft.analysis_data_provider import AnalysisDataProvider
# crankshaft module
import crankshaft.pysal_utils as pu
@@ -15,204 +15,162 @@ import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
def moran(subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
qvals = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
class Moran:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
query = pu.construct_neighbor_query(w_type, qvals)
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Andy Eschbacher
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(2)
except plpy.SPIError, e:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
def moran_local(subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I implementation for PL/Python
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
qvals = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
query = pu.construct_neighbor_query(w_type, qvals)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(5)
except plpy.SPIError, e:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(5)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
qvals = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator)
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
query = pu.construct_neighbor_query(w_type, qvals)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(2)
except plpy.SPIError, e:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
# find quadrants for each geometry
quads = quad_position(lisa.q)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_local_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
qvals = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
query = pu.construct_neighbor_query(w_type, qvals)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(5)
except plpy.SPIError, e:
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(5)
weight = pu.get_weight(result, w_type, num_ngbrs)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
weight = pu.get_weight(result, w_type, num_ngbrs)
return zip([lisa_rate.I], [lisa_rate.EI])
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Andy Eschbacher
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
# find quadrants for each geometry
quads = quad_position(lisa.q)
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
def moran_local_bv(subquery, attr1, attr2,
permutations, geom_col, id_col, w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
weight = pu.get_weight(result, w_type, num_ngbrs)
qvals = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
query = pu.construct_neighbor_query(w_type, qvals)
# find quadrants for each geometry
quads = quad_position(lisa.q)
try:
result = plpy.execute(query)
# if there are no neighbors, exit
if len(result) == 0:
return pu.empty_zipped_array(4)
except plpy.SPIError:
plpy.error("Error: areas of interest query failed, "
"check input parameters")
return pu.empty_zipped_array(4)
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
result = self.data_provider.get_moran(w_type, params)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------

View File

@@ -2,101 +2,97 @@
Spatial dynamics measurements using Spatial Markov
"""
# TODO: remove all plpy dependencies
import numpy as np
import pysal as ps
import plpy
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
def spatial_markov_trend(subquery, time_cols, num_classes=7,
w_type='knn', num_ngbrs=5, permutations=0,
geom_col='the_geom', id_col='cartodb_id'):
"""
Predict the trends of a unit based on:
1. history of its transitions to different classes (e.g., 1st quantile
-> 2nd quantile)
2. average class of its neighbors
class Markov:
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
Inputs:
@param subquery string: e.g., SELECT the_geom, cartodb_id,
interesting_time_column FROM table_name
@param time_cols list of strings: list of strings of column names
@param num_classes (optional): number of classes to break distribution
of values into. Currently uses quantile bins.
@param w_type string (optional): weight type ('knn' or 'queen')
@param num_ngbrs int (optional): number of neighbors (if knn type)
@param permutations int (optional): number of permutations for test
stats
@param geom_col string (optional): name of column which contains the
geometries
@param id_col string (optional): name of column which has the ids of
the table
def spatial_trend(self, subquery, time_cols, num_classes=7,
w_type='knn', num_ngbrs=5, permutations=0,
geom_col='the_geom', id_col='cartodb_id'):
"""
Predict the trends of a unit based on:
1. history of its transitions to different classes (e.g., 1st
quantile -> 2nd quantile)
2. average class of its neighbors
Outputs:
@param trend_up float: probablity that a geom will move to a higher
class
@param trend_down float: probablity that a geom will move to a lower
class
@param trend float: (trend_up - trend_down) / trend_static
@param volatility float: a measure of the volatility based on
probability stddev(prob array)
"""
Inputs:
@param subquery string: e.g., SELECT the_geom, cartodb_id,
interesting_time_column FROM table_name
@param time_cols list of strings: list of strings of column names
@param num_classes (optional): number of classes to break
distribution of values into. Currently uses quantile bins.
@param w_type string (optional): weight type ('knn' or 'queen')
@param num_ngbrs int (optional): number of neighbors (if knn type)
@param permutations int (optional): number of permutations for test
stats
@param geom_col string (optional): name of column which contains
the geometries
@param id_col string (optional): name of column which has the ids
of the table
if len(time_cols) < 2:
plpy.error('More than one time column needs to be passed')
Outputs:
@param trend_up float: probablity that a geom will move to a higher
class
@param trend_down float: probablity that a geom will move to a
lower class
@param trend float: (trend_up - trend_down) / trend_static
@param volatility float: a measure of the volatility based on
probability stddev(prob array)
"""
qvals = {"id_col": id_col,
"time_cols": time_cols,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
if len(time_cols) < 2:
plpy.error('More than one time column needs to be passed')
try:
query_result = plpy.execute(
pu.construct_neighbor_query(w_type, qvals)
)
if len(query_result) == 0:
return zip([None], [None], [None], [None], [None])
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
return zip([None], [None], [None], [None], [None])
params = {"id_col": id_col,
"time_cols": time_cols,
"geom_col": geom_col,
"subquery": subquery,
"num_ngbrs": num_ngbrs}
# build weight
weights = pu.get_weight(query_result, w_type)
weights.transform = 'r'
query_result = self.data_provider.get_markov(w_type, params)
# prep time data
t_data = get_time_data(query_result, time_cols)
# build weight
weights = pu.get_weight(query_result, w_type)
weights.transform = 'r'
plpy.debug('shape of t_data %d, %d' % t_data.shape)
plpy.debug('number of weight objects: %d, %d' % (weights.sparse).shape)
plpy.debug('first num elements: %f' % t_data[0, 0])
# prep time data
t_data = get_time_data(query_result, time_cols)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,
k=num_classes,
fixed=False,
permutations=permutations)
sp_markov_result = ps.Spatial_Markov(t_data,
weights,
k=num_classes,
fixed=False,
permutations=permutations)
# get lag classes
lag_classes = ps.Quantiles(
ps.lag_spatial(weights, t_data[:, -1]),
k=num_classes).yb
# get lag classes
lag_classes = ps.Quantiles(
ps.lag_spatial(weights, t_data[:, -1]),
k=num_classes).yb
# look up probablity distribution for each unit according to class and lag
# class
prob_dist = get_prob_dist(sp_markov_result.P,
lag_classes,
sp_markov_result.classes[:, -1])
# look up probablity distribution for each unit according to class and
# lag class
prob_dist = get_prob_dist(sp_markov_result.P,
lag_classes,
sp_markov_result.classes[:, -1])
# find the ups and down and overall distribution of each cell
trend_up, trend_down, trend, volatility = get_prob_stats(
prob_dist,
sp_markov_result.classes[:, -1])
# find the ups and down and overall distribution of each cell
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
# output the results
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
def get_time_data(markov_data, time_cols):
@@ -187,8 +183,8 @@ def get_prob_stats(prob_dist, unit_indices):
trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
if prob_dist[i, unit_indices[i]] > 0.0:
trend[i] = ((trend_up[i] - trend_down[i]) /
(prob_dist[i, unit_indices[i]]))
trend[i] = (trend_up[i] - trend_down[i]) / (
prob_dist[i, unit_indices[i]])
else:
trend[i] = None

View File

@@ -1,12 +1,13 @@
import re
class MockCursor:
def __init__(self, data):
self.cursor_pos = 0
self.data = data
def fetch(self, batch_size):
batch = self.data[self.cursor_pos : self.cursor_pos + batch_size]
batch = self.data[self.cursor_pos:self.cursor_pos + batch_size]
self.cursor_pos += batch_size
return batch
@@ -45,8 +46,9 @@ class MockPlPy:
data = self.execute(query)
return MockCursor(data)
def execute(self, query): # TODO: additional arguments
for result in self.results:
if result[0].match(query):
return result[1]
return []
# TODO: additional arguments
def execute(self, query):
for result in self.results:
if result[0].match(query):
return result[1]
return []

View File

@@ -1,38 +0,0 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import numpy as np
import crankshaft.clustering as cc
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
class KMeansTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
self.cluster_data = json.loads(open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"
}
def test_kmeans(self):
data = self.cluster_data
plpy._define_result('select' ,data)
clusters = cc.kmeans('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1]==0]
c2 = [a for a in clusters if a[1]==1]
self.assertEqual(len(np.unique(labels)),2)
self.assertEqual(len(c1),20)
self.assertEqual(len(c2),20)

View File

@@ -1,18 +1,13 @@
import unittest
import numpy as np
from helper import fixture_file
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import crankshaft.clustering as cc
from crankshaft.clustering import Getis
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from crankshaft.analysis_data_provider import AnalysisDataProvider
# Fixture files produced as follows
#
@@ -42,6 +37,14 @@ import json
# lgstar_queen.p_sim, lgstar_queen.p_z_sim)))
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_getis(self, w_type, param):
return self.mock_result
class GetisTest(unittest.TestCase):
"""Testing class for Getis-Ord's G* funtion
This test replicates the work done in PySAL documentation:
@@ -49,8 +52,6 @@ class GetisTest(unittest.TestCase):
"""
def setUp(self):
plpy._reset()
# load raw data for analysis
self.neighbors_data = json.loads(
open(fixture_file('neighbors_getis.json')).read())
@@ -64,10 +65,13 @@ class GetisTest(unittest.TestCase):
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.getis_ord('subquery', 'value',
'queen', None, 999, 'the_geom', 'cartodb_id')
getis = Getis(FakeDataProvider(data))
result = getis.getis_ord('subquery', 'value',
'queen', None, 999, 'the_geom',
'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = np.array(self.getis_data)[:, 0:2]
for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected):

View File

@@ -0,0 +1,56 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import fixture_file
from crankshaft.clustering import Kmeans
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.clustering as cc
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mocked_result):
self.mocked_result = mocked_result
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
return self.mocked_result
class KMeansTest(unittest.TestCase):
"""Testing class for k-means spatial"""
def setUp(self):
self.cluster_data = json.loads(
open(fixture_file('kmeans.json')).read())
self.params = {"subquery": "select * from table",
"no_clusters": "10"}
def test_kmeans(self):
"""
"""
data = [{'xs': d['xs'],
'ys': d['ys'],
'ids': d['ids']} for d in self.cluster_data]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data))
clusters = kmeans.spatial('subquery', 2)
labels = [a[1] for a in clusters]
c1 = [a for a in clusters if a[1] == 0]
c2 = [a for a in clusters if a[1] == 1]
self.assertEqual(len(np.unique(labels)), 2)
self.assertEqual(len(c1), 20)
self.assertEqual(len(c2), 20)

View File

@@ -1,25 +1,27 @@
import unittest
import numpy as np
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
import crankshaft.clustering as cc
from helper import fixture_file
from crankshaft.clustering import Moran
from crankshaft.analysis_data_provider import AnalysisDataProvider
import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
from collections import OrderedDict
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, mock_data):
self.mock_result = mock_data
def get_moran(self, w_type, params):
return self.mock_result
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
def setUp(self):
plpy._reset()
self.params = {"id_col": "cartodb_id",
"attr1": "andy",
"attr2": "jay_z",
@@ -39,33 +41,36 @@ class MoranTest(unittest.TestCase):
def test_map_quads(self):
"""Test map_quads"""
self.assertEqual(cc.map_quads(1), 'HH')
self.assertEqual(cc.map_quads(2), 'LH')
self.assertEqual(cc.map_quads(3), 'LL')
self.assertEqual(cc.map_quads(4), 'HL')
self.assertEqual(cc.map_quads(33), None)
self.assertEqual(cc.map_quads('andy'), None)
from crankshaft.clustering import map_quads
self.assertEqual(map_quads(1), 'HH')
self.assertEqual(map_quads(2), 'LH')
self.assertEqual(map_quads(3), 'LL')
self.assertEqual(map_quads(4), 'HL')
self.assertEqual(map_quads(33), None)
self.assertEqual(map_quads('andy'), None)
def test_quad_position(self):
"""Test lisa_sig_vals"""
from crankshaft.clustering import quad_position
quads = np.array([1, 2, 3, 4], np.int)
ans = np.array(['HH', 'LH', 'LL', 'HL'])
test_ans = cc.quad_position(quads)
test_ans = quad_position(quads)
self.assertTrue((test_ans == ans).all())
def test_moran_local(self):
def test_local_stat(self):
"""Test Moran's I local"""
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
data = [OrderedDict([('id', d['id']),
('attr1', d['value']),
('neighbors', d['neighbors'])])
for d in self.neighbors_data]
plpy._define_result('select', data)
moran = Moran(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = cc.moran_local('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
@@ -80,10 +85,10 @@ class MoranTest(unittest.TestCase):
'attr2': 1,
'neighbors': d['neighbors']} for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local_rate('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
moran = Moran(FakeDataProvider(data))
result = moran.local_rate_stat('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
zipped_values = zip(result, self.moran_data)
@@ -96,10 +101,11 @@ class MoranTest(unittest.TestCase):
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1235)
result = cc.moran('table', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
moran = Moran(FakeDataProvider(data))
result = moran.global_stat('table', 'value',
'knn', 5, 99, 'the_geom',
'cartodb_id')
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()

View File

@@ -4,86 +4,99 @@ import numpy as np
import unittest
# from mock_plpy import MockPlPy
# plpy = MockPlPy()
#
# import sys
# sys.modules['plpy'] = plpy
from helper import plpy, fixture_file
from helper import fixture_file
from crankshaft.space_time_dynamics import Markov
import crankshaft.space_time_dynamics as std
from crankshaft import random_seeds
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
class FakeDataProvider(AnalysisDataProvider):
def __init__(self, data):
self.mock_result = data
def get_markov(self, w_type, params):
return self.mock_result
class SpaceTimeTests(unittest.TestCase):
"""Testing class for Markov Functions."""
def setUp(self):
plpy._reset()
self.params = {"id_col": "cartodb_id",
"time_cols": ['dec_2013', 'jan_2014', 'feb_2014'],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(open(fixture_file('neighbors_markov.json')).read())
self.neighbors_data = json.loads(
open(fixture_file('neighbors_markov.json')).read())
self.markov_data = json.loads(open(fixture_file('markov.json')).read())
self.time_data = np.array([i * np.ones(10, dtype=float) for i in range(10)]).T
self.time_data = np.array([i * np.ones(10, dtype=float)
for i in range(10)]).T
self.transition_matrix = np.array([
[[ 0.96341463, 0.0304878 , 0.00609756, 0. , 0. ],
[ 0.06040268, 0.83221477, 0.10738255, 0. , 0. ],
[ 0. , 0.14 , 0.74 , 0.12 , 0. ],
[ 0. , 0.03571429, 0.32142857, 0.57142857, 0.07142857],
[ 0. , 0. , 0. , 0.16666667, 0.83333333]],
[[ 0.79831933, 0.16806723, 0.03361345, 0. , 0. ],
[ 0.0754717 , 0.88207547, 0.04245283, 0. , 0. ],
[ 0.00537634, 0.06989247, 0.8655914 , 0.05913978, 0. ],
[ 0. , 0. , 0.06372549, 0.90196078, 0.03431373],
[ 0. , 0. , 0. , 0.19444444, 0.80555556]],
[[ 0.84693878, 0.15306122, 0. , 0. , 0. ],
[ 0.08133971, 0.78947368, 0.1291866 , 0. , 0. ],
[ 0.00518135, 0.0984456 , 0.79274611, 0.0984456 , 0.00518135],
[ 0. , 0. , 0.09411765, 0.87058824, 0.03529412],
[ 0. , 0. , 0. , 0.10204082, 0.89795918]],
[[ 0.8852459 , 0.09836066, 0. , 0.01639344, 0. ],
[ 0.03875969, 0.81395349, 0.13953488, 0. , 0.00775194],
[ 0.0049505 , 0.09405941, 0.77722772, 0.11881188, 0.0049505 ],
[ 0. , 0.02339181, 0.12865497, 0.75438596, 0.09356725],
[ 0. , 0. , 0. , 0.09661836, 0.90338164]],
[[ 0.33333333, 0.66666667, 0. , 0. , 0. ],
[ 0.0483871 , 0.77419355, 0.16129032, 0.01612903, 0. ],
[ 0.01149425, 0.16091954, 0.74712644, 0.08045977, 0. ],
[ 0. , 0.01036269, 0.06217617, 0.89637306, 0.03108808],
[ 0. , 0. , 0. , 0.02352941, 0.97647059]]]
[[0.96341463, 0.0304878, 0.00609756, 0., 0.],
[0.06040268, 0.83221477, 0.10738255, 0., 0.],
[0., 0.14, 0.74, 0.12, 0.],
[0., 0.03571429, 0.32142857, 0.57142857, 0.07142857],
[0., 0., 0., 0.16666667, 0.83333333]],
[[0.79831933, 0.16806723, 0.03361345, 0., 0.],
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.],
[0., 0., 0.06372549, 0.90196078, 0.03431373],
[0., 0., 0., 0.19444444, 0.80555556]],
[[0.84693878, 0.15306122, 0., 0., 0.],
[0.08133971, 0.78947368, 0.1291866, 0., 0.],
[0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0., 0., 0., 0.10204082, 0.89795918]],
[[0.8852459, 0.09836066, 0., 0.01639344, 0.],
[0.03875969, 0.81395349, 0.13953488, 0., 0.00775194],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0.02339181, 0.12865497, 0.75438596, 0.09356725],
[0., 0., 0., 0.09661836, 0.90338164]],
[[0.33333333, 0.66666667, 0., 0., 0.],
[0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.],
[0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.],
[0., 0.01036269, 0.06217617, 0.89637306, 0.03108808],
[0., 0., 0., 0.02352941, 0.97647059]]]
)
def test_spatial_markov(self):
"""Test Spatial Markov."""
data = [ { 'id': d['id'],
'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'],
'neighbors': d['neighbors'] } for d in self.neighbors_data]
print(str(data[0]))
plpy._define_result('select', data)
data = [{'id': d['id'],
'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
# print(str(data[0]))
markov = Markov(FakeDataProvider(data))
random_seeds.set_random_seeds(1234)
result = std.spatial_markov_trend('subquery', ['y1995', 'y1996', 'y1997', 'y1998', 'y1999', 'y2000', 'y2001', 'y2002', 'y2003', 'y2004', 'y2005', 'y2006', 'y2007', 'y2008', 'y2009'], 5, 'knn', 5, 0, 'the_geom', 'cartodb_id')
result = markov.spatial_trend('subquery',
['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'],
5, 'knn', 5, 0, 'the_geom',
'cartodb_id')
self.assertTrue(result != None)
self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print result[0]
expected = self.markov_data
@@ -94,173 +107,178 @@ class SpaceTimeTests(unittest.TestCase):
def test_get_time_data(self):
"""Test get_time_data"""
data = [ { 'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009'] } for d in self.neighbors_data]
data = [{'attr1': d['y1995'],
'attr2': d['y1996'],
'attr3': d['y1997'],
'attr4': d['y1998'],
'attr5': d['y1999'],
'attr6': d['y2000'],
'attr7': d['y2001'],
'attr8': d['y2002'],
'attr9': d['y2003'],
'attr10': d['y2004'],
'attr11': d['y2005'],
'attr12': d['y2006'],
'attr13': d['y2007'],
'attr14': d['y2008'],
'attr15': d['y2009']} for d in self.neighbors_data]
result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998', 'y1999', 'y2000', 'y2001', 'y2002', 'y2003', 'y2004', 'y2005', 'y2006', 'y2007', 'y2008', 'y2009'])
result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998',
'y1999', 'y2000', 'y2001', 'y2002',
'y2003', 'y2004', 'y2005', 'y2006',
'y2007', 'y2008', 'y2009'])
## expected was prepared from PySAL example:
### f = ps.open(ps.examples.get_path("usjoin.csv"))
### pci = np.array([f.by_col[str(y)] for y in range(1995, 2010)]).transpose()
### rpci = pci / (pci.mean(axis = 0))
# expected was prepared from PySAL example:
# f = ps.open(ps.examples.get_path("usjoin.csv"))
# pci = np.array([f.by_col[str(y)]
# for y in range(1995, 2010)]).transpose()
# rpci = pci / (pci.mean(axis = 0))
expected = np.array([[ 0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154, 0.83271652
, 0.83786314, 0.85012593, 0.85509656, 0.86416612, 0.87119375, 0.86302631
, 0.86148267, 0.86252252, 0.86746356],
[ 0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388, 0.90746978
, 0.89830489, 0.89431991, 0.88924794, 0.89815176, 0.91832091, 0.91706054
, 0.90139505, 0.87897455, 0.86216858],
[ 0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522, 0.78964559
, 0.80584442, 0.8084998, 0.82258551, 0.82668196, 0.82373724, 0.81814804
, 0.83675961, 0.83574199, 0.84647177],
[ 1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841, 1.14506948
, 1.12151133, 1.11160697, 1.10888621, 1.11399806, 1.12168029, 1.13164797
, 1.12958508, 1.11371818, 1.09936775],
[ 1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025, 1.16898201
, 1.17212488, 1.14752303, 1.11843284, 1.11024964, 1.11943471, 1.11736468
, 1.10863242, 1.09642516, 1.07762337],
[ 1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684, 1.44184737
, 1.44782832, 1.41978227, 1.39092208, 1.4059372, 1.40788646, 1.44052766
, 1.45241216, 1.43306098, 1.4174431 ],
[ 1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149, 1.10888138
, 1.11856629, 1.13062931, 1.11944984, 1.12446239, 1.11671008, 1.10880034
, 1.08401709, 1.06959206, 1.07875225],
[ 1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545, 0.99854316
, 0.9880258, 0.99669587, 0.99327676, 1.01400905, 1.03176742, 1.040511
, 1.01749645, 0.9936394, 0.98279746],
[ 0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845, 0.99127006
, 0.97925917, 0.9683482, 0.95335147, 0.93694787, 0.94308213, 0.92232874
, 0.91284091, 0.89689833, 0.88928858],
[ 0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044, 0.8578708
, 0.86036185, 0.86107306, 0.8500772, 0.86981998, 0.86837929, 0.87204141
, 0.86633032, 0.84946077, 0.83287146],
[ 1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624, 1.14450183
, 1.12349752, 1.12596664, 1.12213996, 1.1119989, 1.10257792, 1.10491258
, 1.11059842, 1.10509795, 1.10020097],
[ 0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687, 0.95831051
, 0.94480909, 0.94804195, 0.95430286, 0.94103989, 0.92122519, 0.91010201
, 0.89280392, 0.89298243, 0.89165385],
[ 0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647, 0.9480927
, 0.93539182, 0.95388718, 0.94597005, 0.96918424, 0.94781281, 0.93466815
, 0.94281559, 0.96520315, 0.96715441],
[ 0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897, 0.98687073
, 0.99237486, 0.98209969, 0.9877653, 0.97399471, 0.96910087, 0.98416665
, 0.98423613, 0.99823861, 0.99545704],
[ 0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012, 0.86191535
, 0.84981451, 0.85472102, 0.84564835, 0.83998883, 0.83478547, 0.82803648
, 0.8198736, 0.82265395, 0.8399404 ],
[ 0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136, 0.82785597
, 0.86008789, 0.86776298, 0.86720209, 0.8676334, 0.89179317, 0.94202108
, 0.9422231, 0.93902708, 0.94479184],
[ 0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238, 0.90906632
, 0.92693339, 0.93695966, 0.94242697, 0.94338265, 0.91981796, 0.91108804
, 0.90543476, 0.91737138, 0.94793657],
[ 1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723, 1.20172869
, 1.21328691, 1.22624778, 1.22397075, 1.23857042, 1.24419893, 1.23929384
, 1.23418676, 1.23626739, 1.26754398],
[ 1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667, 1.34790023
, 1.34399863, 1.32575181, 1.30795492, 1.30544841, 1.30303302, 1.32107766
, 1.32936244, 1.33001241, 1.33288462],
[ 1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093, 1.05059016
, 1.03405057, 1.02747623, 1.03162734, 0.9961416, 0.97356208, 0.94241549
, 0.92754547, 0.92549227, 0.92138102],
[ 1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264, 1.13889622
, 1.12442212, 1.13367018, 1.13982256, 1.14029944, 1.11979401, 1.10905389
, 1.10577769, 1.11166825, 1.09985155],
[ 0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284, 0.74480073
, 0.76098396, 0.76156903, 0.76651952, 0.76533288, 0.78205934, 0.76842416
, 0.77487118, 0.77768683, 0.78801192],
[ 0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803, 0.97370819
, 0.96419154, 0.97209861, 0.97441313, 0.96356162, 0.94745352, 0.93965462
, 0.93069645, 0.94020973, 0.94358232],
[ 0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801, 0.80071489
, 0.83358256, 0.83451613, 0.85175032, 0.85954307, 0.86790024, 0.87170334
, 0.87863799, 0.87497981, 0.87888675],
[ 0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619, 0.98733195
, 0.99644997, 0.99669587, 1.02559097, 1.01116651, 0.99988024, 0.97906749
, 0.99323123, 1.00204939, 0.99602148],
[ 1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683, 1.08312397
, 1.05192626, 1.04230892, 1.05577278, 1.08569751, 1.12443486, 1.08891079
, 1.08603695, 1.05997314, 1.02160943],
[ 1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272, 1.18257029
, 1.16226243, 1.16009196, 1.14467789, 1.14820235, 1.12386598, 1.12680236
, 1.12357937, 1.1159258, 1.12570828],
[ 1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667, 1.31210239
, 1.29989156, 1.29203193, 1.27183516, 1.26830786, 1.2617743, 1.28656675
, 1.29734097, 1.29390205, 1.29345446],
[ 0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864, 0.78772975
, 0.82848011, 0.8259679, 0.82435705, 0.83108634, 0.84373784, 0.83891093
, 0.84349247, 0.85637272, 0.86539395],
[ 1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626, 1.2256767
, 1.21126648, 1.19377804, 1.18355337, 1.19674434, 1.21536573, 1.23653297
, 1.27962009, 1.27968392, 1.25907738],
[ 0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282, 0.96480308
, 0.94686376, 0.93679073, 0.92540049, 0.92988835, 0.93442917, 0.92100464
, 0.91475304, 0.90249622, 0.9021363 ],
[ 0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368, 0.88937573
, 0.894401, 0.90448993, 0.95495898, 0.92698333, 0.94745352, 0.92562488
, 0.96635366, 1.02520312, 1.0394296 ],
[ 1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073, 1.00759019
, 0.99192968, 0.99747298, 0.99550759, 0.97583768, 0.9610168, 0.94779638
, 0.93759089, 0.93353431, 0.94121705],
[ 0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613, 0.83434854
, 0.85813595, 0.84667961, 0.84374558, 0.85951183, 0.87194227, 0.89455097
, 0.88283929, 0.90349491, 0.90600675],
[ 1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086, 1.00581626
, 0.98850522, 0.99291168, 0.98983209, 0.97511924, 0.96134615, 0.96382634
, 0.95011401, 0.9434686, 0.94637765],
[ 1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857, 1.04800023
, 1.03024941, 1.04200483, 1.0402554, 1.03296979, 1.02191682, 1.02476275
, 1.02347523, 1.02517684, 1.04359571],
[ 1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043, 1.0531801
, 1.07452771, 1.09383478, 1.1052447, 1.10322136, 1.09167939, 1.08772756
, 1.08859544, 1.09177338, 1.1096083 ],
[ 0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809, 0.86287327
, 0.85169796, 0.85411285, 0.84886336, 0.84517414, 0.84843858, 0.84488343
, 0.83374329, 0.82812044, 0.82878599],
[ 0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286, 0.92652175
, 0.94278865, 0.93682452, 0.98655146, 0.992237, 0.9798497, 0.93869677
, 0.96947771, 1.00362626, 0.98102351],
[ 0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967, 0.93092109
, 0.92662519, 0.93412152, 0.93501274, 0.92879506, 0.92110542, 0.91035556
, 0.90430364, 0.89994694, 0.90073864],
[ 0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824, 0.98882205
, 0.97662234, 0.95601578, 0.94905385, 0.94934888, 0.97152609, 0.97163004
, 0.9700702, 0.97158948, 0.95884908],
[ 0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751, 0.84818516
, 0.85265681, 0.84502402, 0.82645665, 0.81743586, 0.83550406, 0.83338919
, 0.83511679, 0.82136617, 0.80921874],
[ 0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441, 0.95440787
, 0.96364363, 0.96804412, 0.97136214, 0.97583768, 0.95571724, 0.96895368
, 0.97001634, 0.97082733, 0.98782366],
[ 1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249, 1.10558188
, 1.1214086, 1.12292577, 1.13021031, 1.13342735, 1.14686068, 1.14502975
, 1.14474747, 1.14084037, 1.16142926],
[ 1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863, 1.11856702
, 1.09764283, 1.08815849, 1.08044313, 1.09278827, 1.07003204, 1.08398066
, 1.09831768, 1.09298232, 1.09176125],
[ 0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744, 0.77751194
, 0.79902974, 0.81437881, 0.80788828, 0.79603865, 0.78966436, 0.79949807
, 0.80172182, 0.82168155, 0.85587911],
[ 1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561, 1.00162979
, 0.99860739, 1.00814981, 1.00574316, 0.99030032, 0.97682565, 0.97292596
, 0.96519561, 0.96173403, 0.95890284],
[ 0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289, 0.96608031
, 0.99727185, 1.00781194, 1.03484236, 1.05333619, 1.0983263, 1.1704974
, 1.17025154, 1.18730553, 1.14242645]])
expected = np.array(
[[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154,
0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612,
0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356],
[0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388,
0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176,
0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858],
[0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522,
0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196,
0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177],
[1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841,
1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806,
1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775],
[1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025,
1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964,
1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337],
[1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684,
1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372,
1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431],
[1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149,
1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239,
1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225],
[1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545,
0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905,
1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746],
[0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845,
0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787,
0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858],
[0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044,
0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998,
0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146],
[1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624,
1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989,
1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097],
[0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687,
0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989,
0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385],
[0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647,
0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424,
0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441],
[0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897,
0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471,
0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704],
[0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012,
0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883,
0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404],
[0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136,
0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334,
0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184],
[0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238,
0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265,
0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657],
[1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723,
1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042,
1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398],
[1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667,
1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841,
1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462],
[1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093,
1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416,
0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102],
[1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264,
1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944,
1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155],
[0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284,
0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288,
0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192],
[0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803,
0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162,
0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232],
[0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801,
0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307,
0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675],
[0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619,
0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651,
0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148],
[1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683,
1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751,
1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943],
[1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272,
1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235,
1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828],
[1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667,
1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786,
1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446],
[0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864,
0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634,
0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395],
[1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626,
1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434,
1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738],
[0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282,
0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835,
0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363],
[0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368,
0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333,
0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296],
[1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073,
1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768,
0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705],
[0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613,
0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183,
0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675],
[1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086,
1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924,
0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765],
[1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857,
1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979,
1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571],
[1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043,
1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136,
1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083],
[0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809,
0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414,
0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599],
[0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286,
0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237,
0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351],
[0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967,
0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506,
0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864],
[0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824,
0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888,
0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908],
[0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751,
0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586,
0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874],
[0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441,
0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768,
0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366],
[1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249,
1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735,
1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926],
[1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863,
1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827,
1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125],
[0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744,
0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865,
0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911],
[1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561,
1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032,
0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284],
[0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289,
0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619,
1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]])
self.assertTrue(np.allclose(result, expected))
self.assertTrue(type(result) == type(expected))
@@ -268,32 +286,35 @@ class SpaceTimeTests(unittest.TestCase):
def test_rebin_data(self):
"""Test rebin_data"""
## sample in double the time (even case since 10 % 2 = 0):
## (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
## = 0.5, 2.5, 4.5, 6.5, 8.5
# sample in double the time (even case since 10 % 2 = 0):
# (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
# = 0.5, 2.5, 4.5, 6.5, 8.5
ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float)
for i in range(0, 10, 2)]).T
self.assertTrue(np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
## sample in triple the time (uneven since 10 % 3 = 1):
## (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
## = 1, 4, 7, 9
ans_odd = np.array([i * np.ones(10, dtype=float)
for i in (1, 4, 7, 9)]).T
self.assertTrue(np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
# sample in triple the time (uneven since 10 % 3 = 1):
# (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
# = 1, 4, 7, 9
ans_odd = np.array([i * np.ones(10, dtype=float)
for i in (1, 4, 7, 9)]).T
self.assertTrue(
np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
def test_get_prob_dist(self):
"""Test get_prob_dist"""
lag_indices = np.array([1, 2, 3, 4])
unit_indices = np.array([1, 3, 2, 4])
answer = np.array([
[ 0.0754717 , 0.88207547, 0.04245283, 0. , 0. ],
[ 0. , 0. , 0.09411765, 0.87058824, 0.03529412],
[ 0.0049505 , 0.09405941, 0.77722772, 0.11881188, 0.0049505 ],
[ 0. , 0. , 0. , 0.02352941, 0.97647059]
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
result = std.get_prob_dist(self.transition_matrix, lag_indices, unit_indices)
result = std.get_prob_dist(self.transition_matrix,
lag_indices, unit_indices)
self.assertTrue(np.array_equal(result, answer))
@@ -301,16 +322,20 @@ class SpaceTimeTests(unittest.TestCase):
"""Test get_prob_stats"""
probs = np.array([
[ 0.0754717 , 0.88207547, 0.04245283, 0. , 0. ],
[ 0. , 0. , 0.09411765, 0.87058824, 0.03529412],
[ 0.0049505 , 0.09405941, 0.77722772, 0.11881188, 0.0049505 ],
[ 0. , 0. , 0. , 0.02352941, 0.97647059]
[0.0754717, 0.88207547, 0.04245283, 0., 0.],
[0., 0., 0.09411765, 0.87058824, 0.03529412],
[0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
[0., 0., 0., 0.02352941, 0.97647059]
])
unit_indices = np.array([1, 3, 2, 4])
answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.])
answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941])
answer_trend = np.array([-0.03301887 / 0.88207547, -0.05882353 / 0.87058824, 0.02475248 / 0.77722772, -0.02352941 / 0.97647059])
answer_volatility = np.array([ 0.34221495, 0.33705421, 0.29226542, 0.38834223])
answer_trend = np.array([-0.03301887 / 0.88207547,
-0.05882353 / 0.87058824,
0.02475248 / 0.77722772,
-0.02352941 / 0.97647059])
answer_volatility = np.array([0.34221495, 0.33705421,
0.29226542, 0.38834223])
result = std.get_prob_stats(probs, unit_indices)
result_up = result[0]