Compare commits

..

15 Commits

Author SHA1 Message Date
Andy Eschbacher
daba2f9597 release 0.9.5 [ci skip] 2018-04-09 15:22:35 -04:00
Andy Eschbacher
8f28f41060 corrects incorrect variable name 2018-04-09 15:16:52 -04:00
Andy Eschbacher
7509afa5a6 release feature name validation 2018-04-09 14:14:39 -04:00
Andy Eschbacher
a28c68502c adds feature name validation [ci skip] 2018-04-09 14:09:31 -04:00
Andy Eschbacher
5b4443ca88 new faux release 2018-03-22 13:14:20 -04:00
Andy Eschbacher
2048db33fc avoids accuracy calculation without model being defined 2018-03-22 13:12:17 -04:00
Andy Eschbacher
99e78800b3 adds latest release file 2018-03-22 11:46:42 -04:00
Andy Eschbacher
800648a710 adds upgrade path for 0.9.2 faux release 2018-03-22 11:08:46 -04:00
Andy Eschbacher
91ee6ecc48 new faux release 2018-03-22 11:02:45 -04:00
Andy Eschbacher
9a5ab17240 replaces petname with uuid for now 2018-03-22 11:01:39 -04:00
Andy Eschbacher
65be9befb1 faux release for staging testing 2018-03-22 10:19:29 -04:00
Andy Eschbacher
37e6b4a228 fixes release path copy error [ci skip] 2018-03-20 11:59:15 -04:00
Andy Eschbacher
766bfed9be dummy version bump 2018-03-19 13:30:37 -04:00
Andy Eschbacher
e8a601e945 adds model module [ci skip] 2018-03-16 16:45:39 -04:00
Andy Eschbacher
c2be340c07 prototype of model writing 2018-03-16 16:21:00 -04:00
423 changed files with 85288 additions and 246 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ envs/
*.pyc
.DS_Store
.idea/
.*.sw[nop]

View File

@@ -1,40 +1,60 @@
dist: xenial
language: c
dist: precise
sudo: required
env:
global:
- PAGER=cat
- PGUSER=postgres
- PGDATABASE=postgres
- PGOPTIONS='-c client_min_messages=NOTICE'
- PGPORT=5432
- POSTGIS_VERSION="2.5"
matrix:
- POSTGRESQL_VERSION="10"
- POSTGRESQL_VERSION="11"
before_install:
- ./check-up-to-date-with-master.sh
- sudo apt-get -y install python-pip
- sudo service postgresql stop;
- sudo apt-get remove postgresql* -y
- sudo apt-get install -y --allow-unauthenticated --no-install-recommends --no-install-suggests postgresql-$POSTGRESQL_VERSION postgresql-client-$POSTGRESQL_VERSION postgresql-server-dev-$POSTGRESQL_VERSION postgresql-common
- sudo apt-get install -y --allow-unauthenticated postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION-scripts postgis postgresql-plpython-$POSTGRESQL_VERSION
- sudo pg_dropcluster --stop $POSTGRESQL_VERSION main
- sudo rm -rf /etc/postgresql/$POSTGRESQL_VERSION /var/lib/postgresql/$POSTGRESQL_VERSION
- sudo pg_createcluster -u postgres $POSTGRESQL_VERSION main -- -A trust
- sudo /etc/init.d/postgresql start $POSTGRESQL_VERSION || sudo journalctl -xe
- sudo apt-get -y install python-software-properties
- sudo add-apt-repository -y ppa:cartodb/sci
- sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
- sudo add-apt-repository -y ppa:cartodb/gis
- sudo add-apt-repository -y ppa:cartodb/gis-testing
- sudo apt-get update
- sudo apt-get -y install python-pip python-software-properties python-joblib python-nose
- sudo apt-get -y install python-joblib=0.8.3-1-cdb1
- sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1
- sudo apt-get -y install python-scipy=0.14.0-2-cdb6
- sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2
- sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2
- sudo apt-get -y --no-install-recommends install python-scikits-learn=0.14.1-3-cdb2
# Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
- sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
# Install postgres db and build deps
- sudo /etc/init.d/postgresql stop # stop travis default instance
- sudo apt-get -y remove --purge postgresql-9.1
- sudo apt-get -y remove --purge postgresql-9.2
- sudo apt-get -y remove --purge postgresql-9.3
- sudo apt-get -y remove --purge postgresql-9.4
- sudo apt-get -y remove --purge postgresql-9.5
- sudo rm -rf /var/lib/postgresql/
- sudo rm -rf /var/log/postgresql/
- sudo rm -rf /etc/postgresql/
- sudo apt-get -y remove --purge postgis-2.2
- sudo apt-get -y autoremove
- sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
- sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
# configure it to accept local connections from postgres
- echo -e "# TYPE DATABASE USER ADDRESS METHOD \nlocal all postgres trust\nlocal all all trust\nhost all all 127.0.0.1/32 trust" \
| sudo tee /etc/postgresql/9.5/main/pg_hba.conf
- sudo /etc/init.d/postgresql restart 9.5
install:
- sudo make install
script:
- make test
- make test || { cat src/pg/test/regression.diffs; false; }
- ./check-compatibility.sh
after_failure:
- pg_lsclusters
- cat src/pg/test/regression.diffs

View File

@@ -1,9 +1,3 @@
0.8.2 (2019-02-07)
------------------
* Update dependencies to match what it's being used in production.
* Update travis to xenial, PG10 and 11, and postgis 2.6
* Compatibility with PG11
0.8.1 (2018-03-12)
------------------
* Adds improperly added version files

View File

@@ -1,20 +0,0 @@
{
"name": "crankshaft",
"current_version": {
"requires": {
"postgres": ">=9.5.0",
"postgis": ">=2.2.0.0",
"python": ">=2.7.0",
"joblib": "0.8.3",
"numpy": "1.6.1",
"scipy": "0.14.0",
"pysal": "1.14.3",
"scikit-learn": "0.14.1"
},
"works_with": {
}
},
"exceptional_versions": {
}
}

View File

@@ -27,7 +27,7 @@ psql -c "SELECT * FROM pg_available_extension_versions WHERE name LIKE 'cranksha
psql $DBNAME <<'EOF'
-- Install dependencies
CREATE EXTENSION plpythonu;
CREATE EXTENSION postgis;
CREATE EXTENSION postgis VERSION '2.2.2';
-- Create role publicuser if it does not exist
DO
@@ -48,49 +48,26 @@ CREATE EXTENSION crankshaft;
\dx
EOF
# Check PG version
PG_VERSION=`psql -q -t -c "SELECT current_setting('server_version_num')"`
# Save public function signatures
if [[ "$PG_VERSION" -lt 110000 ]]; then
psql $DBNAME -c "
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE
WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
else
psql $DBNAME -c "
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.prokind = 'a' THEN 'agg'
WHEN p.prokind = 'w' THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
fi
psql $DBNAME <<'EOF'
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE
WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;
EOF
# Deploy current dev branch
make clean-dev || die "Could not clean dev files"
@@ -99,42 +76,26 @@ sudo make install || die "Could not deploy current dev branch"
# Check it can be upgraded
psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version"
if [[ $PG_VERSION -lt 110000 ]]; then
psql $DBNAME -c "
CREATE TABLE dev_function_signatures AS
SELECT p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
else
psql $DBNAME -c "
CREATE TABLE dev_function_signatures AS
SELECT p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.prokind = 'a' THEN 'agg'
WHEN p.prokind = 'w' THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
fi
# Check against saved public function signatures
psql $DBNAME <<'EOF'
CREATE TABLE dev_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE
WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;
EOF
echo "Functions in development not in latest release (ok):"
psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;"

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.8.2'::text;
SELECT '0.9.0'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_pyagg'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
query TEXT,
variable_name TEXT,
target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
'''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
variable,
feature_columns,
target_query,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_weightedmean'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov
-- input table format:

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.8.2'::text;
SELECT '0.9.1'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_pyagg'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
query TEXT,
variable_name TEXT,
target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
'''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
variable,
feature_columns,
target_query,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_weightedmean'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov
-- input table format:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
default_version = '0.8.2'
default_version = '0.9.5'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft

View File

@@ -1,5 +0,0 @@
joblib==0.9.4
numpy==1.11.0
scipy==0.17.0
pysal==1.14.3
scikit-learn==0.17.0

View File

@@ -0,0 +1,76 @@
"""
Based on the Weiszfeld algorithm:
https://en.wikipedia.org/wiki/Geometric_median
"""
# import plpy
import numpy as np
from numpy.linalg import norm
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
query = '''
SELECT array_agg(ST_X({geom_col})) As x_coords,
array_agg(ST_Y({geom_col})) As y_coords
FROM {tablename}
'''.format(geom_col=geom_col, tablename=tablename)
try:
resp = plpy.execute(query)
data = np.vstack((resp['x_coords'][0],
resp['y_coords'][0])).T
plpy.notice('coords: %s' % str(coords))
except Exception, err:
# plpy.error('Analysis failed: %s' % err)
print('No plpy')
data = np.array([[1.2 * np.random.random() + 10.,
1.1 * (np.random.random() - 1.) + 3.]
for i in range(1, 100)])
# initialize 'median center' to be the mean
coords_center_temp = data.mean(axis=0)
# plpy.notice('temp_center: %s' % str(coords_center_temp))
print('temp_center: %s' % str(coords_center_temp))
for i in range(0, num_iters):
old_coords_center = coords_center_temp.copy()
denom = denominator(coords_center_temp, data)
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
data[j])
for j in range(len(data))], axis=0)
coords_center_temp = coords_center_temp / denom
print("Pass #%d" % i)
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
print('temp_center: %s' % str(coords_center_temp))
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
coords_center_temp))
print("Center coords: %s" % str(coords_center_temp))
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
return coords_center_temp
def obj_func(center_coords, data):
"""
"""
return np.linalg.norm(center_coords - data)
def numerator(center_coords, data_i):
"""
"""
return np.reciprocal(np.linalg.norm(center_coords - data_i))
def denominator(center_coords, data):
"""
"""
return np.reciprocal(np.linalg.norm(data - center_coords))

View File

@@ -0,0 +1 @@
from core import set_model, get_model, create_model_table

View File

@@ -0,0 +1,86 @@
import time
import plpy
import pickle
from petname import generate
def create_model_table():
q = '''
create table if not exists model_storage(
description text,
name text unique,
model bytea,
feature_names text[],
date_created timestamptz,
id serial primary key);
'''
plpy.notice(q)
plan = plpy.prepare(q)
resp = plpy.execute(plan)
plpy.notice('Model table successfully created')
plpy.notice(str(resp))
def get_model(model_name):
"""retrieve model if it exists"""
try:
plan = plpy.prepare('''
SELECT model FROM model_storage
WHERE name = $1;
''', ['text', ])
model_encoded = plpy.execute(plan, [model_name, ])
if len(model_encoded) == 1:
model = pickle.loads(
model_encoded[0]['model']
)
plpy.notice('Model successfully loaded')
else:
plpy.notice('Model not found, or too many models '
'({})'.format(len(model_encoded)))
model = None
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
return model
def set_model(model, model_name, feature_names):
"""stores the model in the table model_storage"""
if model_name is None:
model_name = generate(words=2, separator='_', letters=8)
existing_names = plpy.execute('''
SELECT array_agg(name) as name
FROM model_storage
''')
plpy.notice('nrows: {}'.format(existing_names.nrows()))
plpy.notice('MODEL NAME: {}'.format(model_name))
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
if existing_names[0]['name'] is not None:
while model_name in existing_names[0]['name']:
model_name = generate(words=2, separator='_', letters=10)
plpy.notice(model_name)
# store model
try:
plan = plpy.prepare('''
INSERT INTO model_storage(description, name, model, feature_names, date_created)
VALUES (
$1,
$2,
$3,
$4::text[],
to_timestamp($5));
''', ['text', 'text', 'bytea', 'text', 'numeric'])
plpy.notice('{%s}' % ','.join(feature_names))
plpy.notice(feature_names)
plpy.execute(
plan,
[' '.join(m.strip() for m in model.__repr__().split('\n')),
model_name,
pickle.dumps(model),
'{%s}' % ','.join(feature_names),
time.time()]
)
plpy.notice('model successfully stored as {}'.format(model_name))
except plpy.SPIError as err:
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))

View File

@@ -2,11 +2,14 @@
Segmentation creation and prediction
"""
import pickle
import plpy
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from crankshaft.analysis_data_provider import AnalysisDataProvider
from crankshaft import model_storage
# NOTE: added optional param here
@@ -51,6 +54,7 @@ class Segmentation(object):
def create_and_predict_segment(self, query, variable, feature_columns,
target_query, model_params,
model_name=None,
id_col='cartodb_id'):
"""
generate a segment with machine learning
@@ -70,15 +74,23 @@ class Segmentation(object):
(target, features, target_mean,
feature_means) = self.clean_data(query, variable, feature_columns)
model, accuracy = train_model(target, features, model_params, 0.2)
model_storage.create_model_table()
# find model if it exists and is specified
if model_name is not None:
model = model_storage.get_model(model_name)
if locals().get('model') is None:
model, accuracy = train_model(target, features, model_params, 0.2)
result = self.predict_segment(model, feature_columns, target_query,
feature_means)
accuracy_array = [accuracy] * result.shape[0]
rowid = self.data_provider.get_segmentation_data(params)
'''
rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
'''
# store the model for later use
model_storage.set_model(model, model_name, feature_columns)
return zip(rowid[0]['ids'], result, accuracy_array)
def predict_segment(self, model, feature_columns, target_query,

View File

@@ -0,0 +1,5 @@
joblib==0.8.3
numpy==1.6.1
scipy==0.14.0
pysal==1.14.3
scikit-learn==0.14.1

View File

@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.8.2',
version='0.0.0',
description='CartoDB Spatial Analysis Python Library',
@@ -41,7 +41,7 @@ setup(
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.9.4', 'numpy==1.11.0', 'scipy==0.17.0', 'pysal==1.14.3', 'scikit-learn==0.17.0'],
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
requires=['pysal', 'numpy', 'sklearn'],

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.0',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1,6 @@
{
"production_col": [10, 10, 10],
"capacity_col": [0.09, 0.31],
"marginal_col": [5, 5],
"pairwise": [[1, 2, 3], [3, 2, 1]]
}

View File

@@ -0,0 +1,15 @@
from test.helper import plpy, fixture_file
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
import crankshaft
class RawDataProvider(AnalysisDataProvider):
def __init__(self, fixturedata):
self.your_algo_data = fixturedata
def get_moran(self, params):
"""
Replace this function name with the one used in your algorithm,
and make sure to use the same function signature that is written
for this algo in analysis_data_provider.py
"""
return self.your_algo_data

View File

@@ -0,0 +1,7 @@
"""Import all modules"""
import crankshaft.random_seeds
import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import crankshaft.regression
import analysis_data_provider

View File

@@ -0,0 +1,149 @@
"""class for fetching data"""
import plpy
import pysal_utils as pu
NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
'for null values and fill in appropriately.')
def verify_data(func):
"""decorator to verify data result before returning to algorithm"""
def wrapper(*args, **kwargs):
"""Error checking"""
try:
data = func(*args, **kwargs)
if not data:
plpy.error(NULL_VALUE_ERROR)
else:
return data
except plpy.SPIError as err:
plpy.error('Analysis failed: {}'.format(err))
return []
return wrapper
class AnalysisDataProvider(object):
"""Data fetching class for pl/python functions"""
@verify_data
def get_getis(self, w_type, params): # pylint: disable=no-self-use
"""fetch data for getis ord's g"""
query = pu.construct_neighbor_query(w_type, params)
return plpy.execute(query)
@verify_data
def get_markov(self, w_type, params): # pylint: disable=no-self-use
"""fetch data for spatial markov"""
query = pu.construct_neighbor_query(w_type, params)
return plpy.execute(query)
@verify_data
def get_moran(self, w_type, params): # pylint: disable=no-self-use
"""fetch data for moran's i analyses"""
query = pu.construct_neighbor_query(w_type, params)
return plpy.execute(query)
@verify_data
def get_nonspatial_kmeans(self, params): # pylint: disable=no-self-use
"""
Fetch data for non-spatial k-means.
Inputs - a dict (params) with the following keys:
colnames: a (text) list of column names (e.g.,
`['andy', 'cookie']`)
id_col: the name of the id column (e.g., `'cartodb_id'`)
subquery: the subquery for exposing the data (e.g.,
SELECT * FROM favorite_things)
Output:
A SQL query for packaging the data for consumption within
`KMeans().nonspatial`. Format will be a list of length one,
with the first element a dict with keys ('rowid', 'attr1',
'attr2', ...)
"""
agg_cols = ', '.join([
'array_agg({0}) As arr_col{1}'.format(val, idx+1)
for idx, val in enumerate(params['colnames'])
])
query = '''
SELECT {cols}, array_agg({id_col}) As rowid
FROM ({subquery}) As a
'''.format(subquery=params['subquery'],
id_col=params['id_col'],
cols=agg_cols).strip()
return plpy.execute(query)
@verify_data
def get_segmentation_model_data(self, params): # pylint: disable=R0201
"""
fetch data for Segmentation
params = {"subquery": query,
"target": variable,
"features": feature_columns}
"""
columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
for col in params['features']])
query = '''
SELECT
array_agg("{target}") As target,
{columns}
FROM ({subquery}) As q
'''.format(subquery=params['subquery'],
target=params['target'],
columns=columns)
return plpy.execute(query)
@verify_data
def get_segmentation_data(self, params): # pylint: disable=no-self-use
"""
params = {"subquery": target_query,
"id_col": id_col}
"""
query = '''
SELECT
array_agg("{id_col}" ORDER BY "{id_col}") as "ids"
FROM ({subquery}) as q
'''.format(**params)
return plpy.execute(query)
@verify_data
def get_segmentation_predict_data(self, params): # pylint: disable=R0201
"""
fetch data for Segmentation
params = {"subquery": target_query,
"feature_columns": feature_columns}
"""
joined_features = ', '.join(['"{}"::numeric'.format(a)
for a in params['feature_columns']])
query = '''
SELECT
Array[{joined_features}] As features
FROM ({subquery}) as q
'''.format(subquery=params['subquery'],
joined_features=joined_features)
return plpy.cursor(query)
@verify_data
def get_spatial_kmeans(self, params): # pylint: disable=no-self-use
"""fetch data for spatial kmeans"""
query = '''
SELECT
array_agg("{id_col}" ORDER BY "{id_col}") as ids,
array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs,
array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys
FROM ({subquery}) As a
WHERE "{geom_col}" IS NOT NULL
'''.format(**params)
return plpy.execute(query)
@verify_data
def get_gwr(self, params): # pylint: disable=no-self-use
"""fetch data for gwr analysis"""
query = pu.gwr_query(params)
return plpy.execute(query)
@verify_data
def get_gwr_predict(self, params): # pylint: disable=no-self-use
"""fetch data for gwr predict"""
query = pu.gwr_predict_query(params)
return plpy.execute(query)

View File

@@ -0,0 +1,76 @@
"""
Based on the Weiszfeld algorithm:
https://en.wikipedia.org/wiki/Geometric_median
"""
# import plpy
import numpy as np
from numpy.linalg import norm
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
query = '''
SELECT array_agg(ST_X({geom_col})) As x_coords,
array_agg(ST_Y({geom_col})) As y_coords
FROM {tablename}
'''.format(geom_col=geom_col, tablename=tablename)
try:
resp = plpy.execute(query)
data = np.vstack((resp['x_coords'][0],
resp['y_coords'][0])).T
plpy.notice('coords: %s' % str(coords))
except Exception, err:
# plpy.error('Analysis failed: %s' % err)
print('No plpy')
data = np.array([[1.2 * np.random.random() + 10.,
1.1 * (np.random.random() - 1.) + 3.]
for i in range(1, 100)])
# initialize 'median center' to be the mean
coords_center_temp = data.mean(axis=0)
# plpy.notice('temp_center: %s' % str(coords_center_temp))
print('temp_center: %s' % str(coords_center_temp))
for i in range(0, num_iters):
old_coords_center = coords_center_temp.copy()
denom = denominator(coords_center_temp, data)
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
data[j])
for j in range(len(data))], axis=0)
coords_center_temp = coords_center_temp / denom
print("Pass #%d" % i)
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
print('temp_center: %s' % str(coords_center_temp))
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
coords_center_temp))
print("Center coords: %s" % str(coords_center_temp))
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
return coords_center_temp
def obj_func(center_coords, data):
"""
"""
return np.linalg.norm(center_coords - data)
def numerator(center_coords, data_i):
"""
"""
return np.reciprocal(np.linalg.norm(center_coords - data_i))
def denominator(center_coords, data):
"""
"""
return np.reciprocal(np.linalg.norm(data - center_coords))

View File

@@ -0,0 +1,4 @@
"""Import all functions from for clustering"""
from moran import *
from kmeans import *
from getis import *

View File

@@ -0,0 +1,50 @@
"""
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
"""
import pysal as ps
from collections import OrderedDict
# crankshaft modules
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Getis(object):
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def getis_ord(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Getis-Ord's G*
Implementation building neighbors with a PostGIS database and PySAL's
Getis-Ord's G* hotspot/coldspot module.
Andy Eschbacher
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors if kNN is chosen
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_getis(w_type, params)
attr_vals = pu.get_attributes(result)
# build PySAL weight object
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate Getis-Ord's G* z- and p-values
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -0,0 +1,113 @@
from sklearn.cluster import KMeans
import numpy as np
from crankshaft.analysis_data_provider import AnalysisDataProvider
class Kmeans(object):
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def spatial(self, query, no_clusters, no_init=20):
"""
find centers based on clusters of latitude/longitude pairs
query: SQL query that has a WGS84 geometry (the_geom)
"""
params = {"subquery": query,
"geom_col": "the_geom",
"id_col": "cartodb_id"}
result = self.data_provider.get_spatial_kmeans(params)
# Unpack query response
xs = result[0]['xs']
ys = result[0]['ys']
ids = result[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)
def nonspatial(self, subquery, colnames, no_clusters=5,
standardize=True, id_col='cartodb_id'):
"""
Arguments:
query (string): A SQL query to retrieve the data required to do the
k-means clustering analysis, like so:
SELECT * FROM iris_flower_data
colnames (list): a list of the column names which contain the data
of interest, like so: ['sepal_width',
'petal_width',
'sepal_length',
'petal_length']
no_clusters (int): number of clusters (greater than zero)
id_col (string): name of the input id_column
Returns:
A list of tuples with the following columns:
cluster labels: a label for the cluster that the row belongs to
centers: center of the cluster that this row belongs to
silhouettes: silhouette measure for this value
rowid: row that these values belong to (corresponds to the value in
`id_col`)
"""
import json
from sklearn import metrics
params = {
"colnames": colnames,
"subquery": subquery,
"id_col": id_col
}
data = self.data_provider.get_nonspatial_kmeans(params)
# fill array with values for k-means clustering
if standardize:
cluster_columns = _scale_data(
_extract_columns(data))
else:
cluster_columns = _extract_columns(data)
kmeans = KMeans(n_clusters=no_clusters,
random_state=0).fit(cluster_columns)
centers = [json.dumps(dict(zip(colnames, c)))
for c in kmeans.cluster_centers_[kmeans.labels_]]
silhouettes = metrics.silhouette_samples(cluster_columns,
kmeans.labels_,
metric='sqeuclidean')
return zip(kmeans.labels_,
centers,
silhouettes,
[kmeans.inertia_] * kmeans.labels_.shape[0],
data[0]['rowid'])
# -- Preprocessing steps
def _extract_columns(data):
"""
Extract the features from the query and pack them into a NumPy array
data (list of dicts): result of the kmeans request
"""
# number of columns minus rowid column
n_cols = len(data[0]) - 1
return np.array([data[0]['arr_col{0}'.format(i+1)]
for i in xrange(n_cols)],
dtype=float).T
def _scale_data(features):
"""
Scale all input columns to center on 0 with a standard devation of 1
features (numpy matrix): features of dimension (n_features, n_samples)
"""
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
return scaler.fit_transform(features)

View File

@@ -0,0 +1,341 @@
"""
Moran's I geostatistics (global clustering & outliers presence)
Functionality relies on a combination of `PySAL
<http://pysal.readthedocs.io/en/latest/>`__ and the data providered provided in
the class instantiation (which defaults to PostgreSQL's plpy module's `database
access functions <https://www.postgresql.org/docs/10/static/plpython.html>`__).
"""
from collections import OrderedDict
import pysal as ps
# crankshaft module
import crankshaft.pysal_utils as pu
from crankshaft.analysis_data_provider import AnalysisDataProvider
# High level interface ---------------------------------------
class Moran(object):
"""Class for calculation of Moran's I statistics (global, local, and local
rate)
Parameters:
data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See
the `crankshaft.analysis_data_provider` module for more information.
"""
def __init__(self, data_provider=None):
if data_provider is None:
self.data_provider = AnalysisDataProvider()
else:
self.data_provider = data_provider
def global_stat(self, subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (global)
Implementation building neighbors with a PostGIS database and Moran's I
core clusters with PySAL.
Args:
subquery (str): Query to give access to the data needed. This query
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
attr_name (str): Column name of data to analyze
w_type (str): Type of spatial weight. Must be one of `knn`
or `queen`. See `PySAL documentation
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
for more information.
num_ngbrs (int): If using `knn` for ``w_type``, this
specifies the number of neighbors to be used to define the spatial
neighborhoods.
permutations (int): Number of permutations for performing
conditional randomization to find the p-value. Higher numbers
takes a longer time for getting results.
geom_col (str): Name of the geometry column in the dataset for
finding the spatial neighborhoods.
id_col (str): Row index for each value. Usually the database index.
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr_name),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr_vals = pu.get_attributes(result)
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I (local)
Args:
subquery (str): Query to give access to the data needed. This query
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
attr (str): Column name of data to analyze
w_type (str): Type of spatial weight. Must be one of `knn`
or `queen`. See `PySAL documentation
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
for more information.
num_ngbrs (int): If using `knn` for ``w_type``, this
specifies the number of neighbors to be used to define the spatial
neighborhoods.
permutations (int): Number of permutations for performing
conditional randomization to find the p-value. Higher numbers
takes a longer time for getting results.
geom_col (str): Name of the geometry column in the dataset for
finding the spatial neighborhoods.
id_col (str): Row index for each value. Usually the database index.
Returns:
list of tuples: Where each tuple consists of the following values:
- quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
- p-value
- spatial lag
- standardized spatial lag (centered on the mean, normalized by the
standard deviation)
- original value
- standardized value
- Moran's I statistic
- original row index
"""
# geometries with attributes that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("attr1", attr),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
attr_vals = pu.get_attributes(result)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
# calculate spatial lag
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
return zip(
quads,
lisa.p_sim,
lag,
lag_std,
lisa.y,
lisa.z,
lisa.Is,
weight.id_order
)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Rate (global)
Args:
subquery (str): Query to give access to the data needed. This query
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
numerator (str): Column name of numerator to analyze
denominator (str): Column name of the denominator
w_type (str): Type of spatial weight. Must be one of `knn`
or `queen`. See `PySAL documentation
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
for more information.
num_ngbrs (int): If using `knn` for ``w_type``, this
specifies the number of neighbors to be used to define the spatial
neighborhoods.
permutations (int): Number of permutations for performing
conditional randomization to find the p-value. Higher numbers
takes a longer time for getting results.
geom_col (str): Name of the geometry column in the dataset for
finding the spatial neighborhoods.
id_col (str): Row index for each value. Usually the database index.
"""
params = OrderedDict([("id_col", id_col),
("attr1", numerator),
("attr2", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
Moran's I Local Rate
Args:
subquery (str): Query to give access to the data needed. This query
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
numerator (str): Column name of numerator to analyze
denominator (str): Column name of the denominator
w_type (str): Type of spatial weight. Must be one of `knn`
or `queen`. See `PySAL documentation
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
for more information.
num_ngbrs (int): If using `knn` for ``w_type``, this
specifies the number of neighbors to be used to define the spatial
neighborhoods.
permutations (int): Number of permutations for performing
conditional randomization to find the p-value. Higher numbers
takes a longer time for getting results.
geom_col (str): Name of the geometry column in the dataset for
finding the spatial neighborhoods.
id_col (str): Row index for each value. Usually the database index.
Returns:
list of tuples: Where each tuple consists of the following values:
- quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
- p-value
- spatial lag
- standardized spatial lag (centered on the mean, normalized by the
standard deviation)
- original value (roughly numerator divided by denominator)
- standardized value
- Moran's I statistic
- original row index
"""
# geometries with values that are null are ignored
# resulting in a collection of not as near neighbors
params = OrderedDict([("id_col", id_col),
("numerator", numerator),
("denominator", denominator),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
permutations=permutations)
# find quadrants for each geometry
quads = quad_position(lisa.q)
# spatial lag
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
return zip(
quads,
lisa.p_sim,
lag,
lag_std,
lisa.y,
lisa.z,
lisa.Is,
weight.id_order
)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
w_type, num_ngbrs):
"""
Moran's I (local) Bivariate (untested)
"""
params = OrderedDict([("id_col", id_col),
("attr1", attr1),
("attr2", attr2),
("geom_col", geom_col),
("subquery", subquery),
("num_ngbrs", num_ngbrs)])
result = self.data_provider.get_moran(w_type, params)
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
# create weights
weight = pu.get_weight(result, w_type, num_ngbrs)
# calculate LISA values
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
permutations=permutations)
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
HH=1, LH=2, LL=3, HL=4
Args:
coord (int): quadrant of a specific measurement
Returns:
classification (one of 'HH', 'LH', 'LL', or 'HL')
"""
if coord == 1:
return 'HH'
elif coord == 2:
return 'LH'
elif coord == 3:
return 'LL'
elif coord == 4:
return 'HL'
return None
def quad_position(quads):
"""
Map all quads
Args:
quads (:obj:`numpy.ndarray`): an array of quads classified by
1-4 (PySAL default)
Returns:
list: an array of quads classied by 'HH', 'LL', etc.
"""
return [map_quads(q) for q in quads]

View File

@@ -0,0 +1 @@
from core import set_model, get_model, create_model_table

View File

@@ -0,0 +1,86 @@
import time
import plpy
import pickle
from petname import generate
def create_model_table():
q = '''
create table if not exists model_storage(
description text,
name text unique,
model bytea,
feature_names text[],
date_created timestamptz,
id serial primary key);
'''
plpy.notice(q)
plan = plpy.prepare(q)
resp = plpy.execute(plan)
plpy.notice('Model table successfully created')
plpy.notice(str(resp))
def get_model(model_name):
"""retrieve model if it exists"""
try:
plan = plpy.prepare('''
SELECT model FROM model_storage
WHERE name = $1;
''', ['text', ])
model_encoded = plpy.execute(plan, [model_name, ])
if len(model_encoded) == 1:
model = pickle.loads(
model_encoded[0]['model']
)
plpy.notice('Model successfully loaded')
else:
plpy.notice('Model not found, or too many models '
'({})'.format(len(model_encoded)))
model = None
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
return model
def set_model(model, model_name, feature_names):
"""stores the model in the table model_storage"""
if model_name is None:
model_name = generate(words=2, separator='_', letters=8)
existing_names = plpy.execute('''
SELECT array_agg(name) as name
FROM model_storage
''')
plpy.notice('nrows: {}'.format(existing_names.nrows()))
plpy.notice('MODEL NAME: {}'.format(model_name))
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
if existing_names[0]['name'] is not None:
while model_name in existing_names[0]['name']:
model_name = generate(words=2, separator='_', letters=10)
plpy.notice(model_name)
# store model
try:
plan = plpy.prepare('''
INSERT INTO model_storage(description, name, model, feature_names, date_created)
VALUES (
$1,
$2,
$3,
$4::text[],
to_timestamp($5));
''', ['text', 'text', 'bytea', 'text', 'numeric'])
plpy.notice('{%s}' % ','.join(feature_names))
plpy.notice(feature_names)
plpy.execute(
plan,
[' '.join(m.strip() for m in model.__repr__().split('\n')),
model_name,
pickle.dumps(model),
'{%s}' % ','.join(feature_names),
time.time()]
)
plpy.notice('model successfully stored as {}'.format(model_name))
except plpy.SPIError as err:
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))

View File

@@ -0,0 +1,2 @@
"""Import all functions for pysal_utils"""
from crankshaft.pysal_utils.pysal_utils import *

View File

@@ -0,0 +1,251 @@
"""
Utilities module for generic PySAL functionality, mainly centered on
translating queries into numpy arrays or PySAL weights objects
"""
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@param query_vals dict: values used to construct the query
"""
if w_type.lower() == 'knn':
return knn(query_vals)
else:
return queen(query_vals)
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@param query_res dict-like: query results with attributes and neighbors
"""
neighbors = {x['id']: x['neighbors'] for x in query_res}
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'
return built_weight
def query_attr_select(params, table_ref=True):
"""
Create portion of SELECT statement for attributes inolved in query.
Defaults to order in the params
@param params: dict of information used in query (column names,
table name, etc.)
Example:
OrderedDict([('numerator', 'price'),
('denominator', 'sq_meters'),
('subquery', 'SELECT * FROM interesting_data')])
Output:
"i.\"price\"::numeric As attr1, " \
"i.\"sq_meters\"::numeric As attr2, "
"""
attr_string = ""
template = "\"%(col)s\"::numeric As attr%(alias_num)s, "
if table_ref:
template = "i." + template
if ('time_cols' in params) or ('ind_vars' in params):
# if markov or gwr analysis
attrs = (params['time_cols'] if 'time_cols' in params
else params['ind_vars'])
if 'ind_vars' in params:
template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, "
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(attrs):
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params, table_ref=True):
"""
Construct where conditions when building neighbors query
Create portion of WHERE clauses for weeding out NULL-valued geometries
Input: dict of params:
{'subquery': ...,
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output:
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
'etc': ...}
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
NULL AND idx_replace."time3" IS NOT NULL'
"""
attr_string = []
template = "\"%s\" IS NOT NULL"
if table_ref:
template = "idx_replace." + template
if ('time_cols' in params) or ('ind_vars' in params):
# markov or gwr where clauses
attrs = (params['time_cols'] if 'time_cols' in params
else params['ind_vars'])
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
# moran where clauses
# get keys
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
if 'denominator' in attrs:
attr_string.append(
"idx_replace.\"%s\" <> 0" % params['denominator'])
out = " AND ".join(attr_string)
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
"""
attr_select = query_attr_select(params, table_ref=True)
attr_where = query_attr_where(params, table_ref=True)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = '''
SELECT
i."{id_col}" As id,
%(attr_select)s
(SELECT ARRAY(SELECT j."{id_col}"
FROM ({subquery}) As j
WHERE i."{id_col}" <> j."{id_col}" AND
%(attr_where_j)s AND
j."{geom_col}" IS NOT NULL
ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC
LIMIT {num_ngbrs})) As neighbors
FROM ({subquery}) As i
WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL
ORDER BY i."{id_col}" ASC;
''' % replacements
return query.format(**params)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
"""
attr_select = query_attr_select(params)
attr_where = query_attr_where(params)
replacements = {"attr_select": attr_select,
"attr_where_i": attr_where.replace("idx_replace", "i"),
"attr_where_j": attr_where.replace("idx_replace", "j")}
query = '''
SELECT
i."{id_col}" As id,
%(attr_select)s
(SELECT ARRAY(SELECT j."{id_col}"
FROM ({subquery}) As j
WHERE i."{id_col}" <> j."{id_col}" AND
ST_Touches(i."{geom_col}", j."{geom_col}") AND
%(attr_where_j)s)) As neighbors
FROM ({subquery}) As i
WHERE
%(attr_where_i)s
ORDER BY i."{id_col}" ASC;
''' % replacements
return query.format(**params)
def gwr_query(params):
"""
GWR query
"""
replacements = {"ind_vars_select": query_attr_select(params,
table_ref=None),
"ind_vars_where": query_attr_where(params,
table_ref=None)}
query = '''
SELECT
array_agg(ST_X(ST_Centroid("{geom_col}"))) As x,
array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y,
array_agg("{dep_var}") As dep_var,
%(ind_vars_select)s
array_agg("{id_col}") As rowid
FROM ({subquery}) As q
WHERE
"{dep_var}" IS NOT NULL AND
%(ind_vars_where)s
''' % replacements
return query.format(**params).strip()
def gwr_predict_query(params):
"""
GWR query
"""
replacements = {"ind_vars_select": query_attr_select(params,
table_ref=None),
"ind_vars_where": query_attr_where(params,
table_ref=None)}
query = '''
SELECT
array_agg(ST_X(ST_Centroid({geom_col}))) As x,
array_agg(ST_Y(ST_Centroid({geom_col}))) As y,
array_agg({dep_var}) As dep_var,
%(ind_vars_select)s
array_agg({id_col}) As rowid
FROM ({subquery}) As q
WHERE
%(ind_vars_where)s
''' % replacements
return query.format(**params).strip()
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)

View File

@@ -0,0 +1,12 @@
"""Random seed generator used for non-deterministic functions in crankshaft"""
import random
import numpy
def set_random_seeds(value):
"""
Set the seeds of the RNGs (Random Number Generators)
used internally.
"""
random.seed(value)
numpy.random.seed(value)

View File

@@ -0,0 +1,3 @@
from crankshaft.regression.gwr import *
from crankshaft.regression.glm import *
from crankshaft.regression.gwr_cs import *

Some files were not shown because too many files have changed in this diff Show More