Compare commits
11 Commits
model-stor
...
0.8.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dcf4e4beb5 | ||
|
|
cfebe6e854 | ||
|
|
bce2fc5f1f | ||
|
|
54646acc56 | ||
|
|
698f8e1a60 | ||
|
|
c86b989527 | ||
|
|
a24d7861d8 | ||
|
|
3e660a4d5f | ||
|
|
7c43596512 | ||
|
|
19397eec5b | ||
|
|
87c3fa8fc5 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,4 +2,3 @@ envs/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
.idea/
|
||||
.*.sw[nop]
|
||||
|
||||
68
.travis.yml
68
.travis.yml
@@ -1,60 +1,40 @@
|
||||
dist: xenial
|
||||
language: c
|
||||
dist: precise
|
||||
sudo: required
|
||||
|
||||
env:
|
||||
global:
|
||||
- PAGER=cat
|
||||
- PGUSER=postgres
|
||||
- PGDATABASE=postgres
|
||||
- PGOPTIONS='-c client_min_messages=NOTICE'
|
||||
- PGPORT=5432
|
||||
- POSTGIS_VERSION="2.5"
|
||||
|
||||
matrix:
|
||||
- POSTGRESQL_VERSION="10"
|
||||
- POSTGRESQL_VERSION="11"
|
||||
|
||||
before_install:
|
||||
- ./check-up-to-date-with-master.sh
|
||||
- sudo apt-get -y install python-pip
|
||||
|
||||
- sudo apt-get -y install python-software-properties
|
||||
- sudo add-apt-repository -y ppa:cartodb/sci
|
||||
- sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
|
||||
- sudo add-apt-repository -y ppa:cartodb/gis
|
||||
- sudo add-apt-repository -y ppa:cartodb/gis-testing
|
||||
- sudo apt-get update
|
||||
- sudo service postgresql stop;
|
||||
- sudo apt-get remove postgresql* -y
|
||||
- sudo apt-get install -y --allow-unauthenticated --no-install-recommends --no-install-suggests postgresql-$POSTGRESQL_VERSION postgresql-client-$POSTGRESQL_VERSION postgresql-server-dev-$POSTGRESQL_VERSION postgresql-common
|
||||
- sudo apt-get install -y --allow-unauthenticated postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION-scripts postgis postgresql-plpython-$POSTGRESQL_VERSION
|
||||
- sudo pg_dropcluster --stop $POSTGRESQL_VERSION main
|
||||
- sudo rm -rf /etc/postgresql/$POSTGRESQL_VERSION /var/lib/postgresql/$POSTGRESQL_VERSION
|
||||
- sudo pg_createcluster -u postgres $POSTGRESQL_VERSION main -- -A trust
|
||||
- sudo /etc/init.d/postgresql start $POSTGRESQL_VERSION || sudo journalctl -xe
|
||||
|
||||
- sudo apt-get -y install python-joblib=0.8.3-1-cdb1
|
||||
- sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1
|
||||
|
||||
- sudo apt-get -y install python-scipy=0.14.0-2-cdb6
|
||||
- sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2
|
||||
- sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2
|
||||
- sudo apt-get -y --no-install-recommends install python-scikits-learn=0.14.1-3-cdb2
|
||||
|
||||
# Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
|
||||
- sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
|
||||
|
||||
# Install postgres db and build deps
|
||||
- sudo /etc/init.d/postgresql stop # stop travis default instance
|
||||
- sudo apt-get -y remove --purge postgresql-9.1
|
||||
- sudo apt-get -y remove --purge postgresql-9.2
|
||||
- sudo apt-get -y remove --purge postgresql-9.3
|
||||
- sudo apt-get -y remove --purge postgresql-9.4
|
||||
- sudo apt-get -y remove --purge postgresql-9.5
|
||||
- sudo rm -rf /var/lib/postgresql/
|
||||
- sudo rm -rf /var/log/postgresql/
|
||||
- sudo rm -rf /etc/postgresql/
|
||||
- sudo apt-get -y remove --purge postgis-2.2
|
||||
- sudo apt-get -y autoremove
|
||||
|
||||
- sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
|
||||
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
|
||||
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
|
||||
- sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
|
||||
- sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
|
||||
|
||||
# configure it to accept local connections from postgres
|
||||
- echo -e "# TYPE DATABASE USER ADDRESS METHOD \nlocal all postgres trust\nlocal all all trust\nhost all all 127.0.0.1/32 trust" \
|
||||
| sudo tee /etc/postgresql/9.5/main/pg_hba.conf
|
||||
- sudo /etc/init.d/postgresql restart 9.5
|
||||
- sudo apt-get -y install python-pip python-software-properties python-joblib python-nose
|
||||
|
||||
install:
|
||||
- sudo make install
|
||||
|
||||
script:
|
||||
- make test || { cat src/pg/test/regression.diffs; false; }
|
||||
- make test
|
||||
- ./check-compatibility.sh
|
||||
|
||||
after_failure:
|
||||
- pg_lsclusters
|
||||
- cat src/pg/test/regression.diffs
|
||||
|
||||
6
NEWS.md
6
NEWS.md
@@ -1,3 +1,9 @@
|
||||
0.8.2 (2019-02-07)
|
||||
------------------
|
||||
* Update dependencies to match what it's being used in production.
|
||||
* Update travis to xenial, PG10 and 11, and postgis 2.6
|
||||
* Compatibility with PG11
|
||||
|
||||
0.8.1 (2018-03-12)
|
||||
------------------
|
||||
* Adds improperly added version files
|
||||
|
||||
20
carto-package.json
Normal file
20
carto-package.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"name": "crankshaft",
|
||||
"current_version": {
|
||||
"requires": {
|
||||
"postgres": ">=9.5.0",
|
||||
"postgis": ">=2.2.0.0",
|
||||
"python": ">=2.7.0",
|
||||
"joblib": "0.8.3",
|
||||
"numpy": "1.6.1",
|
||||
"scipy": "0.14.0",
|
||||
"pysal": "1.14.3",
|
||||
"scikit-learn": "0.14.1"
|
||||
},
|
||||
"works_with": {
|
||||
}
|
||||
},
|
||||
|
||||
"exceptional_versions": {
|
||||
}
|
||||
}
|
||||
@@ -27,7 +27,7 @@ psql -c "SELECT * FROM pg_available_extension_versions WHERE name LIKE 'cranksha
|
||||
psql $DBNAME <<'EOF'
|
||||
-- Install dependencies
|
||||
CREATE EXTENSION plpythonu;
|
||||
CREATE EXTENSION postgis VERSION '2.2.2';
|
||||
CREATE EXTENSION postgis;
|
||||
|
||||
-- Create role publicuser if it does not exist
|
||||
DO
|
||||
@@ -48,26 +48,49 @@ CREATE EXTENSION crankshaft;
|
||||
\dx
|
||||
EOF
|
||||
|
||||
|
||||
# Check PG version
|
||||
PG_VERSION=`psql -q -t -c "SELECT current_setting('server_version_num')"`
|
||||
|
||||
# Save public function signatures
|
||||
psql $DBNAME <<'EOF'
|
||||
CREATE TABLE release_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE
|
||||
WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;
|
||||
EOF
|
||||
if [[ "$PG_VERSION" -lt 110000 ]]; then
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE release_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE
|
||||
WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
else
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE release_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE WHEN p.prokind = 'a' THEN 'agg'
|
||||
WHEN p.prokind = 'w' THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
fi
|
||||
|
||||
# Deploy current dev branch
|
||||
make clean-dev || die "Could not clean dev files"
|
||||
@@ -76,26 +99,42 @@ sudo make install || die "Could not deploy current dev branch"
|
||||
# Check it can be upgraded
|
||||
psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version"
|
||||
|
||||
# Check against saved public function signatures
|
||||
psql $DBNAME <<'EOF'
|
||||
CREATE TABLE dev_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE
|
||||
WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;
|
||||
EOF
|
||||
if [[ $PG_VERSION -lt 110000 ]]; then
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE dev_function_signatures AS
|
||||
SELECT p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
else
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE dev_function_signatures AS
|
||||
SELECT p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE WHEN p.prokind = 'a' THEN 'agg'
|
||||
WHEN p.prokind = 'w' THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
fi
|
||||
|
||||
|
||||
echo "Functions in development not in latest release (ok):"
|
||||
psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
-- Version number of the extension release
|
||||
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
|
||||
RETURNS text AS $$
|
||||
SELECT '0.9.0'::text;
|
||||
SELECT '0.8.2'::text;
|
||||
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- Internal identifier of the installed extension instence
|
||||
@@ -35,25 +35,16 @@ CREATE OR REPLACE FUNCTION
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_pyagg'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
@@ -98,7 +89,6 @@ CREATE OR REPLACE FUNCTION
|
||||
query TEXT,
|
||||
variable_name TEXT,
|
||||
target_table TEXT,
|
||||
model_name text DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -115,59 +105,24 @@ AS $$
|
||||
'learning_rate': learning_rate,
|
||||
'min_samples_leaf': min_samples_leaf
|
||||
}
|
||||
all_cols = list(plpy.execute('''
|
||||
feature_cols = set(plpy.execute('''
|
||||
select * from ({query}) as _w limit 0
|
||||
'''.format(query=query)).colnames())
|
||||
feature_cols = [a for a in all_cols
|
||||
if a not in [variable_name, 'cartodb_id', ]]
|
||||
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
|
||||
return seg.create_and_predict_segment(
|
||||
query,
|
||||
variable_name,
|
||||
feature_cols,
|
||||
target_table,
|
||||
model_params,
|
||||
model_name=model_name
|
||||
model_params
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_RetrieveModelParams(
|
||||
model_name text,
|
||||
param_name text
|
||||
)
|
||||
RETURNS TABLE(param numeric, feature_name text) AS $$
|
||||
|
||||
import pickle
|
||||
from collections import Iterable
|
||||
|
||||
plan = plpy.prepare('''
|
||||
SELECT model, feature_names FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
|
||||
try:
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
plpy.notice(model_encoded[0]['feature_names'])
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
|
||||
res = getattr(model, param_name)
|
||||
if not isinstance(res, Iterable):
|
||||
raise Exception('Cannot return `{}` as a table'.format(param_name))
|
||||
return zip(res, model_encoded[0]['feature_names'])
|
||||
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
query TEXT,
|
||||
variable TEXT,
|
||||
feature_columns TEXT[],
|
||||
target_query TEXT,
|
||||
model_name TEXT DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -189,8 +144,7 @@ AS $$
|
||||
variable,
|
||||
feature_columns,
|
||||
target_query,
|
||||
model_params,
|
||||
model_name=model_name
|
||||
model_params
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
CREATE OR REPLACE FUNCTION CDB_Gravity(
|
||||
@@ -1150,27 +1104,19 @@ BEGIN
|
||||
END
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_weightedmean'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
-- Spatial Markov
|
||||
|
||||
-- input table format:
|
||||
@@ -4,7 +4,7 @@
|
||||
-- Version number of the extension release
|
||||
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
|
||||
RETURNS text AS $$
|
||||
SELECT '0.9.1'::text;
|
||||
SELECT '0.8.2'::text;
|
||||
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- Internal identifier of the installed extension instence
|
||||
@@ -35,25 +35,16 @@ CREATE OR REPLACE FUNCTION
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_pyagg'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
@@ -98,7 +89,6 @@ CREATE OR REPLACE FUNCTION
|
||||
query TEXT,
|
||||
variable_name TEXT,
|
||||
target_table TEXT,
|
||||
model_name text DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -115,59 +105,24 @@ AS $$
|
||||
'learning_rate': learning_rate,
|
||||
'min_samples_leaf': min_samples_leaf
|
||||
}
|
||||
all_cols = list(plpy.execute('''
|
||||
feature_cols = set(plpy.execute('''
|
||||
select * from ({query}) as _w limit 0
|
||||
'''.format(query=query)).colnames())
|
||||
feature_cols = [a for a in all_cols
|
||||
if a not in [variable_name, 'cartodb_id', ]]
|
||||
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
|
||||
return seg.create_and_predict_segment(
|
||||
query,
|
||||
variable_name,
|
||||
feature_cols,
|
||||
target_table,
|
||||
model_params,
|
||||
model_name=model_name
|
||||
model_params
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_RetrieveModelParams(
|
||||
model_name text,
|
||||
param_name text
|
||||
)
|
||||
RETURNS TABLE(param numeric, feature_name text) AS $$
|
||||
|
||||
import pickle
|
||||
from collections import Iterable
|
||||
|
||||
plan = plpy.prepare('''
|
||||
SELECT model, feature_names FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
|
||||
try:
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
plpy.notice(model_encoded[0]['feature_names'])
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
|
||||
res = getattr(model, param_name)
|
||||
if not isinstance(res, Iterable):
|
||||
raise Exception('Cannot return `{}` as a table'.format(param_name))
|
||||
return zip(res, model_encoded[0]['feature_names'])
|
||||
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
query TEXT,
|
||||
variable TEXT,
|
||||
feature_columns TEXT[],
|
||||
target_query TEXT,
|
||||
model_name TEXT DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -189,8 +144,7 @@ AS $$
|
||||
variable,
|
||||
feature_columns,
|
||||
target_query,
|
||||
model_params,
|
||||
model_name=model_name
|
||||
model_params
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
CREATE OR REPLACE FUNCTION CDB_Gravity(
|
||||
@@ -1150,27 +1104,19 @@ BEGIN
|
||||
END
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_weightedmean'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
-- Spatial Markov
|
||||
|
||||
-- input table format:
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
comment = 'CartoDB Spatial Analysis extension'
|
||||
default_version = '0.9.5'
|
||||
default_version = '0.8.2'
|
||||
requires = 'plpythonu, postgis'
|
||||
superuser = true
|
||||
schema = cdb_crankshaft
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -2,14 +2,11 @@
|
||||
Segmentation creation and prediction
|
||||
"""
|
||||
|
||||
import pickle
|
||||
import plpy
|
||||
import numpy as np
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn import metrics
|
||||
from sklearn.cross_validation import train_test_split
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
from crankshaft import model_storage
|
||||
|
||||
# NOTE: added optional param here
|
||||
|
||||
@@ -54,7 +51,6 @@ class Segmentation(object):
|
||||
|
||||
def create_and_predict_segment(self, query, variable, feature_columns,
|
||||
target_query, model_params,
|
||||
model_name=None,
|
||||
id_col='cartodb_id'):
|
||||
"""
|
||||
generate a segment with machine learning
|
||||
@@ -74,23 +70,15 @@ class Segmentation(object):
|
||||
(target, features, target_mean,
|
||||
feature_means) = self.clean_data(query, variable, feature_columns)
|
||||
|
||||
model_storage.create_model_table()
|
||||
|
||||
# find model if it exists and is specified
|
||||
if model_name is not None:
|
||||
model = model_storage.get_model(model_name)
|
||||
|
||||
if locals().get('model') is None:
|
||||
model, accuracy = train_model(target, features, model_params, 0.2)
|
||||
|
||||
model, accuracy = train_model(target, features, model_params, 0.2)
|
||||
result = self.predict_segment(model, feature_columns, target_query,
|
||||
feature_means)
|
||||
accuracy_array = [accuracy] * result.shape[0]
|
||||
|
||||
rowid = self.data_provider.get_segmentation_data(params)
|
||||
|
||||
# store the model for later use
|
||||
model_storage.set_model(model, model_name, feature_columns)
|
||||
'''
|
||||
rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
|
||||
'''
|
||||
return zip(rowid[0]['ids'], result, accuracy_array)
|
||||
|
||||
def predict_segment(self, model, feature_columns, target_query,
|
||||
5
release/python/0.8.2/crankshaft/requirements.txt
Normal file
5
release/python/0.8.2/crankshaft/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
joblib==0.9.4
|
||||
numpy==1.11.0
|
||||
scipy==0.17.0
|
||||
pysal==1.14.3
|
||||
scikit-learn==0.17.0
|
||||
@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
|
||||
setup(
|
||||
name='crankshaft',
|
||||
|
||||
version='0.0.0',
|
||||
version='0.8.2',
|
||||
|
||||
description='CartoDB Spatial Analysis Python Library',
|
||||
|
||||
@@ -41,7 +41,7 @@ setup(
|
||||
# The choice of component versions is dictated by what's
|
||||
# provisioned in the production servers.
|
||||
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
|
||||
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
|
||||
install_requires=['joblib==0.9.4', 'numpy==1.11.0', 'scipy==0.17.0', 'pysal==1.14.3', 'scikit-learn==0.17.0'],
|
||||
|
||||
requires=['pysal', 'numpy', 'sklearn'],
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
"""
|
||||
Based on the Weiszfeld algorithm:
|
||||
https://en.wikipedia.org/wiki/Geometric_median
|
||||
"""
|
||||
|
||||
|
||||
# import plpy
|
||||
import numpy as np
|
||||
from numpy.linalg import norm
|
||||
|
||||
|
||||
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
|
||||
|
||||
query = '''
|
||||
SELECT array_agg(ST_X({geom_col})) As x_coords,
|
||||
array_agg(ST_Y({geom_col})) As y_coords
|
||||
FROM {tablename}
|
||||
'''.format(geom_col=geom_col, tablename=tablename)
|
||||
|
||||
try:
|
||||
resp = plpy.execute(query)
|
||||
data = np.vstack((resp['x_coords'][0],
|
||||
resp['y_coords'][0])).T
|
||||
|
||||
plpy.notice('coords: %s' % str(coords))
|
||||
except Exception, err:
|
||||
# plpy.error('Analysis failed: %s' % err)
|
||||
print('No plpy')
|
||||
data = np.array([[1.2 * np.random.random() + 10.,
|
||||
1.1 * (np.random.random() - 1.) + 3.]
|
||||
for i in range(1, 100)])
|
||||
|
||||
# initialize 'median center' to be the mean
|
||||
coords_center_temp = data.mean(axis=0)
|
||||
|
||||
# plpy.notice('temp_center: %s' % str(coords_center_temp))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
|
||||
for i in range(0, num_iters):
|
||||
old_coords_center = coords_center_temp.copy()
|
||||
denom = denominator(coords_center_temp, data)
|
||||
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
|
||||
data[j])
|
||||
for j in range(len(data))], axis=0)
|
||||
coords_center_temp = coords_center_temp / denom
|
||||
|
||||
print("Pass #%d" % i)
|
||||
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
|
||||
coords_center_temp))
|
||||
print("Center coords: %s" % str(coords_center_temp))
|
||||
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
|
||||
|
||||
return coords_center_temp
|
||||
|
||||
|
||||
def obj_func(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.linalg.norm(center_coords - data)
|
||||
|
||||
|
||||
def numerator(center_coords, data_i):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(center_coords - data_i))
|
||||
|
||||
|
||||
def denominator(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(data - center_coords))
|
||||
@@ -1 +0,0 @@
|
||||
from core import set_model, get_model, create_model_table
|
||||
@@ -1,86 +0,0 @@
|
||||
import time
|
||||
import plpy
|
||||
import pickle
|
||||
from petname import generate
|
||||
|
||||
def create_model_table():
|
||||
q = '''
|
||||
create table if not exists model_storage(
|
||||
description text,
|
||||
name text unique,
|
||||
model bytea,
|
||||
feature_names text[],
|
||||
date_created timestamptz,
|
||||
id serial primary key);
|
||||
'''
|
||||
plpy.notice(q)
|
||||
plan = plpy.prepare(q)
|
||||
resp = plpy.execute(plan)
|
||||
plpy.notice('Model table successfully created')
|
||||
plpy.notice(str(resp))
|
||||
|
||||
def get_model(model_name):
|
||||
"""retrieve model if it exists"""
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
SELECT model FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
if len(model_encoded) == 1:
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
plpy.notice('Model successfully loaded')
|
||||
else:
|
||||
plpy.notice('Model not found, or too many models '
|
||||
'({})'.format(len(model_encoded)))
|
||||
model = None
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
|
||||
return model
|
||||
|
||||
def set_model(model, model_name, feature_names):
|
||||
"""stores the model in the table model_storage"""
|
||||
if model_name is None:
|
||||
model_name = generate(words=2, separator='_', letters=8)
|
||||
existing_names = plpy.execute('''
|
||||
SELECT array_agg(name) as name
|
||||
FROM model_storage
|
||||
''')
|
||||
plpy.notice('nrows: {}'.format(existing_names.nrows()))
|
||||
plpy.notice('MODEL NAME: {}'.format(model_name))
|
||||
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
|
||||
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
|
||||
if existing_names[0]['name'] is not None:
|
||||
while model_name in existing_names[0]['name']:
|
||||
model_name = generate(words=2, separator='_', letters=10)
|
||||
plpy.notice(model_name)
|
||||
|
||||
# store model
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
INSERT INTO model_storage(description, name, model, feature_names, date_created)
|
||||
VALUES (
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4::text[],
|
||||
to_timestamp($5));
|
||||
''', ['text', 'text', 'bytea', 'text', 'numeric'])
|
||||
plpy.notice('{%s}' % ','.join(feature_names))
|
||||
plpy.notice(feature_names)
|
||||
plpy.execute(
|
||||
plan,
|
||||
[' '.join(m.strip() for m in model.__repr__().split('\n')),
|
||||
model_name,
|
||||
pickle.dumps(model),
|
||||
'{%s}' % ','.join(feature_names),
|
||||
time.time()]
|
||||
)
|
||||
plpy.notice('model successfully stored as {}'.format(model_name))
|
||||
except plpy.SPIError as err:
|
||||
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))
|
||||
@@ -1,5 +0,0 @@
|
||||
joblib==0.8.3
|
||||
numpy==1.6.1
|
||||
scipy==0.14.0
|
||||
pysal==1.14.3
|
||||
scikit-learn==0.14.1
|
||||
@@ -1,49 +0,0 @@
|
||||
|
||||
"""
|
||||
CartoDB Spatial Analysis Python Library
|
||||
See:
|
||||
https://github.com/CartoDB/crankshaft
|
||||
"""
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='crankshaft',
|
||||
|
||||
version='0.0.0',
|
||||
|
||||
description='CartoDB Spatial Analysis Python Library',
|
||||
|
||||
url='https://github.com/CartoDB/crankshaft',
|
||||
|
||||
author='Data Services Team - CartoDB',
|
||||
author_email='dataservices@cartodb.com',
|
||||
|
||||
license='MIT',
|
||||
|
||||
classifiers=[
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Intended Audience :: Mapping comunity',
|
||||
'Topic :: Maps :: Mapping Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
],
|
||||
|
||||
keywords='maps mapping tools spatial analysis geostatistics',
|
||||
|
||||
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
|
||||
|
||||
extras_require={
|
||||
'dev': ['unittest'],
|
||||
'test': ['unittest', 'nose', 'mock'],
|
||||
},
|
||||
|
||||
# The choice of component versions is dictated by what's
|
||||
# provisioned in the production servers.
|
||||
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
|
||||
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
|
||||
|
||||
requires=['pysal', 'numpy', 'sklearn'],
|
||||
|
||||
test_suite='test'
|
||||
)
|
||||
@@ -1,49 +0,0 @@
|
||||
|
||||
"""
|
||||
CartoDB Spatial Analysis Python Library
|
||||
See:
|
||||
https://github.com/CartoDB/crankshaft
|
||||
"""
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='crankshaft',
|
||||
|
||||
version='0.0.0',
|
||||
|
||||
description='CartoDB Spatial Analysis Python Library',
|
||||
|
||||
url='https://github.com/CartoDB/crankshaft',
|
||||
|
||||
author='Data Services Team - CartoDB',
|
||||
author_email='dataservices@cartodb.com',
|
||||
|
||||
license='MIT',
|
||||
|
||||
classifiers=[
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Intended Audience :: Mapping comunity',
|
||||
'Topic :: Maps :: Mapping Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
],
|
||||
|
||||
keywords='maps mapping tools spatial analysis geostatistics',
|
||||
|
||||
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
|
||||
|
||||
extras_require={
|
||||
'dev': ['unittest'],
|
||||
'test': ['unittest', 'nose', 'mock'],
|
||||
},
|
||||
|
||||
# The choice of component versions is dictated by what's
|
||||
# provisioned in the production servers.
|
||||
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
|
||||
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
|
||||
|
||||
requires=['pysal', 'numpy', 'sklearn'],
|
||||
|
||||
test_suite='test'
|
||||
)
|
||||
@@ -1,6 +0,0 @@
|
||||
{
|
||||
"production_col": [10, 10, 10],
|
||||
"capacity_col": [0.09, 0.31],
|
||||
"marginal_col": [5, 5],
|
||||
"pairwise": [[1, 2, 3], [3, 2, 1]]
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
from test.helper import plpy, fixture_file
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
import json
|
||||
import crankshaft
|
||||
|
||||
class RawDataProvider(AnalysisDataProvider):
|
||||
def __init__(self, fixturedata):
|
||||
self.your_algo_data = fixturedata
|
||||
def get_moran(self, params):
|
||||
"""
|
||||
Replace this function name with the one used in your algorithm,
|
||||
and make sure to use the same function signature that is written
|
||||
for this algo in analysis_data_provider.py
|
||||
"""
|
||||
return self.your_algo_data
|
||||
@@ -1,7 +0,0 @@
|
||||
"""Import all modules"""
|
||||
import crankshaft.random_seeds
|
||||
import crankshaft.clustering
|
||||
import crankshaft.space_time_dynamics
|
||||
import crankshaft.segmentation
|
||||
import crankshaft.regression
|
||||
import analysis_data_provider
|
||||
@@ -1,149 +0,0 @@
|
||||
"""class for fetching data"""
|
||||
import plpy
|
||||
import pysal_utils as pu
|
||||
|
||||
NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
|
||||
'for null values and fill in appropriately.')
|
||||
|
||||
|
||||
def verify_data(func):
|
||||
"""decorator to verify data result before returning to algorithm"""
|
||||
def wrapper(*args, **kwargs):
|
||||
"""Error checking"""
|
||||
try:
|
||||
data = func(*args, **kwargs)
|
||||
if not data:
|
||||
plpy.error(NULL_VALUE_ERROR)
|
||||
else:
|
||||
return data
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('Analysis failed: {}'.format(err))
|
||||
|
||||
return []
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class AnalysisDataProvider(object):
|
||||
"""Data fetching class for pl/python functions"""
|
||||
@verify_data
|
||||
def get_getis(self, w_type, params): # pylint: disable=no-self-use
|
||||
"""fetch data for getis ord's g"""
|
||||
query = pu.construct_neighbor_query(w_type, params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_markov(self, w_type, params): # pylint: disable=no-self-use
|
||||
"""fetch data for spatial markov"""
|
||||
query = pu.construct_neighbor_query(w_type, params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_moran(self, w_type, params): # pylint: disable=no-self-use
|
||||
"""fetch data for moran's i analyses"""
|
||||
query = pu.construct_neighbor_query(w_type, params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_nonspatial_kmeans(self, params): # pylint: disable=no-self-use
|
||||
"""
|
||||
Fetch data for non-spatial k-means.
|
||||
|
||||
Inputs - a dict (params) with the following keys:
|
||||
colnames: a (text) list of column names (e.g.,
|
||||
`['andy', 'cookie']`)
|
||||
id_col: the name of the id column (e.g., `'cartodb_id'`)
|
||||
subquery: the subquery for exposing the data (e.g.,
|
||||
SELECT * FROM favorite_things)
|
||||
Output:
|
||||
A SQL query for packaging the data for consumption within
|
||||
`KMeans().nonspatial`. Format will be a list of length one,
|
||||
with the first element a dict with keys ('rowid', 'attr1',
|
||||
'attr2', ...)
|
||||
"""
|
||||
agg_cols = ', '.join([
|
||||
'array_agg({0}) As arr_col{1}'.format(val, idx+1)
|
||||
for idx, val in enumerate(params['colnames'])
|
||||
])
|
||||
query = '''
|
||||
SELECT {cols}, array_agg({id_col}) As rowid
|
||||
FROM ({subquery}) As a
|
||||
'''.format(subquery=params['subquery'],
|
||||
id_col=params['id_col'],
|
||||
cols=agg_cols).strip()
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_segmentation_model_data(self, params): # pylint: disable=R0201
|
||||
"""
|
||||
fetch data for Segmentation
|
||||
params = {"subquery": query,
|
||||
"target": variable,
|
||||
"features": feature_columns}
|
||||
"""
|
||||
columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
|
||||
for col in params['features']])
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg("{target}") As target,
|
||||
{columns}
|
||||
FROM ({subquery}) As q
|
||||
'''.format(subquery=params['subquery'],
|
||||
target=params['target'],
|
||||
columns=columns)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_segmentation_data(self, params): # pylint: disable=no-self-use
|
||||
"""
|
||||
params = {"subquery": target_query,
|
||||
"id_col": id_col}
|
||||
"""
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg("{id_col}" ORDER BY "{id_col}") as "ids"
|
||||
FROM ({subquery}) as q
|
||||
'''.format(**params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_segmentation_predict_data(self, params): # pylint: disable=R0201
|
||||
"""
|
||||
fetch data for Segmentation
|
||||
params = {"subquery": target_query,
|
||||
"feature_columns": feature_columns}
|
||||
"""
|
||||
joined_features = ', '.join(['"{}"::numeric'.format(a)
|
||||
for a in params['feature_columns']])
|
||||
query = '''
|
||||
SELECT
|
||||
Array[{joined_features}] As features
|
||||
FROM ({subquery}) as q
|
||||
'''.format(subquery=params['subquery'],
|
||||
joined_features=joined_features)
|
||||
return plpy.cursor(query)
|
||||
|
||||
@verify_data
|
||||
def get_spatial_kmeans(self, params): # pylint: disable=no-self-use
|
||||
"""fetch data for spatial kmeans"""
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg("{id_col}" ORDER BY "{id_col}") as ids,
|
||||
array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs,
|
||||
array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys
|
||||
FROM ({subquery}) As a
|
||||
WHERE "{geom_col}" IS NOT NULL
|
||||
'''.format(**params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_gwr(self, params): # pylint: disable=no-self-use
|
||||
"""fetch data for gwr analysis"""
|
||||
query = pu.gwr_query(params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_gwr_predict(self, params): # pylint: disable=no-self-use
|
||||
"""fetch data for gwr predict"""
|
||||
query = pu.gwr_predict_query(params)
|
||||
return plpy.execute(query)
|
||||
@@ -1,76 +0,0 @@
|
||||
"""
|
||||
Based on the Weiszfeld algorithm:
|
||||
https://en.wikipedia.org/wiki/Geometric_median
|
||||
"""
|
||||
|
||||
|
||||
# import plpy
|
||||
import numpy as np
|
||||
from numpy.linalg import norm
|
||||
|
||||
|
||||
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
|
||||
|
||||
query = '''
|
||||
SELECT array_agg(ST_X({geom_col})) As x_coords,
|
||||
array_agg(ST_Y({geom_col})) As y_coords
|
||||
FROM {tablename}
|
||||
'''.format(geom_col=geom_col, tablename=tablename)
|
||||
|
||||
try:
|
||||
resp = plpy.execute(query)
|
||||
data = np.vstack((resp['x_coords'][0],
|
||||
resp['y_coords'][0])).T
|
||||
|
||||
plpy.notice('coords: %s' % str(coords))
|
||||
except Exception, err:
|
||||
# plpy.error('Analysis failed: %s' % err)
|
||||
print('No plpy')
|
||||
data = np.array([[1.2 * np.random.random() + 10.,
|
||||
1.1 * (np.random.random() - 1.) + 3.]
|
||||
for i in range(1, 100)])
|
||||
|
||||
# initialize 'median center' to be the mean
|
||||
coords_center_temp = data.mean(axis=0)
|
||||
|
||||
# plpy.notice('temp_center: %s' % str(coords_center_temp))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
|
||||
for i in range(0, num_iters):
|
||||
old_coords_center = coords_center_temp.copy()
|
||||
denom = denominator(coords_center_temp, data)
|
||||
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
|
||||
data[j])
|
||||
for j in range(len(data))], axis=0)
|
||||
coords_center_temp = coords_center_temp / denom
|
||||
|
||||
print("Pass #%d" % i)
|
||||
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
|
||||
coords_center_temp))
|
||||
print("Center coords: %s" % str(coords_center_temp))
|
||||
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
|
||||
|
||||
return coords_center_temp
|
||||
|
||||
|
||||
def obj_func(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.linalg.norm(center_coords - data)
|
||||
|
||||
|
||||
def numerator(center_coords, data_i):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(center_coords - data_i))
|
||||
|
||||
|
||||
def denominator(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(data - center_coords))
|
||||
@@ -1,4 +0,0 @@
|
||||
"""Import all functions from for clustering"""
|
||||
from moran import *
|
||||
from kmeans import *
|
||||
from getis import *
|
||||
@@ -1,50 +0,0 @@
|
||||
"""
|
||||
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
|
||||
"""
|
||||
|
||||
import pysal as ps
|
||||
from collections import OrderedDict
|
||||
|
||||
# crankshaft modules
|
||||
import crankshaft.pysal_utils as pu
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
|
||||
# High level interface ---------------------------------------
|
||||
|
||||
|
||||
class Getis(object):
|
||||
def __init__(self, data_provider=None):
|
||||
if data_provider is None:
|
||||
self.data_provider = AnalysisDataProvider()
|
||||
else:
|
||||
self.data_provider = data_provider
|
||||
|
||||
def getis_ord(self, subquery, attr,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Getis-Ord's G*
|
||||
Implementation building neighbors with a PostGIS database and PySAL's
|
||||
Getis-Ord's G* hotspot/coldspot module.
|
||||
Andy Eschbacher
|
||||
"""
|
||||
|
||||
# geometries with attributes that are null are ignored
|
||||
# resulting in a collection of not as near neighbors if kNN is chosen
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_getis(w_type, params)
|
||||
attr_vals = pu.get_attributes(result)
|
||||
|
||||
# build PySAL weight object
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate Getis-Ord's G* z- and p-values
|
||||
getis = ps.esda.getisord.G_Local(attr_vals, weight,
|
||||
star=True, permutations=permutations)
|
||||
|
||||
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)
|
||||
@@ -1,113 +0,0 @@
|
||||
from sklearn.cluster import KMeans
|
||||
import numpy as np
|
||||
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
|
||||
|
||||
class Kmeans(object):
|
||||
def __init__(self, data_provider=None):
|
||||
if data_provider is None:
|
||||
self.data_provider = AnalysisDataProvider()
|
||||
else:
|
||||
self.data_provider = data_provider
|
||||
|
||||
def spatial(self, query, no_clusters, no_init=20):
|
||||
"""
|
||||
find centers based on clusters of latitude/longitude pairs
|
||||
query: SQL query that has a WGS84 geometry (the_geom)
|
||||
"""
|
||||
params = {"subquery": query,
|
||||
"geom_col": "the_geom",
|
||||
"id_col": "cartodb_id"}
|
||||
|
||||
result = self.data_provider.get_spatial_kmeans(params)
|
||||
|
||||
# Unpack query response
|
||||
xs = result[0]['xs']
|
||||
ys = result[0]['ys']
|
||||
ids = result[0]['ids']
|
||||
|
||||
km = KMeans(n_clusters=no_clusters, n_init=no_init)
|
||||
labels = km.fit_predict(zip(xs, ys))
|
||||
return zip(ids, labels)
|
||||
|
||||
def nonspatial(self, subquery, colnames, no_clusters=5,
|
||||
standardize=True, id_col='cartodb_id'):
|
||||
"""
|
||||
Arguments:
|
||||
query (string): A SQL query to retrieve the data required to do the
|
||||
k-means clustering analysis, like so:
|
||||
SELECT * FROM iris_flower_data
|
||||
colnames (list): a list of the column names which contain the data
|
||||
of interest, like so: ['sepal_width',
|
||||
'petal_width',
|
||||
'sepal_length',
|
||||
'petal_length']
|
||||
no_clusters (int): number of clusters (greater than zero)
|
||||
id_col (string): name of the input id_column
|
||||
|
||||
Returns:
|
||||
A list of tuples with the following columns:
|
||||
cluster labels: a label for the cluster that the row belongs to
|
||||
centers: center of the cluster that this row belongs to
|
||||
silhouettes: silhouette measure for this value
|
||||
rowid: row that these values belong to (corresponds to the value in
|
||||
`id_col`)
|
||||
"""
|
||||
import json
|
||||
from sklearn import metrics
|
||||
|
||||
params = {
|
||||
"colnames": colnames,
|
||||
"subquery": subquery,
|
||||
"id_col": id_col
|
||||
}
|
||||
|
||||
data = self.data_provider.get_nonspatial_kmeans(params)
|
||||
|
||||
# fill array with values for k-means clustering
|
||||
if standardize:
|
||||
cluster_columns = _scale_data(
|
||||
_extract_columns(data))
|
||||
else:
|
||||
cluster_columns = _extract_columns(data)
|
||||
|
||||
kmeans = KMeans(n_clusters=no_clusters,
|
||||
random_state=0).fit(cluster_columns)
|
||||
|
||||
centers = [json.dumps(dict(zip(colnames, c)))
|
||||
for c in kmeans.cluster_centers_[kmeans.labels_]]
|
||||
|
||||
silhouettes = metrics.silhouette_samples(cluster_columns,
|
||||
kmeans.labels_,
|
||||
metric='sqeuclidean')
|
||||
|
||||
return zip(kmeans.labels_,
|
||||
centers,
|
||||
silhouettes,
|
||||
[kmeans.inertia_] * kmeans.labels_.shape[0],
|
||||
data[0]['rowid'])
|
||||
|
||||
|
||||
# -- Preprocessing steps
|
||||
|
||||
def _extract_columns(data):
|
||||
"""
|
||||
Extract the features from the query and pack them into a NumPy array
|
||||
data (list of dicts): result of the kmeans request
|
||||
"""
|
||||
# number of columns minus rowid column
|
||||
n_cols = len(data[0]) - 1
|
||||
return np.array([data[0]['arr_col{0}'.format(i+1)]
|
||||
for i in xrange(n_cols)],
|
||||
dtype=float).T
|
||||
|
||||
|
||||
def _scale_data(features):
|
||||
"""
|
||||
Scale all input columns to center on 0 with a standard devation of 1
|
||||
features (numpy matrix): features of dimension (n_features, n_samples)
|
||||
"""
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
scaler = StandardScaler()
|
||||
return scaler.fit_transform(features)
|
||||
@@ -1,341 +0,0 @@
|
||||
"""
|
||||
Moran's I geostatistics (global clustering & outliers presence)
|
||||
Functionality relies on a combination of `PySAL
|
||||
<http://pysal.readthedocs.io/en/latest/>`__ and the data providered provided in
|
||||
the class instantiation (which defaults to PostgreSQL's plpy module's `database
|
||||
access functions <https://www.postgresql.org/docs/10/static/plpython.html>`__).
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
import pysal as ps
|
||||
|
||||
# crankshaft module
|
||||
import crankshaft.pysal_utils as pu
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
|
||||
# High level interface ---------------------------------------
|
||||
|
||||
|
||||
class Moran(object):
|
||||
"""Class for calculation of Moran's I statistics (global, local, and local
|
||||
rate)
|
||||
|
||||
Parameters:
|
||||
data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See
|
||||
the `crankshaft.analysis_data_provider` module for more information.
|
||||
"""
|
||||
def __init__(self, data_provider=None):
|
||||
if data_provider is None:
|
||||
self.data_provider = AnalysisDataProvider()
|
||||
else:
|
||||
self.data_provider = data_provider
|
||||
|
||||
def global_stat(self, subquery, attr_name,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I (global)
|
||||
Implementation building neighbors with a PostGIS database and Moran's I
|
||||
core clusters with PySAL.
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
attr_name (str): Column name of data to analyze
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
|
||||
"""
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr_name),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
attr_vals = pu.get_attributes(result)
|
||||
|
||||
# calculate weights
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate moran global
|
||||
moran_global = ps.esda.moran.Moran(attr_vals, weight,
|
||||
permutations=permutations)
|
||||
|
||||
return zip([moran_global.I], [moran_global.EI])
|
||||
|
||||
def local_stat(self, subquery, attr,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I (local)
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
attr (str): Column name of data to analyze
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
|
||||
Returns:
|
||||
list of tuples: Where each tuple consists of the following values:
|
||||
- quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
|
||||
- p-value
|
||||
- spatial lag
|
||||
- standardized spatial lag (centered on the mean, normalized by the
|
||||
standard deviation)
|
||||
- original value
|
||||
- standardized value
|
||||
- Moran's I statistic
|
||||
- original row index
|
||||
"""
|
||||
|
||||
# geometries with attributes that are null are ignored
|
||||
# resulting in a collection of not as near neighbors
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
attr_vals = pu.get_attributes(result)
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate LISA values
|
||||
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
|
||||
permutations=permutations)
|
||||
|
||||
# find quadrants for each geometry
|
||||
quads = quad_position(lisa.q)
|
||||
|
||||
# calculate spatial lag
|
||||
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
|
||||
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
|
||||
|
||||
return zip(
|
||||
quads,
|
||||
lisa.p_sim,
|
||||
lag,
|
||||
lag_std,
|
||||
lisa.y,
|
||||
lisa.z,
|
||||
lisa.Is,
|
||||
weight.id_order
|
||||
)
|
||||
|
||||
def global_rate_stat(self, subquery, numerator, denominator,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I Rate (global)
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
numerator (str): Column name of numerator to analyze
|
||||
denominator (str): Column name of the denominator
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
"""
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", numerator),
|
||||
("attr2", denominator),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
numer = pu.get_attributes(result, 1)
|
||||
denom = pu.get_attributes(result, 2)
|
||||
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate moran global rate
|
||||
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
|
||||
permutations=permutations)
|
||||
|
||||
return zip([lisa_rate.I], [lisa_rate.EI])
|
||||
|
||||
def local_rate_stat(self, subquery, numerator, denominator,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I Local Rate
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
numerator (str): Column name of numerator to analyze
|
||||
denominator (str): Column name of the denominator
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
|
||||
Returns:
|
||||
list of tuples: Where each tuple consists of the following values:
|
||||
- quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
|
||||
- p-value
|
||||
- spatial lag
|
||||
- standardized spatial lag (centered on the mean, normalized by the
|
||||
standard deviation)
|
||||
- original value (roughly numerator divided by denominator)
|
||||
- standardized value
|
||||
- Moran's I statistic
|
||||
- original row index
|
||||
"""
|
||||
# geometries with values that are null are ignored
|
||||
# resulting in a collection of not as near neighbors
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("numerator", numerator),
|
||||
("denominator", denominator),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
numer = pu.get_attributes(result, 1)
|
||||
denom = pu.get_attributes(result, 2)
|
||||
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate LISA values
|
||||
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
|
||||
permutations=permutations)
|
||||
|
||||
# find quadrants for each geometry
|
||||
quads = quad_position(lisa.q)
|
||||
|
||||
# spatial lag
|
||||
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
|
||||
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
|
||||
|
||||
return zip(
|
||||
quads,
|
||||
lisa.p_sim,
|
||||
lag,
|
||||
lag_std,
|
||||
lisa.y,
|
||||
lisa.z,
|
||||
lisa.Is,
|
||||
weight.id_order
|
||||
)
|
||||
|
||||
def local_bivariate_stat(self, subquery, attr1, attr2,
|
||||
permutations, geom_col, id_col,
|
||||
w_type, num_ngbrs):
|
||||
"""
|
||||
Moran's I (local) Bivariate (untested)
|
||||
"""
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr1),
|
||||
("attr2", attr2),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
attr1_vals = pu.get_attributes(result, 1)
|
||||
attr2_vals = pu.get_attributes(result, 2)
|
||||
|
||||
# create weights
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate LISA values
|
||||
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
|
||||
permutations=permutations)
|
||||
|
||||
# find clustering of significance
|
||||
lisa_sig = quad_position(lisa.q)
|
||||
|
||||
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
|
||||
|
||||
# Low level functions ----------------------------------------
|
||||
|
||||
|
||||
def map_quads(coord):
|
||||
"""
|
||||
Map a quadrant number to Moran's I designation
|
||||
HH=1, LH=2, LL=3, HL=4
|
||||
Args:
|
||||
coord (int): quadrant of a specific measurement
|
||||
Returns:
|
||||
classification (one of 'HH', 'LH', 'LL', or 'HL')
|
||||
"""
|
||||
if coord == 1:
|
||||
return 'HH'
|
||||
elif coord == 2:
|
||||
return 'LH'
|
||||
elif coord == 3:
|
||||
return 'LL'
|
||||
elif coord == 4:
|
||||
return 'HL'
|
||||
return None
|
||||
|
||||
|
||||
def quad_position(quads):
|
||||
"""
|
||||
Map all quads
|
||||
|
||||
Args:
|
||||
quads (:obj:`numpy.ndarray`): an array of quads classified by
|
||||
1-4 (PySAL default)
|
||||
Returns:
|
||||
list: an array of quads classied by 'HH', 'LL', etc.
|
||||
"""
|
||||
return [map_quads(q) for q in quads]
|
||||
@@ -1 +0,0 @@
|
||||
from core import set_model, get_model, create_model_table
|
||||
@@ -1,86 +0,0 @@
|
||||
import time
|
||||
import plpy
|
||||
import pickle
|
||||
from petname import generate
|
||||
|
||||
def create_model_table():
|
||||
q = '''
|
||||
create table if not exists model_storage(
|
||||
description text,
|
||||
name text unique,
|
||||
model bytea,
|
||||
feature_names text[],
|
||||
date_created timestamptz,
|
||||
id serial primary key);
|
||||
'''
|
||||
plpy.notice(q)
|
||||
plan = plpy.prepare(q)
|
||||
resp = plpy.execute(plan)
|
||||
plpy.notice('Model table successfully created')
|
||||
plpy.notice(str(resp))
|
||||
|
||||
def get_model(model_name):
|
||||
"""retrieve model if it exists"""
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
SELECT model FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
if len(model_encoded) == 1:
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
plpy.notice('Model successfully loaded')
|
||||
else:
|
||||
plpy.notice('Model not found, or too many models '
|
||||
'({})'.format(len(model_encoded)))
|
||||
model = None
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
|
||||
return model
|
||||
|
||||
def set_model(model, model_name, feature_names):
|
||||
"""stores the model in the table model_storage"""
|
||||
if model_name is None:
|
||||
model_name = generate(words=2, separator='_', letters=8)
|
||||
existing_names = plpy.execute('''
|
||||
SELECT array_agg(name) as name
|
||||
FROM model_storage
|
||||
''')
|
||||
plpy.notice('nrows: {}'.format(existing_names.nrows()))
|
||||
plpy.notice('MODEL NAME: {}'.format(model_name))
|
||||
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
|
||||
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
|
||||
if existing_names[0]['name'] is not None:
|
||||
while model_name in existing_names[0]['name']:
|
||||
model_name = generate(words=2, separator='_', letters=10)
|
||||
plpy.notice(model_name)
|
||||
|
||||
# store model
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
INSERT INTO model_storage(description, name, model, feature_names, date_created)
|
||||
VALUES (
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4::text[],
|
||||
to_timestamp($5));
|
||||
''', ['text', 'text', 'bytea', 'text', 'numeric'])
|
||||
plpy.notice('{%s}' % ','.join(feature_names))
|
||||
plpy.notice(feature_names)
|
||||
plpy.execute(
|
||||
plan,
|
||||
[' '.join(m.strip() for m in model.__repr__().split('\n')),
|
||||
model_name,
|
||||
pickle.dumps(model),
|
||||
'{%s}' % ','.join(feature_names),
|
||||
time.time()]
|
||||
)
|
||||
plpy.notice('model successfully stored as {}'.format(model_name))
|
||||
except plpy.SPIError as err:
|
||||
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))
|
||||
@@ -1,2 +0,0 @@
|
||||
"""Import all functions for pysal_utils"""
|
||||
from crankshaft.pysal_utils.pysal_utils import *
|
||||
@@ -1,251 +0,0 @@
|
||||
"""
|
||||
Utilities module for generic PySAL functionality, mainly centered on
|
||||
translating queries into numpy arrays or PySAL weights objects
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pysal as ps
|
||||
|
||||
|
||||
def construct_neighbor_query(w_type, query_vals):
|
||||
"""Return query (a string) used for finding neighbors
|
||||
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
|
||||
@param query_vals dict: values used to construct the query
|
||||
"""
|
||||
|
||||
if w_type.lower() == 'knn':
|
||||
return knn(query_vals)
|
||||
else:
|
||||
return queen(query_vals)
|
||||
|
||||
|
||||
# Build weight object
|
||||
def get_weight(query_res, w_type='knn', num_ngbrs=5):
|
||||
"""
|
||||
Construct PySAL weight from return value of query
|
||||
@param query_res dict-like: query results with attributes and neighbors
|
||||
"""
|
||||
|
||||
neighbors = {x['id']: x['neighbors'] for x in query_res}
|
||||
print 'len of neighbors: %d' % len(neighbors)
|
||||
|
||||
built_weight = ps.W(neighbors)
|
||||
built_weight.transform = 'r'
|
||||
|
||||
return built_weight
|
||||
|
||||
|
||||
def query_attr_select(params, table_ref=True):
|
||||
"""
|
||||
Create portion of SELECT statement for attributes inolved in query.
|
||||
Defaults to order in the params
|
||||
@param params: dict of information used in query (column names,
|
||||
table name, etc.)
|
||||
Example:
|
||||
OrderedDict([('numerator', 'price'),
|
||||
('denominator', 'sq_meters'),
|
||||
('subquery', 'SELECT * FROM interesting_data')])
|
||||
Output:
|
||||
"i.\"price\"::numeric As attr1, " \
|
||||
"i.\"sq_meters\"::numeric As attr2, "
|
||||
"""
|
||||
|
||||
attr_string = ""
|
||||
template = "\"%(col)s\"::numeric As attr%(alias_num)s, "
|
||||
|
||||
if table_ref:
|
||||
template = "i." + template
|
||||
|
||||
if ('time_cols' in params) or ('ind_vars' in params):
|
||||
# if markov or gwr analysis
|
||||
attrs = (params['time_cols'] if 'time_cols' in params
|
||||
else params['ind_vars'])
|
||||
if 'ind_vars' in params:
|
||||
template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, "
|
||||
|
||||
for idx, val in enumerate(attrs):
|
||||
attr_string += template % {"col": val, "alias_num": idx + 1}
|
||||
else:
|
||||
# if moran's analysis
|
||||
attrs = [k for k in params
|
||||
if k not in ('id_col', 'geom_col', 'subquery',
|
||||
'num_ngbrs', 'subquery')]
|
||||
|
||||
for idx, val in enumerate(attrs):
|
||||
attr_string += template % {"col": params[val],
|
||||
"alias_num": idx + 1}
|
||||
|
||||
return attr_string
|
||||
|
||||
|
||||
def query_attr_where(params, table_ref=True):
|
||||
"""
|
||||
Construct where conditions when building neighbors query
|
||||
Create portion of WHERE clauses for weeding out NULL-valued geometries
|
||||
Input: dict of params:
|
||||
{'subquery': ...,
|
||||
'numerator': 'data1',
|
||||
'denominator': 'data2',
|
||||
'': ...}
|
||||
Output:
|
||||
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
|
||||
Input:
|
||||
{'subquery': ...,
|
||||
'time_cols': ['time1', 'time2', 'time3'],
|
||||
'etc': ...}
|
||||
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
|
||||
NULL AND idx_replace."time3" IS NOT NULL'
|
||||
"""
|
||||
attr_string = []
|
||||
template = "\"%s\" IS NOT NULL"
|
||||
if table_ref:
|
||||
template = "idx_replace." + template
|
||||
|
||||
if ('time_cols' in params) or ('ind_vars' in params):
|
||||
# markov or gwr where clauses
|
||||
attrs = (params['time_cols'] if 'time_cols' in params
|
||||
else params['ind_vars'])
|
||||
# add values to template
|
||||
for attr in attrs:
|
||||
attr_string.append(template % attr)
|
||||
else:
|
||||
# moran where clauses
|
||||
|
||||
# get keys
|
||||
attrs = [k for k in params
|
||||
if k not in ('id_col', 'geom_col', 'subquery',
|
||||
'num_ngbrs', 'subquery')]
|
||||
|
||||
# add values to template
|
||||
for attr in attrs:
|
||||
attr_string.append(template % params[attr])
|
||||
|
||||
if 'denominator' in attrs:
|
||||
attr_string.append(
|
||||
"idx_replace.\"%s\" <> 0" % params['denominator'])
|
||||
|
||||
out = " AND ".join(attr_string)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def knn(params):
|
||||
"""SQL query for k-nearest neighbors.
|
||||
@param vars: dict of values to fill template
|
||||
"""
|
||||
|
||||
attr_select = query_attr_select(params, table_ref=True)
|
||||
attr_where = query_attr_where(params, table_ref=True)
|
||||
|
||||
replacements = {"attr_select": attr_select,
|
||||
"attr_where_i": attr_where.replace("idx_replace", "i"),
|
||||
"attr_where_j": attr_where.replace("idx_replace", "j")}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
i."{id_col}" As id,
|
||||
%(attr_select)s
|
||||
(SELECT ARRAY(SELECT j."{id_col}"
|
||||
FROM ({subquery}) As j
|
||||
WHERE i."{id_col}" <> j."{id_col}" AND
|
||||
%(attr_where_j)s AND
|
||||
j."{geom_col}" IS NOT NULL
|
||||
ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC
|
||||
LIMIT {num_ngbrs})) As neighbors
|
||||
FROM ({subquery}) As i
|
||||
WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL
|
||||
ORDER BY i."{id_col}" ASC;
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params)
|
||||
|
||||
|
||||
# SQL query for finding queens neighbors (all contiguous polygons)
|
||||
def queen(params):
|
||||
"""SQL query for queen neighbors.
|
||||
@param params dict: information to fill query
|
||||
"""
|
||||
attr_select = query_attr_select(params)
|
||||
attr_where = query_attr_where(params)
|
||||
|
||||
replacements = {"attr_select": attr_select,
|
||||
"attr_where_i": attr_where.replace("idx_replace", "i"),
|
||||
"attr_where_j": attr_where.replace("idx_replace", "j")}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
i."{id_col}" As id,
|
||||
%(attr_select)s
|
||||
(SELECT ARRAY(SELECT j."{id_col}"
|
||||
FROM ({subquery}) As j
|
||||
WHERE i."{id_col}" <> j."{id_col}" AND
|
||||
ST_Touches(i."{geom_col}", j."{geom_col}") AND
|
||||
%(attr_where_j)s)) As neighbors
|
||||
FROM ({subquery}) As i
|
||||
WHERE
|
||||
%(attr_where_i)s
|
||||
ORDER BY i."{id_col}" ASC;
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params)
|
||||
|
||||
|
||||
def gwr_query(params):
|
||||
"""
|
||||
GWR query
|
||||
"""
|
||||
|
||||
replacements = {"ind_vars_select": query_attr_select(params,
|
||||
table_ref=None),
|
||||
"ind_vars_where": query_attr_where(params,
|
||||
table_ref=None)}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg(ST_X(ST_Centroid("{geom_col}"))) As x,
|
||||
array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y,
|
||||
array_agg("{dep_var}") As dep_var,
|
||||
%(ind_vars_select)s
|
||||
array_agg("{id_col}") As rowid
|
||||
FROM ({subquery}) As q
|
||||
WHERE
|
||||
"{dep_var}" IS NOT NULL AND
|
||||
%(ind_vars_where)s
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params).strip()
|
||||
|
||||
|
||||
def gwr_predict_query(params):
|
||||
"""
|
||||
GWR query
|
||||
"""
|
||||
|
||||
replacements = {"ind_vars_select": query_attr_select(params,
|
||||
table_ref=None),
|
||||
"ind_vars_where": query_attr_where(params,
|
||||
table_ref=None)}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg(ST_X(ST_Centroid({geom_col}))) As x,
|
||||
array_agg(ST_Y(ST_Centroid({geom_col}))) As y,
|
||||
array_agg({dep_var}) As dep_var,
|
||||
%(ind_vars_select)s
|
||||
array_agg({id_col}) As rowid
|
||||
FROM ({subquery}) As q
|
||||
WHERE
|
||||
%(ind_vars_where)s
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params).strip()
|
||||
# to add more weight methods open a ticket or pull request
|
||||
|
||||
|
||||
def get_attributes(query_res, attr_num=1):
|
||||
"""
|
||||
@param query_res: query results with attributes and neighbors
|
||||
@param attr_num: attribute number (1, 2, ...)
|
||||
"""
|
||||
return np.array([x['attr' + str(attr_num)] for x in query_res],
|
||||
dtype=np.float)
|
||||
@@ -1,12 +0,0 @@
|
||||
"""Random seed generator used for non-deterministic functions in crankshaft"""
|
||||
import random
|
||||
import numpy
|
||||
|
||||
|
||||
def set_random_seeds(value):
|
||||
"""
|
||||
Set the seeds of the RNGs (Random Number Generators)
|
||||
used internally.
|
||||
"""
|
||||
random.seed(value)
|
||||
numpy.random.seed(value)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user