Compare commits
15 Commits
0.8.2
...
model-stor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
daba2f9597 | ||
|
|
8f28f41060 | ||
|
|
7509afa5a6 | ||
|
|
a28c68502c | ||
|
|
5b4443ca88 | ||
|
|
2048db33fc | ||
|
|
99e78800b3 | ||
|
|
800648a710 | ||
|
|
91ee6ecc48 | ||
|
|
9a5ab17240 | ||
|
|
65be9befb1 | ||
|
|
37e6b4a228 | ||
|
|
766bfed9be | ||
|
|
e8a601e945 | ||
|
|
c2be340c07 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@ envs/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
.idea/
|
||||
.*.sw[nop]
|
||||
|
||||
68
.travis.yml
68
.travis.yml
@@ -1,40 +1,60 @@
|
||||
dist: xenial
|
||||
language: c
|
||||
dist: precise
|
||||
sudo: required
|
||||
|
||||
env:
|
||||
global:
|
||||
- PAGER=cat
|
||||
- PGUSER=postgres
|
||||
- PGDATABASE=postgres
|
||||
- PGOPTIONS='-c client_min_messages=NOTICE'
|
||||
- PGPORT=5432
|
||||
- POSTGIS_VERSION="2.5"
|
||||
|
||||
matrix:
|
||||
- POSTGRESQL_VERSION="10"
|
||||
- POSTGRESQL_VERSION="11"
|
||||
|
||||
before_install:
|
||||
- ./check-up-to-date-with-master.sh
|
||||
- sudo apt-get -y install python-pip
|
||||
|
||||
- sudo service postgresql stop;
|
||||
- sudo apt-get remove postgresql* -y
|
||||
- sudo apt-get install -y --allow-unauthenticated --no-install-recommends --no-install-suggests postgresql-$POSTGRESQL_VERSION postgresql-client-$POSTGRESQL_VERSION postgresql-server-dev-$POSTGRESQL_VERSION postgresql-common
|
||||
- sudo apt-get install -y --allow-unauthenticated postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION-scripts postgis postgresql-plpython-$POSTGRESQL_VERSION
|
||||
- sudo pg_dropcluster --stop $POSTGRESQL_VERSION main
|
||||
- sudo rm -rf /etc/postgresql/$POSTGRESQL_VERSION /var/lib/postgresql/$POSTGRESQL_VERSION
|
||||
- sudo pg_createcluster -u postgres $POSTGRESQL_VERSION main -- -A trust
|
||||
- sudo /etc/init.d/postgresql start $POSTGRESQL_VERSION || sudo journalctl -xe
|
||||
- sudo apt-get -y install python-software-properties
|
||||
- sudo add-apt-repository -y ppa:cartodb/sci
|
||||
- sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
|
||||
- sudo add-apt-repository -y ppa:cartodb/gis
|
||||
- sudo add-apt-repository -y ppa:cartodb/gis-testing
|
||||
- sudo apt-get update
|
||||
|
||||
- sudo apt-get -y install python-pip python-software-properties python-joblib python-nose
|
||||
- sudo apt-get -y install python-joblib=0.8.3-1-cdb1
|
||||
- sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1
|
||||
|
||||
- sudo apt-get -y install python-scipy=0.14.0-2-cdb6
|
||||
- sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2
|
||||
- sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2
|
||||
- sudo apt-get -y --no-install-recommends install python-scikits-learn=0.14.1-3-cdb2
|
||||
|
||||
# Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
|
||||
- sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
|
||||
|
||||
# Install postgres db and build deps
|
||||
- sudo /etc/init.d/postgresql stop # stop travis default instance
|
||||
- sudo apt-get -y remove --purge postgresql-9.1
|
||||
- sudo apt-get -y remove --purge postgresql-9.2
|
||||
- sudo apt-get -y remove --purge postgresql-9.3
|
||||
- sudo apt-get -y remove --purge postgresql-9.4
|
||||
- sudo apt-get -y remove --purge postgresql-9.5
|
||||
- sudo rm -rf /var/lib/postgresql/
|
||||
- sudo rm -rf /var/log/postgresql/
|
||||
- sudo rm -rf /etc/postgresql/
|
||||
- sudo apt-get -y remove --purge postgis-2.2
|
||||
- sudo apt-get -y autoremove
|
||||
|
||||
- sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
|
||||
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
|
||||
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
|
||||
- sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
|
||||
- sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
|
||||
|
||||
# configure it to accept local connections from postgres
|
||||
- echo -e "# TYPE DATABASE USER ADDRESS METHOD \nlocal all postgres trust\nlocal all all trust\nhost all all 127.0.0.1/32 trust" \
|
||||
| sudo tee /etc/postgresql/9.5/main/pg_hba.conf
|
||||
- sudo /etc/init.d/postgresql restart 9.5
|
||||
|
||||
install:
|
||||
- sudo make install
|
||||
|
||||
script:
|
||||
- make test
|
||||
- make test || { cat src/pg/test/regression.diffs; false; }
|
||||
- ./check-compatibility.sh
|
||||
|
||||
after_failure:
|
||||
- pg_lsclusters
|
||||
- cat src/pg/test/regression.diffs
|
||||
|
||||
6
NEWS.md
6
NEWS.md
@@ -1,9 +1,3 @@
|
||||
0.8.2 (2019-02-07)
|
||||
------------------
|
||||
* Update dependencies to match what it's being used in production.
|
||||
* Update travis to xenial, PG10 and 11, and postgis 2.6
|
||||
* Compatibility with PG11
|
||||
|
||||
0.8.1 (2018-03-12)
|
||||
------------------
|
||||
* Adds improperly added version files
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
{
|
||||
"name": "crankshaft",
|
||||
"current_version": {
|
||||
"requires": {
|
||||
"postgres": ">=9.5.0",
|
||||
"postgis": ">=2.2.0.0",
|
||||
"python": ">=2.7.0",
|
||||
"joblib": "0.8.3",
|
||||
"numpy": "1.6.1",
|
||||
"scipy": "0.14.0",
|
||||
"pysal": "1.14.3",
|
||||
"scikit-learn": "0.14.1"
|
||||
},
|
||||
"works_with": {
|
||||
}
|
||||
},
|
||||
|
||||
"exceptional_versions": {
|
||||
}
|
||||
}
|
||||
@@ -27,7 +27,7 @@ psql -c "SELECT * FROM pg_available_extension_versions WHERE name LIKE 'cranksha
|
||||
psql $DBNAME <<'EOF'
|
||||
-- Install dependencies
|
||||
CREATE EXTENSION plpythonu;
|
||||
CREATE EXTENSION postgis;
|
||||
CREATE EXTENSION postgis VERSION '2.2.2';
|
||||
|
||||
-- Create role publicuser if it does not exist
|
||||
DO
|
||||
@@ -48,49 +48,26 @@ CREATE EXTENSION crankshaft;
|
||||
\dx
|
||||
EOF
|
||||
|
||||
|
||||
# Check PG version
|
||||
PG_VERSION=`psql -q -t -c "SELECT current_setting('server_version_num')"`
|
||||
|
||||
# Save public function signatures
|
||||
if [[ "$PG_VERSION" -lt 110000 ]]; then
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE release_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE
|
||||
WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
else
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE release_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE WHEN p.prokind = 'a' THEN 'agg'
|
||||
WHEN p.prokind = 'w' THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
fi
|
||||
psql $DBNAME <<'EOF'
|
||||
CREATE TABLE release_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE
|
||||
WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;
|
||||
EOF
|
||||
|
||||
# Deploy current dev branch
|
||||
make clean-dev || die "Could not clean dev files"
|
||||
@@ -99,42 +76,26 @@ sudo make install || die "Could not deploy current dev branch"
|
||||
# Check it can be upgraded
|
||||
psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version"
|
||||
|
||||
if [[ $PG_VERSION -lt 110000 ]]; then
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE dev_function_signatures AS
|
||||
SELECT p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
else
|
||||
psql $DBNAME -c "
|
||||
CREATE TABLE dev_function_signatures AS
|
||||
SELECT p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE WHEN p.prokind = 'a' THEN 'agg'
|
||||
WHEN p.prokind = 'w' THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;"
|
||||
fi
|
||||
|
||||
# Check against saved public function signatures
|
||||
psql $DBNAME <<'EOF'
|
||||
CREATE TABLE dev_function_signatures AS
|
||||
SELECT
|
||||
p.proname as name,
|
||||
pg_catalog.pg_get_function_result(p.oid) as result_type,
|
||||
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
|
||||
CASE
|
||||
WHEN p.proisagg THEN 'agg'
|
||||
WHEN p.proiswindow THEN 'window'
|
||||
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
|
||||
ELSE 'normal'
|
||||
END as type
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE
|
||||
n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname LIKE 'cdb_%'
|
||||
ORDER BY 1, 2, 4;
|
||||
EOF
|
||||
|
||||
echo "Functions in development not in latest release (ok):"
|
||||
psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
-- Version number of the extension release
|
||||
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
|
||||
RETURNS text AS $$
|
||||
SELECT '0.8.2'::text;
|
||||
SELECT '0.9.0'::text;
|
||||
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- Internal identifier of the installed extension instence
|
||||
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_pyagg'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
|
||||
query TEXT,
|
||||
variable_name TEXT,
|
||||
target_table TEXT,
|
||||
model_name text DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -105,24 +115,59 @@ AS $$
|
||||
'learning_rate': learning_rate,
|
||||
'min_samples_leaf': min_samples_leaf
|
||||
}
|
||||
feature_cols = set(plpy.execute('''
|
||||
all_cols = list(plpy.execute('''
|
||||
select * from ({query}) as _w limit 0
|
||||
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
|
||||
'''.format(query=query)).colnames())
|
||||
feature_cols = [a for a in all_cols
|
||||
if a not in [variable_name, 'cartodb_id', ]]
|
||||
return seg.create_and_predict_segment(
|
||||
query,
|
||||
variable_name,
|
||||
feature_cols,
|
||||
target_table,
|
||||
model_params
|
||||
model_params,
|
||||
model_name=model_name
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_RetrieveModelParams(
|
||||
model_name text,
|
||||
param_name text
|
||||
)
|
||||
RETURNS TABLE(param numeric, feature_name text) AS $$
|
||||
|
||||
import pickle
|
||||
from collections import Iterable
|
||||
|
||||
plan = plpy.prepare('''
|
||||
SELECT model, feature_names FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
|
||||
try:
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
plpy.notice(model_encoded[0]['feature_names'])
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
|
||||
res = getattr(model, param_name)
|
||||
if not isinstance(res, Iterable):
|
||||
raise Exception('Cannot return `{}` as a table'.format(param_name))
|
||||
return zip(res, model_encoded[0]['feature_names'])
|
||||
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
query TEXT,
|
||||
variable TEXT,
|
||||
feature_columns TEXT[],
|
||||
target_query TEXT,
|
||||
model_name TEXT DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -144,7 +189,8 @@ AS $$
|
||||
variable,
|
||||
feature_columns,
|
||||
target_query,
|
||||
model_params
|
||||
model_params,
|
||||
model_name=model_name
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
CREATE OR REPLACE FUNCTION CDB_Gravity(
|
||||
@@ -1104,19 +1150,27 @@ BEGIN
|
||||
END
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_weightedmean'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
-- Spatial Markov
|
||||
|
||||
-- input table format:
|
||||
@@ -4,7 +4,7 @@
|
||||
-- Version number of the extension release
|
||||
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
|
||||
RETURNS text AS $$
|
||||
SELECT '0.8.2'::text;
|
||||
SELECT '0.9.1'::text;
|
||||
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- Internal identifier of the installed extension instence
|
||||
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_pyagg'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
|
||||
SFUNC = CDB_PyAggS,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
|
||||
query TEXT,
|
||||
variable_name TEXT,
|
||||
target_table TEXT,
|
||||
model_name text DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -105,24 +115,59 @@ AS $$
|
||||
'learning_rate': learning_rate,
|
||||
'min_samples_leaf': min_samples_leaf
|
||||
}
|
||||
feature_cols = set(plpy.execute('''
|
||||
all_cols = list(plpy.execute('''
|
||||
select * from ({query}) as _w limit 0
|
||||
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
|
||||
'''.format(query=query)).colnames())
|
||||
feature_cols = [a for a in all_cols
|
||||
if a not in [variable_name, 'cartodb_id', ]]
|
||||
return seg.create_and_predict_segment(
|
||||
query,
|
||||
variable_name,
|
||||
feature_cols,
|
||||
target_table,
|
||||
model_params
|
||||
model_params,
|
||||
model_name=model_name
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_RetrieveModelParams(
|
||||
model_name text,
|
||||
param_name text
|
||||
)
|
||||
RETURNS TABLE(param numeric, feature_name text) AS $$
|
||||
|
||||
import pickle
|
||||
from collections import Iterable
|
||||
|
||||
plan = plpy.prepare('''
|
||||
SELECT model, feature_names FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
|
||||
try:
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
plpy.notice(model_encoded[0]['feature_names'])
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
|
||||
res = getattr(model, param_name)
|
||||
if not isinstance(res, Iterable):
|
||||
raise Exception('Cannot return `{}` as a table'.format(param_name))
|
||||
return zip(res, model_encoded[0]['feature_names'])
|
||||
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
CDB_CreateAndPredictSegment(
|
||||
query TEXT,
|
||||
variable TEXT,
|
||||
feature_columns TEXT[],
|
||||
target_query TEXT,
|
||||
model_name TEXT DEFAULT NULL,
|
||||
n_estimators INTEGER DEFAULT 1200,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||
@@ -144,7 +189,8 @@ AS $$
|
||||
variable,
|
||||
feature_columns,
|
||||
target_query,
|
||||
model_params
|
||||
model_params,
|
||||
model_name=model_name
|
||||
)
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
CREATE OR REPLACE FUNCTION CDB_Gravity(
|
||||
@@ -1104,19 +1150,27 @@ BEGIN
|
||||
END
|
||||
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
|
||||
-- Create aggregate if it did not exist
|
||||
DO $$ BEGIN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
EXCEPTION
|
||||
WHEN duplicate_function THEN NULL;
|
||||
END $$;
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT *
|
||||
FROM pg_catalog.pg_proc p
|
||||
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
|
||||
WHERE n.nspname = 'cdb_crankshaft'
|
||||
AND p.proname = 'cdb_weightedmean'
|
||||
AND p.proisagg)
|
||||
THEN
|
||||
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
|
||||
SFUNC = CDB_WeightedMeanS,
|
||||
FINALFUNC = CDB_WeightedMeanF,
|
||||
STYPE = Numeric[],
|
||||
PARALLEL = SAFE,
|
||||
INITCOND = "{0.0,0.0,0.0}"
|
||||
);
|
||||
END IF;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
-- Spatial Markov
|
||||
|
||||
-- input table format:
|
||||
2393
release/crankshaft--0.9.0.sql
Normal file
2393
release/crankshaft--0.9.0.sql
Normal file
File diff suppressed because it is too large
Load Diff
2393
release/crankshaft--0.9.1--0.9.2.sql
Normal file
2393
release/crankshaft--0.9.1--0.9.2.sql
Normal file
File diff suppressed because it is too large
Load Diff
2393
release/crankshaft--0.9.1.sql
Normal file
2393
release/crankshaft--0.9.1.sql
Normal file
File diff suppressed because it is too large
Load Diff
2393
release/crankshaft--0.9.2--0.9.3.sql
Normal file
2393
release/crankshaft--0.9.2--0.9.3.sql
Normal file
File diff suppressed because it is too large
Load Diff
2393
release/crankshaft--0.9.2.sql
Normal file
2393
release/crankshaft--0.9.2.sql
Normal file
File diff suppressed because it is too large
Load Diff
2421
release/crankshaft--0.9.3--0.9.4.sql
Normal file
2421
release/crankshaft--0.9.3--0.9.4.sql
Normal file
File diff suppressed because it is too large
Load Diff
2393
release/crankshaft--0.9.3.sql
Normal file
2393
release/crankshaft--0.9.3.sql
Normal file
File diff suppressed because it is too large
Load Diff
2421
release/crankshaft--0.9.4--0.9.5.sql
Normal file
2421
release/crankshaft--0.9.4--0.9.5.sql
Normal file
File diff suppressed because it is too large
Load Diff
2421
release/crankshaft--0.9.4.sql
Normal file
2421
release/crankshaft--0.9.4.sql
Normal file
File diff suppressed because it is too large
Load Diff
2421
release/crankshaft--0.9.5.sql
Normal file
2421
release/crankshaft--0.9.5.sql
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
comment = 'CartoDB Spatial Analysis extension'
|
||||
default_version = '0.8.2'
|
||||
default_version = '0.9.5'
|
||||
requires = 'plpythonu, postgis'
|
||||
superuser = true
|
||||
schema = cdb_crankshaft
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,5 +0,0 @@
|
||||
joblib==0.9.4
|
||||
numpy==1.11.0
|
||||
scipy==0.17.0
|
||||
pysal==1.14.3
|
||||
scikit-learn==0.17.0
|
||||
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Based on the Weiszfeld algorithm:
|
||||
https://en.wikipedia.org/wiki/Geometric_median
|
||||
"""
|
||||
|
||||
|
||||
# import plpy
|
||||
import numpy as np
|
||||
from numpy.linalg import norm
|
||||
|
||||
|
||||
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
|
||||
|
||||
query = '''
|
||||
SELECT array_agg(ST_X({geom_col})) As x_coords,
|
||||
array_agg(ST_Y({geom_col})) As y_coords
|
||||
FROM {tablename}
|
||||
'''.format(geom_col=geom_col, tablename=tablename)
|
||||
|
||||
try:
|
||||
resp = plpy.execute(query)
|
||||
data = np.vstack((resp['x_coords'][0],
|
||||
resp['y_coords'][0])).T
|
||||
|
||||
plpy.notice('coords: %s' % str(coords))
|
||||
except Exception, err:
|
||||
# plpy.error('Analysis failed: %s' % err)
|
||||
print('No plpy')
|
||||
data = np.array([[1.2 * np.random.random() + 10.,
|
||||
1.1 * (np.random.random() - 1.) + 3.]
|
||||
for i in range(1, 100)])
|
||||
|
||||
# initialize 'median center' to be the mean
|
||||
coords_center_temp = data.mean(axis=0)
|
||||
|
||||
# plpy.notice('temp_center: %s' % str(coords_center_temp))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
|
||||
for i in range(0, num_iters):
|
||||
old_coords_center = coords_center_temp.copy()
|
||||
denom = denominator(coords_center_temp, data)
|
||||
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
|
||||
data[j])
|
||||
for j in range(len(data))], axis=0)
|
||||
coords_center_temp = coords_center_temp / denom
|
||||
|
||||
print("Pass #%d" % i)
|
||||
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
|
||||
coords_center_temp))
|
||||
print("Center coords: %s" % str(coords_center_temp))
|
||||
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
|
||||
|
||||
return coords_center_temp
|
||||
|
||||
|
||||
def obj_func(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.linalg.norm(center_coords - data)
|
||||
|
||||
|
||||
def numerator(center_coords, data_i):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(center_coords - data_i))
|
||||
|
||||
|
||||
def denominator(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(data - center_coords))
|
||||
@@ -0,0 +1 @@
|
||||
from core import set_model, get_model, create_model_table
|
||||
@@ -0,0 +1,86 @@
|
||||
import time
|
||||
import plpy
|
||||
import pickle
|
||||
from petname import generate
|
||||
|
||||
def create_model_table():
|
||||
q = '''
|
||||
create table if not exists model_storage(
|
||||
description text,
|
||||
name text unique,
|
||||
model bytea,
|
||||
feature_names text[],
|
||||
date_created timestamptz,
|
||||
id serial primary key);
|
||||
'''
|
||||
plpy.notice(q)
|
||||
plan = plpy.prepare(q)
|
||||
resp = plpy.execute(plan)
|
||||
plpy.notice('Model table successfully created')
|
||||
plpy.notice(str(resp))
|
||||
|
||||
def get_model(model_name):
|
||||
"""retrieve model if it exists"""
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
SELECT model FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
if len(model_encoded) == 1:
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
plpy.notice('Model successfully loaded')
|
||||
else:
|
||||
plpy.notice('Model not found, or too many models '
|
||||
'({})'.format(len(model_encoded)))
|
||||
model = None
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
|
||||
return model
|
||||
|
||||
def set_model(model, model_name, feature_names):
|
||||
"""stores the model in the table model_storage"""
|
||||
if model_name is None:
|
||||
model_name = generate(words=2, separator='_', letters=8)
|
||||
existing_names = plpy.execute('''
|
||||
SELECT array_agg(name) as name
|
||||
FROM model_storage
|
||||
''')
|
||||
plpy.notice('nrows: {}'.format(existing_names.nrows()))
|
||||
plpy.notice('MODEL NAME: {}'.format(model_name))
|
||||
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
|
||||
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
|
||||
if existing_names[0]['name'] is not None:
|
||||
while model_name in existing_names[0]['name']:
|
||||
model_name = generate(words=2, separator='_', letters=10)
|
||||
plpy.notice(model_name)
|
||||
|
||||
# store model
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
INSERT INTO model_storage(description, name, model, feature_names, date_created)
|
||||
VALUES (
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4::text[],
|
||||
to_timestamp($5));
|
||||
''', ['text', 'text', 'bytea', 'text', 'numeric'])
|
||||
plpy.notice('{%s}' % ','.join(feature_names))
|
||||
plpy.notice(feature_names)
|
||||
plpy.execute(
|
||||
plan,
|
||||
[' '.join(m.strip() for m in model.__repr__().split('\n')),
|
||||
model_name,
|
||||
pickle.dumps(model),
|
||||
'{%s}' % ','.join(feature_names),
|
||||
time.time()]
|
||||
)
|
||||
plpy.notice('model successfully stored as {}'.format(model_name))
|
||||
except plpy.SPIError as err:
|
||||
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))
|
||||
@@ -2,11 +2,14 @@
|
||||
Segmentation creation and prediction
|
||||
"""
|
||||
|
||||
import pickle
|
||||
import plpy
|
||||
import numpy as np
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn import metrics
|
||||
from sklearn.cross_validation import train_test_split
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
from crankshaft import model_storage
|
||||
|
||||
# NOTE: added optional param here
|
||||
|
||||
@@ -51,6 +54,7 @@ class Segmentation(object):
|
||||
|
||||
def create_and_predict_segment(self, query, variable, feature_columns,
|
||||
target_query, model_params,
|
||||
model_name=None,
|
||||
id_col='cartodb_id'):
|
||||
"""
|
||||
generate a segment with machine learning
|
||||
@@ -70,15 +74,23 @@ class Segmentation(object):
|
||||
(target, features, target_mean,
|
||||
feature_means) = self.clean_data(query, variable, feature_columns)
|
||||
|
||||
model, accuracy = train_model(target, features, model_params, 0.2)
|
||||
model_storage.create_model_table()
|
||||
|
||||
# find model if it exists and is specified
|
||||
if model_name is not None:
|
||||
model = model_storage.get_model(model_name)
|
||||
|
||||
if locals().get('model') is None:
|
||||
model, accuracy = train_model(target, features, model_params, 0.2)
|
||||
|
||||
result = self.predict_segment(model, feature_columns, target_query,
|
||||
feature_means)
|
||||
accuracy_array = [accuracy] * result.shape[0]
|
||||
|
||||
rowid = self.data_provider.get_segmentation_data(params)
|
||||
'''
|
||||
rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
|
||||
'''
|
||||
|
||||
# store the model for later use
|
||||
model_storage.set_model(model, model_name, feature_columns)
|
||||
return zip(rowid[0]['ids'], result, accuracy_array)
|
||||
|
||||
def predict_segment(self, model, feature_columns, target_query,
|
||||
5
release/python/0.9.0/crankshaft/requirements.txt
Normal file
5
release/python/0.9.0/crankshaft/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
joblib==0.8.3
|
||||
numpy==1.6.1
|
||||
scipy==0.14.0
|
||||
pysal==1.14.3
|
||||
scikit-learn==0.14.1
|
||||
@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
|
||||
setup(
|
||||
name='crankshaft',
|
||||
|
||||
version='0.8.2',
|
||||
version='0.0.0',
|
||||
|
||||
description='CartoDB Spatial Analysis Python Library',
|
||||
|
||||
@@ -41,7 +41,7 @@ setup(
|
||||
# The choice of component versions is dictated by what's
|
||||
# provisioned in the production servers.
|
||||
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
|
||||
install_requires=['joblib==0.9.4', 'numpy==1.11.0', 'scipy==0.17.0', 'pysal==1.14.3', 'scikit-learn==0.17.0'],
|
||||
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
|
||||
|
||||
requires=['pysal', 'numpy', 'sklearn'],
|
||||
|
||||
49
release/python/0.9.0/crankshaft/setup.py-r
Normal file
49
release/python/0.9.0/crankshaft/setup.py-r
Normal file
@@ -0,0 +1,49 @@
|
||||
|
||||
"""
|
||||
CartoDB Spatial Analysis Python Library
|
||||
See:
|
||||
https://github.com/CartoDB/crankshaft
|
||||
"""
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='crankshaft',
|
||||
|
||||
version='0.0.0',
|
||||
|
||||
description='CartoDB Spatial Analysis Python Library',
|
||||
|
||||
url='https://github.com/CartoDB/crankshaft',
|
||||
|
||||
author='Data Services Team - CartoDB',
|
||||
author_email='dataservices@cartodb.com',
|
||||
|
||||
license='MIT',
|
||||
|
||||
classifiers=[
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Intended Audience :: Mapping comunity',
|
||||
'Topic :: Maps :: Mapping Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
],
|
||||
|
||||
keywords='maps mapping tools spatial analysis geostatistics',
|
||||
|
||||
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
|
||||
|
||||
extras_require={
|
||||
'dev': ['unittest'],
|
||||
'test': ['unittest', 'nose', 'mock'],
|
||||
},
|
||||
|
||||
# The choice of component versions is dictated by what's
|
||||
# provisioned in the production servers.
|
||||
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
|
||||
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
|
||||
|
||||
requires=['pysal', 'numpy', 'sklearn'],
|
||||
|
||||
test_suite='test'
|
||||
)
|
||||
6
release/python/0.9.0/crankshaft/test/fixtures/optim.json
vendored
Normal file
6
release/python/0.9.0/crankshaft/test/fixtures/optim.json
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"production_col": [10, 10, 10],
|
||||
"capacity_col": [0.09, 0.31],
|
||||
"marginal_col": [5, 5],
|
||||
"pairwise": [[1, 2, 3], [3, 2, 1]]
|
||||
}
|
||||
15
release/python/0.9.0/crankshaft/tools/setup.py
Normal file
15
release/python/0.9.0/crankshaft/tools/setup.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from test.helper import plpy, fixture_file
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
import json
|
||||
import crankshaft
|
||||
|
||||
class RawDataProvider(AnalysisDataProvider):
|
||||
def __init__(self, fixturedata):
|
||||
self.your_algo_data = fixturedata
|
||||
def get_moran(self, params):
|
||||
"""
|
||||
Replace this function name with the one used in your algorithm,
|
||||
and make sure to use the same function signature that is written
|
||||
for this algo in analysis_data_provider.py
|
||||
"""
|
||||
return self.your_algo_data
|
||||
7
release/python/0.9.1/crankshaft/crankshaft/__init__.py
Normal file
7
release/python/0.9.1/crankshaft/crankshaft/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""Import all modules"""
|
||||
import crankshaft.random_seeds
|
||||
import crankshaft.clustering
|
||||
import crankshaft.space_time_dynamics
|
||||
import crankshaft.segmentation
|
||||
import crankshaft.regression
|
||||
import analysis_data_provider
|
||||
@@ -0,0 +1,149 @@
|
||||
"""class for fetching data"""
|
||||
import plpy
|
||||
import pysal_utils as pu
|
||||
|
||||
NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
|
||||
'for null values and fill in appropriately.')
|
||||
|
||||
|
||||
def verify_data(func):
|
||||
"""decorator to verify data result before returning to algorithm"""
|
||||
def wrapper(*args, **kwargs):
|
||||
"""Error checking"""
|
||||
try:
|
||||
data = func(*args, **kwargs)
|
||||
if not data:
|
||||
plpy.error(NULL_VALUE_ERROR)
|
||||
else:
|
||||
return data
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('Analysis failed: {}'.format(err))
|
||||
|
||||
return []
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class AnalysisDataProvider(object):
|
||||
"""Data fetching class for pl/python functions"""
|
||||
@verify_data
|
||||
def get_getis(self, w_type, params): # pylint: disable=no-self-use
|
||||
"""fetch data for getis ord's g"""
|
||||
query = pu.construct_neighbor_query(w_type, params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_markov(self, w_type, params): # pylint: disable=no-self-use
|
||||
"""fetch data for spatial markov"""
|
||||
query = pu.construct_neighbor_query(w_type, params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_moran(self, w_type, params): # pylint: disable=no-self-use
|
||||
"""fetch data for moran's i analyses"""
|
||||
query = pu.construct_neighbor_query(w_type, params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_nonspatial_kmeans(self, params): # pylint: disable=no-self-use
|
||||
"""
|
||||
Fetch data for non-spatial k-means.
|
||||
|
||||
Inputs - a dict (params) with the following keys:
|
||||
colnames: a (text) list of column names (e.g.,
|
||||
`['andy', 'cookie']`)
|
||||
id_col: the name of the id column (e.g., `'cartodb_id'`)
|
||||
subquery: the subquery for exposing the data (e.g.,
|
||||
SELECT * FROM favorite_things)
|
||||
Output:
|
||||
A SQL query for packaging the data for consumption within
|
||||
`KMeans().nonspatial`. Format will be a list of length one,
|
||||
with the first element a dict with keys ('rowid', 'attr1',
|
||||
'attr2', ...)
|
||||
"""
|
||||
agg_cols = ', '.join([
|
||||
'array_agg({0}) As arr_col{1}'.format(val, idx+1)
|
||||
for idx, val in enumerate(params['colnames'])
|
||||
])
|
||||
query = '''
|
||||
SELECT {cols}, array_agg({id_col}) As rowid
|
||||
FROM ({subquery}) As a
|
||||
'''.format(subquery=params['subquery'],
|
||||
id_col=params['id_col'],
|
||||
cols=agg_cols).strip()
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_segmentation_model_data(self, params): # pylint: disable=R0201
|
||||
"""
|
||||
fetch data for Segmentation
|
||||
params = {"subquery": query,
|
||||
"target": variable,
|
||||
"features": feature_columns}
|
||||
"""
|
||||
columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
|
||||
for col in params['features']])
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg("{target}") As target,
|
||||
{columns}
|
||||
FROM ({subquery}) As q
|
||||
'''.format(subquery=params['subquery'],
|
||||
target=params['target'],
|
||||
columns=columns)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_segmentation_data(self, params): # pylint: disable=no-self-use
|
||||
"""
|
||||
params = {"subquery": target_query,
|
||||
"id_col": id_col}
|
||||
"""
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg("{id_col}" ORDER BY "{id_col}") as "ids"
|
||||
FROM ({subquery}) as q
|
||||
'''.format(**params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_segmentation_predict_data(self, params): # pylint: disable=R0201
|
||||
"""
|
||||
fetch data for Segmentation
|
||||
params = {"subquery": target_query,
|
||||
"feature_columns": feature_columns}
|
||||
"""
|
||||
joined_features = ', '.join(['"{}"::numeric'.format(a)
|
||||
for a in params['feature_columns']])
|
||||
query = '''
|
||||
SELECT
|
||||
Array[{joined_features}] As features
|
||||
FROM ({subquery}) as q
|
||||
'''.format(subquery=params['subquery'],
|
||||
joined_features=joined_features)
|
||||
return plpy.cursor(query)
|
||||
|
||||
@verify_data
|
||||
def get_spatial_kmeans(self, params): # pylint: disable=no-self-use
|
||||
"""fetch data for spatial kmeans"""
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg("{id_col}" ORDER BY "{id_col}") as ids,
|
||||
array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs,
|
||||
array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys
|
||||
FROM ({subquery}) As a
|
||||
WHERE "{geom_col}" IS NOT NULL
|
||||
'''.format(**params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_gwr(self, params): # pylint: disable=no-self-use
|
||||
"""fetch data for gwr analysis"""
|
||||
query = pu.gwr_query(params)
|
||||
return plpy.execute(query)
|
||||
|
||||
@verify_data
|
||||
def get_gwr_predict(self, params): # pylint: disable=no-self-use
|
||||
"""fetch data for gwr predict"""
|
||||
query = pu.gwr_predict_query(params)
|
||||
return plpy.execute(query)
|
||||
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Based on the Weiszfeld algorithm:
|
||||
https://en.wikipedia.org/wiki/Geometric_median
|
||||
"""
|
||||
|
||||
|
||||
# import plpy
|
||||
import numpy as np
|
||||
from numpy.linalg import norm
|
||||
|
||||
|
||||
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
|
||||
|
||||
query = '''
|
||||
SELECT array_agg(ST_X({geom_col})) As x_coords,
|
||||
array_agg(ST_Y({geom_col})) As y_coords
|
||||
FROM {tablename}
|
||||
'''.format(geom_col=geom_col, tablename=tablename)
|
||||
|
||||
try:
|
||||
resp = plpy.execute(query)
|
||||
data = np.vstack((resp['x_coords'][0],
|
||||
resp['y_coords'][0])).T
|
||||
|
||||
plpy.notice('coords: %s' % str(coords))
|
||||
except Exception, err:
|
||||
# plpy.error('Analysis failed: %s' % err)
|
||||
print('No plpy')
|
||||
data = np.array([[1.2 * np.random.random() + 10.,
|
||||
1.1 * (np.random.random() - 1.) + 3.]
|
||||
for i in range(1, 100)])
|
||||
|
||||
# initialize 'median center' to be the mean
|
||||
coords_center_temp = data.mean(axis=0)
|
||||
|
||||
# plpy.notice('temp_center: %s' % str(coords_center_temp))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
|
||||
for i in range(0, num_iters):
|
||||
old_coords_center = coords_center_temp.copy()
|
||||
denom = denominator(coords_center_temp, data)
|
||||
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
|
||||
data[j])
|
||||
for j in range(len(data))], axis=0)
|
||||
coords_center_temp = coords_center_temp / denom
|
||||
|
||||
print("Pass #%d" % i)
|
||||
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
|
||||
print('temp_center: %s' % str(coords_center_temp))
|
||||
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
|
||||
coords_center_temp))
|
||||
print("Center coords: %s" % str(coords_center_temp))
|
||||
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
|
||||
|
||||
return coords_center_temp
|
||||
|
||||
|
||||
def obj_func(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.linalg.norm(center_coords - data)
|
||||
|
||||
|
||||
def numerator(center_coords, data_i):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(center_coords - data_i))
|
||||
|
||||
|
||||
def denominator(center_coords, data):
|
||||
"""
|
||||
|
||||
"""
|
||||
return np.reciprocal(np.linalg.norm(data - center_coords))
|
||||
@@ -0,0 +1,4 @@
|
||||
"""Import all functions from for clustering"""
|
||||
from moran import *
|
||||
from kmeans import *
|
||||
from getis import *
|
||||
@@ -0,0 +1,50 @@
|
||||
"""
|
||||
Getis-Ord's G geostatistics (hotspot/coldspot analysis)
|
||||
"""
|
||||
|
||||
import pysal as ps
|
||||
from collections import OrderedDict
|
||||
|
||||
# crankshaft modules
|
||||
import crankshaft.pysal_utils as pu
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
|
||||
# High level interface ---------------------------------------
|
||||
|
||||
|
||||
class Getis(object):
|
||||
def __init__(self, data_provider=None):
|
||||
if data_provider is None:
|
||||
self.data_provider = AnalysisDataProvider()
|
||||
else:
|
||||
self.data_provider = data_provider
|
||||
|
||||
def getis_ord(self, subquery, attr,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Getis-Ord's G*
|
||||
Implementation building neighbors with a PostGIS database and PySAL's
|
||||
Getis-Ord's G* hotspot/coldspot module.
|
||||
Andy Eschbacher
|
||||
"""
|
||||
|
||||
# geometries with attributes that are null are ignored
|
||||
# resulting in a collection of not as near neighbors if kNN is chosen
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_getis(w_type, params)
|
||||
attr_vals = pu.get_attributes(result)
|
||||
|
||||
# build PySAL weight object
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate Getis-Ord's G* z- and p-values
|
||||
getis = ps.esda.getisord.G_Local(attr_vals, weight,
|
||||
star=True, permutations=permutations)
|
||||
|
||||
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)
|
||||
113
release/python/0.9.1/crankshaft/crankshaft/clustering/kmeans.py
Normal file
113
release/python/0.9.1/crankshaft/crankshaft/clustering/kmeans.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from sklearn.cluster import KMeans
|
||||
import numpy as np
|
||||
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
|
||||
|
||||
class Kmeans(object):
|
||||
def __init__(self, data_provider=None):
|
||||
if data_provider is None:
|
||||
self.data_provider = AnalysisDataProvider()
|
||||
else:
|
||||
self.data_provider = data_provider
|
||||
|
||||
def spatial(self, query, no_clusters, no_init=20):
|
||||
"""
|
||||
find centers based on clusters of latitude/longitude pairs
|
||||
query: SQL query that has a WGS84 geometry (the_geom)
|
||||
"""
|
||||
params = {"subquery": query,
|
||||
"geom_col": "the_geom",
|
||||
"id_col": "cartodb_id"}
|
||||
|
||||
result = self.data_provider.get_spatial_kmeans(params)
|
||||
|
||||
# Unpack query response
|
||||
xs = result[0]['xs']
|
||||
ys = result[0]['ys']
|
||||
ids = result[0]['ids']
|
||||
|
||||
km = KMeans(n_clusters=no_clusters, n_init=no_init)
|
||||
labels = km.fit_predict(zip(xs, ys))
|
||||
return zip(ids, labels)
|
||||
|
||||
def nonspatial(self, subquery, colnames, no_clusters=5,
|
||||
standardize=True, id_col='cartodb_id'):
|
||||
"""
|
||||
Arguments:
|
||||
query (string): A SQL query to retrieve the data required to do the
|
||||
k-means clustering analysis, like so:
|
||||
SELECT * FROM iris_flower_data
|
||||
colnames (list): a list of the column names which contain the data
|
||||
of interest, like so: ['sepal_width',
|
||||
'petal_width',
|
||||
'sepal_length',
|
||||
'petal_length']
|
||||
no_clusters (int): number of clusters (greater than zero)
|
||||
id_col (string): name of the input id_column
|
||||
|
||||
Returns:
|
||||
A list of tuples with the following columns:
|
||||
cluster labels: a label for the cluster that the row belongs to
|
||||
centers: center of the cluster that this row belongs to
|
||||
silhouettes: silhouette measure for this value
|
||||
rowid: row that these values belong to (corresponds to the value in
|
||||
`id_col`)
|
||||
"""
|
||||
import json
|
||||
from sklearn import metrics
|
||||
|
||||
params = {
|
||||
"colnames": colnames,
|
||||
"subquery": subquery,
|
||||
"id_col": id_col
|
||||
}
|
||||
|
||||
data = self.data_provider.get_nonspatial_kmeans(params)
|
||||
|
||||
# fill array with values for k-means clustering
|
||||
if standardize:
|
||||
cluster_columns = _scale_data(
|
||||
_extract_columns(data))
|
||||
else:
|
||||
cluster_columns = _extract_columns(data)
|
||||
|
||||
kmeans = KMeans(n_clusters=no_clusters,
|
||||
random_state=0).fit(cluster_columns)
|
||||
|
||||
centers = [json.dumps(dict(zip(colnames, c)))
|
||||
for c in kmeans.cluster_centers_[kmeans.labels_]]
|
||||
|
||||
silhouettes = metrics.silhouette_samples(cluster_columns,
|
||||
kmeans.labels_,
|
||||
metric='sqeuclidean')
|
||||
|
||||
return zip(kmeans.labels_,
|
||||
centers,
|
||||
silhouettes,
|
||||
[kmeans.inertia_] * kmeans.labels_.shape[0],
|
||||
data[0]['rowid'])
|
||||
|
||||
|
||||
# -- Preprocessing steps
|
||||
|
||||
def _extract_columns(data):
|
||||
"""
|
||||
Extract the features from the query and pack them into a NumPy array
|
||||
data (list of dicts): result of the kmeans request
|
||||
"""
|
||||
# number of columns minus rowid column
|
||||
n_cols = len(data[0]) - 1
|
||||
return np.array([data[0]['arr_col{0}'.format(i+1)]
|
||||
for i in xrange(n_cols)],
|
||||
dtype=float).T
|
||||
|
||||
|
||||
def _scale_data(features):
|
||||
"""
|
||||
Scale all input columns to center on 0 with a standard devation of 1
|
||||
features (numpy matrix): features of dimension (n_features, n_samples)
|
||||
"""
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
scaler = StandardScaler()
|
||||
return scaler.fit_transform(features)
|
||||
341
release/python/0.9.1/crankshaft/crankshaft/clustering/moran.py
Normal file
341
release/python/0.9.1/crankshaft/crankshaft/clustering/moran.py
Normal file
@@ -0,0 +1,341 @@
|
||||
"""
|
||||
Moran's I geostatistics (global clustering & outliers presence)
|
||||
Functionality relies on a combination of `PySAL
|
||||
<http://pysal.readthedocs.io/en/latest/>`__ and the data providered provided in
|
||||
the class instantiation (which defaults to PostgreSQL's plpy module's `database
|
||||
access functions <https://www.postgresql.org/docs/10/static/plpython.html>`__).
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
import pysal as ps
|
||||
|
||||
# crankshaft module
|
||||
import crankshaft.pysal_utils as pu
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
|
||||
# High level interface ---------------------------------------
|
||||
|
||||
|
||||
class Moran(object):
|
||||
"""Class for calculation of Moran's I statistics (global, local, and local
|
||||
rate)
|
||||
|
||||
Parameters:
|
||||
data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See
|
||||
the `crankshaft.analysis_data_provider` module for more information.
|
||||
"""
|
||||
def __init__(self, data_provider=None):
|
||||
if data_provider is None:
|
||||
self.data_provider = AnalysisDataProvider()
|
||||
else:
|
||||
self.data_provider = data_provider
|
||||
|
||||
def global_stat(self, subquery, attr_name,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I (global)
|
||||
Implementation building neighbors with a PostGIS database and Moran's I
|
||||
core clusters with PySAL.
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
attr_name (str): Column name of data to analyze
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
|
||||
"""
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr_name),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
attr_vals = pu.get_attributes(result)
|
||||
|
||||
# calculate weights
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate moran global
|
||||
moran_global = ps.esda.moran.Moran(attr_vals, weight,
|
||||
permutations=permutations)
|
||||
|
||||
return zip([moran_global.I], [moran_global.EI])
|
||||
|
||||
def local_stat(self, subquery, attr,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I (local)
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
attr (str): Column name of data to analyze
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
|
||||
Returns:
|
||||
list of tuples: Where each tuple consists of the following values:
|
||||
- quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
|
||||
- p-value
|
||||
- spatial lag
|
||||
- standardized spatial lag (centered on the mean, normalized by the
|
||||
standard deviation)
|
||||
- original value
|
||||
- standardized value
|
||||
- Moran's I statistic
|
||||
- original row index
|
||||
"""
|
||||
|
||||
# geometries with attributes that are null are ignored
|
||||
# resulting in a collection of not as near neighbors
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
attr_vals = pu.get_attributes(result)
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate LISA values
|
||||
lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
|
||||
permutations=permutations)
|
||||
|
||||
# find quadrants for each geometry
|
||||
quads = quad_position(lisa.q)
|
||||
|
||||
# calculate spatial lag
|
||||
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
|
||||
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
|
||||
|
||||
return zip(
|
||||
quads,
|
||||
lisa.p_sim,
|
||||
lag,
|
||||
lag_std,
|
||||
lisa.y,
|
||||
lisa.z,
|
||||
lisa.Is,
|
||||
weight.id_order
|
||||
)
|
||||
|
||||
def global_rate_stat(self, subquery, numerator, denominator,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I Rate (global)
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
numerator (str): Column name of numerator to analyze
|
||||
denominator (str): Column name of the denominator
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
"""
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", numerator),
|
||||
("attr2", denominator),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
numer = pu.get_attributes(result, 1)
|
||||
denom = pu.get_attributes(result, 2)
|
||||
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate moran global rate
|
||||
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
|
||||
permutations=permutations)
|
||||
|
||||
return zip([lisa_rate.I], [lisa_rate.EI])
|
||||
|
||||
def local_rate_stat(self, subquery, numerator, denominator,
|
||||
w_type, num_ngbrs, permutations, geom_col, id_col):
|
||||
"""
|
||||
Moran's I Local Rate
|
||||
|
||||
Args:
|
||||
|
||||
subquery (str): Query to give access to the data needed. This query
|
||||
must give access to ``attr_name``, ``geom_col``, and ``id_col``.
|
||||
numerator (str): Column name of numerator to analyze
|
||||
denominator (str): Column name of the denominator
|
||||
w_type (str): Type of spatial weight. Must be one of `knn`
|
||||
or `queen`. See `PySAL documentation
|
||||
<http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
|
||||
for more information.
|
||||
num_ngbrs (int): If using `knn` for ``w_type``, this
|
||||
specifies the number of neighbors to be used to define the spatial
|
||||
neighborhoods.
|
||||
permutations (int): Number of permutations for performing
|
||||
conditional randomization to find the p-value. Higher numbers
|
||||
takes a longer time for getting results.
|
||||
geom_col (str): Name of the geometry column in the dataset for
|
||||
finding the spatial neighborhoods.
|
||||
id_col (str): Row index for each value. Usually the database index.
|
||||
|
||||
Returns:
|
||||
list of tuples: Where each tuple consists of the following values:
|
||||
- quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
|
||||
- p-value
|
||||
- spatial lag
|
||||
- standardized spatial lag (centered on the mean, normalized by the
|
||||
standard deviation)
|
||||
- original value (roughly numerator divided by denominator)
|
||||
- standardized value
|
||||
- Moran's I statistic
|
||||
- original row index
|
||||
"""
|
||||
# geometries with values that are null are ignored
|
||||
# resulting in a collection of not as near neighbors
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("numerator", numerator),
|
||||
("denominator", denominator),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
numer = pu.get_attributes(result, 1)
|
||||
denom = pu.get_attributes(result, 2)
|
||||
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate LISA values
|
||||
lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
|
||||
permutations=permutations)
|
||||
|
||||
# find quadrants for each geometry
|
||||
quads = quad_position(lisa.q)
|
||||
|
||||
# spatial lag
|
||||
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
|
||||
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
|
||||
|
||||
return zip(
|
||||
quads,
|
||||
lisa.p_sim,
|
||||
lag,
|
||||
lag_std,
|
||||
lisa.y,
|
||||
lisa.z,
|
||||
lisa.Is,
|
||||
weight.id_order
|
||||
)
|
||||
|
||||
def local_bivariate_stat(self, subquery, attr1, attr2,
|
||||
permutations, geom_col, id_col,
|
||||
w_type, num_ngbrs):
|
||||
"""
|
||||
Moran's I (local) Bivariate (untested)
|
||||
"""
|
||||
|
||||
params = OrderedDict([("id_col", id_col),
|
||||
("attr1", attr1),
|
||||
("attr2", attr2),
|
||||
("geom_col", geom_col),
|
||||
("subquery", subquery),
|
||||
("num_ngbrs", num_ngbrs)])
|
||||
|
||||
result = self.data_provider.get_moran(w_type, params)
|
||||
|
||||
# collect attributes
|
||||
attr1_vals = pu.get_attributes(result, 1)
|
||||
attr2_vals = pu.get_attributes(result, 2)
|
||||
|
||||
# create weights
|
||||
weight = pu.get_weight(result, w_type, num_ngbrs)
|
||||
|
||||
# calculate LISA values
|
||||
lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
|
||||
permutations=permutations)
|
||||
|
||||
# find clustering of significance
|
||||
lisa_sig = quad_position(lisa.q)
|
||||
|
||||
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
|
||||
|
||||
# Low level functions ----------------------------------------
|
||||
|
||||
|
||||
def map_quads(coord):
|
||||
"""
|
||||
Map a quadrant number to Moran's I designation
|
||||
HH=1, LH=2, LL=3, HL=4
|
||||
Args:
|
||||
coord (int): quadrant of a specific measurement
|
||||
Returns:
|
||||
classification (one of 'HH', 'LH', 'LL', or 'HL')
|
||||
"""
|
||||
if coord == 1:
|
||||
return 'HH'
|
||||
elif coord == 2:
|
||||
return 'LH'
|
||||
elif coord == 3:
|
||||
return 'LL'
|
||||
elif coord == 4:
|
||||
return 'HL'
|
||||
return None
|
||||
|
||||
|
||||
def quad_position(quads):
|
||||
"""
|
||||
Map all quads
|
||||
|
||||
Args:
|
||||
quads (:obj:`numpy.ndarray`): an array of quads classified by
|
||||
1-4 (PySAL default)
|
||||
Returns:
|
||||
list: an array of quads classied by 'HH', 'LL', etc.
|
||||
"""
|
||||
return [map_quads(q) for q in quads]
|
||||
@@ -0,0 +1 @@
|
||||
from core import set_model, get_model, create_model_table
|
||||
@@ -0,0 +1,86 @@
|
||||
import time
|
||||
import plpy
|
||||
import pickle
|
||||
from petname import generate
|
||||
|
||||
def create_model_table():
|
||||
q = '''
|
||||
create table if not exists model_storage(
|
||||
description text,
|
||||
name text unique,
|
||||
model bytea,
|
||||
feature_names text[],
|
||||
date_created timestamptz,
|
||||
id serial primary key);
|
||||
'''
|
||||
plpy.notice(q)
|
||||
plan = plpy.prepare(q)
|
||||
resp = plpy.execute(plan)
|
||||
plpy.notice('Model table successfully created')
|
||||
plpy.notice(str(resp))
|
||||
|
||||
def get_model(model_name):
|
||||
"""retrieve model if it exists"""
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
SELECT model FROM model_storage
|
||||
WHERE name = $1;
|
||||
''', ['text', ])
|
||||
model_encoded = plpy.execute(plan, [model_name, ])
|
||||
if len(model_encoded) == 1:
|
||||
model = pickle.loads(
|
||||
model_encoded[0]['model']
|
||||
)
|
||||
plpy.notice('Model successfully loaded')
|
||||
else:
|
||||
plpy.notice('Model not found, or too many models '
|
||||
'({})'.format(len(model_encoded)))
|
||||
model = None
|
||||
except plpy.SPIError as err:
|
||||
plpy.error('ERROR: {}'.format(err))
|
||||
|
||||
return model
|
||||
|
||||
def set_model(model, model_name, feature_names):
|
||||
"""stores the model in the table model_storage"""
|
||||
if model_name is None:
|
||||
model_name = generate(words=2, separator='_', letters=8)
|
||||
existing_names = plpy.execute('''
|
||||
SELECT array_agg(name) as name
|
||||
FROM model_storage
|
||||
''')
|
||||
plpy.notice('nrows: {}'.format(existing_names.nrows()))
|
||||
plpy.notice('MODEL NAME: {}'.format(model_name))
|
||||
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names)))
|
||||
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
|
||||
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
|
||||
if existing_names[0]['name'] is not None:
|
||||
while model_name in existing_names[0]['name']:
|
||||
model_name = generate(words=2, separator='_', letters=10)
|
||||
plpy.notice(model_name)
|
||||
|
||||
# store model
|
||||
try:
|
||||
plan = plpy.prepare('''
|
||||
INSERT INTO model_storage(description, name, model, feature_names, date_created)
|
||||
VALUES (
|
||||
$1,
|
||||
$2,
|
||||
$3,
|
||||
$4::text[],
|
||||
to_timestamp($5));
|
||||
''', ['text', 'text', 'bytea', 'text', 'numeric'])
|
||||
plpy.notice('{%s}' % ','.join(feature_names))
|
||||
plpy.notice(feature_names)
|
||||
plpy.execute(
|
||||
plan,
|
||||
[' '.join(m.strip() for m in model.__repr__().split('\n')),
|
||||
model_name,
|
||||
pickle.dumps(model),
|
||||
'{%s}' % ','.join(feature_names),
|
||||
time.time()]
|
||||
)
|
||||
plpy.notice('model successfully stored as {}'.format(model_name))
|
||||
except plpy.SPIError as err:
|
||||
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))
|
||||
@@ -0,0 +1,2 @@
|
||||
"""Import all functions for pysal_utils"""
|
||||
from crankshaft.pysal_utils.pysal_utils import *
|
||||
@@ -0,0 +1,251 @@
|
||||
"""
|
||||
Utilities module for generic PySAL functionality, mainly centered on
|
||||
translating queries into numpy arrays or PySAL weights objects
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pysal as ps
|
||||
|
||||
|
||||
def construct_neighbor_query(w_type, query_vals):
|
||||
"""Return query (a string) used for finding neighbors
|
||||
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
|
||||
@param query_vals dict: values used to construct the query
|
||||
"""
|
||||
|
||||
if w_type.lower() == 'knn':
|
||||
return knn(query_vals)
|
||||
else:
|
||||
return queen(query_vals)
|
||||
|
||||
|
||||
# Build weight object
|
||||
def get_weight(query_res, w_type='knn', num_ngbrs=5):
|
||||
"""
|
||||
Construct PySAL weight from return value of query
|
||||
@param query_res dict-like: query results with attributes and neighbors
|
||||
"""
|
||||
|
||||
neighbors = {x['id']: x['neighbors'] for x in query_res}
|
||||
print 'len of neighbors: %d' % len(neighbors)
|
||||
|
||||
built_weight = ps.W(neighbors)
|
||||
built_weight.transform = 'r'
|
||||
|
||||
return built_weight
|
||||
|
||||
|
||||
def query_attr_select(params, table_ref=True):
|
||||
"""
|
||||
Create portion of SELECT statement for attributes inolved in query.
|
||||
Defaults to order in the params
|
||||
@param params: dict of information used in query (column names,
|
||||
table name, etc.)
|
||||
Example:
|
||||
OrderedDict([('numerator', 'price'),
|
||||
('denominator', 'sq_meters'),
|
||||
('subquery', 'SELECT * FROM interesting_data')])
|
||||
Output:
|
||||
"i.\"price\"::numeric As attr1, " \
|
||||
"i.\"sq_meters\"::numeric As attr2, "
|
||||
"""
|
||||
|
||||
attr_string = ""
|
||||
template = "\"%(col)s\"::numeric As attr%(alias_num)s, "
|
||||
|
||||
if table_ref:
|
||||
template = "i." + template
|
||||
|
||||
if ('time_cols' in params) or ('ind_vars' in params):
|
||||
# if markov or gwr analysis
|
||||
attrs = (params['time_cols'] if 'time_cols' in params
|
||||
else params['ind_vars'])
|
||||
if 'ind_vars' in params:
|
||||
template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, "
|
||||
|
||||
for idx, val in enumerate(attrs):
|
||||
attr_string += template % {"col": val, "alias_num": idx + 1}
|
||||
else:
|
||||
# if moran's analysis
|
||||
attrs = [k for k in params
|
||||
if k not in ('id_col', 'geom_col', 'subquery',
|
||||
'num_ngbrs', 'subquery')]
|
||||
|
||||
for idx, val in enumerate(attrs):
|
||||
attr_string += template % {"col": params[val],
|
||||
"alias_num": idx + 1}
|
||||
|
||||
return attr_string
|
||||
|
||||
|
||||
def query_attr_where(params, table_ref=True):
|
||||
"""
|
||||
Construct where conditions when building neighbors query
|
||||
Create portion of WHERE clauses for weeding out NULL-valued geometries
|
||||
Input: dict of params:
|
||||
{'subquery': ...,
|
||||
'numerator': 'data1',
|
||||
'denominator': 'data2',
|
||||
'': ...}
|
||||
Output:
|
||||
'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
|
||||
Input:
|
||||
{'subquery': ...,
|
||||
'time_cols': ['time1', 'time2', 'time3'],
|
||||
'etc': ...}
|
||||
Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
|
||||
NULL AND idx_replace."time3" IS NOT NULL'
|
||||
"""
|
||||
attr_string = []
|
||||
template = "\"%s\" IS NOT NULL"
|
||||
if table_ref:
|
||||
template = "idx_replace." + template
|
||||
|
||||
if ('time_cols' in params) or ('ind_vars' in params):
|
||||
# markov or gwr where clauses
|
||||
attrs = (params['time_cols'] if 'time_cols' in params
|
||||
else params['ind_vars'])
|
||||
# add values to template
|
||||
for attr in attrs:
|
||||
attr_string.append(template % attr)
|
||||
else:
|
||||
# moran where clauses
|
||||
|
||||
# get keys
|
||||
attrs = [k for k in params
|
||||
if k not in ('id_col', 'geom_col', 'subquery',
|
||||
'num_ngbrs', 'subquery')]
|
||||
|
||||
# add values to template
|
||||
for attr in attrs:
|
||||
attr_string.append(template % params[attr])
|
||||
|
||||
if 'denominator' in attrs:
|
||||
attr_string.append(
|
||||
"idx_replace.\"%s\" <> 0" % params['denominator'])
|
||||
|
||||
out = " AND ".join(attr_string)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def knn(params):
|
||||
"""SQL query for k-nearest neighbors.
|
||||
@param vars: dict of values to fill template
|
||||
"""
|
||||
|
||||
attr_select = query_attr_select(params, table_ref=True)
|
||||
attr_where = query_attr_where(params, table_ref=True)
|
||||
|
||||
replacements = {"attr_select": attr_select,
|
||||
"attr_where_i": attr_where.replace("idx_replace", "i"),
|
||||
"attr_where_j": attr_where.replace("idx_replace", "j")}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
i."{id_col}" As id,
|
||||
%(attr_select)s
|
||||
(SELECT ARRAY(SELECT j."{id_col}"
|
||||
FROM ({subquery}) As j
|
||||
WHERE i."{id_col}" <> j."{id_col}" AND
|
||||
%(attr_where_j)s AND
|
||||
j."{geom_col}" IS NOT NULL
|
||||
ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC
|
||||
LIMIT {num_ngbrs})) As neighbors
|
||||
FROM ({subquery}) As i
|
||||
WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL
|
||||
ORDER BY i."{id_col}" ASC;
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params)
|
||||
|
||||
|
||||
# SQL query for finding queens neighbors (all contiguous polygons)
|
||||
def queen(params):
|
||||
"""SQL query for queen neighbors.
|
||||
@param params dict: information to fill query
|
||||
"""
|
||||
attr_select = query_attr_select(params)
|
||||
attr_where = query_attr_where(params)
|
||||
|
||||
replacements = {"attr_select": attr_select,
|
||||
"attr_where_i": attr_where.replace("idx_replace", "i"),
|
||||
"attr_where_j": attr_where.replace("idx_replace", "j")}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
i."{id_col}" As id,
|
||||
%(attr_select)s
|
||||
(SELECT ARRAY(SELECT j."{id_col}"
|
||||
FROM ({subquery}) As j
|
||||
WHERE i."{id_col}" <> j."{id_col}" AND
|
||||
ST_Touches(i."{geom_col}", j."{geom_col}") AND
|
||||
%(attr_where_j)s)) As neighbors
|
||||
FROM ({subquery}) As i
|
||||
WHERE
|
||||
%(attr_where_i)s
|
||||
ORDER BY i."{id_col}" ASC;
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params)
|
||||
|
||||
|
||||
def gwr_query(params):
|
||||
"""
|
||||
GWR query
|
||||
"""
|
||||
|
||||
replacements = {"ind_vars_select": query_attr_select(params,
|
||||
table_ref=None),
|
||||
"ind_vars_where": query_attr_where(params,
|
||||
table_ref=None)}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg(ST_X(ST_Centroid("{geom_col}"))) As x,
|
||||
array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y,
|
||||
array_agg("{dep_var}") As dep_var,
|
||||
%(ind_vars_select)s
|
||||
array_agg("{id_col}") As rowid
|
||||
FROM ({subquery}) As q
|
||||
WHERE
|
||||
"{dep_var}" IS NOT NULL AND
|
||||
%(ind_vars_where)s
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params).strip()
|
||||
|
||||
|
||||
def gwr_predict_query(params):
|
||||
"""
|
||||
GWR query
|
||||
"""
|
||||
|
||||
replacements = {"ind_vars_select": query_attr_select(params,
|
||||
table_ref=None),
|
||||
"ind_vars_where": query_attr_where(params,
|
||||
table_ref=None)}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
array_agg(ST_X(ST_Centroid({geom_col}))) As x,
|
||||
array_agg(ST_Y(ST_Centroid({geom_col}))) As y,
|
||||
array_agg({dep_var}) As dep_var,
|
||||
%(ind_vars_select)s
|
||||
array_agg({id_col}) As rowid
|
||||
FROM ({subquery}) As q
|
||||
WHERE
|
||||
%(ind_vars_where)s
|
||||
''' % replacements
|
||||
|
||||
return query.format(**params).strip()
|
||||
# to add more weight methods open a ticket or pull request
|
||||
|
||||
|
||||
def get_attributes(query_res, attr_num=1):
|
||||
"""
|
||||
@param query_res: query results with attributes and neighbors
|
||||
@param attr_num: attribute number (1, 2, ...)
|
||||
"""
|
||||
return np.array([x['attr' + str(attr_num)] for x in query_res],
|
||||
dtype=np.float)
|
||||
12
release/python/0.9.1/crankshaft/crankshaft/random_seeds.py
Normal file
12
release/python/0.9.1/crankshaft/crankshaft/random_seeds.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Random seed generator used for non-deterministic functions in crankshaft"""
|
||||
import random
|
||||
import numpy
|
||||
|
||||
|
||||
def set_random_seeds(value):
|
||||
"""
|
||||
Set the seeds of the RNGs (Random Number Generators)
|
||||
used internally.
|
||||
"""
|
||||
random.seed(value)
|
||||
numpy.random.seed(value)
|
||||
@@ -0,0 +1,3 @@
|
||||
from crankshaft.regression.gwr import *
|
||||
from crankshaft.regression.glm import *
|
||||
from crankshaft.regression.gwr_cs import *
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user