Compare commits

..

15 Commits

Author SHA1 Message Date
Andy Eschbacher
daba2f9597 release 0.9.5 [ci skip] 2018-04-09 15:22:35 -04:00
Andy Eschbacher
8f28f41060 corrects incorrect variable name 2018-04-09 15:16:52 -04:00
Andy Eschbacher
7509afa5a6 release feature name validation 2018-04-09 14:14:39 -04:00
Andy Eschbacher
a28c68502c adds feature name validation [ci skip] 2018-04-09 14:09:31 -04:00
Andy Eschbacher
5b4443ca88 new faux release 2018-03-22 13:14:20 -04:00
Andy Eschbacher
2048db33fc avoids accuracy calculation without model being defined 2018-03-22 13:12:17 -04:00
Andy Eschbacher
99e78800b3 adds latest release file 2018-03-22 11:46:42 -04:00
Andy Eschbacher
800648a710 adds upgrade path for 0.9.2 faux release 2018-03-22 11:08:46 -04:00
Andy Eschbacher
91ee6ecc48 new faux release 2018-03-22 11:02:45 -04:00
Andy Eschbacher
9a5ab17240 replaces petname with uuid for now 2018-03-22 11:01:39 -04:00
Andy Eschbacher
65be9befb1 faux release for staging testing 2018-03-22 10:19:29 -04:00
Andy Eschbacher
37e6b4a228 fixes release path copy error [ci skip] 2018-03-20 11:59:15 -04:00
Andy Eschbacher
766bfed9be dummy version bump 2018-03-19 13:30:37 -04:00
Andy Eschbacher
e8a601e945 adds model module [ci skip] 2018-03-16 16:45:39 -04:00
Andy Eschbacher
c2be340c07 prototype of model writing 2018-03-16 16:21:00 -04:00
439 changed files with 69214 additions and 746 deletions

3
.brackets.json Normal file
View File

@@ -0,0 +1,3 @@
{
"sbruchmann.staticpreview.basepath": "/home/carto/Projects/crankshaft/"
}

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ envs/
*.pyc *.pyc
.DS_Store .DS_Store
.idea/ .idea/
.*.sw[nop]

View File

@@ -1,48 +1,60 @@
language: c language: c
dist: precise
sudo: required sudo: required
env: env:
global: global:
- PAGER=cat - PAGER=cat
- PGUSER=postgres
- PGDATABASE=postgres
- PGOPTIONS='-c client_min_messages=NOTICE'
jobs:
include:
- env: POSTGRESQL_VERSION="9.6" POSTGIS_VERSION="2.5"
dist: xenial
- env: POSTGRESQL_VERSION="10" POSTGIS_VERSION="2.5"
dist: xenial
- env: POSTGRESQL_VERSION="11" POSTGIS_VERSION="2.5"
dist: xenial
- env: POSTGRESQL_VERSION="12" POSTGIS_VERSION="3"
dist: bionic
before_install: before_install:
- ./check-up-to-date-with-master.sh
- sudo apt-get -y install python-pip
- sudo apt-get install -y --allow-unauthenticated --no-install-recommends --no-install-suggests postgresql-$POSTGRESQL_VERSION postgresql-client-$POSTGRESQL_VERSION postgresql-server-dev-$POSTGRESQL_VERSION postgresql-common - sudo apt-get -y install python-software-properties
- if [[ $POSTGRESQL_VERSION == '9.6' ]]; then sudo apt-get install -y postgresql-contrib-9.6; fi; - sudo add-apt-repository -y ppa:cartodb/sci
- sudo apt-get install -y --allow-unauthenticated postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION-scripts postgis - sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
- sudo add-apt-repository -y ppa:cartodb/gis
- sudo add-apt-repository -y ppa:cartodb/gis-testing
- sudo apt-get update
# For pre12, install plpython2. For PG12 install plpython3 - sudo apt-get -y install python-joblib=0.8.3-1-cdb1
- if [[ $POSTGRESQL_VERSION != '12' ]]; then sudo apt-get install -y postgresql-plpython-$POSTGRESQL_VERSION python python-pip python-software-properties python-joblib python-nose python-setuptools; else sudo apt-get install -y postgresql-plpython3-12 python3 python3-pip python3-software-properties python3-joblib python3-nose python3-setuptools; fi; - sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1
- if [[ $POSTGRESQL_VERSION == '12' ]]; then echo -e "joblib==0.11\nnumpy==1.13.3\nscipy==0.19.1\npysal==1.14.3\nscikit-learn==0.19.1" > ./src/py/crankshaft/requirements.txt && sed -i -e "s/.*install_requires.*$/ install_requires=['joblib==0.11.0', 'numpy==1.13.3', 'scipy==0.19.1', 'pysal==1.14.3', 'scikit-learn==0.19.1'],/g" ./src/py/crankshaft/setup.py; fi;
- sudo pg_dropcluster --stop $POSTGRESQL_VERSION main - sudo apt-get -y install python-scipy=0.14.0-2-cdb6
- sudo rm -rf /etc/postgresql/$POSTGRESQL_VERSION /var/lib/postgresql/$POSTGRESQL_VERSION - sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2
- sudo pg_createcluster -u postgres $POSTGRESQL_VERSION main --start -- -A trust - sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2
- export PGPORT=$(pg_lsclusters | grep $POSTGRESQL_VERSION | awk '{print $3}') - sudo apt-get -y --no-install-recommends install python-scikits-learn=0.14.1-3-cdb2
# Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
- sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
# Install postgres db and build deps
- sudo /etc/init.d/postgresql stop # stop travis default instance
- sudo apt-get -y remove --purge postgresql-9.1
- sudo apt-get -y remove --purge postgresql-9.2
- sudo apt-get -y remove --purge postgresql-9.3
- sudo apt-get -y remove --purge postgresql-9.4
- sudo apt-get -y remove --purge postgresql-9.5
- sudo rm -rf /var/lib/postgresql/
- sudo rm -rf /var/log/postgresql/
- sudo rm -rf /etc/postgresql/
- sudo apt-get -y remove --purge postgis-2.2
- sudo apt-get -y autoremove
- sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
- sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
# configure it to accept local connections from postgres
- echo -e "# TYPE DATABASE USER ADDRESS METHOD \nlocal all postgres trust\nlocal all all trust\nhost all all 127.0.0.1/32 trust" \
| sudo tee /etc/postgresql/9.5/main/pg_hba.conf
- sudo /etc/init.d/postgresql restart 9.5
install: install:
- sudo make install - sudo make install
script: script:
- make test - make test || { cat src/pg/test/regression.diffs; false; }
- ./check-compatibility.sh - ./check-compatibility.sh
after_failure:
- pg_lsclusters
- cat src/pg/test/regression.diffs
- echo $PGPORT
- cat /var/log/postgresql/postgresql-$POSTGRESQL_VERSION-main.log

View File

@@ -39,7 +39,9 @@ ALTER EXTENSION crankshaft UPDATE TO 'dev';
If the extension has not previously been installed in a database, If the extension has not previously been installed in a database,
it can be installed directly with: it can be installed directly with:
```sql ```sql
CREATE EXTENSION crankshaft WITH VERSION 'dev' CASCADE; CREATE EXTENSION IF NOT EXISTS plpythonu;
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION crankshaft WITH VERSION 'dev';
``` ```
Once the feature or bugfix is completed and all the tests are passing Once the feature or bugfix is completed and all the tests are passing

View File

@@ -23,7 +23,7 @@ test: ## Run the tests for the development version of the extension
$(MAKE) -C $(EXT_DIR) test $(MAKE) -C $(EXT_DIR) test
# Generate a new release into release # Generate a new release into release
release: ## Generate a new release of the extension. release: ## Generate a new release of the extension. Only for telease manager
$(MAKE) -C $(EXT_DIR) release $(MAKE) -C $(EXT_DIR) release
$(MAKE) -C $(PYP_DIR) release $(MAKE) -C $(PYP_DIR) release
@@ -31,7 +31,7 @@ release: ## Generate a new release of the extension.
# Requires sudo. # Requires sudo.
# Use the RELEASE_VERSION environment variable to deploy a specific version: # Use the RELEASE_VERSION environment variable to deploy a specific version:
# sudo make deploy RELEASE_VERSION=1.0.0 # sudo make deploy RELEASE_VERSION=1.0.0
deploy: deploy: ## Deploy a released extension. Only for release manager. Requires sudo.
$(MAKE) -C $(EXT_DIR) deploy $(MAKE) -C $(EXT_DIR) deploy
$(MAKE) -C $(PYP_DIR) deploy $(MAKE) -C $(PYP_DIR) deploy

View File

@@ -3,21 +3,9 @@ EXTENSION = crankshaft
PACKAGE = crankshaft PACKAGE = crankshaft
EXTVERSION = $(shell grep default_version $(SELF_DIR)/src/pg/$(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/") EXTVERSION = $(shell grep default_version $(SELF_DIR)/src/pg/$(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
RELEASE_VERSION ?= $(EXTVERSION) RELEASE_VERSION ?= $(EXTVERSION)
SED = sed SED = sed
AWK = awk PIP = pip
PG_CONFIG = pg_config
PG_VERSION_1000 := $(shell $(PG_CONFIG) --version | $(AWK) '{$$2*=1000; print $$2}')
PG_PARALLEL := $(shell [ $(PG_VERSION_1000) -ge 9600 ] && echo true)
PG_12plus := $(shell [ $(PG_VERSION_1000) -ge 12000 ] && echo true)
PYTHON3 ?= $(PG_12plus)
ifeq ($(PYTHON3), true)
PIP := python3 -m pip
NOSETESTS = nosetests3
else
PIP := python2 -m pip
NOSETESTS = nosetests NOSETESTS = nosetests
endif AWK = awk
PG_CONFIG = pg_config
PG_PARALLEL := $(shell $(PG_CONFIG) --version | ($(AWK) '{$$2*=1000; if ($$2 >= 9600) print 1; else print 0;}' 2> /dev/null || echo 0))

11
NEWS.md
View File

@@ -1,14 +1,3 @@
0.9.0 (2019-12-23)
------------------
* Compatibility with PG12.
* Compatibility with python3 (enable with PYTHON3=true env variable, default in PG12+).
0.8.2 (2019-02-07)
------------------
* Update dependencies to match what it's being used in production.
* Update travis to xenial, PG10 and 11, and postgis 2.5
* Compatibility with PG11
0.8.1 (2018-03-12) 0.8.1 (2018-03-12)
------------------ ------------------
* Adds improperly added version files * Adds improperly added version files

View File

@@ -8,21 +8,28 @@ CARTO Spatial Analysis extension for PostgreSQL.
* `src/` source code * `src/` source code
- `pg/` contains the PostgreSQL extension source code - `pg/` contains the PostgreSQL extension source code
- `py/` Python module source code - `py/` Python module source code
* `release` released versions * `release` reseleased versions
## Requirements ## Requirements
* PostgreSQL * PostgreSQL
* plpythonu (for PG12+, plpython3u) and postgis extensions * plpythonu and postgis extensions
* python-scipy system package (see [src/py/README.md](https://github.com/CartoDB/crankshaft/blob/develop/src/py/README.md)) * python-scipy system package (see [src/py/README.md](https://github.com/CartoDB/crankshaft/blob/develop/src/py/README.md))
# Development Process # Development Process
We distinguish two roles:
* *developers* will implement new functionality and bugfixes into
the codebase.
* A *release manager* will handle the release process.
We use the branch `develop` as the main integration branch for development. The `master` is reserved to handle releases. We use the branch `develop` as the main integration branch for development. The `master` is reserved to handle releases.
The process is as follows: The process is as follows:
1. Create a new **topic branch** from `develop` for any new feature or bugfix and commit their changes to it: 1. Create a new **topic branch** from `develop` for any new feature
or bugfix and commit their changes to it:
```shell ```shell
git fetch && git checkout -b my-cool-feature origin/develop git fetch && git checkout -b my-cool-feature origin/develop
@@ -32,6 +39,7 @@ The process is as follows:
1. Update the [NEWS.md](https://github.com/CartoDB/crankshaft/blob/develop/NEWS.md) doc. 1. Update the [NEWS.md](https://github.com/CartoDB/crankshaft/blob/develop/NEWS.md) doc.
1. Create a pull request and mention relevant people for a **peer review**. 1. Create a pull request and mention relevant people for a **peer review**.
1. Address the comments and improvements you get from the peer review. 1. Address the comments and improvements you get from the peer review.
1. Mention `@CartoDB/dataservices` in the PR to get it merged into `develop`.
In order for a pull request to be accepted, the following criteria should be met: In order for a pull request to be accepted, the following criteria should be met:
* The peer review should pass and no major issue should be left unaddressed. * The peer review should pass and no major issue should be left unaddressed.

View File

@@ -1,6 +1,7 @@
# Release & Deployment Process # Release & Deployment Process
:warning: Do not forget about updating dependencies in `cartodb-platform` and `carto-postgres-artifacts` :warning: The release process of a new version of the extension
shall be performed by the designated *Release Manager*.
## Release steps ## Release steps
* Make sure `develop` branch passes all the tests. * Make sure `develop` branch passes all the tests.

View File

@@ -1,20 +0,0 @@
{
"name": "crankshaft",
"current_version": {
"requires": {
"postgres": ">=9.5.0",
"postgis": ">=2.2.0.0",
"python": ">=2.7.0",
"joblib": "0.8.3",
"numpy": "1.6.1",
"scipy": "0.14.0",
"pysal": "1.14.3",
"scikit-learn": "0.14.1"
},
"works_with": {
}
},
"exceptional_versions": {
}
}

View File

@@ -25,6 +25,10 @@ psql -c "SELECT * FROM pg_available_extension_versions WHERE name LIKE 'cranksha
# Install in the fresh DB # Install in the fresh DB
psql $DBNAME <<'EOF' psql $DBNAME <<'EOF'
-- Install dependencies
CREATE EXTENSION plpythonu;
CREATE EXTENSION postgis VERSION '2.2.2';
-- Create role publicuser if it does not exist -- Create role publicuser if it does not exist
DO DO
$$ $$
@@ -40,53 +44,30 @@ END
$$ LANGUAGE plpgsql; $$ LANGUAGE plpgsql;
-- Install the default version -- Install the default version
CREATE EXTENSION crankshaft CASCADE; CREATE EXTENSION crankshaft;
\dx \dx
EOF EOF
# Check PG version
PG_VERSION=`psql -q -t -c "SELECT current_setting('server_version_num')"`
# Save public function signatures # Save public function signatures
if [[ "$PG_VERSION" -lt 110000 ]]; then psql $DBNAME <<'EOF'
psql $DBNAME -c " CREATE TABLE release_function_signatures AS
CREATE TABLE release_function_signatures AS SELECT
SELECT p.proname as name,
p.proname as name, pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_result(p.oid) as result_type, pg_catalog.pg_get_function_arguments(p.oid) as arguments,
pg_catalog.pg_get_function_arguments(p.oid) as arguments, CASE
CASE WHEN p.proisagg THEN 'agg'
WHEN p.proisagg THEN 'agg' WHEN p.proiswindow THEN 'window'
WHEN p.proiswindow THEN 'window' WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger' ELSE 'normal'
ELSE 'normal' END as type
END as type FROM pg_catalog.pg_proc p
FROM pg_catalog.pg_proc p LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace WHERE
WHERE n.nspname = 'cdb_crankshaft'
n.nspname = 'cdb_crankshaft' AND p.proname LIKE 'cdb_%'
AND p.proname LIKE 'cdb_%' ORDER BY 1, 2, 4;
ORDER BY 1, 2, 4;" EOF
else
psql $DBNAME -c "
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.prokind = 'a' THEN 'agg'
WHEN p.prokind = 'w' THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
fi
# Deploy current dev branch # Deploy current dev branch
make clean-dev || die "Could not clean dev files" make clean-dev || die "Could not clean dev files"
@@ -95,42 +76,26 @@ sudo make install || die "Could not deploy current dev branch"
# Check it can be upgraded # Check it can be upgraded
psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version" psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version"
if [[ $PG_VERSION -lt 110000 ]]; then # Check against saved public function signatures
psql $DBNAME -c " psql $DBNAME <<'EOF'
CREATE TABLE dev_function_signatures AS CREATE TABLE dev_function_signatures AS
SELECT p.proname as name, SELECT
pg_catalog.pg_get_function_result(p.oid) as result_type, p.proname as name,
pg_catalog.pg_get_function_arguments(p.oid) as arguments, pg_catalog.pg_get_function_result(p.oid) as result_type,
CASE WHEN p.proisagg THEN 'agg' pg_catalog.pg_get_function_arguments(p.oid) as arguments,
WHEN p.proiswindow THEN 'window' CASE
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger' WHEN p.proisagg THEN 'agg'
ELSE 'normal' WHEN p.proiswindow THEN 'window'
END as type WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
FROM pg_catalog.pg_proc p ELSE 'normal'
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace END as type
WHERE FROM pg_catalog.pg_proc p
n.nspname = 'cdb_crankshaft' LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
AND p.proname LIKE 'cdb_%' WHERE
ORDER BY 1, 2, 4;" n.nspname = 'cdb_crankshaft'
else AND p.proname LIKE 'cdb_%'
psql $DBNAME -c " ORDER BY 1, 2, 4;
CREATE TABLE dev_function_signatures AS EOF
SELECT p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.prokind = 'a' THEN 'agg'
WHEN p.prokind = 'w' THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
fi
echo "Functions in development not in latest release (ok):" echo "Functions in development not in latest release (ok):"
psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;" psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;"

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release -- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version() CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$ RETURNS text AS $$
SELECT '0.8.2'::text; SELECT '0.9.0'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; $$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence -- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( BEGIN
SFUNC = CDB_PyAggS, IF NOT EXISTS (
STYPE = Numeric[], SELECT *
PARALLEL = SAFE, FROM pg_catalog.pg_proc p
INITCOND = "{}" LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
); WHERE n.nspname = 'cdb_crankshaft'
EXCEPTION AND p.proname = 'cdb_pyagg'
WHEN duplicate_function THEN NULL; AND p.proisagg)
END $$; THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT, target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
'learning_rate': learning_rate, 'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf 'min_samples_leaf': min_samples_leaf
} }
feature_cols = set(plpy.execute(''' all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0 select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ]) '''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment( return seg.create_and_predict_segment(
query, query,
variable_name, variable_name,
feature_cols, feature_cols,
target_table, target_table,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
query TEXT, query TEXT,
variable TEXT, variable TEXT,
feature_columns TEXT[], feature_columns TEXT[],
target_query TEXT, target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
variable, variable,
feature_columns, feature_columns,
target_query, target_query,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity( CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
END END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( BEGIN
SFUNC = CDB_WeightedMeanS, IF NOT EXISTS (
FINALFUNC = CDB_WeightedMeanF, SELECT *
STYPE = Numeric[], FROM pg_catalog.pg_proc p
PARALLEL = SAFE, LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
INITCOND = "{0.0,0.0,0.0}" WHERE n.nspname = 'cdb_crankshaft'
); AND p.proname = 'cdb_weightedmean'
EXCEPTION AND p.proisagg)
WHEN duplicate_function THEN NULL; THEN
END $$; CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov -- Spatial Markov
-- input table format: -- input table format:

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release -- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version() CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$ RETURNS text AS $$
SELECT '0.8.2'::text; SELECT '0.9.1'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; $$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence -- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( BEGIN
SFUNC = CDB_PyAggS, IF NOT EXISTS (
STYPE = Numeric[], SELECT *
PARALLEL = SAFE, FROM pg_catalog.pg_proc p
INITCOND = "{}" LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
); WHERE n.nspname = 'cdb_crankshaft'
EXCEPTION AND p.proname = 'cdb_pyagg'
WHEN duplicate_function THEN NULL; AND p.proisagg)
END $$; THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT, target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
'learning_rate': learning_rate, 'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf 'min_samples_leaf': min_samples_leaf
} }
feature_cols = set(plpy.execute(''' all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0 select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ]) '''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment( return seg.create_and_predict_segment(
query, query,
variable_name, variable_name,
feature_cols, feature_cols,
target_table, target_table,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
query TEXT, query TEXT,
variable TEXT, variable TEXT,
feature_columns TEXT[], feature_columns TEXT[],
target_query TEXT, target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
variable, variable,
feature_columns, feature_columns,
target_query, target_query,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity( CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
END END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( BEGIN
SFUNC = CDB_WeightedMeanS, IF NOT EXISTS (
FINALFUNC = CDB_WeightedMeanF, SELECT *
STYPE = Numeric[], FROM pg_catalog.pg_proc p
PARALLEL = SAFE, LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
INITCOND = "{0.0,0.0,0.0}" WHERE n.nspname = 'cdb_crankshaft'
); AND p.proname = 'cdb_weightedmean'
EXCEPTION AND p.proisagg)
WHEN duplicate_function THEN NULL; THEN
END $$; CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov -- Spatial Markov
-- input table format: -- input table format:

View File

@@ -21,7 +21,7 @@ _cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$ AS $$
from crankshaft import random_seeds from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value) random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[]) CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$ returns NUMERIC[] as $$
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( BEGIN
SFUNC = CDB_PyAggS, IF NOT EXISTS (
STYPE = Numeric[], SELECT *
PARALLEL = SAFE, FROM pg_catalog.pg_proc p
INITCOND = "{}" LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
); WHERE n.nspname = 'cdb_crankshaft'
EXCEPTION AND p.proname = 'cdb_pyagg'
WHEN duplicate_function THEN NULL; AND p.proisagg)
END $$; THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
@@ -82,13 +91,14 @@ AS $$
target_ids, target_ids,
model_params) model_params)
$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED; $$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT, target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,17 +115,51 @@ AS $$
'learning_rate': learning_rate, 'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf 'min_samples_leaf': min_samples_leaf
} }
feature_cols = set(plpy.execute(''' all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0 select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ]) '''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment( return seg.create_and_predict_segment(
query, query,
variable_name, variable_name,
feature_cols, feature_cols,
target_table, target_table,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
@@ -123,6 +167,7 @@ CREATE OR REPLACE FUNCTION
variable TEXT, variable TEXT,
feature_columns TEXT[], feature_columns TEXT[],
target_query TEXT, target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,9 +189,10 @@ AS $$
variable, variable,
feature_columns, feature_columns,
target_query, target_query,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity( CREATE OR REPLACE FUNCTION CDB_Gravity(
IN target_query text, IN target_query text,
IN weight_column text, IN weight_column text,
@@ -656,7 +702,7 @@ AS $$
moran = Moran() moran = Moran()
return moran.global_stat(subquery, column_name, w_type, return moran.global_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) - DEPRECATED -- Moran's I Local (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -681,7 +727,7 @@ AS $$
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag # remove spatial lag
return [(r[6], r[0], r[1], r[7], r[5]) for r in result] return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) -- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -709,7 +755,7 @@ moran = Moran()
return moran.local_stat(subquery, column_name, w_type, return moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (public-facing function) -- Moran's I Local (public-facing function)
@@ -836,7 +882,7 @@ AS $$
# TODO: use named parameters or a dictionary # TODO: use named parameters or a dictionary
return moran.global_rate_stat(subquery, numerator, denominator, w_type, return moran.global_rate_stat(subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (internal function) - DEPRECATED -- Moran's I Local Rate (internal function) - DEPRECATED
@@ -864,7 +910,7 @@ AS $$
result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag # remove spatial lag
return [(r[6], r[0], r[1], r[7], r[4]) for r in result] return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (public-facing function) - DEPRECATED -- Moran's I Local Rate (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -920,7 +966,7 @@ return moran.local_rate_stat(
geom_col, geom_col,
id_col id_col
) )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Rate -- Moran's I Rate
-- Replaces CDB_AreasOfInterestLocalRate -- Replaces CDB_AreasOfInterestLocalRate
@@ -1033,7 +1079,7 @@ from crankshaft.clustering import Kmeans
kmeans = Kmeans() kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init) return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Non-spatial k-means clustering -- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data -- query: sql query to retrieve all the needed data
@@ -1063,7 +1109,7 @@ kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters, return kmeans.nonspatial(query, colnames, no_clusters,
standardize=standardize, standardize=standardize,
id_col=id_col) id_col=id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
@@ -1104,19 +1150,27 @@ BEGIN
END END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( BEGIN
SFUNC = CDB_WeightedMeanS, IF NOT EXISTS (
FINALFUNC = CDB_WeightedMeanF, SELECT *
STYPE = Numeric[], FROM pg_catalog.pg_proc p
PARALLEL = SAFE, LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
INITCOND = "{0.0,0.0,0.0}" WHERE n.nspname = 'cdb_crankshaft'
); AND p.proname = 'cdb_weightedmean'
EXCEPTION AND p.proisagg)
WHEN duplicate_function THEN NULL; THEN
END $$; CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov -- Spatial Markov
-- input table format: -- input table format:
@@ -1146,7 +1200,7 @@ AS $$
## TODO: use named parameters or a dictionary ## TODO: use named parameters or a dictionary
return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- input table format: identical to above but in a predictable format -- input table format: identical to above but in a predictable format
-- Sample function call: -- Sample function call:
@@ -1172,7 +1226,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local -- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary -- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) -- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u; -- $$ LANGUAGE plpythonu;
-- --
-- -- input table format: -- -- input table format:
-- -- id | geom | date | measurement -- -- id | geom | date | measurement
@@ -1198,7 +1252,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local -- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary -- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) -- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u; -- $$ LANGUAGE plpythonu;
-- Based on: -- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js -- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/ -- https://sites.google.com/site/polesofinaccessibility/
@@ -1468,7 +1522,7 @@ AS $$
from crankshaft.clustering import Getis from crankshaft.clustering import Getis
getis = Getis() getis = Getis()
return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- TODO: make a version that accepts the values as arrays -- TODO: make a version that accepts the values as arrays
@@ -1808,7 +1862,7 @@ gwr = GWR()
return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -1826,7 +1880,7 @@ gwr = GWR()
return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- --
-- Creates N points randomly distributed arround the polygon -- Creates N points randomly distributed arround the polygon
-- --

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release -- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version() CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$ RETURNS text AS $$
SELECT '0.9.0'::text; SELECT '0.9.2'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; $$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence -- Internal identifier of the installed extension instence
@@ -21,7 +21,7 @@ _cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$ AS $$
from crankshaft import random_seeds from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value) random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[]) CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$ returns NUMERIC[] as $$
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( BEGIN
SFUNC = CDB_PyAggS, IF NOT EXISTS (
STYPE = Numeric[], SELECT *
PARALLEL = SAFE, FROM pg_catalog.pg_proc p
INITCOND = "{}" LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
); WHERE n.nspname = 'cdb_crankshaft'
EXCEPTION AND p.proname = 'cdb_pyagg'
WHEN duplicate_function THEN NULL; AND p.proisagg)
END $$; THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
@@ -82,13 +91,14 @@ AS $$
target_ids, target_ids,
model_params) model_params)
$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED; $$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT, target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,17 +115,51 @@ AS $$
'learning_rate': learning_rate, 'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf 'min_samples_leaf': min_samples_leaf
} }
feature_cols = set(plpy.execute(''' all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0 select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ]) '''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment( return seg.create_and_predict_segment(
query, query,
variable_name, variable_name,
feature_cols, feature_cols,
target_table, target_table,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment( CDB_CreateAndPredictSegment(
@@ -123,6 +167,7 @@ CREATE OR REPLACE FUNCTION
variable TEXT, variable TEXT,
feature_columns TEXT[], feature_columns TEXT[],
target_query TEXT, target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5, subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,9 +189,10 @@ AS $$
variable, variable,
feature_columns, feature_columns,
target_query, target_query,
model_params model_params,
model_name=model_name
) )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity( CREATE OR REPLACE FUNCTION CDB_Gravity(
IN target_query text, IN target_query text,
IN weight_column text, IN weight_column text,
@@ -656,7 +702,7 @@ AS $$
moran = Moran() moran = Moran()
return moran.global_stat(subquery, column_name, w_type, return moran.global_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) - DEPRECATED -- Moran's I Local (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -681,7 +727,7 @@ AS $$
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag # remove spatial lag
return [(r[6], r[0], r[1], r[7], r[5]) for r in result] return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) -- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -709,7 +755,7 @@ moran = Moran()
return moran.local_stat(subquery, column_name, w_type, return moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (public-facing function) -- Moran's I Local (public-facing function)
@@ -836,7 +882,7 @@ AS $$
# TODO: use named parameters or a dictionary # TODO: use named parameters or a dictionary
return moran.global_rate_stat(subquery, numerator, denominator, w_type, return moran.global_rate_stat(subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col) num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (internal function) - DEPRECATED -- Moran's I Local Rate (internal function) - DEPRECATED
@@ -864,7 +910,7 @@ AS $$
result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag # remove spatial lag
return [(r[6], r[0], r[1], r[7], r[4]) for r in result] return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (public-facing function) - DEPRECATED -- Moran's I Local Rate (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -920,7 +966,7 @@ return moran.local_rate_stat(
geom_col, geom_col,
id_col id_col
) )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Rate -- Moran's I Rate
-- Replaces CDB_AreasOfInterestLocalRate -- Replaces CDB_AreasOfInterestLocalRate
@@ -1033,7 +1079,7 @@ from crankshaft.clustering import Kmeans
kmeans = Kmeans() kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init) return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Non-spatial k-means clustering -- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data -- query: sql query to retrieve all the needed data
@@ -1063,7 +1109,7 @@ kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters, return kmeans.nonspatial(query, colnames, no_clusters,
standardize=standardize, standardize=standardize,
id_col=id_col) id_col=id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
@@ -1104,19 +1150,27 @@ BEGIN
END END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist -- Create aggregate if it did not exist
DO $$ BEGIN DO $$
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( BEGIN
SFUNC = CDB_WeightedMeanS, IF NOT EXISTS (
FINALFUNC = CDB_WeightedMeanF, SELECT *
STYPE = Numeric[], FROM pg_catalog.pg_proc p
PARALLEL = SAFE, LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
INITCOND = "{0.0,0.0,0.0}" WHERE n.nspname = 'cdb_crankshaft'
); AND p.proname = 'cdb_weightedmean'
EXCEPTION AND p.proisagg)
WHEN duplicate_function THEN NULL; THEN
END $$; CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov -- Spatial Markov
-- input table format: -- input table format:
@@ -1146,7 +1200,7 @@ AS $$
## TODO: use named parameters or a dictionary ## TODO: use named parameters or a dictionary
return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- input table format: identical to above but in a predictable format -- input table format: identical to above but in a predictable format
-- Sample function call: -- Sample function call:
@@ -1172,7 +1226,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local -- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary -- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) -- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u; -- $$ LANGUAGE plpythonu;
-- --
-- -- input table format: -- -- input table format:
-- -- id | geom | date | measurement -- -- id | geom | date | measurement
@@ -1198,7 +1252,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local -- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary -- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) -- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u; -- $$ LANGUAGE plpythonu;
-- Based on: -- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js -- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/ -- https://sites.google.com/site/polesofinaccessibility/
@@ -1468,7 +1522,7 @@ AS $$
from crankshaft.clustering import Getis from crankshaft.clustering import Getis
getis = Getis() getis = Getis()
return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- TODO: make a version that accepts the values as arrays -- TODO: make a version that accepts the values as arrays
@@ -1808,7 +1862,7 @@ gwr = GWR()
return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
@@ -1826,7 +1880,7 @@ gwr = GWR()
return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- --
-- Creates N points randomly distributed arround the polygon -- Creates N points randomly distributed arround the polygon
-- --

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension' comment = 'CartoDB Spatial Analysis extension'
default_version = '0.9.0' default_version = '0.9.5'
requires = 'plpython3u, postgis' requires = 'plpythonu, postgis'
superuser = true superuser = true
schema = cdb_crankshaft schema = cdb_crankshaft

View File

@@ -1,5 +0,0 @@
joblib==0.9.4
numpy==1.11.0
scipy==0.17.0
pysal==1.14.3
scikit-learn==0.17.0

View File

@@ -4,4 +4,4 @@ import crankshaft.clustering
import crankshaft.space_time_dynamics import crankshaft.space_time_dynamics
import crankshaft.segmentation import crankshaft.segmentation
import crankshaft.regression import crankshaft.regression
from . import analysis_data_provider import analysis_data_provider

View File

@@ -1,6 +1,6 @@
"""class for fetching data""" """class for fetching data"""
import plpy import plpy
from . import pysal_utils as pu import pysal_utils as pu
NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows ' NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
'for null values and fill in appropriately.') 'for null values and fill in appropriately.')

View File

@@ -0,0 +1,76 @@
"""
Based on the Weiszfeld algorithm:
https://en.wikipedia.org/wiki/Geometric_median
"""
# import plpy
import numpy as np
from numpy.linalg import norm
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
query = '''
SELECT array_agg(ST_X({geom_col})) As x_coords,
array_agg(ST_Y({geom_col})) As y_coords
FROM {tablename}
'''.format(geom_col=geom_col, tablename=tablename)
try:
resp = plpy.execute(query)
data = np.vstack((resp['x_coords'][0],
resp['y_coords'][0])).T
plpy.notice('coords: %s' % str(coords))
except Exception, err:
# plpy.error('Analysis failed: %s' % err)
print('No plpy')
data = np.array([[1.2 * np.random.random() + 10.,
1.1 * (np.random.random() - 1.) + 3.]
for i in range(1, 100)])
# initialize 'median center' to be the mean
coords_center_temp = data.mean(axis=0)
# plpy.notice('temp_center: %s' % str(coords_center_temp))
print('temp_center: %s' % str(coords_center_temp))
for i in range(0, num_iters):
old_coords_center = coords_center_temp.copy()
denom = denominator(coords_center_temp, data)
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
data[j])
for j in range(len(data))], axis=0)
coords_center_temp = coords_center_temp / denom
print("Pass #%d" % i)
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
print('temp_center: %s' % str(coords_center_temp))
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
coords_center_temp))
print("Center coords: %s" % str(coords_center_temp))
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
return coords_center_temp
def obj_func(center_coords, data):
"""
"""
return np.linalg.norm(center_coords - data)
def numerator(center_coords, data_i):
"""
"""
return np.reciprocal(np.linalg.norm(center_coords - data_i))
def denominator(center_coords, data):
"""
"""
return np.reciprocal(np.linalg.norm(data - center_coords))

View File

@@ -1,4 +1,4 @@
"""Import all functions from for clustering""" """Import all functions from for clustering"""
from .moran import * from moran import *
from .kmeans import * from kmeans import *
from .getis import * from getis import *

View File

@@ -47,4 +47,4 @@ class Getis(object):
getis = ps.esda.getisord.G_Local(attr_vals, weight, getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations) star=True, permutations=permutations)
return list(zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)) return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -28,8 +28,8 @@ class Kmeans(object):
ids = result[0]['ids'] ids = result[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init) km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(list(zip(xs, ys))) labels = km.fit_predict(zip(xs, ys))
return list(zip(ids, labels)) return zip(ids, labels)
def nonspatial(self, subquery, colnames, no_clusters=5, def nonspatial(self, subquery, colnames, no_clusters=5,
standardize=True, id_col='cartodb_id'): standardize=True, id_col='cartodb_id'):
@@ -75,18 +75,18 @@ class Kmeans(object):
kmeans = KMeans(n_clusters=no_clusters, kmeans = KMeans(n_clusters=no_clusters,
random_state=0).fit(cluster_columns) random_state=0).fit(cluster_columns)
centers = [json.dumps(dict(list(zip(colnames, c)))) centers = [json.dumps(dict(zip(colnames, c)))
for c in kmeans.cluster_centers_[kmeans.labels_]] for c in kmeans.cluster_centers_[kmeans.labels_]]
silhouettes = metrics.silhouette_samples(cluster_columns, silhouettes = metrics.silhouette_samples(cluster_columns,
kmeans.labels_, kmeans.labels_,
metric='sqeuclidean') metric='sqeuclidean')
return list(zip(kmeans.labels_, return zip(kmeans.labels_,
centers, centers,
silhouettes, silhouettes,
[kmeans.inertia_] * kmeans.labels_.shape[0], [kmeans.inertia_] * kmeans.labels_.shape[0],
data[0]['rowid'])) data[0]['rowid'])
# -- Preprocessing steps # -- Preprocessing steps
@@ -99,7 +99,7 @@ def _extract_columns(data):
# number of columns minus rowid column # number of columns minus rowid column
n_cols = len(data[0]) - 1 n_cols = len(data[0]) - 1
return np.array([data[0]['arr_col{0}'.format(i+1)] return np.array([data[0]['arr_col{0}'.format(i+1)]
for i in range(n_cols)], for i in xrange(n_cols)],
dtype=float).T dtype=float).T

View File

@@ -75,7 +75,7 @@ class Moran(object):
moran_global = ps.esda.moran.Moran(attr_vals, weight, moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations) permutations=permutations)
return list(zip([moran_global.I], [moran_global.EI])) return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr, def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col): w_type, num_ngbrs, permutations, geom_col, id_col):
@@ -139,7 +139,7 @@ class Moran(object):
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z) lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
return list(zip( return zip(
quads, quads,
lisa.p_sim, lisa.p_sim,
lag, lag,
@@ -148,7 +148,7 @@ class Moran(object):
lisa.z, lisa.z,
lisa.Is, lisa.Is,
weight.id_order weight.id_order
)) )
def global_rate_stat(self, subquery, numerator, denominator, def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col): w_type, num_ngbrs, permutations, geom_col, id_col):
@@ -194,7 +194,7 @@ class Moran(object):
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight, lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations) permutations=permutations)
return list(zip([lisa_rate.I], [lisa_rate.EI])) return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator, def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col): w_type, num_ngbrs, permutations, geom_col, id_col):
@@ -262,7 +262,7 @@ class Moran(object):
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z) lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
return list(zip( return zip(
quads, quads,
lisa.p_sim, lisa.p_sim,
lag, lag,
@@ -271,7 +271,7 @@ class Moran(object):
lisa.z, lisa.z,
lisa.Is, lisa.Is,
weight.id_order weight.id_order
)) )
def local_bivariate_stat(self, subquery, attr1, attr2, def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col, permutations, geom_col, id_col,
@@ -303,7 +303,7 @@ class Moran(object):
# find clustering of significance # find clustering of significance
lisa_sig = quad_position(lisa.q) lisa_sig = quad_position(lisa.q)
return list(zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)) return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ---------------------------------------- # Low level functions ----------------------------------------

View File

@@ -0,0 +1 @@
from core import set_model, get_model, create_model_table

View File

@@ -0,0 +1,86 @@
import time
import plpy
import pickle
from petname import generate
def create_model_table():
q = '''
create table if not exists model_storage(
description text,
name text unique,
model bytea,
feature_names text[],
date_created timestamptz,
id serial primary key);
'''
plpy.notice(q)
plan = plpy.prepare(q)
resp = plpy.execute(plan)
plpy.notice('Model table successfully created')
plpy.notice(str(resp))
def get_model(model_name):
"""retrieve model if it exists"""
try:
plan = plpy.prepare('''
SELECT model FROM model_storage
WHERE name = $1;
''', ['text', ])
model_encoded = plpy.execute(plan, [model_name, ])
if len(model_encoded) == 1:
model = pickle.loads(
model_encoded[0]['model']
)
plpy.notice('Model successfully loaded')
else:
plpy.notice('Model not found, or too many models '
'({})'.format(len(model_encoded)))
model = None
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
return model
def set_model(model, model_name, feature_names):
"""stores the model in the table model_storage"""
if model_name is None:
model_name = generate(words=2, separator='_', letters=8)
existing_names = plpy.execute('''
SELECT array_agg(name) as name
FROM model_storage
''')
plpy.notice('nrows: {}'.format(existing_names.nrows()))
plpy.notice('MODEL NAME: {}'.format(model_name))
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
if existing_names[0]['name'] is not None:
while model_name in existing_names[0]['name']:
model_name = generate(words=2, separator='_', letters=10)
plpy.notice(model_name)
# store model
try:
plan = plpy.prepare('''
INSERT INTO model_storage(description, name, model, feature_names, date_created)
VALUES (
$1,
$2,
$3,
$4::text[],
to_timestamp($5));
''', ['text', 'text', 'bytea', 'text', 'numeric'])
plpy.notice('{%s}' % ','.join(feature_names))
plpy.notice(feature_names)
plpy.execute(
plan,
[' '.join(m.strip() for m in model.__repr__().split('\n')),
model_name,
pickle.dumps(model),
'{%s}' % ','.join(feature_names),
time.time()]
)
plpy.notice('model successfully stored as {}'.format(model_name))
except plpy.SPIError as err:
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))

View File

@@ -27,7 +27,7 @@ def get_weight(query_res, w_type='knn', num_ngbrs=5):
""" """
neighbors = {x['id']: x['neighbors'] for x in query_res} neighbors = {x['id']: x['neighbors'] for x in query_res}
print('len of neighbors: %d' % len(neighbors)) print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors) built_weight = ps.W(neighbors)
built_weight.transform = 'r' built_weight.transform = 'r'

View File

@@ -1,4 +1,4 @@
from . import glm import glm
from . import family import family
from . import utils import utils
from . import iwls import iwls

View File

@@ -1,9 +1,8 @@
from __future__ import print_function
import numpy as np import numpy as np
from scipy import stats from scipy import stats
from .utils import cache_readonly from utils import cache_readonly
from functools import reduce
class Results(object): class Results(object):
""" """

View File

@@ -7,8 +7,8 @@ The one parameter exponential family distributions used by GLM.
import numpy as np import numpy as np
from scipy import special from scipy import special
from . import links as L import links as L
from . import varfuncs as V import varfuncs as V
FLOAT_EPS = np.finfo(float).eps FLOAT_EPS = np.finfo(float).eps

View File

@@ -3,10 +3,10 @@ import numpy as np
import numpy.linalg as la import numpy.linalg as la
from pysal.spreg.utils import RegressionPropsY, spdot from pysal.spreg.utils import RegressionPropsY, spdot
import pysal.spreg.user_output as USER import pysal.spreg.user_output as USER
from .utils import cache_readonly from utils import cache_readonly
from .base import LikelihoodModelResults from base import LikelihoodModelResults
from . import family import family
from .iwls import iwls from iwls import iwls
__all__ = ['GLM'] __all__ = ['GLM']

View File

@@ -3,7 +3,7 @@ import numpy.linalg as la
from scipy import sparse as sp from scipy import sparse as sp
from scipy.sparse import linalg as spla from scipy.sparse import linalg as spla
from pysal.spreg.utils import spdot, spmultiply from pysal.spreg.utils import spdot, spmultiply
from .family import Binomial, Poisson from family import Binomial, Poisson
def _compute_betas(y, x): def _compute_betas(y, x):
""" """
@@ -49,7 +49,7 @@ def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=
if isinstance(family, Binomial): if isinstance(family, Binomial):
y = family.link._clean(y) y = family.link._clean(y)
if isinstance(family, Poisson): if isinstance(family, Poisson):
y_off = y/offset y_off = y/offset
y_off = family.starting_mu(y_off) y_off = family.starting_mu(y_off)
v = family.predict(y_off) v = family.predict(y_off)
mu = family.starting_mu(y) mu = family.starting_mu(y)
@@ -58,13 +58,13 @@ def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=
v = family.predict(mu) v = family.predict(mu)
while diff > tol and n_iter < max_iter: while diff > tol and n_iter < max_iter:
n_iter += 1 n_iter += 1
w = family.weights(mu) w = family.weights(mu)
z = v + (family.link.deriv(mu)*(y-mu)) z = v + (family.link.deriv(mu)*(y-mu))
w = np.sqrt(w) w = np.sqrt(w)
if type(x) != np.ndarray: if type(x) != np.ndarray:
w = sp.csr_matrix(w) w = sp.csr_matrix(w)
z = sp.csr_matrix(z) z = sp.csr_matrix(z)
wx = spmultiply(x, w, array_out=False) wx = spmultiply(x, w, array_out=False)
wz = spmultiply(z, w, array_out=False) wz = spmultiply(z, w, array_out=False)
if wi is None: if wi is None:

View File

@@ -1,5 +1,5 @@
from __future__ import absolute_import, print_function
import numpy as np import numpy as np
import warnings import warnings
@@ -17,7 +17,7 @@ try:
from scipy.lib._version import NumpyVersion from scipy.lib._version import NumpyVersion
except ImportError: except ImportError:
import re import re
string_types = str string_types = basestring
class NumpyVersion(): class NumpyVersion():
"""Parse and compare numpy version strings. """Parse and compare numpy version strings.

View File

@@ -1 +1 @@
from .base import * from base import *

View File

@@ -1,4 +1,4 @@
from . import gwr import gwr
from . import sel_bw import sel_bw
from . import diagnostics import diagnostics
from . import kernels import kernels

View File

@@ -7,8 +7,8 @@ __author__ = "Taylor Oshan Tayoshan@gmail.com"
import numpy as np import numpy as np
import numpy.linalg as la import numpy.linalg as la
from scipy.stats import t from scipy.stats import t
from .kernels import * from kernels import *
from .diagnostics import get_AIC, get_AICc, get_BIC from diagnostics import get_AIC, get_AICc, get_BIC
import pysal.spreg.user_output as USER import pysal.spreg.user_output as USER
from crankshaft.regression.glm.family import Gaussian, Binomial, Poisson from crankshaft.regression.glm.family import Gaussian, Binomial, Poisson
from crankshaft.regression.glm.glm import GLM, GLMResults from crankshaft.regression.glm.glm import GLM, GLMResults
@@ -156,7 +156,7 @@ class GWR(GLM):
self.kernel = kernel self.kernel = kernel
self.fixed = fixed self.fixed = fixed
if offset is None: if offset is None:
self.offset = np.ones((self.n, 1)) self.offset = np.ones((self.n, 1))
else: else:
self.offset = offset * 1.0 self.offset = offset * 1.0
self.fit_params = {} self.fit_params = {}
@@ -169,7 +169,7 @@ class GWR(GLM):
def _build_W(self, fixed, kernel, coords, bw, points=None): def _build_W(self, fixed, kernel, coords, bw, points=None):
if fixed: if fixed:
try: try:
W = fk[kernel](coords, bw, points) W = fk[kernel](coords, bw, points)
except: except:
raise TypeError('Unsupported kernel function ', kernel) raise TypeError('Unsupported kernel function ', kernel)
else: else:
@@ -177,6 +177,7 @@ class GWR(GLM):
W = ak[kernel](coords, bw, points) W = ak[kernel](coords, bw, points)
except: except:
raise TypeError('Unsupported kernel function ', kernel) raise TypeError('Unsupported kernel function ', kernel)
return W return W
def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'): def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'):
@@ -217,7 +218,8 @@ class GWR(GLM):
p = np.zeros((m, 1)) p = np.zeros((m, 1))
for i in range(m): for i in range(m):
wi = self.W[i].reshape((-1,1)) wi = self.W[i].reshape((-1,1))
rslt = iwls(self.y, self.X, self.family, self.offset, ini_params, tol, max_iter, wi=wi) rslt = iwls(self.y, self.X, self.family, self.offset,
ini_params, tol, max_iter, wi=wi)
params[i,:] = rslt[0].T params[i,:] = rslt[0].T
predy[i] = rslt[1][i] predy[i] = rslt[1][i]
v[i] = rslt[2][i] v[i] = rslt[2][i]
@@ -257,7 +259,7 @@ class GWR(GLM):
fit_params : dict fit_params : dict
key-value pairs of parameters that will be passed into fit method to define estimation key-value pairs of parameters that will be passed into fit method to define estimation
routine; see fit method for more details routine; see fit method for more details
""" """
if (exog_scale is None) & (exog_resid is None): if (exog_scale is None) & (exog_resid is None):
train_gwr = self.fit(**fit_params) train_gwr = self.fit(**fit_params)
@@ -496,7 +498,7 @@ class GWRResults(GLMResults):
""" """
if exog_scale is not None: if exog_scale is not None:
return cov*exog_scale return cov*exog_scale
else: else:
return cov*self.scale return cov*self.scale
@@ -520,7 +522,7 @@ class GWRResults(GLMResults):
weighted mean of y weighted mean of y
""" """
if self.model.points is not None: if self.model.points is not None:
n = len(self.model.points) n = len(self.model.points)
else: else:
n = self.n n = self.n
off = self.offset.reshape((-1,1)) off = self.offset.reshape((-1,1))
@@ -543,13 +545,13 @@ class GWRResults(GLMResults):
""" """
if self.model.points is not None: if self.model.points is not None:
n = len(self.model.points) n = len(self.model.points)
else: else:
n = self.n n = self.n
TSS = np.zeros(shape=(n,1)) TSS = np.zeros(shape=(n,1))
for i in range(n): for i in range(n):
TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) * TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) *
(self.y.reshape((-1,1)) - self.y_bar[i])**2) (self.y.reshape((-1,1)) - self.y_bar[i])**2)
return TSS return TSS
@cache_readonly @cache_readonly
@@ -563,15 +565,15 @@ class GWRResults(GLMResults):
relationships. relationships.
""" """
if self.model.points is not None: if self.model.points is not None:
n = len(self.model.points) n = len(self.model.points)
resid = self.model.exog_resid.reshape((-1,1)) resid = self.model.exog_resid.reshape((-1,1))
else: else:
n = self.n n = self.n
resid = self.resid_response.reshape((-1,1)) resid = self.resid_response.reshape((-1,1))
RSS = np.zeros(shape=(n,1)) RSS = np.zeros(shape=(n,1))
for i in range(n): for i in range(n):
RSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) RSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1))
* resid**2) * resid**2)
return RSS return RSS
@cache_readonly @cache_readonly
@@ -617,10 +619,10 @@ class GWRResults(GLMResults):
""" """
if isinstance(self.family, (Poisson, Binomial)): if isinstance(self.family, (Poisson, Binomial)):
return self.resid_ss/(self.n - 2.0*self.tr_S + return self.resid_ss/(self.n - 2.0*self.tr_S +
self.tr_STS) #could be changed to SWSTW - nothing to test against self.tr_STS) #could be changed to SWSTW - nothing to test against
else: else:
return self.resid_ss/(self.n - 2.0*self.tr_S + return self.resid_ss/(self.n - 2.0*self.tr_S +
self.tr_STS) #could be changed to SWSTW - nothing to test against self.tr_STS) #could be changed to SWSTW - nothing to test against
@cache_readonly @cache_readonly
def sigma2_ML(self): def sigma2_ML(self):
""" """
@@ -673,14 +675,14 @@ class GWRResults(GLMResults):
Note: in (9.11), p should be tr(S), that is, the effective number of parameters Note: in (9.11), p should be tr(S), that is, the effective number of parameters
""" """
return self.std_res**2 * self.influ / (self.tr_S * (1.0-self.influ)) return self.std_res**2 * self.influ / (self.tr_S * (1.0-self.influ))
@cache_readonly @cache_readonly
def deviance(self): def deviance(self):
off = self.offset.reshape((-1,1)).T off = self.offset.reshape((-1,1)).T
y = self.y y = self.y
ybar = self.y_bar ybar = self.y_bar
if isinstance(self.family, Gaussian): if isinstance(self.family, Gaussian):
raise NotImplementedError('deviance not currently used for Gaussian') raise NotImplementedError('deviance not currently used for Gaussian')
elif isinstance(self.family, Poisson): elif isinstance(self.family, Poisson):
dev = np.sum(2.0*self.W*(y*np.log(y/(ybar*off))-(y-ybar*off)),axis=1) dev = np.sum(2.0*self.W*(y*np.log(y/(ybar*off))-(y-ybar*off)),axis=1)
elif isinstance(self.family, Binomial): elif isinstance(self.family, Binomial):
@@ -690,7 +692,7 @@ class GWRResults(GLMResults):
@cache_readonly @cache_readonly
def resid_deviance(self): def resid_deviance(self):
if isinstance(self.family, Gaussian): if isinstance(self.family, Gaussian):
raise NotImplementedError('deviance not currently used for Gaussian') raise NotImplementedError('deviance not currently used for Gaussian')
else: else:
off = self.offset.reshape((-1,1)).T off = self.offset.reshape((-1,1)).T
y = self.y y = self.y
@@ -708,7 +710,7 @@ class GWRResults(GLMResults):
manual. Equivalent to 1 - (deviance/null deviance) manual. Equivalent to 1 - (deviance/null deviance)
""" """
if isinstance(self.family, Gaussian): if isinstance(self.family, Gaussian):
raise NotImplementedError('Not implemented for Gaussian') raise NotImplementedError('Not implemented for Gaussian')
else: else:
return 1.0 - (self.resid_deviance/self.deviance) return 1.0 - (self.resid_deviance/self.deviance)
@@ -831,8 +833,8 @@ class GWRResults(GLMResults):
def predictions(self): def predictions(self):
P = self.model.P P = self.model.P
if P is None: if P is None:
raise NotImplementedError('predictions only avaialble if predict' raise NotImplementedError('predictions only avaialble if predict'
'method called on GWR model') 'method called on GWR model')
else: else:
predictions = np.sum(P*self.params, axis=1).reshape((-1,1)) predictions = np.sum(P*self.params, axis=1).reshape((-1,1))
return predictions return predictions
@@ -985,7 +987,7 @@ class FBGWR(GWR):
self.fixed = fixed self.fixed = fixed
self.constant = constant self.constant = constant
if constant: if constant:
self.X = USER.check_constant(self.X) self.X = USER.check_constant(self.X)
def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'): def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'):
""" """

View File

@@ -47,14 +47,14 @@ def golden_section(a, c, delta, function, tol, max_iter, int_score=False):
while np.abs(diff) > tol and iters < max_iter: while np.abs(diff) > tol and iters < max_iter:
iters += 1 iters += 1
if int_score: if int_score:
b = np.round(b) b = np.round(b)
d = np.round(d) d = np.round(d)
score_a = function(a) score_a = function(a)
score_b = function(b) score_b = function(b)
score_c = function(c) score_c = function(c)
score_d = function(d) score_d = function(d)
if score_b <= score_d: if score_b <= score_d:
opt_val = b opt_val = b
opt_score = score_b opt_score = score_b
@@ -73,7 +73,7 @@ def golden_section(a, c, delta, function, tol, max_iter, int_score=False):
#d = np.round(b) #d = np.round(b)
#if int_score: #if int_score:
# opt_val = np.round(opt_val) # opt_val = np.round(opt_val)
output.append((opt_val, opt_score)) output.append((opt_val, opt_score))
diff = score_b - score_d diff = score_b - score_d
score = opt_score score = opt_score
@@ -146,7 +146,7 @@ def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score,
gwr_func, bw_func, sel_func): gwr_func, bw_func, sel_func):
if init: if init:
bw = sel_func(bw_func(y, X)) bw = sel_func(bw_func(y, X))
print(bw) print bw
optim_model = gwr_func(y, X, bw) optim_model = gwr_func(y, X, bw)
err = optim_model.resid_response.reshape((-1,1)) err = optim_model.resid_response.reshape((-1,1))
est = optim_model.params est = optim_model.params
@@ -198,7 +198,7 @@ def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score,
new_rss = np.sum((y - predy)**2) new_rss = np.sum((y - predy)**2)
score = np.abs((new_rss - rss)/new_rss) score = np.abs((new_rss - rss)/new_rss)
rss = new_rss rss = new_rss
print(score) print score
scores.append(score) scores.append(score)
delta = score delta = score
BWs.append(bws) BWs.append(bws)

View File

@@ -8,12 +8,12 @@
__author__ = "Taylor Oshan Tayoshan@gmail.com" __author__ = "Taylor Oshan Tayoshan@gmail.com"
from .kernels import * from kernels import *
from .search import golden_section, equal_interval, flexible_bw from search import golden_section, equal_interval, flexible_bw
from .gwr import GWR from gwr import GWR
from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial
import pysal.spreg.user_output as USER import pysal.spreg.user_output as USER
from .diagnostics import get_AICc, get_AIC, get_BIC, get_CV from diagnostics import get_AICc, get_AIC, get_BIC, get_CV
from scipy.spatial.distance import pdist, squareform from scipy.spatial.distance import pdist, squareform
from pysal.common import KDTree from pysal.common import KDTree
import numpy as np import numpy as np
@@ -197,7 +197,7 @@ class Sel_BW(object):
if self.fb: if self.fb:
self._fbw() self._fbw()
print(self.bw[1]) print self.bw[1]
self.XB = self.bw[4] self.XB = self.bw[4]
self.err = self.bw[5] self.err = self.bw[5]
else: else:

View File

@@ -14,7 +14,7 @@ import pysal
class TestGWRGaussian(unittest.TestCase): class TestGWRGaussian(unittest.TestCase):
def setUp(self): def setUp(self):
data = pysal.open(pysal.examples.get_path('GData_utm.csv')) data = pysal.open(pysal.examples.get_path('GData_utm.csv'))
self.coords = list(zip(data.by_col('X'), data.by_col('Y'))) self.coords = zip(data.by_col('X'), data.by_col('Y'))
self.y = np.array(data.by_col('PctBach')).reshape((-1,1)) self.y = np.array(data.by_col('PctBach')).reshape((-1,1))
rural = np.array(data.by_col('PctRural')).reshape((-1,1)) rural = np.array(data.by_col('PctRural')).reshape((-1,1))
pov = np.array(data.by_col('PctPov')).reshape((-1,1)) pov = np.array(data.by_col('PctPov')).reshape((-1,1))
@@ -56,10 +56,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
CV = get_CV(rslt) CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 894.0) self.assertAlmostEquals(np.floor(AICc), 894.0)
self.assertAlmostEqual(np.floor(AIC), 890.0) self.assertAlmostEquals(np.floor(AIC), 890.0)
self.assertAlmostEqual(np.floor(BIC), 944.0) self.assertAlmostEquals(np.floor(BIC), 944.0)
self.assertAlmostEqual(np.round(CV,2), 18.25) self.assertAlmostEquals(np.round(CV,2), 18.25)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -107,10 +107,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
CV = get_CV(rslt) CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 896.0) self.assertAlmostEquals(np.floor(AICc), 896.0)
self.assertAlmostEqual(np.floor(AIC), 892.0) self.assertAlmostEquals(np.floor(AIC), 892.0)
self.assertAlmostEqual(np.floor(BIC), 941.0) self.assertAlmostEquals(np.floor(BIC), 941.0)
self.assertAlmostEqual(np.around(CV, 2), 19.19) self.assertAlmostEquals(np.around(CV, 2), 19.19)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -159,10 +159,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
CV = get_CV(rslt) CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 895.0) self.assertAlmostEquals(np.floor(AICc), 895.0)
self.assertAlmostEqual(np.floor(AIC), 890.0) self.assertAlmostEquals(np.floor(AIC), 890.0)
self.assertAlmostEqual(np.floor(BIC), 943.0) self.assertAlmostEquals(np.floor(BIC), 943.0)
self.assertAlmostEqual(np.around(CV, 2), 18.21) self.assertAlmostEquals(np.around(CV, 2), 18.21)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -211,10 +211,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
CV = get_CV(rslt) CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 896) self.assertAlmostEquals(np.floor(AICc), 896)
self.assertAlmostEqual(np.floor(AIC), 894.0) self.assertAlmostEquals(np.floor(AIC), 894.0)
self.assertAlmostEqual(np.floor(BIC), 922.0) self.assertAlmostEquals(np.floor(BIC), 922.0)
self.assertAlmostEqual(np.around(CV, 2), 17.91) self.assertAlmostEquals(np.around(CV, 2), 17.91)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -314,7 +314,7 @@ class TestGWRGaussian(unittest.TestCase):
class TestGWRPoisson(unittest.TestCase): class TestGWRPoisson(unittest.TestCase):
def setUp(self): def setUp(self):
data = pysal.open(pysal.examples.get_path('Tokyomortality.csv'), mode='Ur') data = pysal.open(pysal.examples.get_path('Tokyomortality.csv'), mode='Ur')
self.coords = list(zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID'))) self.coords = zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID'))
self.y = np.array(data.by_col('db2564')).reshape((-1,1)) self.y = np.array(data.by_col('db2564')).reshape((-1,1))
self.off = np.array(data.by_col('eb2564')).reshape((-1,1)) self.off = np.array(data.by_col('eb2564')).reshape((-1,1))
OCC = np.array(data.by_col('OCC_TEC')).reshape((-1,1)) OCC = np.array(data.by_col('OCC_TEC')).reshape((-1,1))
@@ -355,9 +355,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 13294.0) self.assertAlmostEquals(np.floor(AICc), 13294.0)
self.assertAlmostEqual(np.floor(AIC), 13247.0) self.assertAlmostEquals(np.floor(AIC), 13247.0)
self.assertAlmostEqual(np.floor(BIC), 13485.0) self.assertAlmostEquals(np.floor(BIC), 13485.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-05) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-05)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-03) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-03)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-03) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-03)
@@ -404,9 +404,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 13285) self.assertAlmostEquals(np.floor(AICc), 13285)
self.assertAlmostEqual(np.floor(AIC), 13259.0) self.assertAlmostEquals(np.floor(AIC), 13259.0)
self.assertAlmostEqual(np.floor(BIC), 13442.0) self.assertAlmostEquals(np.floor(BIC), 13442.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
@@ -452,9 +452,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 367.0) self.assertAlmostEquals(np.floor(AICc), 367.0)
self.assertAlmostEqual(np.floor(AIC), 361.0) self.assertAlmostEquals(np.floor(AIC), 361.0)
self.assertAlmostEqual(np.floor(BIC), 451.0) self.assertAlmostEquals(np.floor(BIC), 451.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-02, np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-02,
atol=1e-02) atol=1e-02)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02, atol=1e-02) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02, atol=1e-02)
@@ -511,9 +511,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 11283.0) self.assertAlmostEquals(np.floor(AICc), 11283.0)
self.assertAlmostEqual(np.floor(AIC), 11211.0) self.assertAlmostEquals(np.floor(AIC), 11211.0)
self.assertAlmostEqual(np.floor(BIC), 11497.0) self.assertAlmostEquals(np.floor(BIC), 11497.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-03) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-03)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
@@ -559,9 +559,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 21070.0) self.assertAlmostEquals(np.floor(AICc), 21070.0)
self.assertAlmostEqual(np.floor(AIC), 21069.0) self.assertAlmostEquals(np.floor(AIC), 21069.0)
self.assertAlmostEqual(np.floor(BIC), 21111.0) self.assertAlmostEquals(np.floor(BIC), 21111.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
@@ -583,7 +583,7 @@ class TestGWRPoisson(unittest.TestCase):
class TestGWRBinomial(unittest.TestCase): class TestGWRBinomial(unittest.TestCase):
def setUp(self): def setUp(self):
data = pysal.open(pysal.examples.get_path('landslides.csv')) data = pysal.open(pysal.examples.get_path('landslides.csv'))
self.coords = list(zip(data.by_col('X'), data.by_col('Y'))) self.coords = zip(data.by_col('X'), data.by_col('Y'))
self.y = np.array(data.by_col('Landslid')).reshape((-1,1)) self.y = np.array(data.by_col('Landslid')).reshape((-1,1))
ELEV = np.array(data.by_col('Elev')).reshape((-1,1)) ELEV = np.array(data.by_col('Elev')).reshape((-1,1))
SLOPE = np.array(data.by_col('Slope')).reshape((-1,1)) SLOPE = np.array(data.by_col('Slope')).reshape((-1,1))
@@ -630,9 +630,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 275.0) self.assertAlmostEquals(np.floor(AICc), 275.0)
self.assertAlmostEqual(np.floor(AIC), 271.0) self.assertAlmostEquals(np.floor(AIC), 271.0)
self.assertAlmostEqual(np.floor(BIC), 349.0) self.assertAlmostEquals(np.floor(BIC), 349.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
@@ -693,9 +693,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 277.0) self.assertAlmostEquals(np.floor(AICc), 277.0)
self.assertAlmostEqual(np.floor(AIC), 271.0) self.assertAlmostEquals(np.floor(AIC), 271.0)
self.assertAlmostEqual(np.floor(BIC), 358.0) self.assertAlmostEquals(np.floor(BIC), 358.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
@@ -756,9 +756,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 276.0) self.assertAlmostEquals(np.floor(AICc), 276.0)
self.assertAlmostEqual(np.floor(AIC), 272.0) self.assertAlmostEquals(np.floor(AIC), 272.0)
self.assertAlmostEqual(np.floor(BIC), 341.0) self.assertAlmostEquals(np.floor(BIC), 341.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
@@ -819,9 +819,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt) AIC = get_AIC(rslt)
BIC = get_BIC(rslt) BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 276.0) self.assertAlmostEquals(np.floor(AICc), 276.0)
self.assertAlmostEqual(np.floor(AIC), 273.0) self.assertAlmostEquals(np.floor(AIC), 273.0)
self.assertAlmostEqual(np.floor(BIC), 331.0) self.assertAlmostEquals(np.floor(BIC), 331.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)

View File

@@ -12,7 +12,7 @@ class TestKernels(unittest.TestCase):
y = np.arange(5,0, -1) y = np.arange(5,0, -1)
np.random.shuffle(x) np.random.shuffle(x)
np.random.shuffle(y) np.random.shuffle(y)
self.coords = np.array(list(zip(x, y))) self.coords = np.array(zip(x, y))
self.fix_gauss_kern = np.array([ self.fix_gauss_kern = np.array([
[ 1. , 0.38889556, 0.48567179, 0.48567179, 0.89483932], [ 1. , 0.38889556, 0.48567179, 0.48567179, 0.89483932],
[ 0.38889556, 1. , 0.89483932, 0.64118039, 0.48567179], [ 0.38889556, 1. , 0.89483932, 0.64118039, 0.48567179],

View File

@@ -13,7 +13,7 @@ import pysal
class TestSelBW(unittest.TestCase): class TestSelBW(unittest.TestCase):
def setUp(self): def setUp(self):
data = pysal.open(pysal.examples.get_path('GData_utm.csv')) data = pysal.open(pysal.examples.get_path('GData_utm.csv'))
self.coords = list(zip(data.by_col('X'), data.by_col('Y'))) self.coords = zip(data.by_col('X'), data.by_col('Y'))
self.y = np.array(data.by_col('PctBach')).reshape((-1,1)) self.y = np.array(data.by_col('PctBach')).reshape((-1,1))
rural = np.array(data.by_col('PctRural')).reshape((-1,1)) rural = np.array(data.by_col('PctRural')).reshape((-1,1))
pov = np.array(data.by_col('PctPov')).reshape((-1,1)) pov = np.array(data.by_col('PctPov')).reshape((-1,1))

View File

@@ -2,8 +2,8 @@
Geographically weighted regression Geographically weighted regression
""" """
import numpy as np import numpy as np
from .gwr.base.gwr import GWR as PySAL_GWR from gwr.base.gwr import GWR as PySAL_GWR
from .gwr.base.sel_bw import Sel_BW from gwr.base.sel_bw import Sel_BW
import json import json
from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.analysis_data_provider import AnalysisDataProvider
import plpy import plpy
@@ -48,7 +48,7 @@ class GWR:
# x, y are centroids of input geometries # x, y are centroids of input geometries
x = np.array(query_result[0]['x'], dtype=np.float) x = np.array(query_result[0]['x'], dtype=np.float)
y = np.array(query_result[0]['y'], dtype=np.float) y = np.array(query_result[0]['y'], dtype=np.float)
coords = list(zip(x, y)) coords = zip(x, y)
# extract dependent variable # extract dependent variable
Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape((-1, 1)) Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape((-1, 1))
@@ -88,7 +88,7 @@ class GWR:
bw = np.repeat(float(bw), n) bw = np.repeat(float(bw), n)
# create lists of json objs for model outputs # create lists of json objs for model outputs
for idx in range(n): for idx in xrange(n):
coeffs.append(json.dumps({var: model.params[idx, k] coeffs.append(json.dumps({var: model.params[idx, k]
for k, var in enumerate(ind_vars)})) for k, var in enumerate(ind_vars)}))
stand_errs.append(json.dumps({var: model.bse[idx, k] stand_errs.append(json.dumps({var: model.bse[idx, k]
@@ -99,8 +99,8 @@ class GWR:
json.dumps({var: filtered_t[idx, k] json.dumps({var: filtered_t[idx, k]
for k, var in enumerate(ind_vars)})) for k, var in enumerate(ind_vars)}))
return list(zip(coeffs, stand_errs, t_vals, filtered_t_vals, return zip(coeffs, stand_errs, t_vals, filtered_t_vals,
predicted, residuals, r_squared, bw, rowid)) predicted, residuals, r_squared, bw, rowid)
def gwr_predict(self, subquery, dep_var, ind_vars, def gwr_predict(self, subquery, dep_var, ind_vars,
bw=None, fixed=False, kernel='bisquare', bw=None, fixed=False, kernel='bisquare',
@@ -133,7 +133,7 @@ class GWR:
x = np.array(query_result[0]['x'], dtype=np.float) x = np.array(query_result[0]['x'], dtype=np.float)
y = np.array(query_result[0]['y'], dtype=np.float) y = np.array(query_result[0]['y'], dtype=np.float)
coords = np.array(list(zip(x, y)), dtype=np.float) coords = np.array(zip(x, y), dtype=np.float)
# extract dependent variable # extract dependent variable
Y = np.array(query_result[0]['dep_var']).reshape((-1, 1)) Y = np.array(query_result[0]['dep_var']).reshape((-1, 1))
@@ -190,7 +190,7 @@ class GWR:
predicted = model.predy.flatten() predicted = model.predy.flatten()
m = len(model.predy) m = len(model.predy)
for idx in range(m): for idx in xrange(m):
coeffs.append(json.dumps({var: model.params[idx, k] coeffs.append(json.dumps({var: model.params[idx, k]
for k, var in enumerate(ind_vars)})) for k, var in enumerate(ind_vars)}))
stand_errs.append(json.dumps({var: model.bse[idx, k] stand_errs.append(json.dumps({var: model.bse[idx, k]
@@ -198,5 +198,5 @@ class GWR:
t_vals.append(json.dumps({var: model.tvalues[idx, k] t_vals.append(json.dumps({var: model.tvalues[idx, k]
for k, var in enumerate(ind_vars)})) for k, var in enumerate(ind_vars)}))
return list(zip(coeffs, stand_errs, t_vals, return zip(coeffs, stand_errs, t_vals,
r_squared, predicted, rowid[test])) r_squared, predicted, rowid[test])

View File

@@ -1,2 +1,2 @@
"""Import all functions from for segmentation""" """Import all functions from for segmentation"""
from .segmentation import * from segmentation import *

View File

@@ -2,11 +2,14 @@
Segmentation creation and prediction Segmentation creation and prediction
""" """
import pickle
import plpy
import numpy as np import numpy as np
from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics from sklearn import metrics
from sklearn.cross_validation import train_test_split from sklearn.cross_validation import train_test_split
from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.analysis_data_provider import AnalysisDataProvider
from crankshaft import model_storage
# NOTE: added optional param here # NOTE: added optional param here
@@ -47,10 +50,11 @@ class Segmentation(object):
model_parameters, 0.2) model_parameters, 0.2)
prediction = model.predict(target_features) prediction = model.predict(target_features)
accuracy_array = [accuracy] * prediction.shape[0] accuracy_array = [accuracy] * prediction.shape[0]
return list(zip(target_ids, prediction, accuracy_array)) return zip(target_ids, prediction, accuracy_array)
def create_and_predict_segment(self, query, variable, feature_columns, def create_and_predict_segment(self, query, variable, feature_columns,
target_query, model_params, target_query, model_params,
model_name=None,
id_col='cartodb_id'): id_col='cartodb_id'):
""" """
generate a segment with machine learning generate a segment with machine learning
@@ -70,16 +74,24 @@ class Segmentation(object):
(target, features, target_mean, (target, features, target_mean,
feature_means) = self.clean_data(query, variable, feature_columns) feature_means) = self.clean_data(query, variable, feature_columns)
model, accuracy = train_model(target, features, model_params, 0.2) model_storage.create_model_table()
# find model if it exists and is specified
if model_name is not None:
model = model_storage.get_model(model_name)
if locals().get('model') is None:
model, accuracy = train_model(target, features, model_params, 0.2)
result = self.predict_segment(model, feature_columns, target_query, result = self.predict_segment(model, feature_columns, target_query,
feature_means) feature_means)
accuracy_array = [accuracy] * result.shape[0] accuracy_array = [accuracy] * result.shape[0]
rowid = self.data_provider.get_segmentation_data(params) rowid = self.data_provider.get_segmentation_data(params)
'''
rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] # store the model for later use
''' model_storage.set_model(model, model_name, feature_columns)
return list(zip(rowid[0]['ids'], result, accuracy_array)) return zip(rowid[0]['ids'], result, accuracy_array)
def predict_segment(self, model, feature_columns, target_query, def predict_segment(self, model, feature_columns, target_query,
feature_means): feature_means):

View File

@@ -1,2 +1,2 @@
"""Import all functions from clustering libraries.""" """Import all functions from clustering libraries."""
from .markov import * from markov import *

View File

@@ -91,7 +91,7 @@ class Markov(object):
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1]) trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results # output the results
return list(zip(trend, trend_up, trend_down, volatility, weights.id_order)) return zip(trend, trend_up, trend_down, volatility, weights.id_order)
@@ -140,7 +140,7 @@ def rebin_data(time_data, num_time_per_bin):
return np.array( return np.array(
[time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1) [time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
for i in range(int(n_max))]).T for i in range(n_max)]).T
def get_prob_dist(transition_matrix, lag_indices, unit_indices): def get_prob_dist(transition_matrix, lag_indices, unit_indices):

View File

@@ -1,5 +1,5 @@
joblib==0.9.4 joblib==0.8.3
numpy==1.11.0 numpy==1.6.1
scipy==0.17.0 scipy==0.14.0
pysal==1.14.3 pysal==1.14.3
scikit-learn==0.17.0 scikit-learn==0.14.1

View File

@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
setup( setup(
name='crankshaft', name='crankshaft',
version='0.9.0', version='0.0.0',
description='CartoDB Spatial Analysis Python Library', description='CartoDB Spatial Analysis Python Library',
@@ -26,7 +26,7 @@ setup(
'Intended Audience :: Mapping comunity', 'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools', 'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
'Programming Language :: Python', 'Programming Language :: Python :: 2.7',
], ],
keywords='maps mapping tools spatial analysis geostatistics', keywords='maps mapping tools spatial analysis geostatistics',
@@ -41,7 +41,7 @@ setup(
# The choice of component versions is dictated by what's # The choice of component versions is dictated by what's
# provisioned in the production servers. # provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation. # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.9.4', 'numpy==1.11.0', 'scipy==0.17.0', 'pysal==1.14.3', 'scikit-learn==0.17.0'], install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
requires=['pysal', 'numpy', 'sklearn'], requires=['pysal', 'numpy', 'sklearn'],

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.0',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1,6 @@
{
"production_col": [10, 10, 10],
"capacity_col": [0.09, 0.31],
"marginal_col": [5, 5],
"pairwise": [[1, 2, 3], [3, 2, 1]]
}

View File

@@ -72,7 +72,7 @@ class MoranTest(unittest.TestCase):
result = moran.local_stat('subquery', 'value', result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id') 'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[6]) for row in result] result = [(row[0], row[6]) for row in result]
zipped_values = list(zip(result, self.moran_data)) zipped_values = zip(result, self.moran_data)
for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values: for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val) self.assertAlmostEqual(res_val, exp_val)
@@ -91,7 +91,7 @@ class MoranTest(unittest.TestCase):
'knn', 5, 99, 'the_geom', 'cartodb_id') 'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[6]) for row in result] result = [(row[0], row[6]) for row in result]
zipped_values = list(zip(result, self.moran_data)) zipped_values = zip(result, self.moran_data)
for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values: for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val) self.assertAlmostEqual(res_val, exp_val)

View File

@@ -86,7 +86,7 @@ class GWRTest(unittest.TestCase):
# unpack response # unpack response
coeffs, stand_errs, t_vals, t_vals_filtered, predicteds, \ coeffs, stand_errs, t_vals, t_vals_filtered, predicteds, \
residuals, r_squareds, bws, rowids = list(zip(*gwr_resp)) residuals, r_squareds, bws, rowids = zip(*gwr_resp)
# prepare for comparision # prepare for comparision
coeff_known_pctpov = self.knowns['est_pctpov'] coeff_known_pctpov = self.knowns['est_pctpov']
@@ -98,13 +98,13 @@ class GWRTest(unittest.TestCase):
# test pctpov coefficient estimates # test pctpov coefficient estimates
for idx, val in enumerate(coeff_known_pctpov): for idx, val in enumerate(coeff_known_pctpov):
resp_idx = rowids.index(ids[idx]) resp_idx = rowids.index(ids[idx])
self.assertAlmostEqual(val, self.assertAlmostEquals(val,
json.loads(coeffs[resp_idx])['pctpov'], json.loads(coeffs[resp_idx])['pctpov'],
places=4) places=4)
# test pctrural tvals # test pctrural tvals
for idx, val in enumerate(tval_known_pctblack): for idx, val in enumerate(tval_known_pctblack):
resp_idx = rowids.index(ids[idx]) resp_idx = rowids.index(ids[idx])
self.assertAlmostEqual(val, self.assertAlmostEquals(val,
json.loads(t_vals[resp_idx])['pctrural'], json.loads(t_vals[resp_idx])['pctrural'],
places=4) places=4)
@@ -119,7 +119,7 @@ class GWRTest(unittest.TestCase):
# unpack response # unpack response
coeffs, stand_errs, t_vals, \ coeffs, stand_errs, t_vals, \
r_squareds, predicteds, rowid = list(zip(*gwr_resp)) r_squareds, predicteds, rowid = zip(*gwr_resp)
threshold = 0.01 threshold = 0.01
for i, idx in enumerate(self.idx_ids_of_unknowns): for i, idx in enumerate(self.idx_ids_of_unknowns):

View File

@@ -66,7 +66,7 @@ class SegmentationTest(unittest.TestCase):
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
result = replace_nan_with_mean(test_array, means=None)[0] result = replace_nan_with_mean(test_array, means=None)[0]
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float) expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float)
self.assertEqual(sorted(result), sorted(expectation)) self.assertItemsEqual(result, expectation)
def test_create_and_predict_segment(self): def test_create_and_predict_segment(self):
"""test segmentation.test_create_and_predict""" """test segmentation.test_create_and_predict"""
@@ -118,7 +118,7 @@ class SegmentationTest(unittest.TestCase):
model_parameters, model_parameters,
id_col='cartodb_id') id_col='cartodb_id')
results = [(row[1], row[2]) for row in result] results = [(row[1], row[2]) for row in result]
zipped_values = list(zip(results, self.result_seg)) zipped_values = zip(results, self.result_seg)
pre_res = [r[0] for r in self.true_result] pre_res = [r[0] for r in self.true_result]
acc_res = [r[1] for r in self.result_seg] acc_res = [r[1] for r in self.result_seg]

View File

@@ -98,7 +98,7 @@ class SpaceTimeTests(unittest.TestCase):
self.assertTrue(result is not None) self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result] result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print(result[0]) print result[0]
expected = self.markov_data expected = self.markov_data
for ([res_trend, res_up, res_down, res_vol, res_id], for ([res_trend, res_up, res_down, res_vol, res_id],
[exp_trend, exp_up, exp_down, exp_vol, exp_id] [exp_trend, exp_up, exp_down, exp_vol, exp_id]

View File

@@ -0,0 +1,15 @@
from test.helper import plpy, fixture_file
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
import crankshaft
class RawDataProvider(AnalysisDataProvider):
def __init__(self, fixturedata):
self.your_algo_data = fixturedata
def get_moran(self, params):
"""
Replace this function name with the one used in your algorithm,
and make sure to use the same function signature that is written
for this algo in analysis_data_provider.py
"""
return self.your_algo_data

View File

@@ -0,0 +1,76 @@
"""
Based on the Weiszfeld algorithm:
https://en.wikipedia.org/wiki/Geometric_median
"""
# import plpy
import numpy as np
from numpy.linalg import norm
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
query = '''
SELECT array_agg(ST_X({geom_col})) As x_coords,
array_agg(ST_Y({geom_col})) As y_coords
FROM {tablename}
'''.format(geom_col=geom_col, tablename=tablename)
try:
resp = plpy.execute(query)
data = np.vstack((resp['x_coords'][0],
resp['y_coords'][0])).T
plpy.notice('coords: %s' % str(coords))
except Exception, err:
# plpy.error('Analysis failed: %s' % err)
print('No plpy')
data = np.array([[1.2 * np.random.random() + 10.,
1.1 * (np.random.random() - 1.) + 3.]
for i in range(1, 100)])
# initialize 'median center' to be the mean
coords_center_temp = data.mean(axis=0)
# plpy.notice('temp_center: %s' % str(coords_center_temp))
print('temp_center: %s' % str(coords_center_temp))
for i in range(0, num_iters):
old_coords_center = coords_center_temp.copy()
denom = denominator(coords_center_temp, data)
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
data[j])
for j in range(len(data))], axis=0)
coords_center_temp = coords_center_temp / denom
print("Pass #%d" % i)
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
print('temp_center: %s' % str(coords_center_temp))
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
coords_center_temp))
print("Center coords: %s" % str(coords_center_temp))
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
return coords_center_temp
def obj_func(center_coords, data):
"""
"""
return np.linalg.norm(center_coords - data)
def numerator(center_coords, data_i):
"""
"""
return np.reciprocal(np.linalg.norm(center_coords - data_i))
def denominator(center_coords, data):
"""
"""
return np.reciprocal(np.linalg.norm(data - center_coords))

View File

@@ -0,0 +1 @@
from core import set_model, get_model, create_model_table

View File

@@ -0,0 +1,86 @@
import time
import plpy
import pickle
from petname import generate
def create_model_table():
q = '''
create table if not exists model_storage(
description text,
name text unique,
model bytea,
feature_names text[],
date_created timestamptz,
id serial primary key);
'''
plpy.notice(q)
plan = plpy.prepare(q)
resp = plpy.execute(plan)
plpy.notice('Model table successfully created')
plpy.notice(str(resp))
def get_model(model_name):
"""retrieve model if it exists"""
try:
plan = plpy.prepare('''
SELECT model FROM model_storage
WHERE name = $1;
''', ['text', ])
model_encoded = plpy.execute(plan, [model_name, ])
if len(model_encoded) == 1:
model = pickle.loads(
model_encoded[0]['model']
)
plpy.notice('Model successfully loaded')
else:
plpy.notice('Model not found, or too many models '
'({})'.format(len(model_encoded)))
model = None
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
return model
def set_model(model, model_name, feature_names):
"""stores the model in the table model_storage"""
if model_name is None:
model_name = generate(words=2, separator='_', letters=8)
existing_names = plpy.execute('''
SELECT array_agg(name) as name
FROM model_storage
''')
plpy.notice('nrows: {}'.format(existing_names.nrows()))
plpy.notice('MODEL NAME: {}'.format(model_name))
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
if existing_names[0]['name'] is not None:
while model_name in existing_names[0]['name']:
model_name = generate(words=2, separator='_', letters=10)
plpy.notice(model_name)
# store model
try:
plan = plpy.prepare('''
INSERT INTO model_storage(description, name, model, feature_names, date_created)
VALUES (
$1,
$2,
$3,
$4::text[],
to_timestamp($5));
''', ['text', 'text', 'bytea', 'text', 'numeric'])
plpy.notice('{%s}' % ','.join(feature_names))
plpy.notice(feature_names)
plpy.execute(
plan,
[' '.join(m.strip() for m in model.__repr__().split('\n')),
model_name,
pickle.dumps(model),
'{%s}' % ','.join(feature_names),
time.time()]
)
plpy.notice('model successfully stored as {}'.format(model_name))
except plpy.SPIError as err:
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))

Some files were not shown because too many files have changed in this diff Show More