Compare commits

..

15 Commits

Author SHA1 Message Date
Andy Eschbacher
daba2f9597 release 0.9.5 [ci skip] 2018-04-09 15:22:35 -04:00
Andy Eschbacher
8f28f41060 corrects incorrect variable name 2018-04-09 15:16:52 -04:00
Andy Eschbacher
7509afa5a6 release feature name validation 2018-04-09 14:14:39 -04:00
Andy Eschbacher
a28c68502c adds feature name validation [ci skip] 2018-04-09 14:09:31 -04:00
Andy Eschbacher
5b4443ca88 new faux release 2018-03-22 13:14:20 -04:00
Andy Eschbacher
2048db33fc avoids accuracy calculation without model being defined 2018-03-22 13:12:17 -04:00
Andy Eschbacher
99e78800b3 adds latest release file 2018-03-22 11:46:42 -04:00
Andy Eschbacher
800648a710 adds upgrade path for 0.9.2 faux release 2018-03-22 11:08:46 -04:00
Andy Eschbacher
91ee6ecc48 new faux release 2018-03-22 11:02:45 -04:00
Andy Eschbacher
9a5ab17240 replaces petname with uuid for now 2018-03-22 11:01:39 -04:00
Andy Eschbacher
65be9befb1 faux release for staging testing 2018-03-22 10:19:29 -04:00
Andy Eschbacher
37e6b4a228 fixes release path copy error [ci skip] 2018-03-20 11:59:15 -04:00
Andy Eschbacher
766bfed9be dummy version bump 2018-03-19 13:30:37 -04:00
Andy Eschbacher
e8a601e945 adds model module [ci skip] 2018-03-16 16:45:39 -04:00
Andy Eschbacher
c2be340c07 prototype of model writing 2018-03-16 16:21:00 -04:00
439 changed files with 69214 additions and 746 deletions

3
.brackets.json Normal file
View File

@@ -0,0 +1,3 @@
{
"sbruchmann.staticpreview.basepath": "/home/carto/Projects/crankshaft/"
}

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ envs/
*.pyc
.DS_Store
.idea/
.*.sw[nop]

View File

@@ -1,48 +1,60 @@
language: c
dist: precise
sudo: required
env:
global:
- PAGER=cat
- PGUSER=postgres
- PGDATABASE=postgres
- PGOPTIONS='-c client_min_messages=NOTICE'
jobs:
include:
- env: POSTGRESQL_VERSION="9.6" POSTGIS_VERSION="2.5"
dist: xenial
- env: POSTGRESQL_VERSION="10" POSTGIS_VERSION="2.5"
dist: xenial
- env: POSTGRESQL_VERSION="11" POSTGIS_VERSION="2.5"
dist: xenial
- env: POSTGRESQL_VERSION="12" POSTGIS_VERSION="3"
dist: bionic
before_install:
- ./check-up-to-date-with-master.sh
- sudo apt-get -y install python-pip
- sudo apt-get install -y --allow-unauthenticated --no-install-recommends --no-install-suggests postgresql-$POSTGRESQL_VERSION postgresql-client-$POSTGRESQL_VERSION postgresql-server-dev-$POSTGRESQL_VERSION postgresql-common
- if [[ $POSTGRESQL_VERSION == '9.6' ]]; then sudo apt-get install -y postgresql-contrib-9.6; fi;
- sudo apt-get install -y --allow-unauthenticated postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION-scripts postgis
- sudo apt-get -y install python-software-properties
- sudo add-apt-repository -y ppa:cartodb/sci
- sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
- sudo add-apt-repository -y ppa:cartodb/gis
- sudo add-apt-repository -y ppa:cartodb/gis-testing
- sudo apt-get update
# For pre12, install plpython2. For PG12 install plpython3
- if [[ $POSTGRESQL_VERSION != '12' ]]; then sudo apt-get install -y postgresql-plpython-$POSTGRESQL_VERSION python python-pip python-software-properties python-joblib python-nose python-setuptools; else sudo apt-get install -y postgresql-plpython3-12 python3 python3-pip python3-software-properties python3-joblib python3-nose python3-setuptools; fi;
- if [[ $POSTGRESQL_VERSION == '12' ]]; then echo -e "joblib==0.11\nnumpy==1.13.3\nscipy==0.19.1\npysal==1.14.3\nscikit-learn==0.19.1" > ./src/py/crankshaft/requirements.txt && sed -i -e "s/.*install_requires.*$/ install_requires=['joblib==0.11.0', 'numpy==1.13.3', 'scipy==0.19.1', 'pysal==1.14.3', 'scikit-learn==0.19.1'],/g" ./src/py/crankshaft/setup.py; fi;
- sudo apt-get -y install python-joblib=0.8.3-1-cdb1
- sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1
- sudo pg_dropcluster --stop $POSTGRESQL_VERSION main
- sudo rm -rf /etc/postgresql/$POSTGRESQL_VERSION /var/lib/postgresql/$POSTGRESQL_VERSION
- sudo pg_createcluster -u postgres $POSTGRESQL_VERSION main --start -- -A trust
- export PGPORT=$(pg_lsclusters | grep $POSTGRESQL_VERSION | awk '{print $3}')
- sudo apt-get -y install python-scipy=0.14.0-2-cdb6
- sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2
- sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2
- sudo apt-get -y --no-install-recommends install python-scikits-learn=0.14.1-3-cdb2
# Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
- sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
# Install postgres db and build deps
- sudo /etc/init.d/postgresql stop # stop travis default instance
- sudo apt-get -y remove --purge postgresql-9.1
- sudo apt-get -y remove --purge postgresql-9.2
- sudo apt-get -y remove --purge postgresql-9.3
- sudo apt-get -y remove --purge postgresql-9.4
- sudo apt-get -y remove --purge postgresql-9.5
- sudo rm -rf /var/lib/postgresql/
- sudo rm -rf /var/log/postgresql/
- sudo rm -rf /etc/postgresql/
- sudo apt-get -y remove --purge postgis-2.2
- sudo apt-get -y autoremove
- sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
- sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
- sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
# configure it to accept local connections from postgres
- echo -e "# TYPE DATABASE USER ADDRESS METHOD \nlocal all postgres trust\nlocal all all trust\nhost all all 127.0.0.1/32 trust" \
| sudo tee /etc/postgresql/9.5/main/pg_hba.conf
- sudo /etc/init.d/postgresql restart 9.5
install:
- sudo make install
script:
- make test
- make test || { cat src/pg/test/regression.diffs; false; }
- ./check-compatibility.sh
after_failure:
- pg_lsclusters
- cat src/pg/test/regression.diffs
- echo $PGPORT
- cat /var/log/postgresql/postgresql-$POSTGRESQL_VERSION-main.log

View File

@@ -39,7 +39,9 @@ ALTER EXTENSION crankshaft UPDATE TO 'dev';
If the extension has not previously been installed in a database,
it can be installed directly with:
```sql
CREATE EXTENSION crankshaft WITH VERSION 'dev' CASCADE;
CREATE EXTENSION IF NOT EXISTS plpythonu;
CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION crankshaft WITH VERSION 'dev';
```
Once the feature or bugfix is completed and all the tests are passing

View File

@@ -23,7 +23,7 @@ test: ## Run the tests for the development version of the extension
$(MAKE) -C $(EXT_DIR) test
# Generate a new release into release
release: ## Generate a new release of the extension.
release: ## Generate a new release of the extension. Only for telease manager
$(MAKE) -C $(EXT_DIR) release
$(MAKE) -C $(PYP_DIR) release
@@ -31,7 +31,7 @@ release: ## Generate a new release of the extension.
# Requires sudo.
# Use the RELEASE_VERSION environment variable to deploy a specific version:
# sudo make deploy RELEASE_VERSION=1.0.0
deploy:
deploy: ## Deploy a released extension. Only for release manager. Requires sudo.
$(MAKE) -C $(EXT_DIR) deploy
$(MAKE) -C $(PYP_DIR) deploy

View File

@@ -3,21 +3,9 @@ EXTENSION = crankshaft
PACKAGE = crankshaft
EXTVERSION = $(shell grep default_version $(SELF_DIR)/src/pg/$(EXTENSION).control | sed -e "s/default_version[[:space:]]*=[[:space:]]*'\([^']*\)'/\1/")
RELEASE_VERSION ?= $(EXTVERSION)
SED = sed
AWK = awk
PG_CONFIG = pg_config
PG_VERSION_1000 := $(shell $(PG_CONFIG) --version | $(AWK) '{$$2*=1000; print $$2}')
PG_PARALLEL := $(shell [ $(PG_VERSION_1000) -ge 9600 ] && echo true)
PG_12plus := $(shell [ $(PG_VERSION_1000) -ge 12000 ] && echo true)
PYTHON3 ?= $(PG_12plus)
ifeq ($(PYTHON3), true)
PIP := python3 -m pip
NOSETESTS = nosetests3
else
PIP := python2 -m pip
PIP = pip
NOSETESTS = nosetests
endif
AWK = awk
PG_CONFIG = pg_config
PG_PARALLEL := $(shell $(PG_CONFIG) --version | ($(AWK) '{$$2*=1000; if ($$2 >= 9600) print 1; else print 0;}' 2> /dev/null || echo 0))

11
NEWS.md
View File

@@ -1,14 +1,3 @@
0.9.0 (2019-12-23)
------------------
* Compatibility with PG12.
* Compatibility with python3 (enable with PYTHON3=true env variable, default in PG12+).
0.8.2 (2019-02-07)
------------------
* Update dependencies to match what it's being used in production.
* Update travis to xenial, PG10 and 11, and postgis 2.5
* Compatibility with PG11
0.8.1 (2018-03-12)
------------------
* Adds improperly added version files

View File

@@ -8,21 +8,28 @@ CARTO Spatial Analysis extension for PostgreSQL.
* `src/` source code
- `pg/` contains the PostgreSQL extension source code
- `py/` Python module source code
* `release` released versions
* `release` reseleased versions
## Requirements
* PostgreSQL
* plpythonu (for PG12+, plpython3u) and postgis extensions
* plpythonu and postgis extensions
* python-scipy system package (see [src/py/README.md](https://github.com/CartoDB/crankshaft/blob/develop/src/py/README.md))
# Development Process
We distinguish two roles:
* *developers* will implement new functionality and bugfixes into
the codebase.
* A *release manager* will handle the release process.
We use the branch `develop` as the main integration branch for development. The `master` is reserved to handle releases.
The process is as follows:
1. Create a new **topic branch** from `develop` for any new feature or bugfix and commit their changes to it:
1. Create a new **topic branch** from `develop` for any new feature
or bugfix and commit their changes to it:
```shell
git fetch && git checkout -b my-cool-feature origin/develop
@@ -32,6 +39,7 @@ The process is as follows:
1. Update the [NEWS.md](https://github.com/CartoDB/crankshaft/blob/develop/NEWS.md) doc.
1. Create a pull request and mention relevant people for a **peer review**.
1. Address the comments and improvements you get from the peer review.
1. Mention `@CartoDB/dataservices` in the PR to get it merged into `develop`.
In order for a pull request to be accepted, the following criteria should be met:
* The peer review should pass and no major issue should be left unaddressed.

View File

@@ -1,6 +1,7 @@
# Release & Deployment Process
:warning: Do not forget about updating dependencies in `cartodb-platform` and `carto-postgres-artifacts` :warning:
The release process of a new version of the extension
shall be performed by the designated *Release Manager*.
## Release steps
* Make sure `develop` branch passes all the tests.

View File

@@ -1,20 +0,0 @@
{
"name": "crankshaft",
"current_version": {
"requires": {
"postgres": ">=9.5.0",
"postgis": ">=2.2.0.0",
"python": ">=2.7.0",
"joblib": "0.8.3",
"numpy": "1.6.1",
"scipy": "0.14.0",
"pysal": "1.14.3",
"scikit-learn": "0.14.1"
},
"works_with": {
}
},
"exceptional_versions": {
}
}

View File

@@ -25,6 +25,10 @@ psql -c "SELECT * FROM pg_available_extension_versions WHERE name LIKE 'cranksha
# Install in the fresh DB
psql $DBNAME <<'EOF'
-- Install dependencies
CREATE EXTENSION plpythonu;
CREATE EXTENSION postgis VERSION '2.2.2';
-- Create role publicuser if it does not exist
DO
$$
@@ -40,53 +44,30 @@ END
$$ LANGUAGE plpgsql;
-- Install the default version
CREATE EXTENSION crankshaft CASCADE;
CREATE EXTENSION crankshaft;
\dx
EOF
# Check PG version
PG_VERSION=`psql -q -t -c "SELECT current_setting('server_version_num')"`
# Save public function signatures
if [[ "$PG_VERSION" -lt 110000 ]]; then
psql $DBNAME -c "
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE
WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
else
psql $DBNAME -c "
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.prokind = 'a' THEN 'agg'
WHEN p.prokind = 'w' THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
fi
psql $DBNAME <<'EOF'
CREATE TABLE release_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE
WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;
EOF
# Deploy current dev branch
make clean-dev || die "Could not clean dev files"
@@ -95,42 +76,26 @@ sudo make install || die "Could not deploy current dev branch"
# Check it can be upgraded
psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version"
if [[ $PG_VERSION -lt 110000 ]]; then
psql $DBNAME -c "
CREATE TABLE dev_function_signatures AS
SELECT p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
else
psql $DBNAME -c "
CREATE TABLE dev_function_signatures AS
SELECT p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE WHEN p.prokind = 'a' THEN 'agg'
WHEN p.prokind = 'w' THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;"
fi
# Check against saved public function signatures
psql $DBNAME <<'EOF'
CREATE TABLE dev_function_signatures AS
SELECT
p.proname as name,
pg_catalog.pg_get_function_result(p.oid) as result_type,
pg_catalog.pg_get_function_arguments(p.oid) as arguments,
CASE
WHEN p.proisagg THEN 'agg'
WHEN p.proiswindow THEN 'window'
WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
ELSE 'normal'
END as type
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE
n.nspname = 'cdb_crankshaft'
AND p.proname LIKE 'cdb_%'
ORDER BY 1, 2, 4;
EOF
echo "Functions in development not in latest release (ok):"
psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;"

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.8.2'::text;
SELECT '0.9.0'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_pyagg'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
query TEXT,
variable_name TEXT,
target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
'''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
variable,
feature_columns,
target_query,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_weightedmean'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov
-- input table format:

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.8.2'::text;
SELECT '0.9.1'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_pyagg'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
query TEXT,
variable_name TEXT,
target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
'''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
variable,
feature_columns,
target_query,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_weightedmean'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov
-- input table format:

View File

@@ -21,7 +21,7 @@ _cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_pyagg'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -82,13 +91,14 @@ AS $$
target_ids,
model_params)
$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED;
$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable_name TEXT,
target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,17 +115,51 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
'''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -123,6 +167,7 @@ CREATE OR REPLACE FUNCTION
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,9 +189,10 @@ AS $$
variable,
feature_columns,
target_query,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
IN target_query text,
IN weight_column text,
@@ -656,7 +702,7 @@ AS $$
moran = Moran()
return moran.global_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION
@@ -681,7 +727,7 @@ AS $$
num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag
return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
@@ -709,7 +755,7 @@ moran = Moran()
return moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (public-facing function)
@@ -836,7 +882,7 @@ AS $$
# TODO: use named parameters or a dictionary
return moran.global_rate_stat(subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (internal function) - DEPRECATED
@@ -864,7 +910,7 @@ AS $$
result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag
return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION
@@ -920,7 +966,7 @@ return moran.local_rate_stat(
geom_col,
id_col
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Rate
-- Replaces CDB_AreasOfInterestLocalRate
@@ -1033,7 +1079,7 @@ from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
@@ -1063,7 +1109,7 @@ kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters,
standardize=standardize,
id_col=id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
@@ -1104,19 +1150,27 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_weightedmean'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov
-- input table format:
@@ -1146,7 +1200,7 @@ AS $$
## TODO: use named parameters or a dictionary
return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- input table format: identical to above but in a predictable format
-- Sample function call:
@@ -1172,7 +1226,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
-- $$ LANGUAGE plpythonu;
--
-- -- input table format:
-- -- id | geom | date | measurement
@@ -1198,7 +1252,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
-- $$ LANGUAGE plpythonu;
-- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/
@@ -1468,7 +1522,7 @@ AS $$
from crankshaft.clustering import Getis
getis = Getis()
return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- TODO: make a version that accepts the values as arrays
@@ -1808,7 +1862,7 @@ gwr = GWR()
return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
@@ -1826,7 +1880,7 @@ gwr = GWR()
return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
--
-- Creates N points randomly distributed arround the polygon
--

View File

@@ -4,7 +4,7 @@
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.9.0'::text;
SELECT '0.9.2'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence
@@ -21,7 +21,7 @@ _cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_pyagg'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
END IF;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -82,13 +91,14 @@ AS $$
target_ids,
model_params)
$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED;
$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable_name TEXT,
target_table TEXT,
model_name text DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,17 +115,51 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
all_cols = list(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
'''.format(query=query)).colnames())
feature_cols = [a for a in all_cols
if a not in [variable_name, 'cartodb_id', ]]
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_RetrieveModelParams(
model_name text,
param_name text
)
RETURNS TABLE(param numeric, feature_name text) AS $$
import pickle
from collections import Iterable
plan = plpy.prepare('''
SELECT model, feature_names FROM model_storage
WHERE name = $1;
''', ['text', ])
try:
model_encoded = plpy.execute(plan, [model_name, ])
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
plpy.notice(model_encoded[0]['feature_names'])
model = pickle.loads(
model_encoded[0]['model']
)
res = getattr(model, param_name)
if not isinstance(res, Iterable):
raise Exception('Cannot return `{}` as a table'.format(param_name))
return zip(res, model_encoded[0]['feature_names'])
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
@@ -123,6 +167,7 @@ CREATE OR REPLACE FUNCTION
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
model_name TEXT DEFAULT NULL,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,9 +189,10 @@ AS $$
variable,
feature_columns,
target_query,
model_params
model_params,
model_name=model_name
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
IN target_query text,
IN weight_column text,
@@ -656,7 +702,7 @@ AS $$
moran = Moran()
return moran.global_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION
@@ -681,7 +727,7 @@ AS $$
num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag
return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
@@ -709,7 +755,7 @@ moran = Moran()
return moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (public-facing function)
@@ -836,7 +882,7 @@ AS $$
# TODO: use named parameters or a dictionary
return moran.global_rate_stat(subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (internal function) - DEPRECATED
@@ -864,7 +910,7 @@ AS $$
result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag
return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION
@@ -920,7 +966,7 @@ return moran.local_rate_stat(
geom_col,
id_col
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Moran's I Rate
-- Replaces CDB_AreasOfInterestLocalRate
@@ -1033,7 +1079,7 @@ from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
@@ -1063,7 +1109,7 @@ kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters,
standardize=standardize,
id_col=id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
@@ -1104,19 +1150,27 @@ BEGIN
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
DO $$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_proc p
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
WHERE n.nspname = 'cdb_crankshaft'
AND p.proname = 'cdb_weightedmean'
AND p.proisagg)
THEN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
END IF;
END
$$ LANGUAGE plpgsql;
-- Spatial Markov
-- input table format:
@@ -1146,7 +1200,7 @@ AS $$
## TODO: use named parameters or a dictionary
return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- input table format: identical to above but in a predictable format
-- Sample function call:
@@ -1172,7 +1226,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
-- $$ LANGUAGE plpythonu;
--
-- -- input table format:
-- -- id | geom | date | measurement
@@ -1198,7 +1252,7 @@ $$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
-- $$ LANGUAGE plpythonu;
-- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/
@@ -1468,7 +1522,7 @@ AS $$
from crankshaft.clustering import Getis
getis = Getis()
return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
-- TODO: make a version that accepts the values as arrays
@@ -1808,7 +1862,7 @@ gwr = GWR()
return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
@@ -1826,7 +1880,7 @@ gwr = GWR()
return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
--
-- Creates N points randomly distributed arround the polygon
--

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
default_version = '0.9.0'
requires = 'plpython3u, postgis'
default_version = '0.9.5'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft

View File

@@ -1,5 +0,0 @@
joblib==0.9.4
numpy==1.11.0
scipy==0.17.0
pysal==1.14.3
scikit-learn==0.17.0

View File

@@ -4,4 +4,4 @@ import crankshaft.clustering
import crankshaft.space_time_dynamics
import crankshaft.segmentation
import crankshaft.regression
from . import analysis_data_provider
import analysis_data_provider

View File

@@ -1,6 +1,6 @@
"""class for fetching data"""
import plpy
from . import pysal_utils as pu
import pysal_utils as pu
NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
'for null values and fill in appropriately.')

View File

@@ -0,0 +1,76 @@
"""
Based on the Weiszfeld algorithm:
https://en.wikipedia.org/wiki/Geometric_median
"""
# import plpy
import numpy as np
from numpy.linalg import norm
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
query = '''
SELECT array_agg(ST_X({geom_col})) As x_coords,
array_agg(ST_Y({geom_col})) As y_coords
FROM {tablename}
'''.format(geom_col=geom_col, tablename=tablename)
try:
resp = plpy.execute(query)
data = np.vstack((resp['x_coords'][0],
resp['y_coords'][0])).T
plpy.notice('coords: %s' % str(coords))
except Exception, err:
# plpy.error('Analysis failed: %s' % err)
print('No plpy')
data = np.array([[1.2 * np.random.random() + 10.,
1.1 * (np.random.random() - 1.) + 3.]
for i in range(1, 100)])
# initialize 'median center' to be the mean
coords_center_temp = data.mean(axis=0)
# plpy.notice('temp_center: %s' % str(coords_center_temp))
print('temp_center: %s' % str(coords_center_temp))
for i in range(0, num_iters):
old_coords_center = coords_center_temp.copy()
denom = denominator(coords_center_temp, data)
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
data[j])
for j in range(len(data))], axis=0)
coords_center_temp = coords_center_temp / denom
print("Pass #%d" % i)
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
print('temp_center: %s' % str(coords_center_temp))
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
coords_center_temp))
print("Center coords: %s" % str(coords_center_temp))
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
return coords_center_temp
def obj_func(center_coords, data):
"""
"""
return np.linalg.norm(center_coords - data)
def numerator(center_coords, data_i):
"""
"""
return np.reciprocal(np.linalg.norm(center_coords - data_i))
def denominator(center_coords, data):
"""
"""
return np.reciprocal(np.linalg.norm(data - center_coords))

View File

@@ -1,4 +1,4 @@
"""Import all functions from for clustering"""
from .moran import *
from .kmeans import *
from .getis import *
from moran import *
from kmeans import *
from getis import *

View File

@@ -47,4 +47,4 @@ class Getis(object):
getis = ps.esda.getisord.G_Local(attr_vals, weight,
star=True, permutations=permutations)
return list(zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order))
return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)

View File

@@ -28,8 +28,8 @@ class Kmeans(object):
ids = result[0]['ids']
km = KMeans(n_clusters=no_clusters, n_init=no_init)
labels = km.fit_predict(list(zip(xs, ys)))
return list(zip(ids, labels))
labels = km.fit_predict(zip(xs, ys))
return zip(ids, labels)
def nonspatial(self, subquery, colnames, no_clusters=5,
standardize=True, id_col='cartodb_id'):
@@ -75,18 +75,18 @@ class Kmeans(object):
kmeans = KMeans(n_clusters=no_clusters,
random_state=0).fit(cluster_columns)
centers = [json.dumps(dict(list(zip(colnames, c))))
centers = [json.dumps(dict(zip(colnames, c)))
for c in kmeans.cluster_centers_[kmeans.labels_]]
silhouettes = metrics.silhouette_samples(cluster_columns,
kmeans.labels_,
metric='sqeuclidean')
return list(zip(kmeans.labels_,
return zip(kmeans.labels_,
centers,
silhouettes,
[kmeans.inertia_] * kmeans.labels_.shape[0],
data[0]['rowid']))
data[0]['rowid'])
# -- Preprocessing steps
@@ -99,7 +99,7 @@ def _extract_columns(data):
# number of columns minus rowid column
n_cols = len(data[0]) - 1
return np.array([data[0]['arr_col{0}'.format(i+1)]
for i in range(n_cols)],
for i in xrange(n_cols)],
dtype=float).T

View File

@@ -75,7 +75,7 @@ class Moran(object):
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return list(zip([moran_global.I], [moran_global.EI]))
return zip([moran_global.I], [moran_global.EI])
def local_stat(self, subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
@@ -139,7 +139,7 @@ class Moran(object):
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
return list(zip(
return zip(
quads,
lisa.p_sim,
lag,
@@ -148,7 +148,7 @@ class Moran(object):
lisa.z,
lisa.Is,
weight.id_order
))
)
def global_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
@@ -194,7 +194,7 @@ class Moran(object):
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return list(zip([lisa_rate.I], [lisa_rate.EI]))
return zip([lisa_rate.I], [lisa_rate.EI])
def local_rate_stat(self, subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
@@ -262,7 +262,7 @@ class Moran(object):
lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
return list(zip(
return zip(
quads,
lisa.p_sim,
lag,
@@ -271,7 +271,7 @@ class Moran(object):
lisa.z,
lisa.Is,
weight.id_order
))
)
def local_bivariate_stat(self, subquery, attr1, attr2,
permutations, geom_col, id_col,
@@ -303,7 +303,7 @@ class Moran(object):
# find clustering of significance
lisa_sig = quad_position(lisa.q)
return list(zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order))
return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
# Low level functions ----------------------------------------

View File

@@ -0,0 +1 @@
from core import set_model, get_model, create_model_table

View File

@@ -0,0 +1,86 @@
import time
import plpy
import pickle
from petname import generate
def create_model_table():
q = '''
create table if not exists model_storage(
description text,
name text unique,
model bytea,
feature_names text[],
date_created timestamptz,
id serial primary key);
'''
plpy.notice(q)
plan = plpy.prepare(q)
resp = plpy.execute(plan)
plpy.notice('Model table successfully created')
plpy.notice(str(resp))
def get_model(model_name):
"""retrieve model if it exists"""
try:
plan = plpy.prepare('''
SELECT model FROM model_storage
WHERE name = $1;
''', ['text', ])
model_encoded = plpy.execute(plan, [model_name, ])
if len(model_encoded) == 1:
model = pickle.loads(
model_encoded[0]['model']
)
plpy.notice('Model successfully loaded')
else:
plpy.notice('Model not found, or too many models '
'({})'.format(len(model_encoded)))
model = None
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
return model
def set_model(model, model_name, feature_names):
"""stores the model in the table model_storage"""
if model_name is None:
model_name = generate(words=2, separator='_', letters=8)
existing_names = plpy.execute('''
SELECT array_agg(name) as name
FROM model_storage
''')
plpy.notice('nrows: {}'.format(existing_names.nrows()))
plpy.notice('MODEL NAME: {}'.format(model_name))
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
if existing_names[0]['name'] is not None:
while model_name in existing_names[0]['name']:
model_name = generate(words=2, separator='_', letters=10)
plpy.notice(model_name)
# store model
try:
plan = plpy.prepare('''
INSERT INTO model_storage(description, name, model, feature_names, date_created)
VALUES (
$1,
$2,
$3,
$4::text[],
to_timestamp($5));
''', ['text', 'text', 'bytea', 'text', 'numeric'])
plpy.notice('{%s}' % ','.join(feature_names))
plpy.notice(feature_names)
plpy.execute(
plan,
[' '.join(m.strip() for m in model.__repr__().split('\n')),
model_name,
pickle.dumps(model),
'{%s}' % ','.join(feature_names),
time.time()]
)
plpy.notice('model successfully stored as {}'.format(model_name))
except plpy.SPIError as err:
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))

View File

@@ -27,7 +27,7 @@ def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
neighbors = {x['id']: x['neighbors'] for x in query_res}
print('len of neighbors: %d' % len(neighbors))
print 'len of neighbors: %d' % len(neighbors)
built_weight = ps.W(neighbors)
built_weight.transform = 'r'

View File

@@ -1,4 +1,4 @@
from . import glm
from . import family
from . import utils
from . import iwls
import glm
import family
import utils
import iwls

View File

@@ -1,9 +1,8 @@
from __future__ import print_function
import numpy as np
from scipy import stats
from .utils import cache_readonly
from functools import reduce
from utils import cache_readonly
class Results(object):
"""

View File

@@ -7,8 +7,8 @@ The one parameter exponential family distributions used by GLM.
import numpy as np
from scipy import special
from . import links as L
from . import varfuncs as V
import links as L
import varfuncs as V
FLOAT_EPS = np.finfo(float).eps

View File

@@ -3,10 +3,10 @@ import numpy as np
import numpy.linalg as la
from pysal.spreg.utils import RegressionPropsY, spdot
import pysal.spreg.user_output as USER
from .utils import cache_readonly
from .base import LikelihoodModelResults
from . import family
from .iwls import iwls
from utils import cache_readonly
from base import LikelihoodModelResults
import family
from iwls import iwls
__all__ = ['GLM']

View File

@@ -3,7 +3,7 @@ import numpy.linalg as la
from scipy import sparse as sp
from scipy.sparse import linalg as spla
from pysal.spreg.utils import spdot, spmultiply
from .family import Binomial, Poisson
from family import Binomial, Poisson
def _compute_betas(y, x):
"""
@@ -49,7 +49,7 @@ def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=
if isinstance(family, Binomial):
y = family.link._clean(y)
if isinstance(family, Poisson):
y_off = y/offset
y_off = y/offset
y_off = family.starting_mu(y_off)
v = family.predict(y_off)
mu = family.starting_mu(y)
@@ -58,13 +58,13 @@ def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=
v = family.predict(mu)
while diff > tol and n_iter < max_iter:
n_iter += 1
n_iter += 1
w = family.weights(mu)
z = v + (family.link.deriv(mu)*(y-mu))
w = np.sqrt(w)
if type(x) != np.ndarray:
w = sp.csr_matrix(w)
z = sp.csr_matrix(z)
w = sp.csr_matrix(w)
z = sp.csr_matrix(z)
wx = spmultiply(x, w, array_out=False)
wz = spmultiply(z, w, array_out=False)
if wi is None:

View File

@@ -1,5 +1,5 @@
from __future__ import absolute_import, print_function
import numpy as np
import warnings
@@ -17,7 +17,7 @@ try:
from scipy.lib._version import NumpyVersion
except ImportError:
import re
string_types = str
string_types = basestring
class NumpyVersion():
"""Parse and compare numpy version strings.

View File

@@ -1 +1 @@
from .base import *
from base import *

View File

@@ -1,4 +1,4 @@
from . import gwr
from . import sel_bw
from . import diagnostics
from . import kernels
import gwr
import sel_bw
import diagnostics
import kernels

View File

@@ -7,8 +7,8 @@ __author__ = "Taylor Oshan Tayoshan@gmail.com"
import numpy as np
import numpy.linalg as la
from scipy.stats import t
from .kernels import *
from .diagnostics import get_AIC, get_AICc, get_BIC
from kernels import *
from diagnostics import get_AIC, get_AICc, get_BIC
import pysal.spreg.user_output as USER
from crankshaft.regression.glm.family import Gaussian, Binomial, Poisson
from crankshaft.regression.glm.glm import GLM, GLMResults
@@ -156,7 +156,7 @@ class GWR(GLM):
self.kernel = kernel
self.fixed = fixed
if offset is None:
self.offset = np.ones((self.n, 1))
self.offset = np.ones((self.n, 1))
else:
self.offset = offset * 1.0
self.fit_params = {}
@@ -169,7 +169,7 @@ class GWR(GLM):
def _build_W(self, fixed, kernel, coords, bw, points=None):
if fixed:
try:
W = fk[kernel](coords, bw, points)
W = fk[kernel](coords, bw, points)
except:
raise TypeError('Unsupported kernel function ', kernel)
else:
@@ -177,6 +177,7 @@ class GWR(GLM):
W = ak[kernel](coords, bw, points)
except:
raise TypeError('Unsupported kernel function ', kernel)
return W
def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'):
@@ -217,7 +218,8 @@ class GWR(GLM):
p = np.zeros((m, 1))
for i in range(m):
wi = self.W[i].reshape((-1,1))
rslt = iwls(self.y, self.X, self.family, self.offset, ini_params, tol, max_iter, wi=wi)
rslt = iwls(self.y, self.X, self.family, self.offset,
ini_params, tol, max_iter, wi=wi)
params[i,:] = rslt[0].T
predy[i] = rslt[1][i]
v[i] = rslt[2][i]
@@ -257,7 +259,7 @@ class GWR(GLM):
fit_params : dict
key-value pairs of parameters that will be passed into fit method to define estimation
routine; see fit method for more details
"""
if (exog_scale is None) & (exog_resid is None):
train_gwr = self.fit(**fit_params)
@@ -496,7 +498,7 @@ class GWRResults(GLMResults):
"""
if exog_scale is not None:
return cov*exog_scale
return cov*exog_scale
else:
return cov*self.scale
@@ -520,7 +522,7 @@ class GWRResults(GLMResults):
weighted mean of y
"""
if self.model.points is not None:
n = len(self.model.points)
n = len(self.model.points)
else:
n = self.n
off = self.offset.reshape((-1,1))
@@ -543,13 +545,13 @@ class GWRResults(GLMResults):
"""
if self.model.points is not None:
n = len(self.model.points)
n = len(self.model.points)
else:
n = self.n
TSS = np.zeros(shape=(n,1))
for i in range(n):
TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) *
(self.y.reshape((-1,1)) - self.y_bar[i])**2)
TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) *
(self.y.reshape((-1,1)) - self.y_bar[i])**2)
return TSS
@cache_readonly
@@ -563,15 +565,15 @@ class GWRResults(GLMResults):
relationships.
"""
if self.model.points is not None:
n = len(self.model.points)
resid = self.model.exog_resid.reshape((-1,1))
n = len(self.model.points)
resid = self.model.exog_resid.reshape((-1,1))
else:
n = self.n
resid = self.resid_response.reshape((-1,1))
RSS = np.zeros(shape=(n,1))
RSS = np.zeros(shape=(n,1))
for i in range(n):
RSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1))
* resid**2)
* resid**2)
return RSS
@cache_readonly
@@ -617,10 +619,10 @@ class GWRResults(GLMResults):
"""
if isinstance(self.family, (Poisson, Binomial)):
return self.resid_ss/(self.n - 2.0*self.tr_S +
self.tr_STS) #could be changed to SWSTW - nothing to test against
self.tr_STS) #could be changed to SWSTW - nothing to test against
else:
return self.resid_ss/(self.n - 2.0*self.tr_S +
self.tr_STS) #could be changed to SWSTW - nothing to test against
self.tr_STS) #could be changed to SWSTW - nothing to test against
@cache_readonly
def sigma2_ML(self):
"""
@@ -673,14 +675,14 @@ class GWRResults(GLMResults):
Note: in (9.11), p should be tr(S), that is, the effective number of parameters
"""
return self.std_res**2 * self.influ / (self.tr_S * (1.0-self.influ))
@cache_readonly
def deviance(self):
off = self.offset.reshape((-1,1)).T
y = self.y
ybar = self.y_bar
if isinstance(self.family, Gaussian):
raise NotImplementedError('deviance not currently used for Gaussian')
raise NotImplementedError('deviance not currently used for Gaussian')
elif isinstance(self.family, Poisson):
dev = np.sum(2.0*self.W*(y*np.log(y/(ybar*off))-(y-ybar*off)),axis=1)
elif isinstance(self.family, Binomial):
@@ -690,7 +692,7 @@ class GWRResults(GLMResults):
@cache_readonly
def resid_deviance(self):
if isinstance(self.family, Gaussian):
raise NotImplementedError('deviance not currently used for Gaussian')
raise NotImplementedError('deviance not currently used for Gaussian')
else:
off = self.offset.reshape((-1,1)).T
y = self.y
@@ -708,7 +710,7 @@ class GWRResults(GLMResults):
manual. Equivalent to 1 - (deviance/null deviance)
"""
if isinstance(self.family, Gaussian):
raise NotImplementedError('Not implemented for Gaussian')
raise NotImplementedError('Not implemented for Gaussian')
else:
return 1.0 - (self.resid_deviance/self.deviance)
@@ -831,8 +833,8 @@ class GWRResults(GLMResults):
def predictions(self):
P = self.model.P
if P is None:
raise NotImplementedError('predictions only avaialble if predict'
'method called on GWR model')
raise NotImplementedError('predictions only avaialble if predict'
'method called on GWR model')
else:
predictions = np.sum(P*self.params, axis=1).reshape((-1,1))
return predictions
@@ -985,7 +987,7 @@ class FBGWR(GWR):
self.fixed = fixed
self.constant = constant
if constant:
self.X = USER.check_constant(self.X)
self.X = USER.check_constant(self.X)
def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'):
"""

View File

@@ -47,14 +47,14 @@ def golden_section(a, c, delta, function, tol, max_iter, int_score=False):
while np.abs(diff) > tol and iters < max_iter:
iters += 1
if int_score:
b = np.round(b)
d = np.round(d)
b = np.round(b)
d = np.round(d)
score_a = function(a)
score_b = function(b)
score_c = function(c)
score_d = function(d)
if score_b <= score_d:
opt_val = b
opt_score = score_b
@@ -73,7 +73,7 @@ def golden_section(a, c, delta, function, tol, max_iter, int_score=False):
#d = np.round(b)
#if int_score:
# opt_val = np.round(opt_val)
# opt_val = np.round(opt_val)
output.append((opt_val, opt_score))
diff = score_b - score_d
score = opt_score
@@ -146,7 +146,7 @@ def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score,
gwr_func, bw_func, sel_func):
if init:
bw = sel_func(bw_func(y, X))
print(bw)
print bw
optim_model = gwr_func(y, X, bw)
err = optim_model.resid_response.reshape((-1,1))
est = optim_model.params
@@ -198,7 +198,7 @@ def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score,
new_rss = np.sum((y - predy)**2)
score = np.abs((new_rss - rss)/new_rss)
rss = new_rss
print(score)
print score
scores.append(score)
delta = score
BWs.append(bws)

View File

@@ -8,12 +8,12 @@
__author__ = "Taylor Oshan Tayoshan@gmail.com"
from .kernels import *
from .search import golden_section, equal_interval, flexible_bw
from .gwr import GWR
from kernels import *
from search import golden_section, equal_interval, flexible_bw
from gwr import GWR
from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial
import pysal.spreg.user_output as USER
from .diagnostics import get_AICc, get_AIC, get_BIC, get_CV
from diagnostics import get_AICc, get_AIC, get_BIC, get_CV
from scipy.spatial.distance import pdist, squareform
from pysal.common import KDTree
import numpy as np
@@ -197,7 +197,7 @@ class Sel_BW(object):
if self.fb:
self._fbw()
print(self.bw[1])
print self.bw[1]
self.XB = self.bw[4]
self.err = self.bw[5]
else:

View File

@@ -14,7 +14,7 @@ import pysal
class TestGWRGaussian(unittest.TestCase):
def setUp(self):
data = pysal.open(pysal.examples.get_path('GData_utm.csv'))
self.coords = list(zip(data.by_col('X'), data.by_col('Y')))
self.coords = zip(data.by_col('X'), data.by_col('Y'))
self.y = np.array(data.by_col('PctBach')).reshape((-1,1))
rural = np.array(data.by_col('PctRural')).reshape((-1,1))
pov = np.array(data.by_col('PctPov')).reshape((-1,1))
@@ -56,10 +56,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt)
CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 894.0)
self.assertAlmostEqual(np.floor(AIC), 890.0)
self.assertAlmostEqual(np.floor(BIC), 944.0)
self.assertAlmostEqual(np.round(CV,2), 18.25)
self.assertAlmostEquals(np.floor(AICc), 894.0)
self.assertAlmostEquals(np.floor(AIC), 890.0)
self.assertAlmostEquals(np.floor(BIC), 944.0)
self.assertAlmostEquals(np.round(CV,2), 18.25)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -107,10 +107,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt)
CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 896.0)
self.assertAlmostEqual(np.floor(AIC), 892.0)
self.assertAlmostEqual(np.floor(BIC), 941.0)
self.assertAlmostEqual(np.around(CV, 2), 19.19)
self.assertAlmostEquals(np.floor(AICc), 896.0)
self.assertAlmostEquals(np.floor(AIC), 892.0)
self.assertAlmostEquals(np.floor(BIC), 941.0)
self.assertAlmostEquals(np.around(CV, 2), 19.19)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -159,10 +159,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt)
CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 895.0)
self.assertAlmostEqual(np.floor(AIC), 890.0)
self.assertAlmostEqual(np.floor(BIC), 943.0)
self.assertAlmostEqual(np.around(CV, 2), 18.21)
self.assertAlmostEquals(np.floor(AICc), 895.0)
self.assertAlmostEquals(np.floor(AIC), 890.0)
self.assertAlmostEquals(np.floor(BIC), 943.0)
self.assertAlmostEquals(np.around(CV, 2), 18.21)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -211,10 +211,10 @@ class TestGWRGaussian(unittest.TestCase):
BIC = get_BIC(rslt)
CV = get_CV(rslt)
self.assertAlmostEqual(np.floor(AICc), 896)
self.assertAlmostEqual(np.floor(AIC), 894.0)
self.assertAlmostEqual(np.floor(BIC), 922.0)
self.assertAlmostEqual(np.around(CV, 2), 17.91)
self.assertAlmostEquals(np.floor(AICc), 896)
self.assertAlmostEquals(np.floor(AIC), 894.0)
self.assertAlmostEquals(np.floor(BIC), 922.0)
self.assertAlmostEquals(np.around(CV, 2), 17.91)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
@@ -314,7 +314,7 @@ class TestGWRGaussian(unittest.TestCase):
class TestGWRPoisson(unittest.TestCase):
def setUp(self):
data = pysal.open(pysal.examples.get_path('Tokyomortality.csv'), mode='Ur')
self.coords = list(zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID')))
self.coords = zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID'))
self.y = np.array(data.by_col('db2564')).reshape((-1,1))
self.off = np.array(data.by_col('eb2564')).reshape((-1,1))
OCC = np.array(data.by_col('OCC_TEC')).reshape((-1,1))
@@ -355,9 +355,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 13294.0)
self.assertAlmostEqual(np.floor(AIC), 13247.0)
self.assertAlmostEqual(np.floor(BIC), 13485.0)
self.assertAlmostEquals(np.floor(AICc), 13294.0)
self.assertAlmostEquals(np.floor(AIC), 13247.0)
self.assertAlmostEquals(np.floor(BIC), 13485.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-05)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-03)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-03)
@@ -404,9 +404,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 13285)
self.assertAlmostEqual(np.floor(AIC), 13259.0)
self.assertAlmostEqual(np.floor(BIC), 13442.0)
self.assertAlmostEquals(np.floor(AICc), 13285)
self.assertAlmostEquals(np.floor(AIC), 13259.0)
self.assertAlmostEquals(np.floor(BIC), 13442.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
@@ -452,9 +452,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 367.0)
self.assertAlmostEqual(np.floor(AIC), 361.0)
self.assertAlmostEqual(np.floor(BIC), 451.0)
self.assertAlmostEquals(np.floor(AICc), 367.0)
self.assertAlmostEquals(np.floor(AIC), 361.0)
self.assertAlmostEquals(np.floor(BIC), 451.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-02,
atol=1e-02)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02, atol=1e-02)
@@ -511,9 +511,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 11283.0)
self.assertAlmostEqual(np.floor(AIC), 11211.0)
self.assertAlmostEqual(np.floor(BIC), 11497.0)
self.assertAlmostEquals(np.floor(AICc), 11283.0)
self.assertAlmostEquals(np.floor(AIC), 11211.0)
self.assertAlmostEquals(np.floor(BIC), 11497.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-03)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
@@ -559,9 +559,9 @@ class TestGWRPoisson(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 21070.0)
self.assertAlmostEqual(np.floor(AIC), 21069.0)
self.assertAlmostEqual(np.floor(BIC), 21111.0)
self.assertAlmostEquals(np.floor(AICc), 21070.0)
self.assertAlmostEquals(np.floor(AIC), 21069.0)
self.assertAlmostEquals(np.floor(BIC), 21111.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
@@ -583,7 +583,7 @@ class TestGWRPoisson(unittest.TestCase):
class TestGWRBinomial(unittest.TestCase):
def setUp(self):
data = pysal.open(pysal.examples.get_path('landslides.csv'))
self.coords = list(zip(data.by_col('X'), data.by_col('Y')))
self.coords = zip(data.by_col('X'), data.by_col('Y'))
self.y = np.array(data.by_col('Landslid')).reshape((-1,1))
ELEV = np.array(data.by_col('Elev')).reshape((-1,1))
SLOPE = np.array(data.by_col('Slope')).reshape((-1,1))
@@ -630,9 +630,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 275.0)
self.assertAlmostEqual(np.floor(AIC), 271.0)
self.assertAlmostEqual(np.floor(BIC), 349.0)
self.assertAlmostEquals(np.floor(AICc), 275.0)
self.assertAlmostEquals(np.floor(AIC), 271.0)
self.assertAlmostEquals(np.floor(BIC), 349.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
@@ -693,9 +693,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 277.0)
self.assertAlmostEqual(np.floor(AIC), 271.0)
self.assertAlmostEqual(np.floor(BIC), 358.0)
self.assertAlmostEquals(np.floor(AICc), 277.0)
self.assertAlmostEquals(np.floor(AIC), 271.0)
self.assertAlmostEquals(np.floor(BIC), 358.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
@@ -756,9 +756,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 276.0)
self.assertAlmostEqual(np.floor(AIC), 272.0)
self.assertAlmostEqual(np.floor(BIC), 341.0)
self.assertAlmostEquals(np.floor(AICc), 276.0)
self.assertAlmostEquals(np.floor(AIC), 272.0)
self.assertAlmostEquals(np.floor(BIC), 341.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
@@ -819,9 +819,9 @@ class TestGWRBinomial(unittest.TestCase):
AIC = get_AIC(rslt)
BIC = get_BIC(rslt)
self.assertAlmostEqual(np.floor(AICc), 276.0)
self.assertAlmostEqual(np.floor(AIC), 273.0)
self.assertAlmostEqual(np.floor(BIC), 331.0)
self.assertAlmostEquals(np.floor(AICc), 276.0)
self.assertAlmostEquals(np.floor(AIC), 273.0)
self.assertAlmostEquals(np.floor(BIC), 331.0)
np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)

View File

@@ -12,7 +12,7 @@ class TestKernels(unittest.TestCase):
y = np.arange(5,0, -1)
np.random.shuffle(x)
np.random.shuffle(y)
self.coords = np.array(list(zip(x, y)))
self.coords = np.array(zip(x, y))
self.fix_gauss_kern = np.array([
[ 1. , 0.38889556, 0.48567179, 0.48567179, 0.89483932],
[ 0.38889556, 1. , 0.89483932, 0.64118039, 0.48567179],

View File

@@ -13,7 +13,7 @@ import pysal
class TestSelBW(unittest.TestCase):
def setUp(self):
data = pysal.open(pysal.examples.get_path('GData_utm.csv'))
self.coords = list(zip(data.by_col('X'), data.by_col('Y')))
self.coords = zip(data.by_col('X'), data.by_col('Y'))
self.y = np.array(data.by_col('PctBach')).reshape((-1,1))
rural = np.array(data.by_col('PctRural')).reshape((-1,1))
pov = np.array(data.by_col('PctPov')).reshape((-1,1))

View File

@@ -2,8 +2,8 @@
Geographically weighted regression
"""
import numpy as np
from .gwr.base.gwr import GWR as PySAL_GWR
from .gwr.base.sel_bw import Sel_BW
from gwr.base.gwr import GWR as PySAL_GWR
from gwr.base.sel_bw import Sel_BW
import json
from crankshaft.analysis_data_provider import AnalysisDataProvider
import plpy
@@ -48,7 +48,7 @@ class GWR:
# x, y are centroids of input geometries
x = np.array(query_result[0]['x'], dtype=np.float)
y = np.array(query_result[0]['y'], dtype=np.float)
coords = list(zip(x, y))
coords = zip(x, y)
# extract dependent variable
Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape((-1, 1))
@@ -88,7 +88,7 @@ class GWR:
bw = np.repeat(float(bw), n)
# create lists of json objs for model outputs
for idx in range(n):
for idx in xrange(n):
coeffs.append(json.dumps({var: model.params[idx, k]
for k, var in enumerate(ind_vars)}))
stand_errs.append(json.dumps({var: model.bse[idx, k]
@@ -99,8 +99,8 @@ class GWR:
json.dumps({var: filtered_t[idx, k]
for k, var in enumerate(ind_vars)}))
return list(zip(coeffs, stand_errs, t_vals, filtered_t_vals,
predicted, residuals, r_squared, bw, rowid))
return zip(coeffs, stand_errs, t_vals, filtered_t_vals,
predicted, residuals, r_squared, bw, rowid)
def gwr_predict(self, subquery, dep_var, ind_vars,
bw=None, fixed=False, kernel='bisquare',
@@ -133,7 +133,7 @@ class GWR:
x = np.array(query_result[0]['x'], dtype=np.float)
y = np.array(query_result[0]['y'], dtype=np.float)
coords = np.array(list(zip(x, y)), dtype=np.float)
coords = np.array(zip(x, y), dtype=np.float)
# extract dependent variable
Y = np.array(query_result[0]['dep_var']).reshape((-1, 1))
@@ -190,7 +190,7 @@ class GWR:
predicted = model.predy.flatten()
m = len(model.predy)
for idx in range(m):
for idx in xrange(m):
coeffs.append(json.dumps({var: model.params[idx, k]
for k, var in enumerate(ind_vars)}))
stand_errs.append(json.dumps({var: model.bse[idx, k]
@@ -198,5 +198,5 @@ class GWR:
t_vals.append(json.dumps({var: model.tvalues[idx, k]
for k, var in enumerate(ind_vars)}))
return list(zip(coeffs, stand_errs, t_vals,
r_squared, predicted, rowid[test]))
return zip(coeffs, stand_errs, t_vals,
r_squared, predicted, rowid[test])

View File

@@ -1,2 +1,2 @@
"""Import all functions from for segmentation"""
from .segmentation import *
from segmentation import *

View File

@@ -2,11 +2,14 @@
Segmentation creation and prediction
"""
import pickle
import plpy
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from crankshaft.analysis_data_provider import AnalysisDataProvider
from crankshaft import model_storage
# NOTE: added optional param here
@@ -47,10 +50,11 @@ class Segmentation(object):
model_parameters, 0.2)
prediction = model.predict(target_features)
accuracy_array = [accuracy] * prediction.shape[0]
return list(zip(target_ids, prediction, accuracy_array))
return zip(target_ids, prediction, accuracy_array)
def create_and_predict_segment(self, query, variable, feature_columns,
target_query, model_params,
model_name=None,
id_col='cartodb_id'):
"""
generate a segment with machine learning
@@ -70,16 +74,24 @@ class Segmentation(object):
(target, features, target_mean,
feature_means) = self.clean_data(query, variable, feature_columns)
model, accuracy = train_model(target, features, model_params, 0.2)
model_storage.create_model_table()
# find model if it exists and is specified
if model_name is not None:
model = model_storage.get_model(model_name)
if locals().get('model') is None:
model, accuracy = train_model(target, features, model_params, 0.2)
result = self.predict_segment(model, feature_columns, target_query,
feature_means)
accuracy_array = [accuracy] * result.shape[0]
rowid = self.data_provider.get_segmentation_data(params)
'''
rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
'''
return list(zip(rowid[0]['ids'], result, accuracy_array))
# store the model for later use
model_storage.set_model(model, model_name, feature_columns)
return zip(rowid[0]['ids'], result, accuracy_array)
def predict_segment(self, model, feature_columns, target_query,
feature_means):

View File

@@ -1,2 +1,2 @@
"""Import all functions from clustering libraries."""
from .markov import *
from markov import *

View File

@@ -91,7 +91,7 @@ class Markov(object):
trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
# output the results
return list(zip(trend, trend_up, trend_down, volatility, weights.id_order))
return zip(trend, trend_up, trend_down, volatility, weights.id_order)
@@ -140,7 +140,7 @@ def rebin_data(time_data, num_time_per_bin):
return np.array(
[time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
for i in range(int(n_max))]).T
for i in range(n_max)]).T
def get_prob_dist(transition_matrix, lag_indices, unit_indices):

View File

@@ -1,5 +1,5 @@
joblib==0.9.4
numpy==1.11.0
scipy==0.17.0
joblib==0.8.3
numpy==1.6.1
scipy==0.14.0
pysal==1.14.3
scikit-learn==0.17.0
scikit-learn==0.14.1

View File

@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.9.0',
version='0.0.0',
description='CartoDB Spatial Analysis Python Library',
@@ -26,7 +26,7 @@ setup(
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
@@ -41,7 +41,7 @@ setup(
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.9.4', 'numpy==1.11.0', 'scipy==0.17.0', 'pysal==1.14.3', 'scikit-learn==0.17.0'],
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
requires=['pysal', 'numpy', 'sklearn'],

View File

@@ -0,0 +1,49 @@
"""
CartoDB Spatial Analysis Python Library
See:
https://github.com/CartoDB/crankshaft
"""
from setuptools import setup, find_packages
setup(
name='crankshaft',
version='0.0.0',
description='CartoDB Spatial Analysis Python Library',
url='https://github.com/CartoDB/crankshaft',
author='Data Services Team - CartoDB',
author_email='dataservices@cartodb.com',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Mapping comunity',
'Topic :: Maps :: Mapping Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
],
keywords='maps mapping tools spatial analysis geostatistics',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
extras_require={
'dev': ['unittest'],
'test': ['unittest', 'nose', 'mock'],
},
# The choice of component versions is dictated by what's
# provisioned in the production servers.
# IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
requires=['pysal', 'numpy', 'sklearn'],
test_suite='test'
)

View File

@@ -0,0 +1,6 @@
{
"production_col": [10, 10, 10],
"capacity_col": [0.09, 0.31],
"marginal_col": [5, 5],
"pairwise": [[1, 2, 3], [3, 2, 1]]
}

View File

@@ -72,7 +72,7 @@ class MoranTest(unittest.TestCase):
result = moran.local_stat('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[6]) for row in result]
zipped_values = list(zip(result, self.moran_data))
zipped_values = zip(result, self.moran_data)
for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
@@ -91,7 +91,7 @@ class MoranTest(unittest.TestCase):
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[6]) for row in result]
zipped_values = list(zip(result, self.moran_data))
zipped_values = zip(result, self.moran_data)
for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)

View File

@@ -86,7 +86,7 @@ class GWRTest(unittest.TestCase):
# unpack response
coeffs, stand_errs, t_vals, t_vals_filtered, predicteds, \
residuals, r_squareds, bws, rowids = list(zip(*gwr_resp))
residuals, r_squareds, bws, rowids = zip(*gwr_resp)
# prepare for comparision
coeff_known_pctpov = self.knowns['est_pctpov']
@@ -98,13 +98,13 @@ class GWRTest(unittest.TestCase):
# test pctpov coefficient estimates
for idx, val in enumerate(coeff_known_pctpov):
resp_idx = rowids.index(ids[idx])
self.assertAlmostEqual(val,
self.assertAlmostEquals(val,
json.loads(coeffs[resp_idx])['pctpov'],
places=4)
# test pctrural tvals
for idx, val in enumerate(tval_known_pctblack):
resp_idx = rowids.index(ids[idx])
self.assertAlmostEqual(val,
self.assertAlmostEquals(val,
json.loads(t_vals[resp_idx])['pctrural'],
places=4)
@@ -119,7 +119,7 @@ class GWRTest(unittest.TestCase):
# unpack response
coeffs, stand_errs, t_vals, \
r_squareds, predicteds, rowid = list(zip(*gwr_resp))
r_squareds, predicteds, rowid = zip(*gwr_resp)
threshold = 0.01
for i, idx in enumerate(self.idx_ids_of_unknowns):

View File

@@ -66,7 +66,7 @@ class SegmentationTest(unittest.TestCase):
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
result = replace_nan_with_mean(test_array, means=None)[0]
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float)
self.assertEqual(sorted(result), sorted(expectation))
self.assertItemsEqual(result, expectation)
def test_create_and_predict_segment(self):
"""test segmentation.test_create_and_predict"""
@@ -118,7 +118,7 @@ class SegmentationTest(unittest.TestCase):
model_parameters,
id_col='cartodb_id')
results = [(row[1], row[2]) for row in result]
zipped_values = list(zip(results, self.result_seg))
zipped_values = zip(results, self.result_seg)
pre_res = [r[0] for r in self.true_result]
acc_res = [r[1] for r in self.result_seg]

View File

@@ -98,7 +98,7 @@ class SpaceTimeTests(unittest.TestCase):
self.assertTrue(result is not None)
result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
print(result[0])
print result[0]
expected = self.markov_data
for ([res_trend, res_up, res_down, res_vol, res_id],
[exp_trend, exp_up, exp_down, exp_vol, exp_id]

View File

@@ -0,0 +1,15 @@
from test.helper import plpy, fixture_file
from crankshaft.analysis_data_provider import AnalysisDataProvider
import json
import crankshaft
class RawDataProvider(AnalysisDataProvider):
def __init__(self, fixturedata):
self.your_algo_data = fixturedata
def get_moran(self, params):
"""
Replace this function name with the one used in your algorithm,
and make sure to use the same function signature that is written
for this algo in analysis_data_provider.py
"""
return self.your_algo_data

View File

@@ -0,0 +1,76 @@
"""
Based on the Weiszfeld algorithm:
https://en.wikipedia.org/wiki/Geometric_median
"""
# import plpy
import numpy as np
from numpy.linalg import norm
def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
query = '''
SELECT array_agg(ST_X({geom_col})) As x_coords,
array_agg(ST_Y({geom_col})) As y_coords
FROM {tablename}
'''.format(geom_col=geom_col, tablename=tablename)
try:
resp = plpy.execute(query)
data = np.vstack((resp['x_coords'][0],
resp['y_coords'][0])).T
plpy.notice('coords: %s' % str(coords))
except Exception, err:
# plpy.error('Analysis failed: %s' % err)
print('No plpy')
data = np.array([[1.2 * np.random.random() + 10.,
1.1 * (np.random.random() - 1.) + 3.]
for i in range(1, 100)])
# initialize 'median center' to be the mean
coords_center_temp = data.mean(axis=0)
# plpy.notice('temp_center: %s' % str(coords_center_temp))
print('temp_center: %s' % str(coords_center_temp))
for i in range(0, num_iters):
old_coords_center = coords_center_temp.copy()
denom = denominator(coords_center_temp, data)
coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
data[j])
for j in range(len(data))], axis=0)
coords_center_temp = coords_center_temp / denom
print("Pass #%d" % i)
print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
print('temp_center: %s' % str(coords_center_temp))
print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
coords_center_temp))
print("Center coords: %s" % str(coords_center_temp))
print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
return coords_center_temp
def obj_func(center_coords, data):
"""
"""
return np.linalg.norm(center_coords - data)
def numerator(center_coords, data_i):
"""
"""
return np.reciprocal(np.linalg.norm(center_coords - data_i))
def denominator(center_coords, data):
"""
"""
return np.reciprocal(np.linalg.norm(data - center_coords))

View File

@@ -0,0 +1 @@
from core import set_model, get_model, create_model_table

View File

@@ -0,0 +1,86 @@
import time
import plpy
import pickle
from petname import generate
def create_model_table():
q = '''
create table if not exists model_storage(
description text,
name text unique,
model bytea,
feature_names text[],
date_created timestamptz,
id serial primary key);
'''
plpy.notice(q)
plan = plpy.prepare(q)
resp = plpy.execute(plan)
plpy.notice('Model table successfully created')
plpy.notice(str(resp))
def get_model(model_name):
"""retrieve model if it exists"""
try:
plan = plpy.prepare('''
SELECT model FROM model_storage
WHERE name = $1;
''', ['text', ])
model_encoded = plpy.execute(plan, [model_name, ])
if len(model_encoded) == 1:
model = pickle.loads(
model_encoded[0]['model']
)
plpy.notice('Model successfully loaded')
else:
plpy.notice('Model not found, or too many models '
'({})'.format(len(model_encoded)))
model = None
except plpy.SPIError as err:
plpy.error('ERROR: {}'.format(err))
return model
def set_model(model, model_name, feature_names):
"""stores the model in the table model_storage"""
if model_name is None:
model_name = generate(words=2, separator='_', letters=8)
existing_names = plpy.execute('''
SELECT array_agg(name) as name
FROM model_storage
''')
plpy.notice('nrows: {}'.format(existing_names.nrows()))
plpy.notice('MODEL NAME: {}'.format(model_name))
plpy.notice('LEN of ms: {}'.format(len(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names)))
plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
if existing_names[0]['name'] is not None:
while model_name in existing_names[0]['name']:
model_name = generate(words=2, separator='_', letters=10)
plpy.notice(model_name)
# store model
try:
plan = plpy.prepare('''
INSERT INTO model_storage(description, name, model, feature_names, date_created)
VALUES (
$1,
$2,
$3,
$4::text[],
to_timestamp($5));
''', ['text', 'text', 'bytea', 'text', 'numeric'])
plpy.notice('{%s}' % ','.join(feature_names))
plpy.notice(feature_names)
plpy.execute(
plan,
[' '.join(m.strip() for m in model.__repr__().split('\n')),
model_name,
pickle.dumps(model),
'{%s}' % ','.join(feature_names),
time.time()]
)
plpy.notice('model successfully stored as {}'.format(model_name))
except plpy.SPIError as err:
plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))

Some files were not shown because too many files have changed in this diff Show More