release 0.9.5 [ci skip]

corrects incorrect variable name
release feature name validation
2018-04-09 15:22:35 -04:00 · 2018-04-09 15:16:52 -04:00 · 2018-04-09 14:14:39 -04:00 · 2018-04-09 14:09:31 -04:00 · 2018-03-22 13:14:20 -04:00 · 2018-03-22 13:12:17 -04:00
423 changed files with 85288 additions and 246 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ envs/
 *.pyc
 .DS_Store
 .idea/
+.*.sw[nop]
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,40 +1,60 @@
-dist: xenial
 language: c
+dist: precise
 sudo: required

 env:
  global:
    - PAGER=cat
-    - PGUSER=postgres
-    - PGDATABASE=postgres
-    - PGOPTIONS='-c client_min_messages=NOTICE'
-    - PGPORT=5432
-    - POSTGIS_VERSION="2.5"
-
-  matrix:
-    - POSTGRESQL_VERSION="10"
-    - POSTGRESQL_VERSION="11"

 before_install:
+  - ./check-up-to-date-with-master.sh
+  - sudo apt-get -y install python-pip

-  - sudo service postgresql stop;
-  - sudo apt-get remove postgresql* -y
-  - sudo apt-get install -y --allow-unauthenticated --no-install-recommends --no-install-suggests postgresql-$POSTGRESQL_VERSION postgresql-client-$POSTGRESQL_VERSION postgresql-server-dev-$POSTGRESQL_VERSION postgresql-common
-  - sudo apt-get install -y --allow-unauthenticated postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION postgresql-$POSTGRESQL_VERSION-postgis-$POSTGIS_VERSION-scripts postgis postgresql-plpython-$POSTGRESQL_VERSION
-  - sudo pg_dropcluster --stop $POSTGRESQL_VERSION main
-  - sudo rm -rf /etc/postgresql/$POSTGRESQL_VERSION /var/lib/postgresql/$POSTGRESQL_VERSION
-  - sudo pg_createcluster -u postgres $POSTGRESQL_VERSION main -- -A trust
-  - sudo /etc/init.d/postgresql start $POSTGRESQL_VERSION || sudo journalctl -xe
+  - sudo apt-get -y install python-software-properties
+  - sudo add-apt-repository -y ppa:cartodb/sci
+  - sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
+  - sudo add-apt-repository -y ppa:cartodb/gis
+  - sudo add-apt-repository -y ppa:cartodb/gis-testing
+  - sudo apt-get update

-  - sudo apt-get -y install python-pip python-software-properties python-joblib python-nose
+  - sudo apt-get -y install python-joblib=0.8.3-1-cdb1
+  - sudo apt-get -y install python-numpy=1:1.6.1-6ubuntu1
+
+  - sudo apt-get -y install python-scipy=0.14.0-2-cdb6
+  - sudo apt-get -y --no-install-recommends install python-sklearn-lib=0.14.1-3-cdb2
+  - sudo apt-get -y --no-install-recommends install python-sklearn=0.14.1-3-cdb2
+  - sudo apt-get -y --no-install-recommends install python-scikits-learn=0.14.1-3-cdb2
+
+  # Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
+  - sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
+
+  # Install postgres db and build deps
+  - sudo /etc/init.d/postgresql stop # stop travis default instance
+  - sudo apt-get -y remove --purge postgresql-9.1
+  - sudo apt-get -y remove --purge postgresql-9.2
+  - sudo apt-get -y remove --purge postgresql-9.3
+  - sudo apt-get -y remove --purge postgresql-9.4
+  - sudo apt-get -y remove --purge postgresql-9.5
+  - sudo rm -rf /var/lib/postgresql/
+  - sudo rm -rf /var/log/postgresql/
+  - sudo rm -rf /etc/postgresql/
+  - sudo apt-get -y remove --purge postgis-2.2
+  - sudo apt-get -y autoremove
+
+  - sudo apt-get -y install postgresql-9.5=9.5.2-3cdb3
+  - sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb3
+  - sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb3
+  - sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
+  - sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
+
+  # configure it to accept local connections from postgres
+  - echo -e "# TYPE  DATABASE        USER            ADDRESS                 METHOD \nlocal   all             postgres                                trust\nlocal   all             all                                     trust\nhost    all             all             127.0.0.1/32            trust" \
+    | sudo tee /etc/postgresql/9.5/main/pg_hba.conf
+  - sudo /etc/init.d/postgresql restart 9.5

 install:
  - sudo make install

 script:
-  - make test
+  - make test || { cat src/pg/test/regression.diffs; false; }
  - ./check-compatibility.sh
-
-after_failure:
-  - pg_lsclusters
-  - cat src/pg/test/regression.diffs
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,3 @@
-0.8.2 (2019-02-07)
------------------
-* Update dependencies to match what it's being used in production.
-* Update travis to xenial, PG10 and 11, and postgis 2.6
-* Compatibility with PG11
-
 0.8.1 (2018-03-12)
 ------------------
 * Adds improperly added version files
--- a/carto-package.json
+++ b/carto-package.json
@@ -1,20 +0,0 @@
-{
-    "name": "crankshaft",
-    "current_version": {
-        "requires": {
-            "postgres": ">=9.5.0",
-            "postgis": ">=2.2.0.0",
-            "python": ">=2.7.0",
-            "joblib": "0.8.3",
-            "numpy": "1.6.1",
-            "scipy": "0.14.0",
-            "pysal": "1.14.3",
-            "scikit-learn": "0.14.1"
-        },
-        "works_with": {
-        }
-    },
-
-    "exceptional_versions": {
-    }
-}
--- a/check-compatibility.sh
+++ b/check-compatibility.sh
@@ -27,7 +27,7 @@ psql -c "SELECT * FROM pg_available_extension_versions WHERE name LIKE 'cranksha
 psql $DBNAME <<'EOF'
 -- Install dependencies
 CREATE EXTENSION plpythonu;
-CREATE EXTENSION postgis;
+CREATE EXTENSION postgis VERSION '2.2.2';

 -- Create role publicuser if it does not exist
 DO
@@ -48,49 +48,26 @@ CREATE EXTENSION crankshaft;
 \dx
 EOF

-
-# Check PG version
-PG_VERSION=`psql -q -t -c "SELECT current_setting('server_version_num')"`
-
 # Save public function signatures
-if [[ "$PG_VERSION" -lt 110000 ]]; then
-    psql $DBNAME -c "
-    CREATE TABLE release_function_signatures AS
-    SELECT
-        p.proname as name,
-        pg_catalog.pg_get_function_result(p.oid) as result_type,
-        pg_catalog.pg_get_function_arguments(p.oid) as arguments,
-    CASE
-        WHEN p.proisagg THEN 'agg'
-        WHEN p.proiswindow THEN 'window'
-        WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
-        ELSE 'normal'
-    END as type
-    FROM pg_catalog.pg_proc p
-        LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
-    WHERE
-        n.nspname = 'cdb_crankshaft'
-        AND p.proname LIKE 'cdb_%'
-    ORDER BY 1, 2, 4;"
-else
-    psql $DBNAME -c "
-    CREATE TABLE release_function_signatures AS
-    SELECT
-        p.proname as name,
-        pg_catalog.pg_get_function_result(p.oid) as result_type,
-        pg_catalog.pg_get_function_arguments(p.oid) as arguments,
-    CASE    WHEN p.prokind = 'a' THEN 'agg'
-            WHEN p.prokind = 'w' THEN 'window'
-            WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
-            ELSE 'normal'
-    END as type
-    FROM pg_catalog.pg_proc p
-        LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
-    WHERE
-        n.nspname = 'cdb_crankshaft'
-        AND p.proname LIKE 'cdb_%'
-    ORDER BY 1, 2, 4;"
-fi
+psql $DBNAME <<'EOF'
+CREATE TABLE release_function_signatures AS
+  SELECT
+    p.proname as name,
+    pg_catalog.pg_get_function_result(p.oid) as result_type,
+    pg_catalog.pg_get_function_arguments(p.oid) as arguments,
+   CASE
+    WHEN p.proisagg THEN 'agg'
+    WHEN p.proiswindow THEN 'window'
+    WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
+    ELSE 'normal'
+   END as type
+  FROM pg_catalog.pg_proc p
+       LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+  WHERE
+       n.nspname = 'cdb_crankshaft'
+       AND p.proname LIKE 'cdb_%'
+  ORDER BY 1, 2, 4;
+EOF

 # Deploy current dev branch
 make clean-dev || die "Could not clean dev files"
@@ -99,42 +76,26 @@ sudo make install || die "Could not deploy current dev branch"
 # Check it can be upgraded
 psql $DBNAME -c "ALTER EXTENSION crankshaft update to 'dev';" || die "Cannot upgrade to dev version"

-if [[ $PG_VERSION -lt 110000 ]]; then
-    psql $DBNAME -c "
-    CREATE TABLE dev_function_signatures AS
-        SELECT  p.proname as name,
-                pg_catalog.pg_get_function_result(p.oid) as result_type,
-                pg_catalog.pg_get_function_arguments(p.oid) as arguments,
-                CASE    WHEN p.proisagg     THEN 'agg'
-                        WHEN p.proiswindow  THEN 'window'
-                        WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
-                        ELSE 'normal'
-                END as type
-        FROM pg_catalog.pg_proc p
-            LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
-        WHERE
-            n.nspname = 'cdb_crankshaft'
-            AND p.proname LIKE 'cdb_%'
-        ORDER BY 1, 2, 4;"
-else
-    psql $DBNAME -c "
-    CREATE TABLE dev_function_signatures AS
-        SELECT  p.proname as name,
-                pg_catalog.pg_get_function_result(p.oid) as result_type,
-                pg_catalog.pg_get_function_arguments(p.oid) as arguments,
-                CASE    WHEN p.prokind = 'a' THEN 'agg'
-                        WHEN p.prokind = 'w' THEN 'window'
-                        WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
-                        ELSE 'normal'
-                END as type
-        FROM pg_catalog.pg_proc p
-            LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
-        WHERE
-            n.nspname = 'cdb_crankshaft'
-            AND p.proname LIKE 'cdb_%'
-        ORDER BY 1, 2, 4;"
-fi
-
+# Check against saved public function signatures
+psql $DBNAME <<'EOF'
+CREATE TABLE dev_function_signatures AS
+  SELECT
+    p.proname as name,
+    pg_catalog.pg_get_function_result(p.oid) as result_type,
+    pg_catalog.pg_get_function_arguments(p.oid) as arguments,
+   CASE
+    WHEN p.proisagg THEN 'agg'
+    WHEN p.proiswindow THEN 'window'
+    WHEN p.prorettype = 'pg_catalog.trigger'::pg_catalog.regtype THEN 'trigger'
+    ELSE 'normal'
+   END as type
+  FROM pg_catalog.pg_proc p
+       LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+  WHERE
+       n.nspname = 'cdb_crankshaft'
+       AND p.proname LIKE 'cdb_%'
+  ORDER BY 1, 2, 4;
+EOF

 echo "Functions in development not in latest release (ok):"
 psql $DBNAME -c "SELECT * FROM dev_function_signatures EXCEPT SELECT * FROM release_function_signatures;"
--- a/release/crankshaft--0.8.1--0.9.0.sql
+++ b/release/crankshaft--0.8.1--0.9.0.sql
@@ -4,7 +4,7 @@
 -- Version number of the extension release
 CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
 RETURNS text AS $$
-  SELECT '0.8.2'::text;
+  SELECT '0.9.0'::text;
 $$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;

 -- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
    $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

 -- Create aggregate if it did not exist
-DO $$ BEGIN
-    CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
-        SFUNC = CDB_PyAggS,
-        STYPE = Numeric[],
-        PARALLEL = SAFE,
-        INITCOND = "{}"
-    );
-EXCEPTION
-    WHEN duplicate_function THEN NULL;
-END $$;
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT *
+        FROM pg_catalog.pg_proc p
+            LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+        WHERE n.nspname = 'cdb_crankshaft'
+            AND p.proname = 'cdb_pyagg'
+            AND p.proisagg)
+    THEN
+        CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
+            SFUNC = CDB_PyAggS,
+            STYPE = Numeric[],
+            PARALLEL = SAFE,
+            INITCOND = "{}"
+        );
+    END IF;
+END
+$$ LANGUAGE plpgsql;

 CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
      query TEXT,
      variable_name TEXT,
      target_table TEXT,
+      model_name text DEFAULT NULL,
      n_estimators INTEGER DEFAULT 1200,
      max_depth INTEGER DEFAULT 3,
      subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
        'learning_rate': learning_rate,
        'min_samples_leaf': min_samples_leaf
    }
-    feature_cols = set(plpy.execute('''
+    all_cols = list(plpy.execute('''
        select * from ({query}) as _w limit 0
-    '''.format(query=query)).colnames()) -  set([variable_name, 'cartodb_id', ])
+    '''.format(query=query)).colnames())
+    feature_cols = [a for a in all_cols
+                    if a not in [variable_name, 'cartodb_id', ]]
    return seg.create_and_predict_segment(
        query,
        variable_name,
        feature_cols,
        target_table,
-        model_params
+        model_params,
+        model_name=model_name
    )
 $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;

+CREATE OR REPLACE FUNCTION
+  CDB_RetrieveModelParams(
+    model_name text,
+    param_name text
+  )
+RETURNS TABLE(param numeric, feature_name text) AS $$
+
+import pickle
+from collections import Iterable
+
+plan = plpy.prepare('''
+    SELECT model, feature_names FROM model_storage
+    WHERE name = $1;
+''', ['text', ])
+
+try:
+    model_encoded = plpy.execute(plan, [model_name, ])
+except plpy.SPIError as err:
+    plpy.error('ERROR: {}'.format(err))
+plpy.notice(model_encoded[0]['feature_names'])
+model = pickle.loads(
+    model_encoded[0]['model']
+)
+
+res = getattr(model, param_name) 
+if not isinstance(res, Iterable):
+    raise Exception('Cannot return `{}` as a table'.format(param_name))
+return zip(res, model_encoded[0]['feature_names'])
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
 CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
      query TEXT,
      variable TEXT,
      feature_columns TEXT[],
      target_query TEXT,
+      model_name TEXT DEFAULT NULL,
      n_estimators INTEGER DEFAULT 1200,
      max_depth INTEGER DEFAULT 3,
      subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
        variable,
        feature_columns,
        target_query,
-        model_params
+        model_params,
+        model_name=model_name
    )
 $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
 CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
 END
 $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

-
 -- Create aggregate if it did not exist
-DO $$ BEGIN
-    CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
-        SFUNC = CDB_WeightedMeanS,
-        FINALFUNC = CDB_WeightedMeanF,
-        STYPE = Numeric[],
-        PARALLEL = SAFE,
-        INITCOND = "{0.0,0.0,0.0}"
-);
-EXCEPTION
-    WHEN duplicate_function THEN NULL;
-END $$;
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT *
+        FROM pg_catalog.pg_proc p
+            LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+        WHERE n.nspname = 'cdb_crankshaft'
+            AND p.proname = 'cdb_weightedmean'
+            AND p.proisagg)
+    THEN
+        CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
+            SFUNC = CDB_WeightedMeanS,
+            FINALFUNC = CDB_WeightedMeanF,
+            STYPE = Numeric[],
+            PARALLEL = SAFE,
+            INITCOND = "{0.0,0.0,0.0}"
+        );
+    END IF;
+END
+$$ LANGUAGE plpgsql;
 -- Spatial Markov

 -- input table format:
--- a/release/crankshaft--0.9.0--0.9.1.sql
+++ b/release/crankshaft--0.9.0--0.9.1.sql
@@ -4,7 +4,7 @@
 -- Version number of the extension release
 CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
 RETURNS text AS $$
-  SELECT '0.8.2'::text;
+  SELECT '0.9.1'::text;
 $$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;

 -- Internal identifier of the installed extension instence
@@ -35,16 +35,25 @@ CREATE OR REPLACE FUNCTION
    $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

 -- Create aggregate if it did not exist
-DO $$ BEGIN
-    CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
-        SFUNC = CDB_PyAggS,
-        STYPE = Numeric[],
-        PARALLEL = SAFE,
-        INITCOND = "{}"
-    );
-EXCEPTION
-    WHEN duplicate_function THEN NULL;
-END $$;
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT *
+        FROM pg_catalog.pg_proc p
+            LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+        WHERE n.nspname = 'cdb_crankshaft'
+            AND p.proname = 'cdb_pyagg'
+            AND p.proisagg)
+    THEN
+        CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
+            SFUNC = CDB_PyAggS,
+            STYPE = Numeric[],
+            PARALLEL = SAFE,
+            INITCOND = "{}"
+        );
+    END IF;
+END
+$$ LANGUAGE plpgsql;

 CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
@@ -89,6 +98,7 @@ CREATE OR REPLACE FUNCTION
      query TEXT,
      variable_name TEXT,
      target_table TEXT,
+      model_name text DEFAULT NULL,
      n_estimators INTEGER DEFAULT 1200,
      max_depth INTEGER DEFAULT 3,
      subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -105,24 +115,59 @@ AS $$
        'learning_rate': learning_rate,
        'min_samples_leaf': min_samples_leaf
    }
-    feature_cols = set(plpy.execute('''
+    all_cols = list(plpy.execute('''
        select * from ({query}) as _w limit 0
-    '''.format(query=query)).colnames()) -  set([variable_name, 'cartodb_id', ])
+    '''.format(query=query)).colnames())
+    feature_cols = [a for a in all_cols
+                    if a not in [variable_name, 'cartodb_id', ]]
    return seg.create_and_predict_segment(
        query,
        variable_name,
        feature_cols,
        target_table,
-        model_params
+        model_params,
+        model_name=model_name
    )
 $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;

+CREATE OR REPLACE FUNCTION
+  CDB_RetrieveModelParams(
+    model_name text,
+    param_name text
+  )
+RETURNS TABLE(param numeric, feature_name text) AS $$
+
+import pickle
+from collections import Iterable
+
+plan = plpy.prepare('''
+    SELECT model, feature_names FROM model_storage
+    WHERE name = $1;
+''', ['text', ])
+
+try:
+    model_encoded = plpy.execute(plan, [model_name, ])
+except plpy.SPIError as err:
+    plpy.error('ERROR: {}'.format(err))
+plpy.notice(model_encoded[0]['feature_names'])
+model = pickle.loads(
+    model_encoded[0]['model']
+)
+
+res = getattr(model, param_name) 
+if not isinstance(res, Iterable):
+    raise Exception('Cannot return `{}` as a table'.format(param_name))
+return zip(res, model_encoded[0]['feature_names'])
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
 CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
      query TEXT,
      variable TEXT,
      feature_columns TEXT[],
      target_query TEXT,
+      model_name TEXT DEFAULT NULL,
      n_estimators INTEGER DEFAULT 1200,
      max_depth INTEGER DEFAULT 3,
      subsample DOUBLE PRECISION DEFAULT 0.5,
@@ -144,7 +189,8 @@ AS $$
        variable,
        feature_columns,
        target_query,
-        model_params
+        model_params,
+        model_name=model_name
    )
 $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
 CREATE OR REPLACE FUNCTION CDB_Gravity(
@@ -1104,19 +1150,27 @@ BEGIN
 END
 $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

-
 -- Create aggregate if it did not exist
-DO $$ BEGIN
-    CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
-        SFUNC = CDB_WeightedMeanS,
-        FINALFUNC = CDB_WeightedMeanF,
-        STYPE = Numeric[],
-        PARALLEL = SAFE,
-        INITCOND = "{0.0,0.0,0.0}"
-);
-EXCEPTION
-    WHEN duplicate_function THEN NULL;
-END $$;
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT *
+        FROM pg_catalog.pg_proc p
+            LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+        WHERE n.nspname = 'cdb_crankshaft'
+            AND p.proname = 'cdb_weightedmean'
+            AND p.proisagg)
+    THEN
+        CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
+            SFUNC = CDB_WeightedMeanS,
+            FINALFUNC = CDB_WeightedMeanF,
+            STYPE = Numeric[],
+            PARALLEL = SAFE,
+            INITCOND = "{0.0,0.0,0.0}"
+        );
+    END IF;
+END
+$$ LANGUAGE plpgsql;
 -- Spatial Markov

 -- input table format:
--- a/release/crankshaft--0.9.0.sql
+++ b/release/crankshaft--0.9.0.sql
--- a/release/crankshaft--0.9.1--0.9.2.sql
+++ b/release/crankshaft--0.9.1--0.9.2.sql
--- a/release/crankshaft--0.9.1.sql
+++ b/release/crankshaft--0.9.1.sql
--- a/release/crankshaft--0.9.2--0.9.3.sql
+++ b/release/crankshaft--0.9.2--0.9.3.sql
--- a/release/crankshaft--0.9.2.sql
+++ b/release/crankshaft--0.9.2.sql
--- a/release/crankshaft--0.9.3--0.9.4.sql
+++ b/release/crankshaft--0.9.3--0.9.4.sql
--- a/release/crankshaft--0.9.3.sql
+++ b/release/crankshaft--0.9.3.sql
--- a/release/crankshaft--0.9.4--0.9.5.sql
+++ b/release/crankshaft--0.9.4--0.9.5.sql
--- a/release/crankshaft--0.9.4.sql
+++ b/release/crankshaft--0.9.4.sql
--- a/release/crankshaft--0.9.5.sql
+++ b/release/crankshaft--0.9.5.sql
--- a/release/crankshaft.control
+++ b/release/crankshaft.control
@@ -1,5 +1,5 @@
 comment = 'CartoDB Spatial Analysis extension'
-default_version = '0.8.2'
+default_version = '0.9.5'
 requires = 'plpythonu, postgis'
 superuser = true
 schema = cdb_crankshaft
--- a/release/python/0.8.0/crankshaft/crankshaft/.analysis_data_provider.py.swp
+++ b/release/python/0.8.0/crankshaft/crankshaft/.analysis_data_provider.py.swp
--- a/release/python/0.8.1/crankshaft/crankshaft/.analysis_data_provider.py.swp
+++ b/release/python/0.8.1/crankshaft/crankshaft/.analysis_data_provider.py.swp
--- a/release/python/0.8.2/crankshaft/requirements.txt
+++ b/release/python/0.8.2/crankshaft/requirements.txt
@@ -1,5 +0,0 @@
-joblib==0.9.4
-numpy==1.11.0
-scipy==0.17.0
-pysal==1.14.3
-scikit-learn==0.17.0
--- a/release/python/0.9.0/crankshaft/crankshaft/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/analysis_data_provider.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/analysis_data_provider.py
--- a/release/python/0.9.0/crankshaft/crankshaft/centers/median_center.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/centers/median_center.py
@@ -0,0 +1,76 @@
+"""
+    Based on the Weiszfeld algorithm:
+      https://en.wikipedia.org/wiki/Geometric_median
+"""
+
+
+# import plpy
+import numpy as np
+from numpy.linalg import norm
+
+
+def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
+
+    query = '''
+        SELECT array_agg(ST_X({geom_col})) As x_coords,
+               array_agg(ST_Y({geom_col})) As y_coords
+          FROM {tablename}
+    '''.format(geom_col=geom_col, tablename=tablename)
+
+    try:
+        resp = plpy.execute(query)
+        data = np.vstack((resp['x_coords'][0],
+                          resp['y_coords'][0])).T
+
+        plpy.notice('coords: %s' % str(coords))
+    except Exception, err:
+        # plpy.error('Analysis failed: %s' % err)
+        print('No plpy')
+        data = np.array([[1.2 * np.random.random() + 10.,
+                          1.1 * (np.random.random() - 1.) + 3.]
+                        for i in range(1, 100)])
+
+    # initialize 'median center' to be the mean
+    coords_center_temp = data.mean(axis=0)
+
+    # plpy.notice('temp_center: %s' % str(coords_center_temp))
+    print('temp_center: %s' % str(coords_center_temp))
+
+    for i in range(0, num_iters):
+        old_coords_center = coords_center_temp.copy()
+        denom = denominator(coords_center_temp, data)
+        coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
+                                                         data[j])
+                                     for j in range(len(data))], axis=0)
+        coords_center_temp = coords_center_temp / denom
+
+        print("Pass #%d" % i)
+        print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
+        print('temp_center: %s' % str(coords_center_temp))
+        print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
+                                                         coords_center_temp))
+        print("Center coords: %s" % str(coords_center_temp))
+        print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
+
+    return coords_center_temp
+
+
+def obj_func(center_coords, data):
+    """
+
+    """
+    return np.linalg.norm(center_coords - data)
+
+
+def numerator(center_coords, data_i):
+    """
+
+    """
+    return np.reciprocal(np.linalg.norm(center_coords - data_i))
+
+
+def denominator(center_coords, data):
+    """
+
+    """
+    return np.reciprocal(np.linalg.norm(data - center_coords))
--- a/release/python/0.9.0/crankshaft/crankshaft/clustering/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/clustering/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/clustering/getis.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/clustering/getis.py
--- a/release/python/0.9.0/crankshaft/crankshaft/clustering/kmeans.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/clustering/kmeans.py
--- a/release/python/0.9.0/crankshaft/crankshaft/clustering/moran.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/clustering/moran.py
--- a/release/python/0.9.0/crankshaft/crankshaft/model_storage/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/model_storage/init.py
@@ -0,0 +1 @@
+from core import set_model, get_model, create_model_table
--- a/release/python/0.9.0/crankshaft/crankshaft/model_storage/core.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/model_storage/core.py
@@ -0,0 +1,86 @@
+import time
+import plpy
+import pickle
+from petname import generate
+
+def create_model_table():
+    q = '''
+        create table if not exists model_storage(
+            description text,
+            name text unique,
+            model bytea,
+            feature_names text[],
+            date_created timestamptz,
+            id serial primary key);
+        '''
+    plpy.notice(q)
+    plan = plpy.prepare(q)
+    resp = plpy.execute(plan)
+    plpy.notice('Model table successfully created')
+    plpy.notice(str(resp))
+
+def get_model(model_name):
+    """retrieve model if it exists"""
+    try:
+        plan = plpy.prepare('''
+            SELECT model FROM model_storage
+            WHERE name = $1;
+        ''', ['text', ])
+        model_encoded = plpy.execute(plan, [model_name, ])
+        if len(model_encoded) == 1:
+            model = pickle.loads(
+                model_encoded[0]['model']
+            )
+            plpy.notice('Model successfully loaded')
+        else:
+            plpy.notice('Model not found, or too many models '
+                        '({})'.format(len(model_encoded)))
+            model = None
+    except plpy.SPIError as err:
+        plpy.error('ERROR: {}'.format(err))
+
+    return model
+
+def set_model(model, model_name, feature_names):
+    """stores the model in the table model_storage"""
+    if model_name is None:
+	model_name = generate(words=2, separator='_', letters=8)
+	existing_names = plpy.execute('''
+	    SELECT array_agg(name) as name
+	    FROM model_storage
+	''')
+        plpy.notice('nrows: {}'.format(existing_names.nrows()))
+	plpy.notice('MODEL NAME: {}'.format(model_name))
+        plpy.notice('LEN of ms: {}'.format(len(existing_names)))
+        plpy.notice('existing_names: {}'.format(str(existing_names)))
+        plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
+        plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
+        if existing_names[0]['name'] is not None:
+            while model_name in existing_names[0]['name']:
+                model_name = generate(words=2, separator='_', letters=10)
+                plpy.notice(model_name)
+
+    # store model
+    try:
+	plan = plpy.prepare('''
+	    INSERT INTO model_storage(description, name, model, feature_names, date_created)
+	    VALUES (
+	      $1,
+	      $2,
+	      $3,
+              $4::text[],
+	      to_timestamp($5));
+	''', ['text', 'text', 'bytea', 'text', 'numeric'])
+        plpy.notice('{%s}' % ','.join(feature_names))
+        plpy.notice(feature_names)
+	plpy.execute(
+	    plan,
+	    [' '.join(m.strip() for m in model.__repr__().split('\n')),
+	     model_name,
+	     pickle.dumps(model),
+             '{%s}' % ','.join(feature_names),
+	     time.time()]
+	)
+	plpy.notice('model successfully stored as {}'.format(model_name))
+    except plpy.SPIError as err:
+	plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))
--- a/release/python/0.9.0/crankshaft/crankshaft/pysal_utils/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/pysal_utils/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py
--- a/release/python/0.9.0/crankshaft/crankshaft/random_seeds.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/random_seeds.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/base.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/base.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/family.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/family.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/glm.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/glm.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/iwls.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/iwls.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/links.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/links.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/utils.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/utils.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/glm/varfuncs.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/glm/varfuncs.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/gwr.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/gwr.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/kernels.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/kernels.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/search.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/search.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py
--- a/release/python/0.9.0/crankshaft/crankshaft/regression/gwr_cs.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/regression/gwr_cs.py
--- a/release/python/0.9.0/crankshaft/crankshaft/segmentation/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/segmentation/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/segmentation/segmentation.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/segmentation/segmentation.py
@@ -2,11 +2,14 @@
 Segmentation creation and prediction
 """

+import pickle
+import plpy
 import numpy as np
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn import metrics
 from sklearn.cross_validation import train_test_split
 from crankshaft.analysis_data_provider import AnalysisDataProvider
+from crankshaft import model_storage

 # NOTE: added optional param here

@@ -51,6 +54,7 @@ class Segmentation(object):

    def create_and_predict_segment(self, query, variable, feature_columns,
                                   target_query, model_params,
+                                   model_name=None,
                                   id_col='cartodb_id'):
        """
        generate a segment with machine learning
@@ -70,15 +74,23 @@ class Segmentation(object):
        (target, features, target_mean,
            feature_means) = self.clean_data(query, variable, feature_columns)

-        model, accuracy = train_model(target, features, model_params, 0.2)
+        model_storage.create_model_table()
+
+        # find model if it exists and is specified
+        if model_name is not None:
+            model = model_storage.get_model(model_name)
+
+        if locals().get('model') is None:
+            model, accuracy = train_model(target, features, model_params, 0.2)
+
        result = self.predict_segment(model, feature_columns, target_query,
                                      feature_means)
        accuracy_array = [accuracy] * result.shape[0]

        rowid = self.data_provider.get_segmentation_data(params)
-        '''
-        rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
-        '''
+
+        # store the model for later use
+        model_storage.set_model(model, model_name, feature_columns)
        return zip(rowid[0]['ids'], result, accuracy_array)

    def predict_segment(self, model, feature_columns, target_query,
--- a/release/python/0.9.0/crankshaft/crankshaft/space_time_dynamics/init.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/space_time_dynamics/init.py
--- a/release/python/0.9.0/crankshaft/crankshaft/space_time_dynamics/markov.py
+++ b/release/python/0.9.0/crankshaft/crankshaft/space_time_dynamics/markov.py
--- a/release/python/0.9.0/crankshaft/requirements.txt
+++ b/release/python/0.9.0/crankshaft/requirements.txt
@@ -0,0 +1,5 @@
+joblib==0.8.3
+numpy==1.6.1
+scipy==0.14.0
+pysal==1.14.3
+scikit-learn==0.14.1
--- a/release/python/0.9.0/crankshaft/setup.py
+++ b/release/python/0.9.0/crankshaft/setup.py
@@ -10,7 +10,7 @@ from setuptools import setup, find_packages
 setup(
    name='crankshaft',

-    version='0.8.2',
+    version='0.0.0',

    description='CartoDB Spatial Analysis Python Library',

@@ -41,7 +41,7 @@ setup(
    # The choice of component versions is dictated by what's
    # provisioned in the production servers.
    # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
-    install_requires=['joblib==0.9.4', 'numpy==1.11.0', 'scipy==0.17.0', 'pysal==1.14.3', 'scikit-learn==0.17.0'],
+    install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],

    requires=['pysal', 'numpy', 'sklearn'],

--- a/release/python/0.9.0/crankshaft/setup.py-r
+++ b/release/python/0.9.0/crankshaft/setup.py-r
@@ -0,0 +1,49 @@
+
+"""
+CartoDB Spatial Analysis Python Library
+See:
+https://github.com/CartoDB/crankshaft
+"""
+
+from setuptools import setup, find_packages
+
+setup(
+    name='crankshaft',
+
+    version='0.0.0',
+
+    description='CartoDB Spatial Analysis Python Library',
+
+    url='https://github.com/CartoDB/crankshaft',
+
+    author='Data Services Team - CartoDB',
+    author_email='dataservices@cartodb.com',
+
+    license='MIT',
+
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Mapping comunity',
+        'Topic :: Maps :: Mapping Tools',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 2.7',
+    ],
+
+    keywords='maps mapping tools spatial analysis geostatistics',
+
+    packages=find_packages(exclude=['contrib', 'docs', 'tests']),
+
+    extras_require={
+        'dev': ['unittest'],
+        'test': ['unittest', 'nose', 'mock'],
+    },
+
+    # The choice of component versions is dictated by what's
+    # provisioned in the production servers.
+    # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
+    install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1', 'petname==2.2'],
+
+    requires=['pysal', 'numpy', 'sklearn'],
+
+    test_suite='test'
+)
--- a/release/python/0.9.0/crankshaft/test/fixtures/data.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/data.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/getis.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/getis.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/gwr_packed_data.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/gwr_packed_data.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/gwr_packed_knowns.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/gwr_packed_knowns.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/kmeans.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/kmeans.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/markov.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/markov.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/model_data.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/model_data.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/moran.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/moran.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/neighbors.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/neighbors.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/neighbors_getis.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/neighbors_getis.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/neighbors_markov.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/neighbors_markov.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/optim.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/optim.json
@@ -0,0 +1,6 @@
+{
+    "production_col": [10, 10, 10],
+    "capacity_col": [0.09, 0.31],
+    "marginal_col": [5, 5],
+    "pairwise": [[1, 2, 3], [3, 2, 1]]
+}
--- a/release/python/0.9.0/crankshaft/test/fixtures/predict_data.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/predict_data.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/segmentation_result.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/segmentation_result.json
--- a/release/python/0.9.0/crankshaft/test/fixtures/true_result.json
+++ b/release/python/0.9.0/crankshaft/test/fixtures/true_result.json
--- a/release/python/0.9.0/crankshaft/test/helper.py
+++ b/release/python/0.9.0/crankshaft/test/helper.py
--- a/release/python/0.9.0/crankshaft/test/mock_plpy.py
+++ b/release/python/0.9.0/crankshaft/test/mock_plpy.py
--- a/release/python/0.9.0/crankshaft/test/test_clustering_getis.py
+++ b/release/python/0.9.0/crankshaft/test/test_clustering_getis.py
--- a/release/python/0.9.0/crankshaft/test/test_clustering_kmeans.py
+++ b/release/python/0.9.0/crankshaft/test/test_clustering_kmeans.py
--- a/release/python/0.9.0/crankshaft/test/test_clustering_moran.py
+++ b/release/python/0.9.0/crankshaft/test/test_clustering_moran.py
--- a/release/python/0.9.0/crankshaft/test/test_pysal_utils.py
+++ b/release/python/0.9.0/crankshaft/test/test_pysal_utils.py
--- a/release/python/0.9.0/crankshaft/test/test_regression_gwr.py
+++ b/release/python/0.9.0/crankshaft/test/test_regression_gwr.py
--- a/release/python/0.9.0/crankshaft/test/test_segmentation.py
+++ b/release/python/0.9.0/crankshaft/test/test_segmentation.py
--- a/release/python/0.9.0/crankshaft/test/test_space_time_dynamics.py
+++ b/release/python/0.9.0/crankshaft/test/test_space_time_dynamics.py
--- a/release/python/0.9.0/crankshaft/tools/setup.py
+++ b/release/python/0.9.0/crankshaft/tools/setup.py
@@ -0,0 +1,15 @@
+from test.helper import plpy, fixture_file
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+import json
+import crankshaft
+
+class RawDataProvider(AnalysisDataProvider):
+    def __init__(self, fixturedata):
+        self.your_algo_data = fixturedata
+    def get_moran(self, params):
+        """
+          Replace this function name with the one used in your algorithm,
+          and make sure to use the same function signature that is written
+          for this algo in analysis_data_provider.py
+        """
+        return self.your_algo_data
--- a/release/python/0.9.1/crankshaft/crankshaft/init.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/init.py
@@ -0,0 +1,7 @@
+"""Import all modules"""
+import crankshaft.random_seeds
+import crankshaft.clustering
+import crankshaft.space_time_dynamics
+import crankshaft.segmentation
+import crankshaft.regression
+import analysis_data_provider
--- a/release/python/0.9.1/crankshaft/crankshaft/analysis_data_provider.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/analysis_data_provider.py
@@ -0,0 +1,149 @@
+"""class for fetching data"""
+import plpy
+import pysal_utils as pu
+
+NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
+                    'for null values and fill in appropriately.')
+
+
+def verify_data(func):
+    """decorator to verify data result before returning to algorithm"""
+    def wrapper(*args, **kwargs):
+        """Error checking"""
+        try:
+            data = func(*args, **kwargs)
+            if not data:
+                plpy.error(NULL_VALUE_ERROR)
+            else:
+                return data
+        except plpy.SPIError as err:
+            plpy.error('Analysis failed: {}'.format(err))
+
+        return []
+
+    return wrapper
+
+
+class AnalysisDataProvider(object):
+    """Data fetching class for pl/python functions"""
+    @verify_data
+    def get_getis(self, w_type, params):  # pylint: disable=no-self-use
+        """fetch data for getis ord's g"""
+        query = pu.construct_neighbor_query(w_type, params)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_markov(self, w_type, params):  # pylint: disable=no-self-use
+        """fetch data for spatial markov"""
+        query = pu.construct_neighbor_query(w_type, params)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_moran(self, w_type, params):  # pylint: disable=no-self-use
+        """fetch data for moran's i analyses"""
+        query = pu.construct_neighbor_query(w_type, params)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_nonspatial_kmeans(self, params):  # pylint: disable=no-self-use
+        """
+            Fetch data for non-spatial k-means.
+
+            Inputs - a dict (params) with the following keys:
+                colnames: a (text) list of column names (e.g.,
+                          `['andy', 'cookie']`)
+                id_col: the name of the id column (e.g., `'cartodb_id'`)
+                subquery: the subquery for exposing the data (e.g.,
+                          SELECT * FROM favorite_things)
+            Output:
+                A SQL query for packaging the data for consumption within
+                `KMeans().nonspatial`. Format will be a list of length one,
+                with the first element a dict with keys ('rowid', 'attr1',
+                'attr2', ...)
+        """
+        agg_cols = ', '.join([
+            'array_agg({0}) As arr_col{1}'.format(val, idx+1)
+            for idx, val in enumerate(params['colnames'])
+        ])
+        query = '''
+            SELECT {cols}, array_agg({id_col}) As rowid
+            FROM ({subquery}) As a
+        '''.format(subquery=params['subquery'],
+                   id_col=params['id_col'],
+                   cols=agg_cols).strip()
+        return plpy.execute(query)
+
+    @verify_data
+    def get_segmentation_model_data(self, params):  # pylint: disable=R0201
+        """
+           fetch data for Segmentation
+        params = {"subquery": query,
+                  "target": variable,
+                  "features": feature_columns}
+        """
+        columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
+                             for col in params['features']])
+        query = '''
+                SELECT
+                  array_agg("{target}") As target,
+                  {columns}
+                FROM ({subquery}) As q
+                '''.format(subquery=params['subquery'],
+                           target=params['target'],
+                           columns=columns)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_segmentation_data(self, params):  # pylint: disable=no-self-use
+        """
+            params = {"subquery": target_query,
+                      "id_col": id_col}
+        """
+        query = '''
+                SELECT
+                  array_agg("{id_col}" ORDER BY "{id_col}") as "ids"
+                FROM ({subquery}) as q
+                 '''.format(**params)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_segmentation_predict_data(self, params):  # pylint: disable=R0201
+        """
+            fetch data for Segmentation
+            params = {"subquery": target_query,
+                      "feature_columns": feature_columns}
+        """
+        joined_features = ', '.join(['"{}"::numeric'.format(a)
+                                     for a in params['feature_columns']])
+        query = '''
+                SELECT
+                  Array[{joined_features}] As features
+                FROM ({subquery}) as q
+                '''.format(subquery=params['subquery'],
+                           joined_features=joined_features)
+        return plpy.cursor(query)
+
+    @verify_data
+    def get_spatial_kmeans(self, params):  # pylint: disable=no-self-use
+        """fetch data for spatial kmeans"""
+        query = '''
+                SELECT
+                  array_agg("{id_col}" ORDER BY "{id_col}") as ids,
+                  array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs,
+                  array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys
+                FROM ({subquery}) As a
+                WHERE "{geom_col}" IS NOT NULL
+                '''.format(**params)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_gwr(self, params):  # pylint: disable=no-self-use
+        """fetch data for gwr analysis"""
+        query = pu.gwr_query(params)
+        return plpy.execute(query)
+
+    @verify_data
+    def get_gwr_predict(self, params):  # pylint: disable=no-self-use
+        """fetch data for gwr predict"""
+        query = pu.gwr_predict_query(params)
+        return plpy.execute(query)
--- a/release/python/0.9.1/crankshaft/crankshaft/centers/median_center.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/centers/median_center.py
@@ -0,0 +1,76 @@
+"""
+    Based on the Weiszfeld algorithm:
+      https://en.wikipedia.org/wiki/Geometric_median
+"""
+
+
+# import plpy
+import numpy as np
+from numpy.linalg import norm
+
+
+def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
+
+    query = '''
+        SELECT array_agg(ST_X({geom_col})) As x_coords,
+               array_agg(ST_Y({geom_col})) As y_coords
+          FROM {tablename}
+    '''.format(geom_col=geom_col, tablename=tablename)
+
+    try:
+        resp = plpy.execute(query)
+        data = np.vstack((resp['x_coords'][0],
+                          resp['y_coords'][0])).T
+
+        plpy.notice('coords: %s' % str(coords))
+    except Exception, err:
+        # plpy.error('Analysis failed: %s' % err)
+        print('No plpy')
+        data = np.array([[1.2 * np.random.random() + 10.,
+                          1.1 * (np.random.random() - 1.) + 3.]
+                        for i in range(1, 100)])
+
+    # initialize 'median center' to be the mean
+    coords_center_temp = data.mean(axis=0)
+
+    # plpy.notice('temp_center: %s' % str(coords_center_temp))
+    print('temp_center: %s' % str(coords_center_temp))
+
+    for i in range(0, num_iters):
+        old_coords_center = coords_center_temp.copy()
+        denom = denominator(coords_center_temp, data)
+        coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
+                                                         data[j])
+                                     for j in range(len(data))], axis=0)
+        coords_center_temp = coords_center_temp / denom
+
+        print("Pass #%d" % i)
+        print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
+        print('temp_center: %s' % str(coords_center_temp))
+        print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
+                                                         coords_center_temp))
+        print("Center coords: %s" % str(coords_center_temp))
+        print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
+
+    return coords_center_temp
+
+
+def obj_func(center_coords, data):
+    """
+
+    """
+    return np.linalg.norm(center_coords - data)
+
+
+def numerator(center_coords, data_i):
+    """
+
+    """
+    return np.reciprocal(np.linalg.norm(center_coords - data_i))
+
+
+def denominator(center_coords, data):
+    """
+
+    """
+    return np.reciprocal(np.linalg.norm(data - center_coords))
--- a/release/python/0.9.1/crankshaft/crankshaft/clustering/init.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/clustering/init.py
@@ -0,0 +1,4 @@
+"""Import all functions from for clustering"""
+from moran import *
+from kmeans import *
+from getis import *
--- a/release/python/0.9.1/crankshaft/crankshaft/clustering/getis.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/clustering/getis.py
@@ -0,0 +1,50 @@
+"""
+Getis-Ord's G geostatistics (hotspot/coldspot analysis)
+"""
+
+import pysal as ps
+from collections import OrderedDict
+
+# crankshaft modules
+import crankshaft.pysal_utils as pu
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+# High level interface ---------------------------------------
+
+
+class Getis(object):
+    def __init__(self, data_provider=None):
+        if data_provider is None:
+            self.data_provider = AnalysisDataProvider()
+        else:
+            self.data_provider = data_provider
+
+    def getis_ord(self, subquery, attr,
+                  w_type, num_ngbrs, permutations, geom_col, id_col):
+        """
+        Getis-Ord's G*
+        Implementation building neighbors with a PostGIS database and PySAL's
+          Getis-Ord's G* hotspot/coldspot module.
+        Andy Eschbacher
+        """
+
+        # geometries with attributes that are null are ignored
+        # resulting in a collection of not as near neighbors if kNN is chosen
+
+        params = OrderedDict([("id_col", id_col),
+                              ("attr1", attr),
+                              ("geom_col", geom_col),
+                              ("subquery", subquery),
+                              ("num_ngbrs", num_ngbrs)])
+
+        result = self.data_provider.get_getis(w_type, params)
+        attr_vals = pu.get_attributes(result)
+
+        # build PySAL weight object
+        weight = pu.get_weight(result, w_type, num_ngbrs)
+
+        # calculate Getis-Ord's G* z- and p-values
+        getis = ps.esda.getisord.G_Local(attr_vals, weight,
+                                         star=True, permutations=permutations)
+
+        return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)
--- a/release/python/0.9.1/crankshaft/crankshaft/clustering/kmeans.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/clustering/kmeans.py
@@ -0,0 +1,113 @@
+from sklearn.cluster import KMeans
+import numpy as np
+
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+
+class Kmeans(object):
+    def __init__(self, data_provider=None):
+        if data_provider is None:
+            self.data_provider = AnalysisDataProvider()
+        else:
+            self.data_provider = data_provider
+
+    def spatial(self, query, no_clusters, no_init=20):
+        """
+            find centers based on clusters of latitude/longitude pairs
+            query: SQL query that has a WGS84 geometry (the_geom)
+        """
+        params = {"subquery": query,
+                  "geom_col": "the_geom",
+                  "id_col": "cartodb_id"}
+
+        result = self.data_provider.get_spatial_kmeans(params)
+
+        # Unpack query response
+        xs = result[0]['xs']
+        ys = result[0]['ys']
+        ids = result[0]['ids']
+
+        km = KMeans(n_clusters=no_clusters, n_init=no_init)
+        labels = km.fit_predict(zip(xs, ys))
+        return zip(ids, labels)
+
+    def nonspatial(self, subquery, colnames, no_clusters=5,
+                   standardize=True, id_col='cartodb_id'):
+        """
+        Arguments:
+            query (string): A SQL query to retrieve the data required to do the
+                            k-means clustering analysis, like so:
+                            SELECT * FROM iris_flower_data
+            colnames (list): a list of the column names which contain the data
+                             of interest, like so: ['sepal_width',
+                                                    'petal_width',
+                                                    'sepal_length',
+                                                    'petal_length']
+            no_clusters (int): number of clusters (greater than zero)
+            id_col (string): name of the input id_column
+
+        Returns:
+            A list of tuples with the following columns:
+            cluster labels: a label for the cluster that the row belongs to
+            centers: center of the cluster that this row belongs to
+            silhouettes: silhouette measure for this value
+            rowid: row that these values belong to (corresponds to the value in
+                   `id_col`)
+        """
+        import json
+        from sklearn import metrics
+
+        params = {
+            "colnames": colnames,
+            "subquery": subquery,
+            "id_col": id_col
+        }
+
+        data = self.data_provider.get_nonspatial_kmeans(params)
+
+        # fill array with values for k-means clustering
+        if standardize:
+            cluster_columns = _scale_data(
+              _extract_columns(data))
+        else:
+            cluster_columns = _extract_columns(data)
+
+        kmeans = KMeans(n_clusters=no_clusters,
+                        random_state=0).fit(cluster_columns)
+
+        centers = [json.dumps(dict(zip(colnames, c)))
+                   for c in kmeans.cluster_centers_[kmeans.labels_]]
+
+        silhouettes = metrics.silhouette_samples(cluster_columns,
+                                                 kmeans.labels_,
+                                                 metric='sqeuclidean')
+
+        return zip(kmeans.labels_,
+                   centers,
+                   silhouettes,
+                   [kmeans.inertia_] * kmeans.labels_.shape[0],
+                   data[0]['rowid'])
+
+
+# -- Preprocessing steps
+
+def _extract_columns(data):
+    """
+        Extract the features from the query and pack them into a NumPy array
+        data (list of dicts): result of the kmeans request
+    """
+    # number of columns minus rowid column
+    n_cols = len(data[0]) - 1
+    return np.array([data[0]['arr_col{0}'.format(i+1)]
+                     for i in xrange(n_cols)],
+                    dtype=float).T
+
+
+def _scale_data(features):
+    """
+        Scale all input columns to center on 0 with a standard devation of 1
+        features (numpy matrix): features of dimension (n_features, n_samples)
+    """
+    from sklearn.preprocessing import StandardScaler
+    scaler = StandardScaler()
+    return scaler.fit_transform(features)
--- a/release/python/0.9.1/crankshaft/crankshaft/clustering/moran.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/clustering/moran.py
@@ -0,0 +1,341 @@
+"""
+Moran's I geostatistics (global clustering & outliers presence)
+Functionality relies on a combination of `PySAL
+<http://pysal.readthedocs.io/en/latest/>`__ and the data providered provided in
+the class instantiation (which defaults to PostgreSQL's plpy module's `database
+access functions <https://www.postgresql.org/docs/10/static/plpython.html>`__).
+"""
+
+from collections import OrderedDict
+import pysal as ps
+
+# crankshaft module
+import crankshaft.pysal_utils as pu
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+# High level interface ---------------------------------------
+
+
+class Moran(object):
+    """Class for calculation of Moran's I statistics (global, local, and local
+    rate)
+
+    Parameters:
+      data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See
+        the `crankshaft.analysis_data_provider` module for more information.
+    """
+    def __init__(self, data_provider=None):
+        if data_provider is None:
+            self.data_provider = AnalysisDataProvider()
+        else:
+            self.data_provider = data_provider
+
+    def global_stat(self, subquery, attr_name,
+                    w_type, num_ngbrs, permutations, geom_col, id_col):
+        """
+        Moran's I (global)
+        Implementation building neighbors with a PostGIS database and Moran's I
+         core clusters with PySAL.
+
+        Args:
+
+          subquery (str): Query to give access to the data needed. This query
+            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+          attr_name (str): Column name of data to analyze
+          w_type (str): Type of spatial weight. Must be one of `knn`
+            or `queen`. See `PySAL documentation
+            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
+            for more information.
+          num_ngbrs (int): If using `knn` for ``w_type``, this
+            specifies the number of neighbors to be used to define the spatial
+            neighborhoods.
+          permutations (int): Number of permutations for performing
+            conditional randomization to find the p-value. Higher numbers
+            takes a longer time for getting results.
+          geom_col (str): Name of the geometry column in the dataset for
+            finding the spatial neighborhoods.
+          id_col (str): Row index for each value. Usually the database index.
+
+        """
+        params = OrderedDict([("id_col", id_col),
+                              ("attr1", attr_name),
+                              ("geom_col", geom_col),
+                              ("subquery", subquery),
+                              ("num_ngbrs", num_ngbrs)])
+
+        result = self.data_provider.get_moran(w_type, params)
+
+        # collect attributes
+        attr_vals = pu.get_attributes(result)
+
+        # calculate weights
+        weight = pu.get_weight(result, w_type, num_ngbrs)
+
+        # calculate moran global
+        moran_global = ps.esda.moran.Moran(attr_vals, weight,
+                                           permutations=permutations)
+
+        return zip([moran_global.I], [moran_global.EI])
+
+    def local_stat(self, subquery, attr,
+                   w_type, num_ngbrs, permutations, geom_col, id_col):
+        """
+        Moran's I (local)
+
+        Args:
+
+          subquery (str): Query to give access to the data needed. This query
+            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+          attr (str): Column name of data to analyze
+          w_type (str): Type of spatial weight. Must be one of `knn`
+            or `queen`. See `PySAL documentation
+            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
+            for more information.
+          num_ngbrs (int): If using `knn` for ``w_type``, this
+            specifies the number of neighbors to be used to define the spatial
+            neighborhoods.
+          permutations (int): Number of permutations for performing
+            conditional randomization to find the p-value. Higher numbers
+            takes a longer time for getting results.
+          geom_col (str): Name of the geometry column in the dataset for
+            finding the spatial neighborhoods.
+          id_col (str): Row index for each value. Usually the database index.
+
+        Returns:
+          list of tuples: Where each tuple consists of the following values:
+            - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
+            - p-value
+            - spatial lag
+            - standardized spatial lag (centered on the mean, normalized by the
+              standard deviation)
+            - original value
+            - standardized value
+            - Moran's I statistic
+            - original row index
+        """
+
+        # geometries with attributes that are null are ignored
+        # resulting in a collection of not as near neighbors
+
+        params = OrderedDict([("id_col", id_col),
+                              ("attr1", attr),
+                              ("geom_col", geom_col),
+                              ("subquery", subquery),
+                              ("num_ngbrs", num_ngbrs)])
+
+        result = self.data_provider.get_moran(w_type, params)
+
+        attr_vals = pu.get_attributes(result)
+        weight = pu.get_weight(result, w_type, num_ngbrs)
+
+        # calculate LISA values
+        lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
+                                         permutations=permutations)
+
+        # find quadrants for each geometry
+        quads = quad_position(lisa.q)
+
+        # calculate spatial lag
+        lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
+        lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
+
+        return zip(
+            quads,
+            lisa.p_sim,
+            lag,
+            lag_std,
+            lisa.y,
+            lisa.z,
+            lisa.Is,
+            weight.id_order
+        )
+
+    def global_rate_stat(self, subquery, numerator, denominator,
+                         w_type, num_ngbrs, permutations, geom_col, id_col):
+        """
+        Moran's I Rate (global)
+
+        Args:
+
+          subquery (str): Query to give access to the data needed. This query
+            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+          numerator (str): Column name of numerator to analyze
+          denominator (str): Column name of the denominator
+          w_type (str): Type of spatial weight. Must be one of `knn`
+            or `queen`. See `PySAL documentation
+            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
+            for more information.
+          num_ngbrs (int): If using `knn` for ``w_type``, this
+            specifies the number of neighbors to be used to define the spatial
+            neighborhoods.
+          permutations (int): Number of permutations for performing
+            conditional randomization to find the p-value. Higher numbers
+            takes a longer time for getting results.
+          geom_col (str): Name of the geometry column in the dataset for
+            finding the spatial neighborhoods.
+          id_col (str): Row index for each value. Usually the database index.
+        """
+        params = OrderedDict([("id_col", id_col),
+                              ("attr1", numerator),
+                              ("attr2", denominator),
+                              ("geom_col", geom_col),
+                              ("subquery", subquery),
+                              ("num_ngbrs", num_ngbrs)])
+
+        result = self.data_provider.get_moran(w_type, params)
+
+        # collect attributes
+        numer = pu.get_attributes(result, 1)
+        denom = pu.get_attributes(result, 2)
+
+        weight = pu.get_weight(result, w_type, num_ngbrs)
+
+        # calculate moran global rate
+        lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
+                                             permutations=permutations)
+
+        return zip([lisa_rate.I], [lisa_rate.EI])
+
+    def local_rate_stat(self, subquery, numerator, denominator,
+                        w_type, num_ngbrs, permutations, geom_col, id_col):
+        """
+        Moran's I Local Rate
+
+        Args:
+
+          subquery (str): Query to give access to the data needed. This query
+            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+          numerator (str): Column name of numerator to analyze
+          denominator (str): Column name of the denominator
+          w_type (str): Type of spatial weight. Must be one of `knn`
+            or `queen`. See `PySAL documentation
+            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
+            for more information.
+          num_ngbrs (int): If using `knn` for ``w_type``, this
+            specifies the number of neighbors to be used to define the spatial
+            neighborhoods.
+          permutations (int): Number of permutations for performing
+            conditional randomization to find the p-value. Higher numbers
+            takes a longer time for getting results.
+          geom_col (str): Name of the geometry column in the dataset for
+            finding the spatial neighborhoods.
+          id_col (str): Row index for each value. Usually the database index.
+
+        Returns:
+          list of tuples: Where each tuple consists of the following values:
+            - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
+            - p-value
+            - spatial lag
+            - standardized spatial lag (centered on the mean, normalized by the
+              standard deviation)
+            - original value (roughly numerator divided by denominator)
+            - standardized value
+            - Moran's I statistic
+            - original row index
+        """
+        # geometries with values that are null are ignored
+        # resulting in a collection of not as near neighbors
+
+        params = OrderedDict([("id_col", id_col),
+                              ("numerator", numerator),
+                              ("denominator", denominator),
+                              ("geom_col", geom_col),
+                              ("subquery", subquery),
+                              ("num_ngbrs", num_ngbrs)])
+
+        result = self.data_provider.get_moran(w_type, params)
+
+        # collect attributes
+        numer = pu.get_attributes(result, 1)
+        denom = pu.get_attributes(result, 2)
+
+        weight = pu.get_weight(result, w_type, num_ngbrs)
+
+        # calculate LISA values
+        lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
+                                              permutations=permutations)
+
+        # find quadrants for each geometry
+        quads = quad_position(lisa.q)
+
+        # spatial lag
+        lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
+        lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
+
+        return zip(
+            quads,
+            lisa.p_sim,
+            lag,
+            lag_std,
+            lisa.y,
+            lisa.z,
+            lisa.Is,
+            weight.id_order
+        )
+
+    def local_bivariate_stat(self, subquery, attr1, attr2,
+                             permutations, geom_col, id_col,
+                             w_type, num_ngbrs):
+        """
+            Moran's I (local) Bivariate (untested)
+        """
+
+        params = OrderedDict([("id_col", id_col),
+                              ("attr1", attr1),
+                              ("attr2", attr2),
+                              ("geom_col", geom_col),
+                              ("subquery", subquery),
+                              ("num_ngbrs", num_ngbrs)])
+
+        result = self.data_provider.get_moran(w_type, params)
+
+        # collect attributes
+        attr1_vals = pu.get_attributes(result, 1)
+        attr2_vals = pu.get_attributes(result, 2)
+
+        # create weights
+        weight = pu.get_weight(result, w_type, num_ngbrs)
+
+        # calculate LISA values
+        lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
+                                            permutations=permutations)
+
+        # find clustering of significance
+        lisa_sig = quad_position(lisa.q)
+
+        return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
+
+# Low level functions ----------------------------------------
+
+
+def map_quads(coord):
+    """
+    Map a quadrant number to Moran's I designation
+    HH=1, LH=2, LL=3, HL=4
+    Args:
+      coord (int): quadrant of a specific measurement
+    Returns:
+      classification (one of 'HH', 'LH', 'LL', or 'HL')
+    """
+    if coord == 1:
+        return 'HH'
+    elif coord == 2:
+        return 'LH'
+    elif coord == 3:
+        return 'LL'
+    elif coord == 4:
+        return 'HL'
+    return None
+
+
+def quad_position(quads):
+    """
+    Map all quads
+
+    Args:
+      quads (:obj:`numpy.ndarray`): an array of quads classified by
+      1-4 (PySAL default)
+    Returns:
+      list: an array of quads classied by 'HH', 'LL', etc.
+    """
+    return [map_quads(q) for q in quads]
--- a/release/python/0.9.1/crankshaft/crankshaft/model_storage/init.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/model_storage/init.py
@@ -0,0 +1 @@
+from core import set_model, get_model, create_model_table
--- a/release/python/0.9.1/crankshaft/crankshaft/model_storage/core.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/model_storage/core.py
@@ -0,0 +1,86 @@
+import time
+import plpy
+import pickle
+from petname import generate
+
+def create_model_table():
+    q = '''
+        create table if not exists model_storage(
+            description text,
+            name text unique,
+            model bytea,
+            feature_names text[],
+            date_created timestamptz,
+            id serial primary key);
+        '''
+    plpy.notice(q)
+    plan = plpy.prepare(q)
+    resp = plpy.execute(plan)
+    plpy.notice('Model table successfully created')
+    plpy.notice(str(resp))
+
+def get_model(model_name):
+    """retrieve model if it exists"""
+    try:
+        plan = plpy.prepare('''
+            SELECT model FROM model_storage
+            WHERE name = $1;
+        ''', ['text', ])
+        model_encoded = plpy.execute(plan, [model_name, ])
+        if len(model_encoded) == 1:
+            model = pickle.loads(
+                model_encoded[0]['model']
+            )
+            plpy.notice('Model successfully loaded')
+        else:
+            plpy.notice('Model not found, or too many models '
+                        '({})'.format(len(model_encoded)))
+            model = None
+    except plpy.SPIError as err:
+        plpy.error('ERROR: {}'.format(err))
+
+    return model
+
+def set_model(model, model_name, feature_names):
+    """stores the model in the table model_storage"""
+    if model_name is None:
+	model_name = generate(words=2, separator='_', letters=8)
+	existing_names = plpy.execute('''
+	    SELECT array_agg(name) as name
+	    FROM model_storage
+	''')
+        plpy.notice('nrows: {}'.format(existing_names.nrows()))
+	plpy.notice('MODEL NAME: {}'.format(model_name))
+        plpy.notice('LEN of ms: {}'.format(len(existing_names)))
+        plpy.notice('existing_names: {}'.format(str(existing_names)))
+        plpy.notice('existing_names: {}'.format(str(existing_names[0]['name'])))
+        plpy.notice('type existing_names: {}'.format(type(existing_names[0]['name'])))
+        if existing_names[0]['name'] is not None:
+            while model_name in existing_names[0]['name']:
+                model_name = generate(words=2, separator='_', letters=10)
+                plpy.notice(model_name)
+
+    # store model
+    try:
+	plan = plpy.prepare('''
+	    INSERT INTO model_storage(description, name, model, feature_names, date_created)
+	    VALUES (
+	      $1,
+	      $2,
+	      $3,
+              $4::text[],
+	      to_timestamp($5));
+	''', ['text', 'text', 'bytea', 'text', 'numeric'])
+        plpy.notice('{%s}' % ','.join(feature_names))
+        plpy.notice(feature_names)
+	plpy.execute(
+	    plan,
+	    [' '.join(m.strip() for m in model.__repr__().split('\n')),
+	     model_name,
+	     pickle.dumps(model),
+             '{%s}' % ','.join(feature_names),
+	     time.time()]
+	)
+	plpy.notice('model successfully stored as {}'.format(model_name))
+    except plpy.SPIError as err:
+	plpy.notice('ERROR: {}\nt: {}'.format(err, time.time()))
--- a/release/python/0.9.1/crankshaft/crankshaft/pysal_utils/init.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/pysal_utils/init.py
@@ -0,0 +1,2 @@
+"""Import all functions for pysal_utils"""
+from crankshaft.pysal_utils.pysal_utils import *
--- a/release/python/0.9.1/crankshaft/crankshaft/pysal_utils/pysal_utils.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/pysal_utils/pysal_utils.py
@@ -0,0 +1,251 @@
+"""
+    Utilities module for generic PySAL functionality, mainly centered on
+      translating queries into numpy arrays or PySAL weights objects
+"""
+
+import numpy as np
+import pysal as ps
+
+
+def construct_neighbor_query(w_type, query_vals):
+    """Return query (a string) used for finding neighbors
+        @param w_type text: type of neighbors to calculate ('knn' or 'queen')
+        @param query_vals dict: values used to construct the query
+    """
+
+    if w_type.lower() == 'knn':
+        return knn(query_vals)
+    else:
+        return queen(query_vals)
+
+
+# Build weight object
+def get_weight(query_res, w_type='knn', num_ngbrs=5):
+    """
+        Construct PySAL weight from return value of query
+        @param query_res dict-like: query results with attributes and neighbors
+    """
+
+    neighbors = {x['id']: x['neighbors'] for x in query_res}
+    print 'len of neighbors: %d' % len(neighbors)
+
+    built_weight = ps.W(neighbors)
+    built_weight.transform = 'r'
+
+    return built_weight
+
+
+def query_attr_select(params, table_ref=True):
+    """
+        Create portion of SELECT statement for attributes inolved in query.
+        Defaults to order in the params
+        @param params: dict of information used in query (column names,
+                       table name, etc.)
+            Example:
+            OrderedDict([('numerator', 'price'),
+                         ('denominator', 'sq_meters'),
+                         ('subquery', 'SELECT * FROM interesting_data')])
+        Output:
+          "i.\"price\"::numeric As attr1, " \
+          "i.\"sq_meters\"::numeric As attr2, "
+    """
+
+    attr_string = ""
+    template = "\"%(col)s\"::numeric As attr%(alias_num)s, "
+
+    if table_ref:
+        template = "i." + template
+
+    if ('time_cols' in params) or ('ind_vars' in params):
+        # if markov or gwr analysis
+        attrs = (params['time_cols'] if 'time_cols' in params
+                 else params['ind_vars'])
+        if 'ind_vars' in params:
+            template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, "
+
+        for idx, val in enumerate(attrs):
+            attr_string += template % {"col": val, "alias_num": idx + 1}
+    else:
+        # if moran's analysis
+        attrs = [k for k in params
+                 if k not in ('id_col', 'geom_col', 'subquery',
+                              'num_ngbrs', 'subquery')]
+
+        for idx, val in enumerate(attrs):
+            attr_string += template % {"col": params[val],
+                                       "alias_num": idx + 1}
+
+    return attr_string
+
+
+def query_attr_where(params, table_ref=True):
+    """
+      Construct where conditions when building neighbors query
+        Create portion of WHERE clauses for weeding out NULL-valued geometries
+        Input: dict of params:
+            {'subquery': ...,
+             'numerator': 'data1',
+             'denominator': 'data2',
+             '': ...}
+        Output:
+          'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
+        Input:
+        {'subquery': ...,
+         'time_cols': ['time1', 'time2', 'time3'],
+         'etc': ...}
+        Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
+          NULL AND idx_replace."time3" IS NOT NULL'
+    """
+    attr_string = []
+    template = "\"%s\" IS NOT NULL"
+    if table_ref:
+        template = "idx_replace." + template
+
+    if ('time_cols' in params) or ('ind_vars' in params):
+        # markov or gwr where clauses
+        attrs = (params['time_cols'] if 'time_cols' in params
+                 else params['ind_vars'])
+        # add values to template
+        for attr in attrs:
+            attr_string.append(template % attr)
+    else:
+        # moran where clauses
+
+        # get keys
+        attrs = [k for k in params
+                 if k not in ('id_col', 'geom_col', 'subquery',
+                              'num_ngbrs', 'subquery')]
+
+        # add values to template
+        for attr in attrs:
+            attr_string.append(template % params[attr])
+
+        if 'denominator' in attrs:
+            attr_string.append(
+              "idx_replace.\"%s\" <> 0" % params['denominator'])
+
+    out = " AND ".join(attr_string)
+
+    return out
+
+
+def knn(params):
+    """SQL query for k-nearest neighbors.
+        @param vars: dict of values to fill template
+    """
+
+    attr_select = query_attr_select(params, table_ref=True)
+    attr_where = query_attr_where(params, table_ref=True)
+
+    replacements = {"attr_select": attr_select,
+                    "attr_where_i": attr_where.replace("idx_replace", "i"),
+                    "attr_where_j": attr_where.replace("idx_replace", "j")}
+
+    query = '''
+            SELECT
+              i."{id_col}" As id,
+              %(attr_select)s
+              (SELECT ARRAY(SELECT j."{id_col}"
+                FROM ({subquery}) As j
+                WHERE i."{id_col}" <> j."{id_col}" AND
+                      %(attr_where_j)s AND
+                      j."{geom_col}" IS NOT NULL
+                ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC
+                LIMIT {num_ngbrs})) As neighbors
+             FROM ({subquery}) As i
+            WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL
+            ORDER BY i."{id_col}" ASC;
+            ''' % replacements
+
+    return query.format(**params)
+
+
+# SQL query for finding queens neighbors (all contiguous polygons)
+def queen(params):
+    """SQL query for queen neighbors.
+        @param params dict: information to fill query
+    """
+    attr_select = query_attr_select(params)
+    attr_where = query_attr_where(params)
+
+    replacements = {"attr_select": attr_select,
+                    "attr_where_i": attr_where.replace("idx_replace", "i"),
+                    "attr_where_j": attr_where.replace("idx_replace", "j")}
+
+    query = '''
+            SELECT
+              i."{id_col}" As id,
+              %(attr_select)s
+              (SELECT ARRAY(SELECT j."{id_col}"
+                 FROM ({subquery}) As j
+                WHERE i."{id_col}" <> j."{id_col}" AND
+                      ST_Touches(i."{geom_col}", j."{geom_col}") AND
+                      %(attr_where_j)s)) As neighbors
+            FROM ({subquery}) As i
+            WHERE
+                %(attr_where_i)s
+            ORDER BY i."{id_col}" ASC;
+            ''' % replacements
+
+    return query.format(**params)
+
+
+def gwr_query(params):
+    """
+    GWR query
+    """
+
+    replacements = {"ind_vars_select": query_attr_select(params,
+                                                         table_ref=None),
+                    "ind_vars_where": query_attr_where(params,
+                                                       table_ref=None)}
+
+    query = '''
+      SELECT
+        array_agg(ST_X(ST_Centroid("{geom_col}"))) As x,
+        array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y,
+        array_agg("{dep_var}") As dep_var,
+        %(ind_vars_select)s
+        array_agg("{id_col}") As rowid
+      FROM ({subquery}) As q
+      WHERE
+        "{dep_var}" IS NOT NULL AND
+        %(ind_vars_where)s
+        ''' % replacements
+
+    return query.format(**params).strip()
+
+
+def gwr_predict_query(params):
+    """
+    GWR query
+    """
+
+    replacements = {"ind_vars_select": query_attr_select(params,
+                                                         table_ref=None),
+                    "ind_vars_where": query_attr_where(params,
+                                                       table_ref=None)}
+
+    query = '''
+      SELECT
+        array_agg(ST_X(ST_Centroid({geom_col}))) As x,
+        array_agg(ST_Y(ST_Centroid({geom_col}))) As y,
+        array_agg({dep_var}) As dep_var,
+        %(ind_vars_select)s
+        array_agg({id_col}) As rowid
+      FROM ({subquery}) As q
+      WHERE
+        %(ind_vars_where)s
+        ''' % replacements
+
+    return query.format(**params).strip()
+# to add more weight methods open a ticket or pull request
+
+
+def get_attributes(query_res, attr_num=1):
+    """
+        @param query_res: query results with attributes and neighbors
+        @param attr_num: attribute number (1, 2, ...)
+    """
+    return np.array([x['attr' + str(attr_num)] for x in query_res],
+                    dtype=np.float)
--- a/release/python/0.9.1/crankshaft/crankshaft/random_seeds.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/random_seeds.py
@@ -0,0 +1,12 @@
+"""Random seed generator used for non-deterministic functions in crankshaft"""
+import random
+import numpy
+
+
+def set_random_seeds(value):
+    """
+    Set the seeds of the RNGs (Random Number Generators)
+    used internally.
+    """
+    random.seed(value)
+    numpy.random.seed(value)
--- a/release/python/0.9.1/crankshaft/crankshaft/regression/init.py
+++ b/release/python/0.9.1/crankshaft/crankshaft/regression/init.py
@@ -0,0 +1,3 @@
+from crankshaft.regression.gwr import *
+from crankshaft.regression.glm import *
+from crankshaft.regression.gwr_cs import *
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Andy Eschbacher	daba2f9597	release 0.9.5 [ci skip]	2018-04-09 15:22:35 -04:00
Andy Eschbacher	8f28f41060	corrects incorrect variable name	2018-04-09 15:16:52 -04:00
Andy Eschbacher	7509afa5a6	release feature name validation	2018-04-09 14:14:39 -04:00
Andy Eschbacher	a28c68502c	adds feature name validation [ci skip]	2018-04-09 14:09:31 -04:00
Andy Eschbacher	5b4443ca88	new faux release	2018-03-22 13:14:20 -04:00
Andy Eschbacher	2048db33fc	avoids accuracy calculation without model being defined	2018-03-22 13:12:17 -04:00
Andy Eschbacher	99e78800b3	adds latest release file	2018-03-22 11:46:42 -04:00
Andy Eschbacher	800648a710	adds upgrade path for 0.9.2 faux release	2018-03-22 11:08:46 -04:00
Andy Eschbacher	91ee6ecc48	new faux release	2018-03-22 11:02:45 -04:00
Andy Eschbacher	9a5ab17240	replaces petname with uuid for now	2018-03-22 11:01:39 -04:00
Andy Eschbacher	65be9befb1	faux release for staging testing	2018-03-22 10:19:29 -04:00
Andy Eschbacher	37e6b4a228	fixes release path copy error [ci skip]	2018-03-20 11:59:15 -04:00
Andy Eschbacher	766bfed9be	dummy version bump	2018-03-19 13:30:37 -04:00
Andy Eschbacher	e8a601e945	adds model module [ci skip]	2018-03-16 16:45:39 -04:00
Andy Eschbacher	c2be340c07	prototype of model writing	2018-03-16 16:21:00 -04:00
				`@@ -0,0 +1 @@`
				`from core import set_model, get_model, create_model_table`