diff --git a/pg/crankshaft--0.0.1.sql b/pg/crankshaft--0.0.1.sql index 436beea..c72e5fc 100644 --- a/pg/crankshaft--0.0.1.sql +++ b/pg/crankshaft--0.0.1.sql @@ -137,6 +137,33 @@ BEGIN END; $$ LANGUAGE plpgsql VOLATILE; +CREATE OR REPLACE FUNCTION + cdb_create_segment ( + segment_name TEXT, + table_name TEXT, + column_name TEXT, + geoid_column TEXT DEFAULT 'geoid', + census_table TEXT DEFAULT 'block_groups' + ) +RETURNS NUMERIC +AS $$ + from crankshaft import segmentation + # TODO: use named parameters or a dictionary + return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest') +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION + cdb_predict_segment ( + segment_name TEXT, + geoid_column TEXT DEFAULT 'geoid', + census_table TEXT DEFAULT 'block_groups' + ) +RETURNS TABLE(geoid TEXT, prediction NUMERIC) +AS $$ + from crankshaft.segmentation import create_segemnt + # TODO: use named parameters or a dictionary + return create_segment('table') +$$ LANGUAGE plpythonu; -- Make sure by default there are no permissions for publicuser -- NOTE: this happens at extension creation time, as part of an implicit transaction. -- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; diff --git a/pg/sql/0.0.1/05_segmentation.sql b/pg/sql/0.0.1/05_segmentation.sql index 8e23ad6..cce29b6 100644 --- a/pg/sql/0.0.1/05_segmentation.sql +++ b/pg/sql/0.0.1/05_segmentation.sql @@ -8,9 +8,9 @@ CREATE OR REPLACE FUNCTION ) RETURNS NUMERIC AS $$ - from crankshaft.segmentation import create_segemnt + from crankshaft import segmentation # TODO: use named parameters or a dictionary - return create_segment('table') + return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest') $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION diff --git a/python/crankshaft/crankshaft/segmentation/segmentation.py b/python/crankshaft/crankshaft/segmentation/segmentation.py index acdae51..201e7e4 100644 --- a/python/crankshaft/crankshaft/segmentation/segmentation.py +++ b/python/crankshaft/crankshaft/segmentation/segmentation.py @@ -13,57 +13,71 @@ from sklearn.cross_validation import train_test_split # High level interface --------------------------------------- -def cdb_create_segment(segment_name,table_name,column_name,geoid_column,census_table,method): +def create_segment(segment_name,table_name,column_name,geoid_column,census_table,method): """ generate a segment with machine learning Stuart Lynn """ - data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table,)) - features = data[data.columns.difference([column_name, 'geoid'])] + data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table)) + features = data[data.columns.difference([column_name, 'geoid','the_geom'])] target, mean, std = normalize(data[column_name]) model, accuracy = train_model(target,features, test_split=0.2) - save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method) + # save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method) + # predict_segment return accuracy def normalize(target): mean = np.mean(target) - std = no.std(target) + std = np.std(target) + plpy.notice('mean '+str(mean)+" std : "+str(std)) return (target - mean)/std, mean, std def denormalize(target, mean ,std): return target*std + mean def train_model(target,features,test_split): + plpy.notice('training the model') + plpy.notice('dataframe shape '+ str(np.shape(features))) + plpy.notice('dataframe columns '+ str(features.dtypes)) + features = features.dropna(axis =1, how='all').fillna(0) + target = target.fillna(0) features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) + plpy.notice('training the model test train split') model = ExtraTreesRegressor(n_estimators = 40, max_features=len(features.columns)) + plpy.notice('training the model created tree') + plpy.notice('features '+str(np.shape(features_train))+" "+str(np.shape(features_test)) ) + model.fit(features_train, target_train) + plpy.notice('training the model fitting model') accuracy = calculate_model_accuracy(model,features,target) return model, accuracy def calculate_model_accuracy(model,features,target): - prediction = self.model.predict(features) + prediction = model.predict(features) return metrics.mean_squared_error(prediction,target)/np.std(target) def join_with_census(table_name, column_name, geoid_column, census_table): - coulmns = plpy.execute('select {census_table}.* limit 1 ') - feature_names = ",".join(columns.keys.difference(['the_geom','cartodb_id'])) + columns = plpy.execute('select * from {census_table} limit 1 '.format(**locals())) + combined_columns = [ a for a in columns[0].keys() if a not in ['the_geom','cartodb_id','geoid']] + feature_names = ",".join([ " {census_table}.\"{a}\" as \"{a}\" ".format(**locals()) for a in combined_columns]) + plpy.notice('joining with census data') join_data = plpy.execute(''' - WITH region_extent AS ( - SELECT ST_Extent(the_geom) as table_extent FROM {table_name}; - ) - SELECT {features_names}, {table_name}.{column_name} - FROM {table_name} ,region_extent + + SELECT {feature_names}, {table_name}.{column_name} + FROM {table_name} JOIN {census_table} - ON {table_name}.{geoid_column} = {census_table}.geoid - WHERE {census_table}.the_geom && region_extent.table_extent + ON {table_name}.{geoid_column}::numeric = {census_table}.geoid::numeric '''.format(**locals())) if len(join_data) == 0: plpy.notice('Failed to join with census data') - return join_data + return query_to_dictionary(join_data) -def cdb_predict_segment(segment_name,geoid_column,census_table): +def query_to_dictionary(result): + return [ dict(zip(r.keys(), r.values())) for r in result ] + +def predict_segment(model,features,geoid_column,census_table): """ predict a segment with machine learning Stuart Lynn @@ -89,30 +103,31 @@ def fetch_model(model_name): return data -def create_model_table(model_name): +def create_model_table(): """ create the model table if requred """ plpy.execute(''' CREATE table IF NOT EXISTS _cdb_models( name TEXT, - model BLOB, + model TEXT, features TEXT[], accuracy NUMERIC, table_name TEXT, + census_table_name TEXT, + method TEXT )''') def save_model(model_name,model,accuracy,table_name, column_name,census_table,geoid_column,method): """ save a model to the model table for later use """ + create_model_table() plpy.execute(''' - DELETE FROM _cdb_models WHERE model_name = {model_name} + DELETE FROM _cdb_models WHERE name = '{model_name}' '''.format(**locals())) - + model_pickle = pickle.dumps(model) plpy.execute(""" - INSERT INTO _cdb_models ({model_name},{model_pickle},{accuracy}) - """) - -def + INSERT INTO _cdb_models ('{model_name}','{model_pickle}',{accuracy}, '{table_name}', '{census_table}', '{method}') + """.format(**locals())) diff --git a/python/crankshaft/setup.py b/python/crankshaft/setup.py index c0f8c50..07ff9e9 100644 --- a/python/crankshaft/setup.py +++ b/python/crankshaft/setup.py @@ -40,9 +40,8 @@ setup( # The choice of component versions is dictated by what's # provisioned in the production servers. - install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'], + install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0','pandas','sklearn'], - requires=['pysal', 'numpy'], test_suite='test' )