From 531ad281589eaa838a76cd6fcd1d18c606b200d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Ignacio=20S=C3=A1nchez=20Lara?= Date: Tue, 10 Jul 2018 19:06:49 +0200 Subject: [PATCH] Send optimal batch size --- client/cdb_dataservices_client--0.25.0.sql | 64 +++++++++++-------- client/sql/16_custom_types.sql | 3 +- client/sql/21_bulk_geocoding_functions.sql | 61 ++++++++++-------- .../21_bulk_geocoding_functions_test.out | 20 +++--- .../sql/21_bulk_geocoding_functions_test.sql | 18 +++--- .../cdb_dataservices_server--0.32.0.sql | 17 +++-- server/extension/sql/200_quotas.sql | 17 +++-- .../cartodb_services/bulk_geocoders.py | 11 ++++ .../cartodb_services/here/bulk_geocoder.py | 13 ++-- test/integration/test_street_functions.py | 19 +++++- 10 files changed, 153 insertions(+), 90 deletions(-) create mode 100644 server/lib/python/cartodb_services/cartodb_services/bulk_geocoders.py diff --git a/client/cdb_dataservices_client--0.25.0.sql b/client/cdb_dataservices_client--0.25.0.sql index d21bb5f..2bed351 100644 --- a/client/cdb_dataservices_client--0.25.0.sql +++ b/client/cdb_dataservices_client--0.25.0.sql @@ -112,7 +112,8 @@ CREATE TYPE cdb_dataservices_client.service_quota_info AS ( monthly_quota NUMERIC, used_quota NUMERIC, soft_limit BOOLEAN, - provider TEXT + provider TEXT, + max_batch_size NUMERIC ); -- -- Public dataservices API function @@ -1987,25 +1988,36 @@ CREATE OR REPLACE FUNCTION cdb_dataservices_client._DST_DisconnectUserTable( TARGET cdb_dataservices_server._DST_DisconnectUserTable; $$ LANGUAGE plproxy VOLATILE PARALLEL UNSAFE; CREATE OR REPLACE FUNCTION cdb_dataservices_client.cdb_bulk_geocode_street_point (query text, - street_column text, city_column text default null, state_column text default null, country_column text default null, batch_size integer DEFAULT 50) + street_column text, city_column text default null, state_column text default null, country_column text default null, batch_size integer DEFAULT NULL) RETURNS SETOF cdb_dataservices_client.geocoding AS $$ DECLARE query_row_count integer; enough_quota boolean; remaining_quota integer; + max_batch_size integer; cartodb_id_batch integer; batches_n integer; DEFAULT_BATCH_SIZE CONSTANT numeric := 100; - MAX_BATCH_SIZE CONSTANT numeric := 10000; + MAX_SAFE_BATCH_SIZE CONSTANT numeric := 5000; current_row_count integer ; temp_table_name text; BEGIN + SELECT csqi.monthly_quota - csqi.used_quota AS remaining_quota, csqi.max_batch_size + INTO remaining_quota, max_batch_size + FROM cdb_dataservices_client.cdb_service_quota_info() csqi + WHERE service = 'hires_geocoder'; + RAISE DEBUG 'remaining_quota: %; max_batch_size: %', remaining_quota, max_batch_size; + IF batch_size IS NULL THEN - RAISE EXCEPTION 'batch_size can''t be null'; - ELSIF batch_size > MAX_BATCH_SIZE THEN - RAISE EXCEPTION 'batch_size must be lower than %', MAX_BATCH_SIZE + 1; + batch_size := max_batch_size; + ELSIF batch_size > max_batch_size THEN + RAISE EXCEPTION 'batch_size must be lower than %', max_batch_size + 1; + END IF; + + IF batch_size > MAX_SAFE_BATCH_SIZE THEN + batch_size := MAX_SAFE_BATCH_SIZE; END IF; EXECUTE format('SELECT COUNT(1) from (%s) _x', query) INTO query_row_count; @@ -2013,11 +2025,7 @@ BEGIN RAISE DEBUG 'cdb_bulk_geocode_street_point --> query_row_count: %; query: %; country: %; state: %; city: %; street: %', query_row_count, query, country_column, state_column, city_column, street_column; SELECT cdb_dataservices_client.cdb_enough_quota('hires_geocoder', query_row_count) INTO enough_quota; - IF enough_quota IS NOT NULL AND NOT enough_quota THEN - SELECT csqi.monthly_quota - csqi.used_quota AS remaining_quota - INTO remaining_quota - FROM cdb_dataservices_client.cdb_service_quota_info() csqi - WHERE service = 'hires_geocoder'; + IF remaining_quota < query_row_count THEN RAISE EXCEPTION 'Remaining quota: %. Estimated cost: %', remaining_quota, query_row_count; END IF; @@ -2036,25 +2044,27 @@ BEGIN coalesce(state_column, ''''''), coalesce(country_column, '''''') into street_column, city_column, state_column, country_column; - FOR cartodb_id_batch in 0..(batches_n - 1) - LOOP + IF batches_n > 0 THEN + FOR cartodb_id_batch in 0..(batches_n - 1) + LOOP - EXECUTE format( - 'WITH geocoding_data as (' || - ' SELECT ' || - ' json_build_object(''id'', cartodb_id, ''address'', %s, ''city'', %s, ''state'', %s, ''country'', %s) as data , ' || - ' floor((cartodb_id-1)::float/$1) as batch' || - ' FROM (%s) _x' || - ') ' || - 'INSERT INTO %s SELECT (cdb_dataservices_client._cdb_bulk_geocode_street_point(jsonb_agg(data))).* ' || - 'FROM geocoding_data ' || - 'WHERE batch = $2', street_column, city_column, state_column, country_column, query, temp_table_name) - USING batch_size, cartodb_id_batch; + EXECUTE format( + 'WITH geocoding_data as (' || + ' SELECT ' || + ' json_build_object(''id'', cartodb_id, ''address'', %s, ''city'', %s, ''state'', %s, ''country'', %s) as data , ' || + ' floor((cartodb_id-1)::float/$1) as batch' || + ' FROM (%s) _x' || + ') ' || + 'INSERT INTO %s SELECT (cdb_dataservices_client._cdb_bulk_geocode_street_point(jsonb_agg(data))).* ' || + 'FROM geocoding_data ' || + 'WHERE batch = $2', street_column, city_column, state_column, country_column, query, temp_table_name) + USING batch_size, cartodb_id_batch; - GET DIAGNOSTICS current_row_count = ROW_COUNT; - RAISE DEBUG 'Batch % --> %', cartodb_id_batch, current_row_count; + GET DIAGNOSTICS current_row_count = ROW_COUNT; + RAISE DEBUG 'Batch % --> %', cartodb_id_batch, current_row_count; - END LOOP; + END LOOP; + END IF; RETURN QUERY EXECUTE 'SELECT * FROM ' || quote_ident(temp_table_name); END; diff --git a/client/sql/16_custom_types.sql b/client/sql/16_custom_types.sql index 1b74cac..7458ffa 100644 --- a/client/sql/16_custom_types.sql +++ b/client/sql/16_custom_types.sql @@ -39,5 +39,6 @@ CREATE TYPE cdb_dataservices_client.service_quota_info AS ( monthly_quota NUMERIC, used_quota NUMERIC, soft_limit BOOLEAN, - provider TEXT + provider TEXT, + max_batch_size NUMERIC ); diff --git a/client/sql/21_bulk_geocoding_functions.sql b/client/sql/21_bulk_geocoding_functions.sql index 2e14965..f9ba824 100644 --- a/client/sql/21_bulk_geocoding_functions.sql +++ b/client/sql/21_bulk_geocoding_functions.sql @@ -1,23 +1,34 @@ CREATE OR REPLACE FUNCTION cdb_dataservices_client.cdb_bulk_geocode_street_point (query text, - street_column text, city_column text default null, state_column text default null, country_column text default null, batch_size integer DEFAULT 50) + street_column text, city_column text default null, state_column text default null, country_column text default null, batch_size integer DEFAULT NULL) RETURNS SETOF cdb_dataservices_client.geocoding AS $$ DECLARE query_row_count integer; enough_quota boolean; remaining_quota integer; + max_batch_size integer; cartodb_id_batch integer; batches_n integer; DEFAULT_BATCH_SIZE CONSTANT numeric := 100; - MAX_BATCH_SIZE CONSTANT numeric := 10000; + MAX_SAFE_BATCH_SIZE CONSTANT numeric := 5000; current_row_count integer ; temp_table_name text; BEGIN + SELECT csqi.monthly_quota - csqi.used_quota AS remaining_quota, csqi.max_batch_size + INTO remaining_quota, max_batch_size + FROM cdb_dataservices_client.cdb_service_quota_info() csqi + WHERE service = 'hires_geocoder'; + RAISE DEBUG 'remaining_quota: %; max_batch_size: %', remaining_quota, max_batch_size; + IF batch_size IS NULL THEN - RAISE EXCEPTION 'batch_size can''t be null'; - ELSIF batch_size > MAX_BATCH_SIZE THEN - RAISE EXCEPTION 'batch_size must be lower than %', MAX_BATCH_SIZE + 1; + batch_size := max_batch_size; + ELSIF batch_size > max_batch_size THEN + RAISE EXCEPTION 'batch_size must be lower than %', max_batch_size + 1; + END IF; + + IF batch_size > MAX_SAFE_BATCH_SIZE THEN + batch_size := MAX_SAFE_BATCH_SIZE; END IF; EXECUTE format('SELECT COUNT(1) from (%s) _x', query) INTO query_row_count; @@ -25,11 +36,7 @@ BEGIN RAISE DEBUG 'cdb_bulk_geocode_street_point --> query_row_count: %; query: %; country: %; state: %; city: %; street: %', query_row_count, query, country_column, state_column, city_column, street_column; SELECT cdb_dataservices_client.cdb_enough_quota('hires_geocoder', query_row_count) INTO enough_quota; - IF enough_quota IS NOT NULL AND NOT enough_quota THEN - SELECT csqi.monthly_quota - csqi.used_quota AS remaining_quota - INTO remaining_quota - FROM cdb_dataservices_client.cdb_service_quota_info() csqi - WHERE service = 'hires_geocoder'; + IF remaining_quota < query_row_count THEN RAISE EXCEPTION 'Remaining quota: %. Estimated cost: %', remaining_quota, query_row_count; END IF; @@ -48,25 +55,27 @@ BEGIN coalesce(state_column, ''''''), coalesce(country_column, '''''') into street_column, city_column, state_column, country_column; - FOR cartodb_id_batch in 0..(batches_n - 1) - LOOP + IF batches_n > 0 THEN + FOR cartodb_id_batch in 0..(batches_n - 1) + LOOP - EXECUTE format( - 'WITH geocoding_data as (' || - ' SELECT ' || - ' json_build_object(''id'', cartodb_id, ''address'', %s, ''city'', %s, ''state'', %s, ''country'', %s) as data , ' || - ' floor((cartodb_id-1)::float/$1) as batch' || - ' FROM (%s) _x' || - ') ' || - 'INSERT INTO %s SELECT (cdb_dataservices_client._cdb_bulk_geocode_street_point(jsonb_agg(data))).* ' || - 'FROM geocoding_data ' || - 'WHERE batch = $2', street_column, city_column, state_column, country_column, query, temp_table_name) - USING batch_size, cartodb_id_batch; + EXECUTE format( + 'WITH geocoding_data as (' || + ' SELECT ' || + ' json_build_object(''id'', cartodb_id, ''address'', %s, ''city'', %s, ''state'', %s, ''country'', %s) as data , ' || + ' floor((cartodb_id-1)::float/$1) as batch' || + ' FROM (%s) _x' || + ') ' || + 'INSERT INTO %s SELECT (cdb_dataservices_client._cdb_bulk_geocode_street_point(jsonb_agg(data))).* ' || + 'FROM geocoding_data ' || + 'WHERE batch = $2', street_column, city_column, state_column, country_column, query, temp_table_name) + USING batch_size, cartodb_id_batch; - GET DIAGNOSTICS current_row_count = ROW_COUNT; - RAISE DEBUG 'Batch % --> %', cartodb_id_batch, current_row_count; + GET DIAGNOSTICS current_row_count = ROW_COUNT; + RAISE DEBUG 'Batch % --> %', cartodb_id_batch, current_row_count; - END LOOP; + END LOOP; + END IF; RETURN QUERY EXECUTE 'SELECT * FROM ' || quote_ident(temp_table_name); END; diff --git a/client/test/expected/21_bulk_geocoding_functions_test.out b/client/test/expected/21_bulk_geocoding_functions_test.out index 7ca70db..d2f268b 100644 --- a/client/test/expected/21_bulk_geocoding_functions_test.out +++ b/client/test/expected/21_bulk_geocoding_functions_test.out @@ -1,21 +1,21 @@ \set VERBOSITY terse --- Test bulk size mandatory -SELECT cdb_dataservices_client.cdb_bulk_geocode_street_point('select 1 as cartodb_id', '''Valladolid, Spain''', null, null, null, null); -ERROR: batch_size can't be null --- Test quota check by mocking quota 0 +ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info() RENAME TO cdb_service_quota_info_mocked; +CREATE FUNCTION cdb_dataservices_client.cdb_service_quota_info () +RETURNS SETOF cdb_dataservices_client.service_quota_info AS $$ + SELECT 'hires_geocoder'::cdb_dataservices_client.service_type AS service, 0::NUMERIC AS monthly_quota, 0::NUMERIC AS used_quota, FALSE AS soft_limit, 'google' AS provider, 1::NUMERIC AS max_batch_size; +$$ LANGUAGE SQL; ALTER FUNCTION cdb_dataservices_client.cdb_enough_quota (service TEXT ,input_size NUMERIC) RENAME TO cdb_enough_quota_mocked; CREATE FUNCTION cdb_dataservices_client.cdb_enough_quota (service TEXT ,input_size NUMERIC) RETURNS BOOLEAN as $$ SELECT FALSE; $$ LANGUAGE SQL; -ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info() RENAME TO cdb_service_quota_info_mocked; -CREATE FUNCTION cdb_dataservices_client.cdb_service_quota_info () -RETURNS SETOF cdb_dataservices_client.service_quota_info AS $$ - SELECT 'hires_geocoder'::cdb_dataservices_client.service_type AS service, 0::NUMERIC AS monthly_quota, 0::NUMERIC AS used_quota, FALSE AS soft_limit, 'google' AS provider; -$$ LANGUAGE SQL; +-- Test bulk size not mandatory (it will get the optimal) +SELECT cdb_dataservices_client.cdb_bulk_geocode_street_point('select 1 as cartodb_id', '''Valladolid, Spain''', null, null, null, null); +ERROR: Remaining quota: 0. Estimated cost: 1 +-- Test quota check by mocking quota 0 SELECT cdb_dataservices_client.cdb_bulk_geocode_street_point('select 1 as cartodb_id', '''Valladolid, Spain'''); ERROR: Remaining quota: 0. Estimated cost: 1 DROP FUNCTION cdb_dataservices_client.cdb_service_quota_info; DROP FUNCTION cdb_dataservices_client.cdb_enough_quota; -ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info_mocked() RENAME TO cdb_service_quota_info; ALTER FUNCTION cdb_dataservices_client.cdb_enough_quota_mocked (service TEXT ,input_size NUMERIC) RENAME TO cdb_enough_quota; +ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info_mocked() RENAME TO cdb_service_quota_info; diff --git a/client/test/sql/21_bulk_geocoding_functions_test.sql b/client/test/sql/21_bulk_geocoding_functions_test.sql index 5433bd7..d17f470 100644 --- a/client/test/sql/21_bulk_geocoding_functions_test.sql +++ b/client/test/sql/21_bulk_geocoding_functions_test.sql @@ -1,26 +1,26 @@ \set VERBOSITY terse --- Test bulk size mandatory -SELECT cdb_dataservices_client.cdb_bulk_geocode_street_point('select 1 as cartodb_id', '''Valladolid, Spain''', null, null, null, null); +ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info() RENAME TO cdb_service_quota_info_mocked; +CREATE FUNCTION cdb_dataservices_client.cdb_service_quota_info () +RETURNS SETOF cdb_dataservices_client.service_quota_info AS $$ + SELECT 'hires_geocoder'::cdb_dataservices_client.service_type AS service, 0::NUMERIC AS monthly_quota, 0::NUMERIC AS used_quota, FALSE AS soft_limit, 'google' AS provider, 1::NUMERIC AS max_batch_size; +$$ LANGUAGE SQL; --- Test quota check by mocking quota 0 ALTER FUNCTION cdb_dataservices_client.cdb_enough_quota (service TEXT ,input_size NUMERIC) RENAME TO cdb_enough_quota_mocked; CREATE FUNCTION cdb_dataservices_client.cdb_enough_quota (service TEXT ,input_size NUMERIC) RETURNS BOOLEAN as $$ SELECT FALSE; $$ LANGUAGE SQL; -ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info() RENAME TO cdb_service_quota_info_mocked; -CREATE FUNCTION cdb_dataservices_client.cdb_service_quota_info () -RETURNS SETOF cdb_dataservices_client.service_quota_info AS $$ - SELECT 'hires_geocoder'::cdb_dataservices_client.service_type AS service, 0::NUMERIC AS monthly_quota, 0::NUMERIC AS used_quota, FALSE AS soft_limit, 'google' AS provider; -$$ LANGUAGE SQL; +-- Test bulk size not mandatory (it will get the optimal) +SELECT cdb_dataservices_client.cdb_bulk_geocode_street_point('select 1 as cartodb_id', '''Valladolid, Spain''', null, null, null, null); +-- Test quota check by mocking quota 0 SELECT cdb_dataservices_client.cdb_bulk_geocode_street_point('select 1 as cartodb_id', '''Valladolid, Spain'''); DROP FUNCTION cdb_dataservices_client.cdb_service_quota_info; DROP FUNCTION cdb_dataservices_client.cdb_enough_quota; -ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info_mocked() RENAME TO cdb_service_quota_info; ALTER FUNCTION cdb_dataservices_client.cdb_enough_quota_mocked (service TEXT ,input_size NUMERIC) RENAME TO cdb_enough_quota; +ALTER FUNCTION cdb_dataservices_client.cdb_service_quota_info_mocked() RENAME TO cdb_service_quota_info; diff --git a/server/extension/cdb_dataservices_server--0.32.0.sql b/server/extension/cdb_dataservices_server--0.32.0.sql index a013f3c..dc82322 100644 --- a/server/extension/cdb_dataservices_server--0.32.0.sql +++ b/server/extension/cdb_dataservices_server--0.32.0.sql @@ -1861,7 +1861,8 @@ BEGIN monthly_quota NUMERIC, used_quota NUMERIC, soft_limit BOOLEAN, - provider TEXT + provider TEXT, + max_batch_size NUMERIC ); END IF; END $$; @@ -1872,6 +1873,7 @@ CREATE OR REPLACE FUNCTION cdb_dataservices_server.cdb_service_quota_info( RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ from cartodb_services.metrics.user import UserMetricsService from datetime import date + from cartodb_services.bulk_geocoders import BATCH_GEOCODER_CLASS_BY_PROVIDER plpy.execute("SELECT cdb_dataservices_server._connect_to_redis('{0}')".format(username)) redis_conn = GD["redis_connection_{0}".format(username)]['redis_metrics_connection'] @@ -1889,7 +1891,7 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_isolines_config.service_type, today) soft_limit = user_isolines_config.soft_isolines_limit provider = user_isolines_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + ret += [[service, monthly_quota, used_quota, soft_limit, provider, 1]] #-- Hires Geocoder service = 'hires_geocoder' @@ -1901,7 +1903,12 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_geocoder_config.service_type, today) soft_limit = user_geocoder_config.soft_geocoding_limit provider = user_geocoder_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + batch_geocoder_class = BATCH_GEOCODER_CLASS_BY_PROVIDER.get(provider, None) + if batch_geocoder_class and hasattr(batch_geocoder_class, 'MAX_BATCH_SIZE'): + max_batch_size = batch_geocoder_class.MAX_BATCH_SIZE + else: + max_batch_size = 1 + ret += [[service, monthly_quota, used_quota, soft_limit, provider, max_batch_size]] #-- Routing service = 'routing' @@ -1913,7 +1920,7 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_routing_config.service_type, today) soft_limit = user_routing_config.soft_limit provider = user_routing_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + ret += [[service, monthly_quota, used_quota, soft_limit, provider, 1]] #-- Observatory service = 'observatory' @@ -1925,7 +1932,7 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_obs_config.service_type, today) soft_limit = user_obs_config.soft_limit provider = user_obs_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + ret += [[service, monthly_quota, used_quota, soft_limit, provider, 1]] return ret $$ LANGUAGE plpythonu STABLE PARALLEL RESTRICTED; diff --git a/server/extension/sql/200_quotas.sql b/server/extension/sql/200_quotas.sql index 28ee724..6f34083 100644 --- a/server/extension/sql/200_quotas.sql +++ b/server/extension/sql/200_quotas.sql @@ -22,7 +22,8 @@ BEGIN monthly_quota NUMERIC, used_quota NUMERIC, soft_limit BOOLEAN, - provider TEXT + provider TEXT, + max_batch_size NUMERIC ); END IF; END $$; @@ -33,6 +34,7 @@ CREATE OR REPLACE FUNCTION cdb_dataservices_server.cdb_service_quota_info( RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ from cartodb_services.metrics.user import UserMetricsService from datetime import date + from cartodb_services.bulk_geocoders import BATCH_GEOCODER_CLASS_BY_PROVIDER plpy.execute("SELECT cdb_dataservices_server._connect_to_redis('{0}')".format(username)) redis_conn = GD["redis_connection_{0}".format(username)]['redis_metrics_connection'] @@ -50,7 +52,7 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_isolines_config.service_type, today) soft_limit = user_isolines_config.soft_isolines_limit provider = user_isolines_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + ret += [[service, monthly_quota, used_quota, soft_limit, provider, 1]] #-- Hires Geocoder service = 'hires_geocoder' @@ -62,7 +64,12 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_geocoder_config.service_type, today) soft_limit = user_geocoder_config.soft_geocoding_limit provider = user_geocoder_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + batch_geocoder_class = BATCH_GEOCODER_CLASS_BY_PROVIDER.get(provider, None) + if batch_geocoder_class and hasattr(batch_geocoder_class, 'MAX_BATCH_SIZE'): + max_batch_size = batch_geocoder_class.MAX_BATCH_SIZE + else: + max_batch_size = 1 + ret += [[service, monthly_quota, used_quota, soft_limit, provider, max_batch_size]] #-- Routing service = 'routing' @@ -74,7 +81,7 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_routing_config.service_type, today) soft_limit = user_routing_config.soft_limit provider = user_routing_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + ret += [[service, monthly_quota, used_quota, soft_limit, provider, 1]] #-- Observatory service = 'observatory' @@ -86,7 +93,7 @@ RETURNS SETOF cdb_dataservices_server.service_quota_info AS $$ used_quota = user_service.used_quota(user_obs_config.service_type, today) soft_limit = user_obs_config.soft_limit provider = user_obs_config.provider - ret += [[service, monthly_quota, used_quota, soft_limit, provider]] + ret += [[service, monthly_quota, used_quota, soft_limit, provider, 1]] return ret $$ LANGUAGE plpythonu STABLE PARALLEL RESTRICTED; diff --git a/server/lib/python/cartodb_services/cartodb_services/bulk_geocoders.py b/server/lib/python/cartodb_services/cartodb_services/bulk_geocoders.py new file mode 100644 index 0000000..6dfd555 --- /dev/null +++ b/server/lib/python/cartodb_services/cartodb_services/bulk_geocoders.py @@ -0,0 +1,11 @@ +from google import GoogleMapsBulkGeocoder +from here import HereMapsBulkGeocoder +from tomtom import TomTomBulkGeocoder +from mapbox import MapboxBulkGeocoder + +BATCH_GEOCODER_CLASS_BY_PROVIDER = { + 'google': GoogleMapsBulkGeocoder, + 'heremaps': HereMapsBulkGeocoder, + 'tomtom': TomTomBulkGeocoder, + 'mapbox': MapboxBulkGeocoder +} diff --git a/server/lib/python/cartodb_services/cartodb_services/here/bulk_geocoder.py b/server/lib/python/cartodb_services/cartodb_services/here/bulk_geocoder.py index e567e2e..3724bcd 100644 --- a/server/lib/python/cartodb_services/cartodb_services/here/bulk_geocoder.py +++ b/server/lib/python/cartodb_services/cartodb_services/here/bulk_geocoder.py @@ -16,7 +16,7 @@ HereJobStatus = namedtuple('HereJobStatus', 'total_count processed_count status' class HereMapsBulkGeocoder(HereMapsGeocoder, StreetPointBulkGeocoder): MAX_BATCH_SIZE = 1000000 # From the docs - MIN_BATCHED_SEARCH = 100 # Under this, serial will be used + MIN_BATCHED_SEARCH = 1000 # Under this, serial will be used BATCH_URL = 'https://batch.geocoder.cit.api.here.com/6.2/jobs' # https://developer.here.com/documentation/batch-geocoder/topics/read-batch-request-output.html META_COLS = ['relevance', 'matchType', 'matchCode', 'matchLevel', 'matchQualityStreet'] @@ -55,14 +55,17 @@ class HereMapsBulkGeocoder(HereMapsGeocoder, StreetPointBulkGeocoder): while True: job_info = self._job_status(request_id) if job_info.processed_count == last_processed: + self._logger.debug('--> no progress ({})'.format(last_processed)) stalled_retries += 1 if stalled_retries > self.MAX_STALLED_RETRIES: raise Exception('Too many retries for job {}'.format(request_id)) else: + self._logger.debug('--> progress ({} != {})'.format(job_info.processed_count, last_processed)) stalled_retries = 0 last_processed = job_info.processed_count - self._logger.debug('--> Job poll check: {}'.format(job_info)) + self._logger.debug('--> Job poll check ({}): {}'.format( + stalled_retries, job_info)) if job_info.status in self.JOB_FINAL_STATES: break else: @@ -95,7 +98,7 @@ class HereMapsBulkGeocoder(HereMapsGeocoder, StreetPointBulkGeocoder): request_params.update({ 'gen': 8, 'action': 'run', - #'mailto': 'juanignaciosl@carto.com', + # 'mailto': 'juanignaciosl@carto.com', 'header': 'true', 'inDelim': '|', 'outDelim': '|', @@ -121,8 +124,8 @@ class HereMapsBulkGeocoder(HereMapsGeocoder, StreetPointBulkGeocoder): timeout=(self.connect_timeout, self.read_timeout)) polling_root = ET.fromstring(polling_r.text) return HereJobStatus( - total_count=polling_root.find('./Response/TotalCount').text, - processed_count=polling_root.find('./Response/ProcessedCount').text, + total_count=int(polling_root.find('./Response/TotalCount').text), + processed_count=int(polling_root.find('./Response/ProcessedCount').text), status=polling_root.find('./Response/Status').text) def _download_results(self, job_id): diff --git a/test/integration/test_street_functions.py b/test/integration/test_street_functions.py index d66df30..52ffbe0 100644 --- a/test/integration/test_street_functions.py +++ b/test/integration/test_street_functions.py @@ -259,7 +259,8 @@ class TestBulkStreetFunctions(TestStreetFunctionsSetUp): """ Useful just to test a good batch size """ - n = 50 + n = 110 + batch_size = 'NULL' # NULL for optimal streets = [] for i in range(0, n): streets.append('{{"cartodb_id": {}, "address": "{} Yonge Street, ' \ @@ -270,7 +271,7 @@ class TestBulkStreetFunctions(TestStreetFunctionsSetUp): "'select * from jsonb_to_recordset(''[" \ "{}" \ "]''::jsonb) as (cartodb_id integer, address text)', " \ - "'address', null, null, null, {})".format(','.join(streets), n) + "'address', null, null, null, {})".format(','.join(streets), batch_size) response = self._run_authenticated(query) assert_equal(n - 1, len(response['rows'])) @@ -307,6 +308,20 @@ class TestBulkStreetFunctions(TestStreetFunctionsSetUp): assert_close_enough(self._x_y_by_cartodb_id(response)[1], self.fixture_points['Plaza EspaƱa 1, Barcelona']) + def _test_known_table(self): + subquery = 'select * from known_table where cartodb_id < 1100' + subquery_count = 'select count(1) from ({}) _x'.format(subquery) + count = self._run_authenticated(subquery_count)['rows'][0]['count'] + + query = "select cartodb_id, st_x(the_geom), st_y(the_geom) " \ + "FROM cdb_dataservices_client.cdb_bulk_geocode_street_point(" \ + "'{}' " \ + ", 'street', 'city', NULL, 'country')".format(subquery) + response = self._run_authenticated(query) + assert_equal(len(response['rows']), count) + assert_not_equal(response['rows'][0]['st_x'], None) + + def _run_authenticated(self, query): authenticated_query = "{}&api_key={}".format(query, self.env_variables[