diff --git a/geocoder/admin1/extension/.gitignore b/geocoder/admin1/extension/.gitignore new file mode 100644 index 0000000..e710f0e --- /dev/null +++ b/geocoder/admin1/extension/.gitignore @@ -0,0 +1,3 @@ +results/ +regression.diffs +regression.out diff --git a/geocoder/admin1/extension/Makefile b/geocoder/admin1/extension/Makefile new file mode 100644 index 0000000..b472643 --- /dev/null +++ b/geocoder/admin1/extension/Makefile @@ -0,0 +1,8 @@ +EXTENSION = cdb_geocoder_admin1 +DATA = cdb_geocoder_admin1--0.0.1.sql +REGRESS = cdb_geocoder_admin1_test + +# postgres build stuff +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/geocoder/admin1/extension/README.md b/geocoder/admin1/extension/README.md new file mode 100644 index 0000000..ac99e36 --- /dev/null +++ b/geocoder/admin1/extension/README.md @@ -0,0 +1,36 @@ +# CartoDB admin1 geocoder extension +Postgres extension for the CartoDB admin1 geocoder. It is meant to contain the functions and related objects needed to geocode by admin1 regions. It is not meant to contain the actual data used to geocode them. + +## Dependencies +This extension is thought to be used on top of CartoDB platform. Therefore a cartodb user is required to install the extension onto it. + +The following is a non-comprehensive list of dependencies: + +- Postgres 9.3+ +- Postgis extension +- Schema triggers extension +- CartoDB extension + +## Installation into the db cluster +This requires root privileges +``` +sudo make all install +``` + +## Execute tests +``` +PGUSER=postgres make installcheck +``` + +## Install onto a user's database +``` +psql -U development_cartodb_user_fe3b850a-01c0-48f9-8a26-a82f09e9b53f cartodb_dev_user_fe3b850a-01c0-48f9-8a26-a82f09e9b53f_db +``` + +and then: + +```sql +CREATE EXTENSION cdb_geocoder_admin1; +``` + +The extension creation in the user's db does not require special privileges. It can be even created from the sql api. diff --git a/geocoder/admin1/extension/cdb_geocoder_admin1--0.0.1.sql b/geocoder/admin1/extension/cdb_geocoder_admin1--0.0.1.sql new file mode 100644 index 0000000..617beee --- /dev/null +++ b/geocoder/admin1/extension/cdb_geocoder_admin1--0.0.1.sql @@ -0,0 +1,265 @@ +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION cdb_geocoder_admin1" to load this file. \quit + +-- Response types for admin1 geocoder +-- TODO: check if the types exist already in the db + +CREATE TYPE geocode_admin_v1 AS (q TEXT, geom GEOMETRY, success BOOLEAN); +CREATE TYPE geocode_admin_country_v1 AS (q TEXT, c TEXT, geom GEOMETRY, success BOOLEAN); + + +-- Public API functions -- +--- Geocoding function --- +-- TODO: deal with permissions +CREATE OR REPLACE FUNCTION geocode_admin1_polygons(name text[]) RETURNS SETOF geocode_admin_v1 + LANGUAGE plpgsql SECURITY DEFINER + AS $$ + DECLARE + ret geocode_admin_v1%rowtype; + BEGIN + FOR ret IN + SELECT + q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success + FROM ( + SELECT + q, ( + SELECT the_geom + FROM global_province_polygons + WHERE d.c = ANY (synonyms) + ORDER BY frequency DESC LIMIT 1 + ) geom + FROM (SELECT trim(replace(lower(unnest(name)),'.',' ')) c, unnest(name) q) d + ) v + LOOP + RETURN NEXT ret; + END LOOP; + RETURN; +END +$$; + + +CREATE OR REPLACE FUNCTION geocode_admin1_polygons(name text[], inputcountry text) RETURNS SETOF geocode_admin_v1 + LANGUAGE plpgsql SECURITY DEFINER + AS $$ + DECLARE + ret geocode_admin_v1%rowtype; + BEGIN + + FOR ret IN WITH + p AS (SELECT r.c, r.q, (SELECT iso3 FROM country_decoder WHERE lower(inputcountry) = ANY (synonyms)) i FROM (SELECT trim(replace(lower(unnest(name)),'.',' ')) c, unnest(name) q) r) + SELECT + q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success + FROM ( + SELECT + q, ( + SELECT the_geom + FROM global_province_polygons + WHERE p.c = ANY (synonyms) + AND iso3 = p.i + -- To calculate frequency, I simply counted the number of users + -- we had signed up in each country. Countries with more users, + -- we favor higher in the geocoder :) + ORDER BY frequency DESC LIMIT 1 + ) geom + FROM p) n + LOOP + RETURN NEXT ret; + END LOOP; + RETURN; +END +$$; + + +CREATE OR REPLACE FUNCTION geocode_admin1_polygons(names text[], country text[]) RETURNS SETOF geocode_admin_country_v1 + LANGUAGE plpgsql SECURITY DEFINER + AS $$ + DECLARE + ret geocode_admin_country_v1%rowtype; + nans TEXT[]; + BEGIN + + + SELECT array_agg(p) INTO nans FROM (SELECT unnest(names) p, unnest(country) c) g WHERE c IS NULL; + + IF 0 < array_length(nans, 1) THEN + SELECT array_agg(p), array_agg(c) INTO names, country FROM (SELECT unnest(names) p, unnest(country) c) g WHERE c IS NOT NULL; + FOR ret IN SELECT g.q, NULL as c, g.geom, g.success FROM (SELECT (geocode_admin1_polygons(nans)).*) g LOOP + RETURN NEXT ret; + END LOOP; + END IF; + + + FOR ret IN WITH + p AS (SELECT r.p, r.q, c, (SELECT iso3 FROM country_decoder WHERE lower(r.c) = ANY (synonyms)) i FROM (SELECT trim(replace(lower(unnest(names)),'.',' ')) p, unnest(names) q, unnest(country) c) r) + SELECT + q, c, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success + FROM ( + SELECT + q, c, ( + SELECT the_geom + FROM global_province_polygons + WHERE p.p = ANY (synonyms) + AND iso3 = p.i + -- To calculate frequency, I simply counted the number of users + -- we had signed up in each country. Countries with more users, + -- we favor higher in the geocoder :) + ORDER BY frequency DESC LIMIT 1 + ) geom + FROM p) n + LOOP + RETURN NEXT ret; + END LOOP; + RETURN; +END +$$; + +-------------------------------------------------------------------------------- + +-- Support tables + +CREATE TABLE country_decoder ( + name text, + nativename text, + tld text, + iso2 text, + ccn3 text, + iso3 text, + currency text, + callingcode text, + capital text, + altspellings text, + relevance text, + region text, + subregion text, + language text, + languagescodes text, + translations text, + population text, + latlng text, + demonym text, + borders text, + the_geom geometry(Geometry,4326), + cartodb_id integer NOT NULL, + created_at timestamp with time zone DEFAULT now() NOT NULL, + updated_at timestamp with time zone DEFAULT now() NOT NULL, + the_geom_webmercator geometry(Geometry,3857), + synbu text[], + synonyms text[], + users double precision +); + +CREATE SEQUENCE country_decoder_cartodb_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; +ALTER SEQUENCE country_decoder_cartodb_id_seq OWNED BY country_decoder.cartodb_id; +ALTER TABLE ONLY country_decoder ALTER COLUMN cartodb_id SET DEFAULT nextval('country_decoder_cartodb_id_seq'::regclass); + + +ALTER TABLE ONLY country_decoder + ADD CONSTRAINT country_decoder_cartodb_id_key UNIQUE (cartodb_id); +ALTER TABLE ONLY country_decoder + ADD CONSTRAINT country_decoder_pkey PRIMARY KEY (cartodb_id); + +ALTER TABLE country_decoder CLUSTER ON country_decoder_pkey; + + +CREATE INDEX country_decoder_the_geom_idx ON country_decoder USING gist (the_geom); +CREATE INDEX country_decoder_the_geom_webmercator_idx ON country_decoder USING gist (the_geom_webmercator); + + +CREATE TRIGGER track_updates AFTER INSERT OR DELETE OR UPDATE OR TRUNCATE ON country_decoder FOR EACH STATEMENT EXECUTE PROCEDURE cartodb.cdb_tablemetadata_trigger(); +CREATE TRIGGER update_the_geom_webmercator_trigger BEFORE INSERT OR UPDATE OF the_geom ON country_decoder FOR EACH ROW EXECUTE PROCEDURE cartodb._cdb_update_the_geom_webmercator(); +CREATE TRIGGER update_updated_at_trigger BEFORE UPDATE ON country_decoder FOR EACH ROW EXECUTE PROCEDURE cartodb._cdb_update_updated_at(); + + +CREATE TABLE global_province_polygons ( + the_geom geometry(Geometry,4326), + adm1_code text, + objectid_1 integer, + diss_me integer, + adm1_cod_1 text, + iso_3166_2 text, + wikipedia text, + iso_a2 text, + adm0_sr integer, + name text, + name_alt text, + name_local text, + type text, + type_en text, + code_local text, + code_hasc text, + note text, + hasc_maybe text, + region text, + region_cod text, + provnum_ne integer, + gadm_level integer, + check_me integer, + scalerank integer, + datarank integer, + abbrev text, + postal text, + area_sqkm double precision, + sameascity integer, + labelrank integer, + featurecla text, + name_len integer, + mapcolor9 integer, + mapcolor13 integer, + fips text, + fips_alt text, + woe_id integer, + woe_label text, + woe_name text, + latitude double precision, + longitude double precision, + sov_a3 text, + iso3 text, + adm0_label integer, + admin text, + geonunit text, + gu_a3 text, + gn_id integer, + gn_name text, + gns_id integer, + gns_name text, + gn_level integer, + gn_region text, + gn_a1_code text, + region_sub text, + sub_code text, + gns_level integer, + gns_lang text, + gns_adm1 text, + gns_region text, + cartodb_id integer NOT NULL, + created_at timestamp with time zone DEFAULT now() NOT NULL, + updated_at timestamp with time zone DEFAULT now() NOT NULL, + the_geom_webmercator geometry(Geometry,3857), + synonyms text[], + frequency double precision +); + +CREATE SEQUENCE global_province_polygons_cartodb_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; +ALTER SEQUENCE global_province_polygons_cartodb_id_seq OWNED BY global_province_polygons.cartodb_id; +ALTER TABLE ONLY global_province_polygons ALTER COLUMN cartodb_id SET DEFAULT nextval('global_province_polygons_cartodb_id_seq'::regclass); +ALTER TABLE ONLY global_province_polygons + ADD CONSTRAINT global_province_polygons_cartodb_id_key UNIQUE (cartodb_id); +ALTER TABLE ONLY global_province_polygons + ADD CONSTRAINT global_province_polygons_pkey PRIMARY KEY (cartodb_id); + +CREATE INDEX global_province_polygons_the_geom_idx ON global_province_polygons USING gist (the_geom); +CREATE INDEX global_province_polygons_the_geom_webmercator_idx ON global_province_polygons USING gist (the_geom_webmercator); + +CREATE TRIGGER track_updates AFTER INSERT OR DELETE OR UPDATE OR TRUNCATE ON global_province_polygons FOR EACH STATEMENT EXECUTE PROCEDURE cartodb.cdb_tablemetadata_trigger(); +CREATE TRIGGER update_the_geom_webmercator_trigger BEFORE INSERT OR UPDATE OF the_geom ON global_province_polygons FOR EACH ROW EXECUTE PROCEDURE cartodb._cdb_update_the_geom_webmercator(); +CREATE TRIGGER update_updated_at_trigger BEFORE UPDATE ON global_province_polygons FOR EACH ROW EXECUTE PROCEDURE cartodb._cdb_update_updated_at(); diff --git a/geocoder/admin1/extension/cdb_geocoder_admin1.control b/geocoder/admin1/extension/cdb_geocoder_admin1.control new file mode 100644 index 0000000..7621dcb --- /dev/null +++ b/geocoder/admin1/extension/cdb_geocoder_admin1.control @@ -0,0 +1,6 @@ +# cdb geocoder admin1 extension +comment = 'CartoDB admin1 internal geocoder' +default_version = '0.0.1' +relocatable = true +requires = cartodb +superuser = false diff --git a/geocoder/admin1/extension/expected/cdb_geocoder_admin1_test.out b/geocoder/admin1/extension/expected/cdb_geocoder_admin1_test.out new file mode 100644 index 0000000..963689f --- /dev/null +++ b/geocoder/admin1/extension/expected/cdb_geocoder_admin1_test.out @@ -0,0 +1,45 @@ +CREATE EXTENSION postgis; +CREATE EXTENSION schema_triggers; +CREATE EXTENSION plpythonu; +CREATE EXTENSION cartodb; +CREATE EXTENSION cdb_geocoder_admin1; +-- Check that the geocoding functions are callable, should return NULL +SELECT (geocode_admin1_polygons(Array['TX','Cuidad Real', 'sevilla'])).*; + q | geom | success +-------------+------+--------- + TX | | f + Cuidad Real | | f + sevilla | | f +(3 rows) +SELECT (geocode_admin1_polygons(Array['NH', 'Vermont'], 'United States')).*; + q | geom | success +---------+------+--------- + NH | | f + Vermont | | f +(2 rows) +SELECT (geocode_admin1_polygons(Array['az', 'az'], Array['Ecuador', 'USA'])).*; + q | c | geom | success +----+---------+------+--------- + az | Ecuador | | f + az | USA | | f +(2 rows) + +-- Mock the varnish invalidation function +CREATE OR REPLACE FUNCTION public.cdb_invalidate_varnish(table_name text) RETURNS void AS $$ +BEGIN + RETURN; +END +$$ +LANGUAGE plpgsql; + +-- Add a few data to the sources +COPY global_province_polygons (the_geom,adm1_code, objectid_1, diss_me, adm1_cod_1, iso_3166_2, wikipedia, iso_a2, adm0_sr, name, name_alt, name_local, type, type_en, code_local, code_hasc, note, hasc_maybe, region, region_cod, provnum_ne, gadm_level, check_me, scalerank, datarank, abbrev, postal, area_sqkm, sameascity, labelrank, featurecla, name_len, mapcolor9, mapcolor13, fips, fips_alt, woe_id, woe_label, woe_name, latitude, longitude, sov_a3, iso3, adm0_label, admin, geonunit, gu_a3, gn_id, gn_name, gns_id, gns_name, gn_level, gn_region, gn_a1_code, region_sub, sub_code, gns_level, gns_lang, gns_adm1, gns_region, cartodb_id, created_at, updated_at, synonyms, frequency) FROM stdin; +0106000020E6100000010000000103000000010000000400000000000000005009C07FB86AC523AF4340FFFFFFFFFFEF0EC0C3CF8DC4FADB42400000000000E0F0BFC3CF8DC4FADB424000000000005009C07FB86AC523AF4340 SVN-1035 1473 1035 SVN-1035 SI- \N SI 1 Vipava \N \N Opcine Commune|Municipality \N SI.SP.VI \N \N Goriška \N 162 2 0 10 8 \N VI 0 -99 10 Admin-1 scale rank 6 2 12 SIE1 \N -55848385 \N \N 45.8271000000000015 13.9723000000000006 SVN SVN 2 Slovenia Slovenia SVN 3239075 Obcina Vipava 243467 Vipava, Obcina 1 \N SI.E1 \N \N 1 div SI05 \N 3700 2014-02-18 19:53:50.080158+00 2014-04-01 15:18:54.094644+00 {vipava,"obcina vipava","vipava,obcina",si.e1,si.sp.vi,svn-1035,svn-1035} 5 +\. + +-- Check that the synonym function is callable, should return true +SELECT (geocode_admin1_polygons(Array['obcina vipava'])).success; + success +--------- + t +(1 row) \ No newline at end of file diff --git a/geocoder/admin1/extension/sql/cdb_geocoder_admin1_test.sql b/geocoder/admin1/extension/sql/cdb_geocoder_admin1_test.sql new file mode 100644 index 0000000..276317d --- /dev/null +++ b/geocoder/admin1/extension/sql/cdb_geocoder_admin1_test.sql @@ -0,0 +1,29 @@ +CREATE EXTENSION postgis; +CREATE EXTENSION schema_triggers; +CREATE EXTENSION plpythonu; +CREATE EXTENSION cartodb; +CREATE EXTENSION cdb_geocoder_admin1; + +-- Check that the geocoding functions are callable, should return NULL +SELECT (geocode_admin1_polygons(Array['TX','Cuidad Real', 'sevilla'])).*; +SELECT (geocode_admin1_polygons(Array['NH', 'Vermont'], 'United States')).*; +SELECT (geocode_admin1_polygons(Array['az', 'az'], Array['Ecuador', 'USA'])).*; + +-- Mock the varnish invalidation function +CREATE OR REPLACE FUNCTION public.cdb_invalidate_varnish(table_name text) RETURNS void AS $$ +BEGIN + RETURN; +END +$$ +LANGUAGE plpgsql; + +-- Add a few data to the sources +COPY global_province_polygons (the_geom, adm1_code, objectid_1, diss_me, adm1_cod_1, iso_3166_2, wikipedia, iso_a2, adm0_sr, name, name_alt, name_local, type, type_en, code_local, code_hasc, note, hasc_maybe, region, region_cod, provnum_ne, gadm_level, check_me, scalerank, datarank, abbrev, postal, area_sqkm, sameascity, labelrank, featurecla, name_len, mapcolor9, mapcolor13, fips, fips_alt, woe_id, woe_label, woe_name, latitude, longitude, sov_a3, iso3, adm0_label, admin, geonunit, gu_a3, gn_id, gn_name, gns_id, gns_name, gn_level, gn_region, gn_a1_code, region_sub, sub_code, gns_level, gns_lang, gns_adm1, gns_region, cartodb_id, created_at, updated_at, the_geom_webmercator, synonyms, frequency) FROM stdin; +0106000020E6100000010000000103000000010000001C000000701C468693172C405BCFC9FB20EB464010E0C037081E2C40F7FF710981E74640606B2301AC1A2C40B407F55702E64640200BD34801192C40A44EE24698E34640503B8E674E162C4084868DF1EEE246400025D4480D132C40B43156A168E14640701EE41F19082C400CFADB0CA1E24640E0F6B6C345062C401815CAFBB1E24640A00B4B5DB3022C40EF46C124BCE2464010E49A3E6E012C40CB84EFBA8CE24640105943E9D0FA2B40ECD210DDEBE14640008981CA8FF72B40071D4DCAEFE24640B05E6F7FDDDC2B40F3246E6C88E5464060FB4FFA19CE2B40A46CBD87B5E746404058ECF694C52B408CF11C3411EA4640FFF7D7E59FC82B40BCC34EE79AEB46400077C737A4D02B403C3FEF3AB8EC4640E04B53FA05D82B4090EF60F885ED464090EBDCBC6BDD2B40E048DFA985EE464000A5771CC0E92B40BC5B8EF1D4EF4640005BBBFD66F22B40208A181748F04640805CE1823AF92B402CA5060659F0464040FC6A45A0FE2B404B4FC4C11DF046408061640B9F022C4060B922EEBEEF4640FFD76EE28A072C405421278BBDEE4640E09BADEC49092C40FB7EEC1D7AED4640AF09D3481D0B2C40D86DDB0C8DEC4640701C468693172C405BCFC9FB20EB4640 SVN-1035 1473 1035 SVN-1035 SI- \N SI 1 Vipava \N \N Opcine Commune|Municipality \N SI.SP.VI \N + \N Goriška \N 162 2 0 10 8 \N VI 0 -99 10 Admin-1 scale rank 6 2 12 SIE1 \N -55848385 + \N \N 45.8271000000000015 13.9723000000000006 SVN SVN 2 Slovenia Slovenia SVN 3239075 Obcina Vipava 243467 Vipava, Obcina 1 + \N SI.E1 \N \N 1 div SI05 \N 3700 2014-02-18 19:53:50.080158+00 2014-04-01 15:18:54.094644+00 0106000020110F0000010000000103000000010000001C00000023340ED8CEDB3741C0404B7264F35541C85B157A4AE13741B323798AF9EE5541CEDCBDE56FDE37414004235027ED5541489EE87B05DD3741D970658A36EA55418D272EB8BADA374116C6965868E95541A8917D26F7D73741F9BBEB238DE75541A6C99293A9CE37417F31C28009E95541DA9DBCA51CCD3741A5FAE31E1EE9554121BD100C14CA37414E69CD7D2AE95541124037ECFFC8374162390EC3F0E85541332B2FC661C337417AB6C1E72CE855418A0578349EC037417308826069E95541BCFC69D6F1A93741B086FDCC92EC55419F7FD8E3679D3741462E0A8039EF554160E7068E2B963741A150E52919F25541861B631AC198374148B84A13F9F3554123289F23909F3741CCF276F254F55541F49EE41CD5A53741895CFAD14FF655415E8C59A46AAA3741EE95F4A487F755414B69D859E3B43741E59BB99720F95541A6ECFD723CBC3741762F910EADF9554144635A9E08C23741DD3BD2B6C1F95541C650CF259EC637415377246A79F9554105D616CA02CA374149377DBD05F9554170AA9CC530CE3741A52C14CCCBF755419A5F7271ACCF37414FD3A85D41F65541A2FB415F39D13741C55D614C20F5554123340ED8CEDB3741C0404B7264F35541 {vipava,"obcina vipava","vipava, obcina",si.e1,si.sp.vi,svn-1035,svn-1035} 5 +\. + +-- Check that the synonym function is callable, should return true +SELECT (geocode_admin1_polygons(Array['obcina vipava'])).success;