diff --git a/geocoder/admin0/README.md b/geocoder/admin0/README.md new file mode 100644 index 0000000..a2d48ef --- /dev/null +++ b/geocoder/admin0/README.md @@ -0,0 +1,56 @@ +Admin0 Geocoder +=============== + +### Function + +Accepts a list of terms. Terms are searched against the ```name_``` column in ```admin0_synonyms```. The ```name_``` column is an automatically cleaned and populated column based on the raw values in ```name_``` . The synonym table returns the proper ISO code (based on rank values in table below). The iso code is then matched against the single row in ```ne_admin0_v3``` to return the polygon. + +### Creation steps + +1. Upload fresh NaturalEarth data to ```ne_admin0_v3```. +2. Delete all rows in the ```admin0_synonyms``` table. +3. Ensure that all [indexes and triggers](https://github.com/CartoDB/data-services/wiki/Indexes-and-triggers) exist on these two tables. +4. Upload the data/wikipedia_countries_native_names.csv table if it doesn't already exist +5. Run the sql/subdivide_polygons.sql +6. Run the sql/build_synonym_table.sql +7. If needed, load or replace the function with sql/geocoder.sql + +### Data Sources + +(see the wiki page: [Geocoder Data Sources #admin0-countries](https://github.com/CartoDB/data-services/wiki/Geocoder-Datasources#admin0-countries)) + +- natural earth data: ne_10m_admin_0_countries (version 3.0) which is currently stored in Geocoding.CartoDB as ne_admin0_v3 + +- native language spellings were gathered from http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages and stored in data/wikipedia_countries_native_names.csv + +### Preparation details + +Users dislike the NaturalEarth aggregation of French regions into the mainland France polygon. We have done a minimal amount of subdivision. This can be done by executing, + +sql/subdivide_polygons.sql + +## Admin0_synonyms + +Documentation for the creation of the geocoder synonym tables. + +For use with the admin0_geocoder. + + +### Ranks + +| rank number | origin data | origin column | description | +|-------------|-----------------------------|---------------|----------------------| +| 0 | natural earth 10m countries | name | literal name | +| 1 | natural earth 10m countries | name_alt | alternate name | +| 2 | wiki country navive names | country_endonym | local variation | +| 3 | natural earth 10m countries | adm0_a3 | 3 digit country code | +| 4 | natural earth 10m countries | abbrev | abbreviation | +| 5 | natural earth 10m countries | formal_en | formal english | +| 6 | natural earth 10m countries | brk_name | ? | +| 7 | natural earth 10m countries | formal_fr | formal french | + +__notes:__ + +- The column `adm0_a3` will be used as a unique identifier. +- The ranks are somewhat arbitrarily organized and should be modified later based on our users use of the geocoder (will users more commonly geocode an adm0_a3 or abbreviation?) +- I also forgot to assign a `rank` of `2` to a synonym. \ No newline at end of file diff --git a/geocoder/admin0/data/wikipedia_countries_native_names.csv b/geocoder/admin0/data/wikipedia_countries_native_names.csv new file mode 100644 index 0000000..d8eb1e1 --- /dev/null +++ b/geocoder/admin0/data/wikipedia_countries_native_names.csv @@ -0,0 +1,176 @@ +adm0_a3,country_exonym,country_endonym +,Abkhazia,Apsny +AFG,Afghanistan,Afghanestan +ALB,Albania,Shqipëria +DZA,Algeria,Dzayer +ASM,American Samoa,Amerika Sāmoa +AND,Andorra,Andorra +AGO,Angola,Angola +AIA,Anguilla,Anguilla +ATG,Antigua and Barbuda,Antigua and Barbuda +ARG,Argentina,Argentina +ARM,Armenia,Hayastán +ABW,Aruba,Aruba +AUS,Australia,Australia +AUT,Austria,Österreich +AZE,Azerbaijan,Azərbaycan +BHR,Bahrain,Al-Baḥrayn +BGD,Bangladesh,Bangladesh +BRB,Barbados,Barbados +BLR,Belarus,Belarus’ +BEL,Belgium,België +BEN,Benin,Bénin +BTN,Bhutan,Druk Yul +BOL,Bolivia,Bolivia +BIH,Bosnia and Herzegovina,Bosna i Hercegovina +BRA,Brazil,Brasil +BRN,Brunei,Brunei +BGR,Bulgaria,Bulgariya or Bălgarija +,Burma,Myanma +KHM,Cambodia,Kampuchea +CMR,Cameroon,Cameroun +CPV,Cape Verde,Cabo Verde +CAF,Central African Republic,République Centrafricaine +TCD,Chad,Tchad +COM,Comoros,Komori +HRV,Croatia,Hrvatska +CUB,Cuba,Cuba +CUW,Curaçao,Curaçao +CYP,Cyprus,Kypros +CZE,Czech Republic,Česká republika +COD,Democratic Republic of the Congo,République démocratique du Congo +DNK,Denmark,Danmark +DJI,Djibouti,Jībūtī +DOM,Dominican Republic,República Dominicana +TLS,East Timor,Timor Lorosa'e +BHS,Bahamas,The Bahamas +CHN,China,Zhōngguó +EGY,Egypt,Misr or Masr +,Equatorial Guinea,Guinea Ecuatorial +ERI,Eritrea,Iritriya +EST,Estonia,Eesti +ETH,Ethiopia,Ityop'ia +FLK,Falkland Islands,Falkland Islands +,Faroe Islands,Føroyar +FJI,Fiji,Fiji +FIN,Finland,Suomi +,French Guiana,Guyane +PYF,French Polynesia,Polynésie française +GEO,Georgia,Sak'art'velo +DEU,Germany,Deutschland +GRC,Greece,Hellas +GRL,Greenland,Kalaallit Nunaat +GUM,Guam,Guåhån +GTM,Guatemala,Guatemala +GIN,Guinea,Guinée +GNB,Guinea-Bissau,Guiné-Bissau +HTI,Haiti,Haïti +HKG,Hong Kong,Hong Kong +HUN,Hungary,Magyarország +ISL,Iceland,Ísland +IND,India,Bharôt +IDN,Indonesia,Indonesia +IRN,Iran,Īrān +IRQ,Iraq,Al-'Iraq +IRL,Ireland,Éire +IMN,Isle of Man,Isle of Man +ISR,Israel,yisrael +ITA,Italy,Italia +JPN,Japan,Nihon +JEY,Jersey,Jersey +JOR,Jordan,Al-’Urdun +KAZ,Kazakhstan,Qazaqstan +KOS,Kosovo,"Kosova, Косово" +KWT,Kuwait,Al-Kuwayt +KGZ,Kyrgyzstan,Kyrgyzstan +LAO,Laos,Lao +LVA,Latvia,Latvija +PHL,Philippines,Pilipinas +PRT,Portugal,Portugal +QAT,Qatar,Qaṭar +,Republic of the Congo,République du Congo +,Réunion,Réunion +ROU,Romania,România +RUS,Russia,Rossiya or Rossiâ +BLM,Saint Barthélemy,Saint-Barthélemy +MAF,Saint Martin,Saint-Martin +USA,United States,United States +URY,Uruguay,República Oriental del Uruguay +UZB,Uzbekistan,O‘zbekiston +VUT,Vanuatu,Vanuatu +,Vatican City,Città del Vaticano +VNM,Vietnam,Việt Nam +LBY,Libya,Libya +LIE,Liechtenstein,Liechtenstein +LTU,Lithuania,Lietuva +LUX,Luxembourg,Lëtzebuerg +MKD,Macedonia,Makedonija +MDG,Madagascar,Madagasikara +MYS,Malaysia,Malaysia +MDV,Maldives,Dhivehi Raajje +MLI,Mali,Mali +MLT,Malta,Malta +MRT,Mauritania,Muritan / Agawec +MUS,Mauritius,Maurice +,Mayotte,Mayotte +MEX,Mexico,México +MDA,Moldova,Moldova +MCO,Monaco,Monaco +MNG,Mongolia,Mongol Uls +MNE,Montenegro,Crna Gora +MAR,Morocco,Amerruk / Elmeɣrib +MOZ,Mozambique,Moçambique +NAM,Namibia,Namibia +NRU,Nauru,Nauru +NPL,Nepal,Nepāla +NLD,Netherlands,Nederland +NCL,New Caledonia,Nouvelle-Calédonie +NZL,New Zealand,New Zealand +NIU,Niue,Niuē +,North Korea,Chosŏn as called in NK +,Northern Cyprus,Kuzey Kıbrıs +NOR,Norway,Norge +OMN,Oman,‘Umān +PAK,Pakistan,Pākistān (Islamic Republic of Pakistan) +PLW,Palau,Belau +,Palestinian National Authority,Filastīn +PAN,Panama,Panamá +PNG,Papua New Guinea,Papua New Guinea +PRY,Paraguay,Paraguay +PER,Peru,Perú +POL,Poland,Polska +LBN,Lebanon,Lubnān +SPM,Saint Pierre and Miquelon,Saint-Pierre et Miquelon +,São Tomé and Príncipe,São Tomé e Príncipe +SAU,Saudi Arabia,Al-Mamlaka Al-‘Arabiyyah as Sa‘ūdiyyah +,Wallis and Futuna,Wallis-et-Futuna +YEM,Yemen,Al-Yaman +SEN,Senegal,Sénégal +SRB,Serbia,Srbija +SYC,Seychelles,Sesel +SGP,Singapore,Singapura +SXM,Sint Maarten,Sint Maarten +SVK,Slovakia,Slovensko +TON,Tonga,Tonga +SVN,Slovenia,Slovenija +,Solomon Islands,Solomon Islands +SOM,Somalia,Soomaaliya +ZAF,South Africa,South Africa +,South Korea,Hanguk as called in SK +,South Ossetia,Khussar Iryston +ESP,Spain,España +LKA,Sri Lanka,Sri Lankā +SDN,Sudan,As-Sudan +,Svalbard,Svalbard +SWE,Sweden,Sverige +CHE,Switzerland,Schweiz +SYR,Syria,Suriyah +,Taiwan (Republic of China),Zhōnghuá Mínguó or Táiwan +TJK,Tajikistan,Tojikistan +TUN,Tunisia,Tunes +THA,Thailand,"Mueang Thai, Prathet Thai, Ratcha-anachak Thai" +TUR,Turkey,Türkiye +TKM,Turkmenistan,Türkmenistan +UKR,Ukraine,Ukraїna +ARE,United Arab Emirates,Al-’Imārat Al-‘Arabiyyah Al-Muttaḥidah +GBR,United Kingdom,United Kingdom diff --git a/geocoder/admin0/sql/build_synonym_table.sql b/geocoder/admin0/sql/build_synonym_table.sql new file mode 100644 index 0000000..3a332c2 --- /dev/null +++ b/geocoder/admin0/sql/build_synonym_table.sql @@ -0,0 +1,75 @@ + +---- ADMIN0_SYNONYMS --- +-- insert data from ne_admin_0 into admin0_synonyms +-- the name column from ne_10m_countries is assigned a rank of 0 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) + SELECT name, 0, adm0_a3 + FROM ne_admin0_v3; + +-- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter +-- and insert into admin1_synonyms as new rows with a rank=1 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, adm0_a3 +FROM + ne_admin0_v3; + +-- insert ad0_a3 codes as synonyms with a rank = 3 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + adm0_a3, 3, adm0_a3 +FROM + ne_admin0_v3; + +-- insert abbrv as name with a rank = 4 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + abbrev, 4, adm0_a3 +FROM + ne_admin0_v3; + +-- insert formal_en as name with a rank = 5 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + formal_en, 5, adm0_a3 +FROM + ne_admin0_v3; + +-- insert brk_name as name with a rank = 6 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + brk_name, 6, adm0_a3 +FROM + ne_admin0_v3; + +-- insert formal_fr as name with a rank = 7 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + formal_fr, 7, adm0_a3 +FROM + ne_admin0_v3; + +-- insert iso_a2 as name with a rank = 8 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + iso_a2, 8, adm0_a3 +FROM + ne_admin0_v3; + +-- remove all cases where name is NULL +DELETE FROM admin0_synonyms WHERE name IS NULL; + +-- remove all cases where a name is duplicated with a higher rank +DELETE FROM admin0_synonyms + WHERE cartodb_id IN ( + SELECT + cartodb_id + FROM + admin0_synonyms a + WHERE + 0 < ( + SELECT count(*) + FROM admin0_synonyms + WHERE name_ = a.name_ + AND adm0_a3 = a.adm0_a3 + AND rank < a.rank)); \ No newline at end of file diff --git a/geocoder/admin0/sql/geocoder.sql b/geocoder/admin0/sql/geocoder.sql new file mode 100644 index 0000000..7d372dd --- /dev/null +++ b/geocoder/admin0/sql/geocoder.sql @@ -0,0 +1,27 @@ +CREATE OR REPLACE FUNCTION test_geocode_admin0_polygons(name text[]) + RETURNS SETOF geocode_admin_v1 AS $$ + DECLARE + ret geocode_admin_v1%rowtype; + BEGIN + FOR ret IN + SELECT + q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success + FROM ( + SELECT + q, ( + SELECT the_geom + FROM ne_admin0_v3 + WHERE adm0_a3 = ( + SELECT adm0_a3 FROM admin0_synonyms + WHERE name_ = lower(regexp_replace(d.q, '[^a-zA-Z]', '', 'g')) + ORDER BY rank ASC LIMIT 1 + ) + ) geom + FROM (SELECT unnest(name) q) d + ) v + LOOP + RETURN NEXT ret; + END LOOP; + RETURN; +END +$$ LANGUAGE 'plpgsql' SECURITY DEFINER; \ No newline at end of file diff --git a/geocoder/admin0/sql/subdivide_polygons.sql b/geocoder/admin0/sql/subdivide_polygons.sql new file mode 100644 index 0000000..35b0d85 --- /dev/null +++ b/geocoder/admin0/sql/subdivide_polygons.sql @@ -0,0 +1,26 @@ +---- Subdivide France into subregions ---- +--- Assumes fresh NaturalEarth admin0 dataset +-- Split French Guiane from France +INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name) + +WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(4, -53))) + +SELECT geom, 'GUF', 'French Guiane' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(4, -53), 8)); + +-- Split Corse from France +INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name) + +WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(42.14, 9.12))) + +SELECT ST_Collect(geom), 'FRH', 'Corse' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(42.14, 9.12), 2)); + +-- Split Reunion from France +INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name) + +WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(-21.12, 55.51))) + +SELECT ST_Collect(geom), 'REU', 'Reunion' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(-21.12, 55.51),2)); + +-- Remove the above three from the FRA polygon +WITH a AS (SELECT (ST_Dump(the_geom)).geom geom FROM ne_admin0_v3 WHERE adm0_a3 = 'FRA') +UPDATE ne_admin0_v3 SET the_geom = (SELECT ST_Union(geom) FROM a WHERE NOT ST_intersects(geom, (SELECT ST_Union(the_geom) FROM ne_admin0_v3 WHERE adm0_a3 IN ('GUF', 'FRH', 'REU')))) WHERE adm0_a3 = 'FRA';