structuring for a make file
This commit is contained in:
56
geocoder/admin0/README.md
Normal file
56
geocoder/admin0/README.md
Normal file
@@ -0,0 +1,56 @@
|
||||
Admin0 Geocoder
|
||||
===============
|
||||
|
||||
### Function
|
||||
|
||||
Accepts a list of terms. Terms are searched against the ```name_``` column in ```admin0_synonyms```. The ```name_``` column is an automatically cleaned and populated column based on the raw values in ```name_``` . The synonym table returns the proper ISO code (based on rank values in table below). The iso code is then matched against the single row in ```ne_admin0_v3``` to return the polygon.
|
||||
|
||||
### Creation steps
|
||||
|
||||
1. Upload fresh NaturalEarth data to ```ne_admin0_v3```.
|
||||
2. Delete all rows in the ```admin0_synonyms``` table.
|
||||
3. Ensure that all [indexes and triggers](https://github.com/CartoDB/data-services/wiki/Indexes-and-triggers) exist on these two tables.
|
||||
4. Upload the data/wikipedia_countries_native_names.csv table if it doesn't already exist
|
||||
5. Run the sql/subdivide_polygons.sql
|
||||
6. Run the sql/build_synonym_table.sql
|
||||
7. If needed, load or replace the function with sql/geocoder.sql
|
||||
|
||||
### Data Sources
|
||||
|
||||
(see the wiki page: [Geocoder Data Sources #admin0-countries](https://github.com/CartoDB/data-services/wiki/Geocoder-Datasources#admin0-countries))
|
||||
|
||||
- natural earth data: ne_10m_admin_0_countries (version 3.0) which is currently stored in Geocoding.CartoDB as ne_admin0_v3
|
||||
|
||||
- native language spellings were gathered from http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages and stored in data/wikipedia_countries_native_names.csv
|
||||
|
||||
### Preparation details
|
||||
|
||||
Users dislike the NaturalEarth aggregation of French regions into the mainland France polygon. We have done a minimal amount of subdivision. This can be done by executing,
|
||||
|
||||
sql/subdivide_polygons.sql
|
||||
|
||||
## Admin0_synonyms
|
||||
|
||||
Documentation for the creation of the geocoder synonym tables.
|
||||
|
||||
For use with the admin0_geocoder.
|
||||
|
||||
|
||||
### Ranks
|
||||
|
||||
| rank number | origin data | origin column | description |
|
||||
|-------------|-----------------------------|---------------|----------------------|
|
||||
| 0 | natural earth 10m countries | name | literal name |
|
||||
| 1 | natural earth 10m countries | name_alt | alternate name |
|
||||
| 2 | wiki country navive names | country_endonym | local variation |
|
||||
| 3 | natural earth 10m countries | adm0_a3 | 3 digit country code |
|
||||
| 4 | natural earth 10m countries | abbrev | abbreviation |
|
||||
| 5 | natural earth 10m countries | formal_en | formal english |
|
||||
| 6 | natural earth 10m countries | brk_name | ? |
|
||||
| 7 | natural earth 10m countries | formal_fr | formal french |
|
||||
|
||||
__notes:__
|
||||
|
||||
- The column `adm0_a3` will be used as a unique identifier.
|
||||
- The ranks are somewhat arbitrarily organized and should be modified later based on our users use of the geocoder (will users more commonly geocode an adm0_a3 or abbreviation?)
|
||||
- I also forgot to assign a `rank` of `2` to a synonym.
|
||||
176
geocoder/admin0/data/wikipedia_countries_native_names.csv
Normal file
176
geocoder/admin0/data/wikipedia_countries_native_names.csv
Normal file
@@ -0,0 +1,176 @@
|
||||
adm0_a3,country_exonym,country_endonym
|
||||
,Abkhazia,Apsny
|
||||
AFG,Afghanistan,Afghanestan
|
||||
ALB,Albania,Shqipëria
|
||||
DZA,Algeria,Dzayer
|
||||
ASM,American Samoa,Amerika Sāmoa
|
||||
AND,Andorra,Andorra
|
||||
AGO,Angola,Angola
|
||||
AIA,Anguilla,Anguilla
|
||||
ATG,Antigua and Barbuda,Antigua and Barbuda
|
||||
ARG,Argentina,Argentina
|
||||
ARM,Armenia,Hayastán
|
||||
ABW,Aruba,Aruba
|
||||
AUS,Australia,Australia
|
||||
AUT,Austria,Österreich
|
||||
AZE,Azerbaijan,Azərbaycan
|
||||
BHR,Bahrain,Al-Baḥrayn
|
||||
BGD,Bangladesh,Bangladesh
|
||||
BRB,Barbados,Barbados
|
||||
BLR,Belarus,Belarus’
|
||||
BEL,Belgium,België
|
||||
BEN,Benin,Bénin
|
||||
BTN,Bhutan,Druk Yul
|
||||
BOL,Bolivia,Bolivia
|
||||
BIH,Bosnia and Herzegovina,Bosna i Hercegovina
|
||||
BRA,Brazil,Brasil
|
||||
BRN,Brunei,Brunei
|
||||
BGR,Bulgaria,Bulgariya or Bălgarija
|
||||
,Burma,Myanma
|
||||
KHM,Cambodia,Kampuchea
|
||||
CMR,Cameroon,Cameroun
|
||||
CPV,Cape Verde,Cabo Verde
|
||||
CAF,Central African Republic,République Centrafricaine
|
||||
TCD,Chad,Tchad
|
||||
COM,Comoros,Komori
|
||||
HRV,Croatia,Hrvatska
|
||||
CUB,Cuba,Cuba
|
||||
CUW,Curaçao,Curaçao
|
||||
CYP,Cyprus,Kypros
|
||||
CZE,Czech Republic,Česká republika
|
||||
COD,Democratic Republic of the Congo,République démocratique du Congo
|
||||
DNK,Denmark,Danmark
|
||||
DJI,Djibouti,Jībūtī
|
||||
DOM,Dominican Republic,República Dominicana
|
||||
TLS,East Timor,Timor Lorosa'e
|
||||
BHS,Bahamas,The Bahamas
|
||||
CHN,China,Zhōngguó
|
||||
EGY,Egypt,Misr or Masr
|
||||
,Equatorial Guinea,Guinea Ecuatorial
|
||||
ERI,Eritrea,Iritriya
|
||||
EST,Estonia,Eesti
|
||||
ETH,Ethiopia,Ityop'ia
|
||||
FLK,Falkland Islands,Falkland Islands
|
||||
,Faroe Islands,Føroyar
|
||||
FJI,Fiji,Fiji
|
||||
FIN,Finland,Suomi
|
||||
,French Guiana,Guyane
|
||||
PYF,French Polynesia,Polynésie française
|
||||
GEO,Georgia,Sak'art'velo
|
||||
DEU,Germany,Deutschland
|
||||
GRC,Greece,Hellas
|
||||
GRL,Greenland,Kalaallit Nunaat
|
||||
GUM,Guam,Guåhån
|
||||
GTM,Guatemala,Guatemala
|
||||
GIN,Guinea,Guinée
|
||||
GNB,Guinea-Bissau,Guiné-Bissau
|
||||
HTI,Haiti,Haïti
|
||||
HKG,Hong Kong,Hong Kong
|
||||
HUN,Hungary,Magyarország
|
||||
ISL,Iceland,Ísland
|
||||
IND,India,Bharôt
|
||||
IDN,Indonesia,Indonesia
|
||||
IRN,Iran,Īrān
|
||||
IRQ,Iraq,Al-'Iraq
|
||||
IRL,Ireland,Éire
|
||||
IMN,Isle of Man,Isle of Man
|
||||
ISR,Israel,yisrael
|
||||
ITA,Italy,Italia
|
||||
JPN,Japan,Nihon
|
||||
JEY,Jersey,Jersey
|
||||
JOR,Jordan,Al-’Urdun
|
||||
KAZ,Kazakhstan,Qazaqstan
|
||||
KOS,Kosovo,"Kosova, Косово"
|
||||
KWT,Kuwait,Al-Kuwayt
|
||||
KGZ,Kyrgyzstan,Kyrgyzstan
|
||||
LAO,Laos,Lao
|
||||
LVA,Latvia,Latvija
|
||||
PHL,Philippines,Pilipinas
|
||||
PRT,Portugal,Portugal
|
||||
QAT,Qatar,Qaṭar
|
||||
,Republic of the Congo,République du Congo
|
||||
,Réunion,Réunion
|
||||
ROU,Romania,România
|
||||
RUS,Russia,Rossiya or Rossiâ
|
||||
BLM,Saint Barthélemy,Saint-Barthélemy
|
||||
MAF,Saint Martin,Saint-Martin
|
||||
USA,United States,United States
|
||||
URY,Uruguay,República Oriental del Uruguay
|
||||
UZB,Uzbekistan,O‘zbekiston
|
||||
VUT,Vanuatu,Vanuatu
|
||||
,Vatican City,Città del Vaticano
|
||||
VNM,Vietnam,Việt Nam
|
||||
LBY,Libya,Libya
|
||||
LIE,Liechtenstein,Liechtenstein
|
||||
LTU,Lithuania,Lietuva
|
||||
LUX,Luxembourg,Lëtzebuerg
|
||||
MKD,Macedonia,Makedonija
|
||||
MDG,Madagascar,Madagasikara
|
||||
MYS,Malaysia,Malaysia
|
||||
MDV,Maldives,Dhivehi Raajje
|
||||
MLI,Mali,Mali
|
||||
MLT,Malta,Malta
|
||||
MRT,Mauritania,Muritan / Agawec
|
||||
MUS,Mauritius,Maurice
|
||||
,Mayotte,Mayotte
|
||||
MEX,Mexico,México
|
||||
MDA,Moldova,Moldova
|
||||
MCO,Monaco,Monaco
|
||||
MNG,Mongolia,Mongol Uls
|
||||
MNE,Montenegro,Crna Gora
|
||||
MAR,Morocco,Amerruk / Elmeɣrib
|
||||
MOZ,Mozambique,Moçambique
|
||||
NAM,Namibia,Namibia
|
||||
NRU,Nauru,Nauru
|
||||
NPL,Nepal,Nepāla
|
||||
NLD,Netherlands,Nederland
|
||||
NCL,New Caledonia,Nouvelle-Calédonie
|
||||
NZL,New Zealand,New Zealand
|
||||
NIU,Niue,Niuē
|
||||
,North Korea,Chosŏn as called in NK
|
||||
,Northern Cyprus,Kuzey Kıbrıs
|
||||
NOR,Norway,Norge
|
||||
OMN,Oman,‘Umān
|
||||
PAK,Pakistan,Pākistān (Islamic Republic of Pakistan)
|
||||
PLW,Palau,Belau
|
||||
,Palestinian National Authority,Filastīn
|
||||
PAN,Panama,Panamá
|
||||
PNG,Papua New Guinea,Papua New Guinea
|
||||
PRY,Paraguay,Paraguay
|
||||
PER,Peru,Perú
|
||||
POL,Poland,Polska
|
||||
LBN,Lebanon,Lubnān
|
||||
SPM,Saint Pierre and Miquelon,Saint-Pierre et Miquelon
|
||||
,São Tomé and Príncipe,São Tomé e Príncipe
|
||||
SAU,Saudi Arabia,Al-Mamlaka Al-‘Arabiyyah as Sa‘ūdiyyah
|
||||
,Wallis and Futuna,Wallis-et-Futuna
|
||||
YEM,Yemen,Al-Yaman
|
||||
SEN,Senegal,Sénégal
|
||||
SRB,Serbia,Srbija
|
||||
SYC,Seychelles,Sesel
|
||||
SGP,Singapore,Singapura
|
||||
SXM,Sint Maarten,Sint Maarten
|
||||
SVK,Slovakia,Slovensko
|
||||
TON,Tonga,Tonga
|
||||
SVN,Slovenia,Slovenija
|
||||
,Solomon Islands,Solomon Islands
|
||||
SOM,Somalia,Soomaaliya
|
||||
ZAF,South Africa,South Africa
|
||||
,South Korea,Hanguk as called in SK
|
||||
,South Ossetia,Khussar Iryston
|
||||
ESP,Spain,España
|
||||
LKA,Sri Lanka,Sri Lankā
|
||||
SDN,Sudan,As-Sudan
|
||||
,Svalbard,Svalbard
|
||||
SWE,Sweden,Sverige
|
||||
CHE,Switzerland,Schweiz
|
||||
SYR,Syria,Suriyah
|
||||
,Taiwan (Republic of China),Zhōnghuá Mínguó or Táiwan
|
||||
TJK,Tajikistan,Tojikistan
|
||||
TUN,Tunisia,Tunes
|
||||
THA,Thailand,"Mueang Thai, Prathet Thai, Ratcha-anachak Thai"
|
||||
TUR,Turkey,Türkiye
|
||||
TKM,Turkmenistan,Türkmenistan
|
||||
UKR,Ukraine,Ukraїna
|
||||
ARE,United Arab Emirates,Al-’Imārat Al-‘Arabiyyah Al-Muttaḥidah
|
||||
GBR,United Kingdom,United Kingdom
|
||||
|
75
geocoder/admin0/sql/build_synonym_table.sql
Normal file
75
geocoder/admin0/sql/build_synonym_table.sql
Normal file
@@ -0,0 +1,75 @@
|
||||
|
||||
---- ADMIN0_SYNONYMS ---
|
||||
-- insert data from ne_admin_0 into admin0_synonyms
|
||||
-- the name column from ne_10m_countries is assigned a rank of 0
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT name, 0, adm0_a3
|
||||
FROM ne_admin0_v3;
|
||||
|
||||
-- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter
|
||||
-- and insert into admin1_synonyms as new rows with a rank=1
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- insert ad0_a3 codes as synonyms with a rank = 3
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
adm0_a3, 3, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- insert abbrv as name with a rank = 4
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
abbrev, 4, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- insert formal_en as name with a rank = 5
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
formal_en, 5, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- insert brk_name as name with a rank = 6
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
brk_name, 6, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- insert formal_fr as name with a rank = 7
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
formal_fr, 7, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- insert iso_a2 as name with a rank = 8
|
||||
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
|
||||
SELECT
|
||||
iso_a2, 8, adm0_a3
|
||||
FROM
|
||||
ne_admin0_v3;
|
||||
|
||||
-- remove all cases where name is NULL
|
||||
DELETE FROM admin0_synonyms WHERE name IS NULL;
|
||||
|
||||
-- remove all cases where a name is duplicated with a higher rank
|
||||
DELETE FROM admin0_synonyms
|
||||
WHERE cartodb_id IN (
|
||||
SELECT
|
||||
cartodb_id
|
||||
FROM
|
||||
admin0_synonyms a
|
||||
WHERE
|
||||
0 < (
|
||||
SELECT count(*)
|
||||
FROM admin0_synonyms
|
||||
WHERE name_ = a.name_
|
||||
AND adm0_a3 = a.adm0_a3
|
||||
AND rank < a.rank));
|
||||
27
geocoder/admin0/sql/geocoder.sql
Normal file
27
geocoder/admin0/sql/geocoder.sql
Normal file
@@ -0,0 +1,27 @@
|
||||
CREATE OR REPLACE FUNCTION test_geocode_admin0_polygons(name text[])
|
||||
RETURNS SETOF geocode_admin_v1 AS $$
|
||||
DECLARE
|
||||
ret geocode_admin_v1%rowtype;
|
||||
BEGIN
|
||||
FOR ret IN
|
||||
SELECT
|
||||
q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
|
||||
FROM (
|
||||
SELECT
|
||||
q, (
|
||||
SELECT the_geom
|
||||
FROM ne_admin0_v3
|
||||
WHERE adm0_a3 = (
|
||||
SELECT adm0_a3 FROM admin0_synonyms
|
||||
WHERE name_ = lower(regexp_replace(d.q, '[^a-zA-Z]', '', 'g'))
|
||||
ORDER BY rank ASC LIMIT 1
|
||||
)
|
||||
) geom
|
||||
FROM (SELECT unnest(name) q) d
|
||||
) v
|
||||
LOOP
|
||||
RETURN NEXT ret;
|
||||
END LOOP;
|
||||
RETURN;
|
||||
END
|
||||
$$ LANGUAGE 'plpgsql' SECURITY DEFINER;
|
||||
26
geocoder/admin0/sql/subdivide_polygons.sql
Normal file
26
geocoder/admin0/sql/subdivide_polygons.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
---- Subdivide France into subregions ----
|
||||
--- Assumes fresh NaturalEarth admin0 dataset
|
||||
-- Split French Guiane from France
|
||||
INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name)
|
||||
|
||||
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(4, -53)))
|
||||
|
||||
SELECT geom, 'GUF', 'French Guiane' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(4, -53), 8));
|
||||
|
||||
-- Split Corse from France
|
||||
INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name)
|
||||
|
||||
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(42.14, 9.12)))
|
||||
|
||||
SELECT ST_Collect(geom), 'FRH', 'Corse' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(42.14, 9.12), 2));
|
||||
|
||||
-- Split Reunion from France
|
||||
INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name)
|
||||
|
||||
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(-21.12, 55.51)))
|
||||
|
||||
SELECT ST_Collect(geom), 'REU', 'Reunion' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(-21.12, 55.51),2));
|
||||
|
||||
-- Remove the above three from the FRA polygon
|
||||
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom FROM ne_admin0_v3 WHERE adm0_a3 = 'FRA')
|
||||
UPDATE ne_admin0_v3 SET the_geom = (SELECT ST_Union(geom) FROM a WHERE NOT ST_intersects(geom, (SELECT ST_Union(the_geom) FROM ne_admin0_v3 WHERE adm0_a3 IN ('GUF', 'FRH', 'REU')))) WHERE adm0_a3 = 'FRA';
|
||||
Reference in New Issue
Block a user