structuring for a make file

This commit is contained in:
andrewxhill
2014-08-04 18:29:28 -04:00
parent 0156731a20
commit 3e813b9868
5 changed files with 360 additions and 0 deletions

56
geocoder/admin0/README.md Normal file
View File

@@ -0,0 +1,56 @@
Admin0 Geocoder
===============
### Function
Accepts a list of terms. Terms are searched against the ```name_``` column in ```admin0_synonyms```. The ```name_``` column is an automatically cleaned and populated column based on the raw values in ```name_``` . The synonym table returns the proper ISO code (based on rank values in table below). The iso code is then matched against the single row in ```ne_admin0_v3``` to return the polygon.
### Creation steps
1. Upload fresh NaturalEarth data to ```ne_admin0_v3```.
2. Delete all rows in the ```admin0_synonyms``` table.
3. Ensure that all [indexes and triggers](https://github.com/CartoDB/data-services/wiki/Indexes-and-triggers) exist on these two tables.
4. Upload the data/wikipedia_countries_native_names.csv table if it doesn't already exist
5. Run the sql/subdivide_polygons.sql
6. Run the sql/build_synonym_table.sql
7. If needed, load or replace the function with sql/geocoder.sql
### Data Sources
(see the wiki page: [Geocoder Data Sources #admin0-countries](https://github.com/CartoDB/data-services/wiki/Geocoder-Datasources#admin0-countries))
- natural earth data: ne_10m_admin_0_countries (version 3.0) which is currently stored in Geocoding.CartoDB as ne_admin0_v3
- native language spellings were gathered from http://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages and stored in data/wikipedia_countries_native_names.csv
### Preparation details
Users dislike the NaturalEarth aggregation of French regions into the mainland France polygon. We have done a minimal amount of subdivision. This can be done by executing,
sql/subdivide_polygons.sql
## Admin0_synonyms
Documentation for the creation of the geocoder synonym tables.
For use with the admin0_geocoder.
### Ranks
| rank number | origin data | origin column | description |
|-------------|-----------------------------|---------------|----------------------|
| 0 | natural earth 10m countries | name | literal name |
| 1 | natural earth 10m countries | name_alt | alternate name |
| 2 | wiki country navive names | country_endonym | local variation |
| 3 | natural earth 10m countries | adm0_a3 | 3 digit country code |
| 4 | natural earth 10m countries | abbrev | abbreviation |
| 5 | natural earth 10m countries | formal_en | formal english |
| 6 | natural earth 10m countries | brk_name | ? |
| 7 | natural earth 10m countries | formal_fr | formal french |
__notes:__
- The column `adm0_a3` will be used as a unique identifier.
- The ranks are somewhat arbitrarily organized and should be modified later based on our users use of the geocoder (will users more commonly geocode an adm0_a3 or abbreviation?)
- I also forgot to assign a `rank` of `2` to a synonym.

View File

@@ -0,0 +1,176 @@
adm0_a3,country_exonym,country_endonym
,Abkhazia,Apsny
AFG,Afghanistan,Afghanestan
ALB,Albania,Shqipëria
DZA,Algeria,Dzayer
ASM,American Samoa,Amerika Sāmoa
AND,Andorra,Andorra
AGO,Angola,Angola
AIA,Anguilla,Anguilla
ATG,Antigua and Barbuda,Antigua and Barbuda
ARG,Argentina,Argentina
ARM,Armenia,Hayastán
ABW,Aruba,Aruba
AUS,Australia,Australia
AUT,Austria,Österreich
AZE,Azerbaijan,Azərbaycan
BHR,Bahrain,Al-Baḥrayn
BGD,Bangladesh,Bangladesh
BRB,Barbados,Barbados
BLR,Belarus,Belarus
BEL,Belgium,België
BEN,Benin,Bénin
BTN,Bhutan,Druk Yul
BOL,Bolivia,Bolivia
BIH,Bosnia and Herzegovina,Bosna i Hercegovina
BRA,Brazil,Brasil
BRN,Brunei,Brunei
BGR,Bulgaria,Bulgariya or Bălgarija
,Burma,Myanma
KHM,Cambodia,Kampuchea
CMR,Cameroon,Cameroun
CPV,Cape Verde,Cabo Verde
CAF,Central African Republic,République Centrafricaine
TCD,Chad,Tchad
COM,Comoros,Komori
HRV,Croatia,Hrvatska
CUB,Cuba,Cuba
CUW,Curaçao,Curaçao
CYP,Cyprus,Kypros
CZE,Czech Republic,Česká republika
COD,Democratic Republic of the Congo,République démocratique du Congo
DNK,Denmark,Danmark
DJI,Djibouti,Jībūtī
DOM,Dominican Republic,República Dominicana
TLS,East Timor,Timor Lorosa'e
BHS,Bahamas,The Bahamas
CHN,China,Zhōngguó
EGY,Egypt,Misr or Masr
,Equatorial Guinea,Guinea Ecuatorial
ERI,Eritrea,Iritriya
EST,Estonia,Eesti
ETH,Ethiopia,Ityop'ia
FLK,Falkland Islands,Falkland Islands
,Faroe Islands,Føroyar
FJI,Fiji,Fiji
FIN,Finland,Suomi
,French Guiana,Guyane
PYF,French Polynesia,Polynésie française
GEO,Georgia,Sak'art'velo
DEU,Germany,Deutschland
GRC,Greece,Hellas
GRL,Greenland,Kalaallit Nunaat
GUM,Guam,Guåhån
GTM,Guatemala,Guatemala
GIN,Guinea,Guinée
GNB,Guinea-Bissau,Guiné-Bissau
HTI,Haiti,Haïti
HKG,Hong Kong,Hong Kong
HUN,Hungary,Magyarország
ISL,Iceland,Ísland
IND,India,Bharôt
IDN,Indonesia,Indonesia
IRN,Iran,Īrān
IRQ,Iraq,Al-'Iraq
IRL,Ireland,Éire
IMN,Isle of Man,Isle of Man
ISR,Israel,yisrael
ITA,Italy,Italia
JPN,Japan,Nihon
JEY,Jersey,Jersey
JOR,Jordan,Al-Urdun
KAZ,Kazakhstan,Qazaqstan
KOS,Kosovo,"Kosova, Косово"
KWT,Kuwait,Al-Kuwayt
KGZ,Kyrgyzstan,Kyrgyzstan
LAO,Laos,Lao
LVA,Latvia,Latvija
PHL,Philippines,Pilipinas
PRT,Portugal,Portugal
QAT,Qatar,Qaṭar
,Republic of the Congo,République du Congo
,Réunion,Réunion
ROU,Romania,România
RUS,Russia,Rossiya or Rossiâ
BLM,Saint Barthélemy,Saint-Barthélemy
MAF,Saint Martin,Saint-Martin
USA,United States,United States
URY,Uruguay,República Oriental del Uruguay
UZB,Uzbekistan,Ozbekiston
VUT,Vanuatu,Vanuatu
,Vatican City,Città del Vaticano
VNM,Vietnam,Việt Nam
LBY,Libya,Libya
LIE,Liechtenstein,Liechtenstein
LTU,Lithuania,Lietuva
LUX,Luxembourg,Lëtzebuerg
MKD,Macedonia,Makedonija
MDG,Madagascar,Madagasikara
MYS,Malaysia,Malaysia
MDV,Maldives,Dhivehi Raajje
MLI,Mali,Mali
MLT,Malta,Malta
MRT,Mauritania,Muritan / Agawec
MUS,Mauritius,Maurice
,Mayotte,Mayotte
MEX,Mexico,México
MDA,Moldova,Moldova
MCO,Monaco,Monaco
MNG,Mongolia,Mongol Uls
MNE,Montenegro,Crna Gora
MAR,Morocco,Amerruk / Elmeɣrib
MOZ,Mozambique,Moçambique
NAM,Namibia,Namibia
NRU,Nauru,Nauru
NPL,Nepal,Nepāla
NLD,Netherlands,Nederland
NCL,New Caledonia,Nouvelle-Calédonie
NZL,New Zealand,New Zealand
NIU,Niue,Niuē
,North Korea,Chosŏn as called in NK
,Northern Cyprus,Kuzey Kıbrıs
NOR,Norway,Norge
OMN,Oman,Umān
PAK,Pakistan,Pākistān (Islamic Republic of Pakistan)
PLW,Palau,Belau
,Palestinian National Authority,Filastīn
PAN,Panama,Panamá
PNG,Papua New Guinea,Papua New Guinea
PRY,Paraguay,Paraguay
PER,Peru,Perú
POL,Poland,Polska
LBN,Lebanon,Lubnān
SPM,Saint Pierre and Miquelon,Saint-Pierre et Miquelon
,São Tomé and Príncipe,São Tomé e Príncipe
SAU,Saudi Arabia,Al-Mamlaka Al-Arabiyyah as Saūdiyyah
,Wallis and Futuna,Wallis-et-Futuna
YEM,Yemen,Al-Yaman
SEN,Senegal,Sénégal
SRB,Serbia,Srbija
SYC,Seychelles,Sesel
SGP,Singapore,Singapura
SXM,Sint Maarten,Sint Maarten
SVK,Slovakia,Slovensko
TON,Tonga,Tonga
SVN,Slovenia,Slovenija
,Solomon Islands,Solomon Islands
SOM,Somalia,Soomaaliya
ZAF,South Africa,South Africa
,South Korea,Hanguk as called in SK
,South Ossetia,Khussar Iryston
ESP,Spain,España
LKA,Sri Lanka,Sri Lankā
SDN,Sudan,As-Sudan
,Svalbard,Svalbard
SWE,Sweden,Sverige
CHE,Switzerland,Schweiz
SYR,Syria,Suriyah
,Taiwan (Republic of China),Zhōnghuá Mínguó or Táiwan
TJK,Tajikistan,Tojikistan
TUN,Tunisia,Tunes
THA,Thailand,"Mueang Thai, Prathet Thai, Ratcha-anachak Thai"
TUR,Turkey,Türkiye
TKM,Turkmenistan,Türkmenistan
UKR,Ukraine,Ukraїna
ARE,United Arab Emirates,Al-Imārat Al-Arabiyyah Al-Muttaḥidah
GBR,United Kingdom,United Kingdom
1 adm0_a3 country_exonym country_endonym
2 Abkhazia Apsny
3 AFG Afghanistan Afghanestan
4 ALB Albania Shqipëria
5 DZA Algeria Dzayer
6 ASM American Samoa Amerika Sāmoa
7 AND Andorra Andorra
8 AGO Angola Angola
9 AIA Anguilla Anguilla
10 ATG Antigua and Barbuda Antigua and Barbuda
11 ARG Argentina Argentina
12 ARM Armenia Hayastán
13 ABW Aruba Aruba
14 AUS Australia Australia
15 AUT Austria Österreich
16 AZE Azerbaijan Azərbaycan
17 BHR Bahrain Al-Baḥrayn
18 BGD Bangladesh Bangladesh
19 BRB Barbados Barbados
20 BLR Belarus Belarus’
21 BEL Belgium België
22 BEN Benin Bénin
23 BTN Bhutan Druk Yul
24 BOL Bolivia Bolivia
25 BIH Bosnia and Herzegovina Bosna i Hercegovina
26 BRA Brazil Brasil
27 BRN Brunei Brunei
28 BGR Bulgaria Bulgariya or Bălgarija
29 Burma Myanma
30 KHM Cambodia Kampuchea
31 CMR Cameroon Cameroun
32 CPV Cape Verde Cabo Verde
33 CAF Central African Republic République Centrafricaine
34 TCD Chad Tchad
35 COM Comoros Komori
36 HRV Croatia Hrvatska
37 CUB Cuba Cuba
38 CUW Curaçao Curaçao
39 CYP Cyprus Kypros
40 CZE Czech Republic Česká republika
41 COD Democratic Republic of the Congo République démocratique du Congo
42 DNK Denmark Danmark
43 DJI Djibouti Jībūtī
44 DOM Dominican Republic República Dominicana
45 TLS East Timor Timor Lorosa'e
46 BHS Bahamas The Bahamas
47 CHN China Zhōngguó
48 EGY Egypt Misr or Masr
49 Equatorial Guinea Guinea Ecuatorial
50 ERI Eritrea Iritriya
51 EST Estonia Eesti
52 ETH Ethiopia Ityop'ia
53 FLK Falkland Islands Falkland Islands
54 Faroe Islands Føroyar
55 FJI Fiji Fiji
56 FIN Finland Suomi
57 French Guiana Guyane
58 PYF French Polynesia Polynésie française
59 GEO Georgia Sak'art'velo
60 DEU Germany Deutschland
61 GRC Greece Hellas
62 GRL Greenland Kalaallit Nunaat
63 GUM Guam Guåhån
64 GTM Guatemala Guatemala
65 GIN Guinea Guinée
66 GNB Guinea-Bissau Guiné-Bissau
67 HTI Haiti Haïti
68 HKG Hong Kong Hong Kong
69 HUN Hungary Magyarország
70 ISL Iceland Ísland
71 IND India Bharôt
72 IDN Indonesia Indonesia
73 IRN Iran Īrān
74 IRQ Iraq Al-'Iraq
75 IRL Ireland Éire
76 IMN Isle of Man Isle of Man
77 ISR Israel yisrael
78 ITA Italy Italia
79 JPN Japan Nihon
80 JEY Jersey Jersey
81 JOR Jordan Al-’Urdun
82 KAZ Kazakhstan Qazaqstan
83 KOS Kosovo Kosova, Косово
84 KWT Kuwait Al-Kuwayt
85 KGZ Kyrgyzstan Kyrgyzstan
86 LAO Laos Lao
87 LVA Latvia Latvija
88 PHL Philippines Pilipinas
89 PRT Portugal Portugal
90 QAT Qatar Qaṭar
91 Republic of the Congo République du Congo
92 Réunion Réunion
93 ROU Romania România
94 RUS Russia Rossiya or Rossiâ
95 BLM Saint Barthélemy Saint-Barthélemy
96 MAF Saint Martin Saint-Martin
97 USA United States United States
98 URY Uruguay República Oriental del Uruguay
99 UZB Uzbekistan O‘zbekiston
100 VUT Vanuatu Vanuatu
101 Vatican City Città del Vaticano
102 VNM Vietnam Việt Nam
103 LBY Libya Libya
104 LIE Liechtenstein Liechtenstein
105 LTU Lithuania Lietuva
106 LUX Luxembourg Lëtzebuerg
107 MKD Macedonia Makedonija
108 MDG Madagascar Madagasikara
109 MYS Malaysia Malaysia
110 MDV Maldives Dhivehi Raajje
111 MLI Mali Mali
112 MLT Malta Malta
113 MRT Mauritania Muritan / Agawec
114 MUS Mauritius Maurice
115 Mayotte Mayotte
116 MEX Mexico México
117 MDA Moldova Moldova
118 MCO Monaco Monaco
119 MNG Mongolia Mongol Uls
120 MNE Montenegro Crna Gora
121 MAR Morocco Amerruk / Elmeɣrib
122 MOZ Mozambique Moçambique
123 NAM Namibia Namibia
124 NRU Nauru Nauru
125 NPL Nepal Nepāla
126 NLD Netherlands Nederland
127 NCL New Caledonia Nouvelle-Calédonie
128 NZL New Zealand New Zealand
129 NIU Niue Niuē
130 North Korea Chosŏn as called in NK
131 Northern Cyprus Kuzey Kıbrıs
132 NOR Norway Norge
133 OMN Oman ‘Umān
134 PAK Pakistan Pākistān (Islamic Republic of Pakistan)
135 PLW Palau Belau
136 Palestinian National Authority Filastīn
137 PAN Panama Panamá
138 PNG Papua New Guinea Papua New Guinea
139 PRY Paraguay Paraguay
140 PER Peru Perú
141 POL Poland Polska
142 LBN Lebanon Lubnān
143 SPM Saint Pierre and Miquelon Saint-Pierre et Miquelon
144 São Tomé and Príncipe São Tomé e Príncipe
145 SAU Saudi Arabia Al-Mamlaka Al-‘Arabiyyah as Sa‘ūdiyyah
146 Wallis and Futuna Wallis-et-Futuna
147 YEM Yemen Al-Yaman
148 SEN Senegal Sénégal
149 SRB Serbia Srbija
150 SYC Seychelles Sesel
151 SGP Singapore Singapura
152 SXM Sint Maarten Sint Maarten
153 SVK Slovakia Slovensko
154 TON Tonga Tonga
155 SVN Slovenia Slovenija
156 Solomon Islands Solomon Islands
157 SOM Somalia Soomaaliya
158 ZAF South Africa South Africa
159 South Korea Hanguk as called in SK
160 South Ossetia Khussar Iryston
161 ESP Spain España
162 LKA Sri Lanka Sri Lankā
163 SDN Sudan As-Sudan
164 Svalbard Svalbard
165 SWE Sweden Sverige
166 CHE Switzerland Schweiz
167 SYR Syria Suriyah
168 Taiwan (Republic of China) Zhōnghuá Mínguó or Táiwan
169 TJK Tajikistan Tojikistan
170 TUN Tunisia Tunes
171 THA Thailand Mueang Thai, Prathet Thai, Ratcha-anachak Thai
172 TUR Turkey Türkiye
173 TKM Turkmenistan Türkmenistan
174 UKR Ukraine Ukraїna
175 ARE United Arab Emirates Al-’Imārat Al-‘Arabiyyah Al-Muttaḥidah
176 GBR United Kingdom United Kingdom

View File

@@ -0,0 +1,75 @@
---- ADMIN0_SYNONYMS ---
-- insert data from ne_admin_0 into admin0_synonyms
-- the name column from ne_10m_countries is assigned a rank of 0
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT name, 0, adm0_a3
FROM ne_admin0_v3;
-- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter
-- and insert into admin1_synonyms as new rows with a rank=1
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, adm0_a3
FROM
ne_admin0_v3;
-- insert ad0_a3 codes as synonyms with a rank = 3
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
adm0_a3, 3, adm0_a3
FROM
ne_admin0_v3;
-- insert abbrv as name with a rank = 4
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
abbrev, 4, adm0_a3
FROM
ne_admin0_v3;
-- insert formal_en as name with a rank = 5
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
formal_en, 5, adm0_a3
FROM
ne_admin0_v3;
-- insert brk_name as name with a rank = 6
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
brk_name, 6, adm0_a3
FROM
ne_admin0_v3;
-- insert formal_fr as name with a rank = 7
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
formal_fr, 7, adm0_a3
FROM
ne_admin0_v3;
-- insert iso_a2 as name with a rank = 8
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
iso_a2, 8, adm0_a3
FROM
ne_admin0_v3;
-- remove all cases where name is NULL
DELETE FROM admin0_synonyms WHERE name IS NULL;
-- remove all cases where a name is duplicated with a higher rank
DELETE FROM admin0_synonyms
WHERE cartodb_id IN (
SELECT
cartodb_id
FROM
admin0_synonyms a
WHERE
0 < (
SELECT count(*)
FROM admin0_synonyms
WHERE name_ = a.name_
AND adm0_a3 = a.adm0_a3
AND rank < a.rank));

View File

@@ -0,0 +1,27 @@
CREATE OR REPLACE FUNCTION test_geocode_admin0_polygons(name text[])
RETURNS SETOF geocode_admin_v1 AS $$
DECLARE
ret geocode_admin_v1%rowtype;
BEGIN
FOR ret IN
SELECT
q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
FROM (
SELECT
q, (
SELECT the_geom
FROM ne_admin0_v3
WHERE adm0_a3 = (
SELECT adm0_a3 FROM admin0_synonyms
WHERE name_ = lower(regexp_replace(d.q, '[^a-zA-Z]', '', 'g'))
ORDER BY rank ASC LIMIT 1
)
) geom
FROM (SELECT unnest(name) q) d
) v
LOOP
RETURN NEXT ret;
END LOOP;
RETURN;
END
$$ LANGUAGE 'plpgsql' SECURITY DEFINER;

View File

@@ -0,0 +1,26 @@
---- Subdivide France into subregions ----
--- Assumes fresh NaturalEarth admin0 dataset
-- Split French Guiane from France
INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name)
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(4, -53)))
SELECT geom, 'GUF', 'French Guiane' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(4, -53), 8));
-- Split Corse from France
INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name)
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(42.14, 9.12)))
SELECT ST_Collect(geom), 'FRH', 'Corse' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(42.14, 9.12), 2));
-- Split Reunion from France
INSERT INTO ne_admin0_v3 (the_geom, adm0_a3, name)
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom,adm0_a3 FROM ne_admin0_v3 WHERE ST_Intersects(the_geom, CDB_LatLNg(-21.12, 55.51)))
SELECT ST_Collect(geom), 'REU', 'Reunion' FROM a WHERE ST_Intersects(geom, ST_Buffer(CDB_LatLNg(-21.12, 55.51),2));
-- Remove the above three from the FRA polygon
WITH a AS (SELECT (ST_Dump(the_geom)).geom geom FROM ne_admin0_v3 WHERE adm0_a3 = 'FRA')
UPDATE ne_admin0_v3 SET the_geom = (SELECT ST_Union(geom) FROM a WHERE NOT ST_intersects(geom, (SELECT ST_Union(the_geom) FROM ne_admin0_v3 WHERE adm0_a3 IN ('GUF', 'FRH', 'REU')))) WHERE adm0_a3 = 'FRA';