Merge branch 'master' of https://github.com/CartoDB/data-services

merge
2014-08-31 19:36:17 -04:00
parent 27b1678cfa a2d3addcef
commit 342fc8b4c1
6 changed files with 159 additions and 117 deletions
--- a/geocoder/admin1/sql/geocoder.sql
+++ b/geocoder/admin1/sql/geocoder.sql
@@ -1,97 +1,74 @@
 --- Usage

--SELECT (geocode_admin1_polygons(Array['TX','Cuidad Real', 'sevilla'])).*
-
--- Function
-
-CREATE OR REPLACE FUNCTION test_geocode_admin1_polygons(name text[])
-  RETURNS SETOF geocode_admin_v1 AS $$
-  DECLARE 
-    ret geocode_admin_v1%rowtype;
-  BEGIN
-  FOR ret IN
-    SELECT
-       q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
-    FROM (
-      SELECT 
-        q, (
-          SELECT the_geom 
-          FROM global_province_polygons
-          WHERE d.c = ANY (synonyms) 
-          -- To calculate frequency, I simply counted the number of users
-          -- we had signed up in each country. Countries with more users, 
-          -- we favor higher in the geocoder :)
-          ORDER BY frequency DESC LIMIT 1
-        ) geom
-      FROM (SELECT trim(replace(lower(unnest(name)),'.',' ')) c, unnest(name) q) d
-    ) v
-  LOOP 
-    RETURN NEXT ret;
-  END LOOP;
-  RETURN;
-END
-$$ LANGUAGE 'plpgsql' SECURITY DEFINER;
-Text array, country name
-
-
-- CREATE OR REPLACE FUNCTION test_geocode_admin1_polygons(name text[])
--   RETURNS SETOF geocode_admin_v1 AS $$
--   DECLARE 
--     ret geocode_admin_v1%rowtype;
--   BEGIN
--   -- FOR ret IN
--   RETURN QUERY
--     SELECT 
--       d.q, n.the_geom as geom, 
--       CASE WHEN s.adm1_code IS NULL then FALSE ELSE TRUE END AS success 
--       FROM (
--         SELECT 
--           q, lower(regexp_replace(q, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g'))::text x 
--         FROM (SELECT unnest(name) q) g
--         ) d 
--       LEFT OUTER JOIN 
--         admin1_synonyms s ON name_ = d.x 
--       LEFT OUTER JOIN 
--         ne_admin1_v3 n ON s.adm1_code = n.adm1_code;
-- END
-- $$ LANGUAGE 'plpgsql' SECURITY DEFINER;
-
-
--- Usage
-
 --- SELECT (geocode_admin1_polygons(Array['az', 'Texas'], 'Ecuador')).*

+
 --- Function
 CREATE OR REPLACE FUNCTION test_geocode_admin1_polygons(name text[], inputcountry text)
-  RETURNS SETOF geocode_admin_v1 AS $$
+  RETURNS SETOF geocode_admin_country_v1 AS $$
  DECLARE 
-    ret geocode_admin_v1%rowtype;
+    ret geocode_admin_country_v1%rowtype;
+    adm0 TEXT;
+    adm0_check BOOLEAN := TRUE;
  BEGIN

-  FOR ret IN WITH 
-    p AS (SELECT r.c, r.q, (SELECT iso3 FROM country_decoder WHERE lower(inputcountry) = ANY (synonyms)) i FROM (SELECT  trim(replace(lower(unnest(name)),'.',' ')) c, unnest(name) q) r)
-    SELECT
-       q, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
-    FROM (
-      SELECT 
-        q, (
-          SELECT the_geom 
-          FROM global_province_polygons
-          WHERE p.c = ANY (synonyms) 
-          AND iso3 = p.i
-          -- To calculate frequency, I simply counted the number of users
-          -- we had signed up in each country. Countries with more users, 
-          -- we favor higher in the geocoder :)
-          ORDER BY frequency DESC LIMIT 1
-        ) geom
-      FROM p) n
-    LOOP
-    RETURN NEXT ret;
-  END LOOP;
+  IF inputcountry IS NULL THEN
+    adm0_check = FALSE;
+  END IF;
+  IF trim(inputcountry)='' THEN
+    adm0_check = FALSE;
+  END IF;
+
+  IF adm0_check IS TRUE THEN
+    SELECT INTO adm0 adm0_a3 FROM admin0_synonyms WHERE name_ = lower(regexp_replace(inputcountry, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g'))::text LIMIT 1;
+
+    FOR ret IN
+      SELECT
+        q, inputcountry, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
+      FROM (
+        SELECT 
+          q, (
+            SELECT the_geom FROM qs_adm1 WHERE global_id = (
+              SELECT global_id
+              FROM admin1_synonyms
+              WHERE name_ =  lower(regexp_replace(d.q, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g'))::text
+                AND adm0_a3 = adm0
+              LIMIT 1
+              )
+            ) geom
+        FROM (SELECT unnest(name) q) d
+      ) v
+    LOOP 
+      RETURN NEXT ret;
+    END LOOP;
+
+  --Handle cases where country couldn't be found
+  ELSE
+    FOR ret IN
+      SELECT
+        q, inputcountry, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
+      FROM (
+        SELECT 
+          q, (
+            SELECT the_geom FROM qs_adm1 WHERE global_id = (
+              SELECT global_id
+              FROM admin1_synonyms
+              WHERE name_ =  lower(regexp_replace(d.q, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g'))::text
+              LIMIT 1
+              )
+            ) geom
+        FROM (SELECT unnest(name) q) d
+      ) v
+    LOOP 
+      RETURN NEXT ret;
+    END LOOP;
+  END IF;
  RETURN;
 END
-$$ LANGUAGE 'plpgsql' SECURITY DEFINER;
-Text array, country array
+$$ LANGUAGE 'plpgsql';
+
+
+--Text array, country array

 --- Usage

@@ -103,40 +80,12 @@ CREATE OR REPLACE FUNCTION test_geocode_admin1_polygons(names text[], country te
  RETURNS SETOF geocode_admin_country_v1 AS $$
  DECLARE 
    ret geocode_admin_country_v1%rowtype;
-    nans TEXT[];
  BEGIN

-
-  SELECT array_agg(p) INTO nans FROM (SELECT unnest(names) p, unnest(country) c) g WHERE c IS NULL;
-
-  IF 0 < array_length(nans, 1) THEN
-    SELECT array_agg(p), array_agg(c) INTO names, country FROM (SELECT unnest(names) p, unnest(country) c) g WHERE c IS NOT NULL;
-    FOR ret IN SELECT g.q, NULL as c, g.geom, g.success FROM (SELECT (geocode_admin1_polygons(nans)).*) g LOOP
-      RETURN NEXT ret;
-    END LOOP;
-  END IF;
-
-
-  FOR ret IN WITH 
-    p AS (SELECT r.p, r.q, c, (SELECT iso3 FROM country_decoder WHERE lower(r.c) = ANY (synonyms)) i FROM (SELECT  trim(replace(lower(unnest(names)),'.',' ')) p, unnest(names) q, unnest(country) c) r)
-    SELECT
-       q, c, geom, CASE WHEN geom IS NULL THEN FALSE ELSE TRUE END AS success
-    FROM (
-      SELECT 
-        q, c, (
-          SELECT the_geom 
-          FROM global_province_polygons
-          WHERE p.p = ANY (synonyms) 
-          AND iso3 = p.i
-          -- To calculate frequency, I simply counted the number of users
-          -- we had signed up in each country. Countries with more users, 
-          -- we favor higher in the geocoder :)
-          ORDER BY frequency DESC LIMIT 1
-        ) geom
-      FROM p) n
-    LOOP
+  FOR ret IN SELECT (test_geocode_admin1_polygons(array_agg(n), c)).* FROM (SELECT unnest(names) n, unnest(country) c) a GROUP BY c LOOP 
    RETURN NEXT ret;
  END LOOP;
  RETURN;
 END
-$$ LANGUAGE 'plpgsql' SECURITY DEFINER;
+$$ LANGUAGE 'plpgsql';
+
--- a/geocoder/ip-addresses/README.md
+++ b/geocoder/ip-addresses/README.md
@@ -5,11 +5,14 @@ IP address geocoder

 ### Creation steps

-1. upload a new dataset to the geocoder table, call it latest_ip_address_locations
+1. Upload a new dataset to the geocoder table, call it latest_ip_address_locations
 2. Run the sql/build_data_table script to update the table

 ### Data Sources

+GeoLite2 open source database [Created by MaxMind](http://www.maxmind.com) - 
+http://dev.maxmind.com/geoip/geoip2/geolite2/ Download the CSV [Geolite2 City](http://geolite.maxmind.com/download/geoip/database/GeoLite2-City-CSV.zip)
+
 ### Preparation details


--- a/geocoder/ip-addresses/sql/build_data_table.sql
+++ b/geocoder/ip-addresses/sql/build_data_table.sql
@@ -1,5 +1,5 @@

---- Postal Code Polygon table ---
+---- IP addresses table ---
 --- ---

 -- Clear table
--- a/geocoder/postal-codes/README.md
+++ b/geocoder/postal-codes/README.md
@@ -3,8 +3,14 @@ Postal code geocoder (polygons)

 ### Function

+By following the next steps a table is populated with zipcodes from Australia, Canada, USA and France (identified by iso3) related with their spatial location in terms of polygons.
+
 ### Creation steps

+1. Import the four files attached in the section "Datasources".
+
+2. Run sql/build_data_table.sql. Notice that table "postal_code_polygons" should exist in advance with columns: _the_geom_, _adm0_a3_ and _postal_code_.
+
 ### Data Sources

 Australian polygons - http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/2033.0.55.0012011?OpenDocument
@@ -20,6 +26,58 @@ French polygons - http://www.data.gouv.fr/dataset/fond-de-carte-des-codes-postau

 ### Preparation details

+The names of the imported files are:
+
+- doc for Australia table
+- gfsa000a11a_e for Canada table
+- tl_2013_us_zcta510 for USA table
+- codes_postaux for France table
+
 # Postal code geocoder (points)

-todo
+### Function
+
+By following the next steps a table is populated with zipcodes of different countries (identified by iso3) related with their spatial location in terms of points.
+
+This dataset includes data for the following countries:
+
+````
+CH, ES, GU, ZA, MX, SJ, NL, RU, AX, TH, AR, MY, RE, LK, GB, IS, GL, JE, DK, IN,
+SI, GP, MQ, BR, SM, BG, NZ, MP, CZ, DO, MD, PK, TR, VI, BD, GG, LT, PM, MC, US,
+IT, LU, SK, LI, PR, IM, NO, PT, PL, FI, JP, CA, DE, HU, PH, SE, VA, YT, MK, FR,
+MH, RO, FO, GF, AD, HR, DZ, GT, AU, AS, BE, AT
+````
+
+### Creation steps
+
+1. Download the allCountries.zip file from [GeoNames](www.geonames.org). Import and rename the table as tmp_zipcode_points. You can follow the manual process explained below instead.
+
+
+The columns that are loaded are the following ones:
+field_1: corresponding to ISO2
+field_10: corresponds to latitude
+field_11: corresponds to longitude
+field_2: corresponds to ZIP code
+
+2. Georeference the table using field11 as longitude and field10 as latitude in order to construct the_geom.
+
+3. Add column iso3 (text) and run sql/build_zipcode_points_table.sql.
+
+
+**Alternative manual process**
+
+Open the allCountries.txt file with Excel an add a new row on top. Delete columns C-I and L.
+
+In the first row, add the following columns: iso2, zipcode, lat, long.
+
+Import the file ignoring step 2.
+
+### Data Sources
+
+All countries points [GeoNames](www.geonames.org) - http://download.geonames.org/export/zip/allCountries.zip
+
+### Preparation details
+
+_The big size of the dataset may cause interruptions in the processing of the coordinates after uploading the file, manipulating the file before importing is a faster workaround._
+
+
--- a/geocoder/postal-codes/sql/build_zipcode_points_table.sql
+++ b/geocoder/postal-codes/sql/build_zipcode_points_table.sql
@@ -0,0 +1,26 @@
+
+---- Postal Code Points table ---
+
+-- Clear table
+
+DELETE FROM zipcode_points;
+
+-- Insert points
+
+DELETE FROM zipcode_points;
+
+INSERT INTO zip_code_points (the_geom, zipcode, iso3) 
+SELECT the_geom, zipcode,
+		(
+		SELECT country_decoder.iso3 FROM country_decoder 
+		WHERE tmp_zipcode_points.iso2 = country_decoder.iso2
+		)
+FROM tmp_zipcode_points
+);
+
+
+-- Drops temporary table
+
+DROP TABLE tmp_zipcode_points;
+
+
--- a/geocoder/setup/indexes.sql
+++ b/geocoder/setup/indexes.sql
@@ -4,6 +4,12 @@ CREATE INDEX idx_admin0_synonyms_name_ ON admin0_synonyms (name_);
 CREATE INDEX idx_admin0_synonyms_rank ON admin0_synonyms (rank);
 -- CREATE INDEX idx_admin0_synonyms_name_rank ON admin0_synonyms (name_, rank);

+-- Index on admin1 id
+CREATE UNIQUE INDEX idx_qs_adm1_global_id ON qs_adm1 (global_id)
+CREATE INDEX idx_admin1_synonyms_name_adm0 ON admin1_synonyms (name_, adm0_a3)
+
 -- create indexes on polygon table
 CREATE UNIQUE INDEX idx_ne_admin0_v3_adm0_a3 ON ne_admin0_v3 (adm0_a3);

+-- create indexes on postal code polygon table
+CREATE UNIQUE INDEX idx_postal_code_polygons_a3_code ON postal_code_polygons (adm0_a3, postal_code)