New version 0.19.0

Merge pull request #298 from CartoDB/295-estimate-row-count
Add CDB_EstimateRowCount function
2017-04-11 11:22:20 +02:00 · 2017-04-11 11:01:31 +02:00 · 2017-04-10 15:58:49 +02:00 · 2017-04-10 13:50:37 +02:00 · 2017-04-10 12:17:47 +02:00 · 2017-04-10 08:08:59 +02:00
26 changed files with 598 additions and 66 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,14 +1,41 @@
 language: c

 addons:
-  postgresql: 9.3
+  postgresql: 9.5

 before_install:
+  # Add custom PPAs from cartodb
+  - sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
+  - sudo add-apt-repository -y ppa:cartodb/gis
+  - sudo add-apt-repository -y ppa:cartodb/gis-testing
  - sudo apt-get update
-  #- sudo apt-get install -q postgresql-9.3-postgis-2.1
-  - sudo apt-get update
-  - sudo apt-get install -q postgresql-server-dev-9.3
-  - sudo apt-get install -q postgresql-plpython-9.3
+
+  # Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
+  - sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
+
+  # Install postgres db and build deps
+  - sudo /etc/init.d/postgresql stop # stop travis default instance
+  - sudo apt-get -y remove --purge postgresql-9.1
+  - sudo apt-get -y remove --purge postgresql-9.2
+  - sudo apt-get -y remove --purge postgresql-9.3
+  - sudo apt-get -y remove --purge postgresql-9.4
+  - sudo apt-get -y remove --purge postgresql-9.5
+  - sudo rm -rf /var/lib/postgresql/
+  - sudo rm -rf /var/log/postgresql/
+  - sudo rm -rf /etc/postgresql/
+  - sudo apt-get -y remove --purge postgis-2.2
+  - sudo apt-get -y autoremove
+
+  - sudo apt-get -y install postgresql-9.5=9.5.2-3cdb2
+  - sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb2
+  - sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb2
+  - sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
+  - sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
+
+  # configure it to accept local connections from postgres
+  - echo -e "# TYPE  DATABASE        USER            ADDRESS                 METHOD \nlocal   all             postgres                                trust\nlocal   all             all                                     trust\nhost    all             all             127.0.0.1/32            trust" \
+    | sudo tee /etc/postgresql/9.5/main/pg_hba.conf
+  - sudo /etc/init.d/postgresql restart 9.5

 script:
  - make
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,3 +62,7 @@ A useful query:
 ```sql
 SELECT * FROM pg_extension_update_paths('cartodb') WHERE path IS NOT NULL AND source = cdb_version();
 ```
+
+## Submitting Contributions
+
+* You will need to sign a Contributor License Agreement (CLA) before making a submission. [Learn more here](https://carto.com/contributions).
--- a/4
+++ b/4
@@ -1,7 +1,7 @@
 # cartodb/Makefile

 EXTENSION = cartodb
-EXTVERSION = 0.18.4
+EXTVERSION = 0.19.0

 SED = sed

@@ -78,6 +78,8 @@ UPGRADABLE = \
  0.18.2 \
  0.18.3 \
  0.18.4 \
+  0.18.5 \
+  0.19.0 \
  $(EXTVERSION)dev \
  $(EXTVERSION)next \
  $(END)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,15 @@
+0.19.0 (2017-04-11)
+
+* Add new function `CDB_EstimateRowCount` #295
+
+0.18.5 (2016-11-30)
+
+* Add to new overview creation strategies #290
+* Fix tests: race condition with publicuser #157
+* Fix: CDB_Stats divisions by zero #181
+* Better implementation of `CDB_EqualIntervalBins` #244
+* New tests for binning functions #249
+
 0.18.4 (2016-11-04)

 * No functional changes; fixes the migration from previous versions #288
--- a/doc/CDB_EstimateRowCount.md
+++ b/doc/CDB_EstimateRowCount.md
@@ -0,0 +1,25 @@
+Estimate the number of rows of a query.
+
+
+#### Using the function
+
+```sql
+SELECT CDB_EstimateRowCount($$
+  UPDATE addresses SET the_geom = cdb_geocode_street_point(addr, city, state, 'US');
+$$) AS row_count;
+```
+
+Result:
+
+```
+ row_count
+-----------
+         5
+(1 row)
+```
+
+#### Arguments
+
+CDB_EstimateRowCount(query)
+
+* **query** text: the SQL query to estimate the row count for.
--- a/scripts-available/CDB_EqualIntervalBins.sql
+++ b/scripts-available/CDB_EqualIntervalBins.sql
@@ -1,8 +1,8 @@
 --
 -- Calculate the equal interval bins for a given column
 --
-- @param in_array A numeric array of numbers to determine the best
--                   to determine the bin boundary
+-- @param in_array An array of numbers to determine the best
+--                   bin boundary
 --
 -- @param breaks The number of bins you want to find.
 --  
@@ -11,27 +11,14 @@
 -- 
 --

-CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$
-DECLARE 
-    diff numeric;
-    min_val numeric;
-    max_val numeric;
-    tmp_val numeric;
-    i INT := 1;
-    reply numeric[];
-BEGIN
-    SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL;
-    diff = (max_val - min_val) / breaks::numeric;
-    LOOP
-        IF i < breaks THEN
-            tmp_val = min_val + i::numeric * diff;
-            reply = array_append(reply, tmp_val);
-            i := i+1;
-        ELSE
-            reply = array_append(reply, max_val);
-            EXIT;
-        END IF;
-    END LOOP;
-    RETURN reply;
-END;
-$$ language plpgsql IMMUTABLE;
+CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array anyarray, breaks INT ) RETURNS anyarray as $$
+WITH stats AS (
+  SELECT min(e), (max(e)-min(e))/breaks AS del
+    FROM (SELECT unnest(in_array) e) AS p)
+SELECT array_agg(bins)
+  FROM (
+    SELECT min + generate_series(1,breaks)*del AS bins
+      FROM stats) q;
+$$ LANGUAGE SQL IMMUTABLE;
+
+DROP FUNCTION IF EXISTS CDB_EqualIntervalBins( numeric[], integer);
--- a/scripts-available/CDB_EstimateRowCount.sql
+++ b/scripts-available/CDB_EstimateRowCount.sql
@@ -0,0 +1,31 @@
+-- Internal function to generate stats for a table if they don't exist
+CREATE OR REPLACE FUNCTION _CDB_GenerateStats(reloid REGCLASS)
+RETURNS VOID
+AS $$
+DECLARE
+  has_stats BOOLEAN;
+BEGIN
+  SELECT EXISTS (
+    SELECT * FROM pg_catalog.pg_statistic WHERE starelid = reloid
+  ) INTO has_stats;
+  IF NOT has_stats THEN
+    EXECUTE Format('ANALYZE %s;', reloid);
+  END IF;
+END
+$$ LANGUAGE 'plpgsql' VOLATILE STRICT SECURITY DEFINER;
+
+-- Return a row count estimate of the result of a query using statistics
+CREATE OR REPLACE FUNCTION CDB_EstimateRowCount(query text)
+RETURNS Numeric
+AS $$
+DECLARE
+  plan JSON;
+BEGIN
+  -- Make sure statistics exist for all the tables of the query
+  PERFORM _CDB_GenerateStats(tabname) FROM  unnest(CDB_QueryTablesText(query)) AS tabname;
+
+  -- Use the query planner to obtain an estimate of the number of result rows
+  EXECUTE 'EXPLAIN (FORMAT JSON) ' || query INTO STRICT plan;
+  RETURN plan->0->'Plan'->'Plan Rows';
+END
+$$ LANGUAGE 'plpgsql' VOLATILE STRICT;
--- a/scripts-available/CDB_Overviews.sql
+++ b/scripts-available/CDB_Overviews.sql
@@ -697,6 +697,356 @@ AS $$
  END;
 $$ LANGUAGE PLPGSQL;

+
+CREATE OR REPLACE FUNCTION _CDB_GridCluster_Reduce_Strategy(reloid REGCLASS, ref_z INTEGER, overview_z INTEGER, grid_px FLOAT8 DEFAULT NULL)
+RETURNS REGCLASS
+AS $$
+  DECLARE
+    overview_rel TEXT;
+    reduction FLOAT8;
+    base_name TEXT;
+    pixel_m FLOAT8;
+    grid_m FLOAT8;
+    offset_m FLOAT8;
+    offset_x TEXT;
+    offset_y TEXT;
+    cell_x TEXT;
+    cell_y TEXT;
+    aggr_attributes TEXT;
+    attributes TEXT;
+    columns TEXT;
+    gtypes TEXT[];
+    schema_name TEXT;
+    table_name TEXT;
+    point_geom TEXT;
+  BEGIN
+    SELECT _CDB_GeometryTypes(reloid) INTO gtypes;
+    IF gtypes IS NULL OR array_upper(gtypes, 1) <> 1 OR gtypes[1] <> 'ST_Point' THEN
+      -- This strategy only supports datasets with point geomety
+      RETURN NULL;
+    END IF;
+
+    --TODO: check applicability: geometry type, minimum number of points...
+
+    overview_rel := _CDB_Overview_Name(reloid, ref_z, overview_z);
+
+    -- Grid size in pixels at Z level overview_z
+    IF grid_px IS NULL THEN
+      grid_px := 1.0;
+    END IF;
+
+    SELECT * FROM _cdb_split_table_name(reloid) INTO schema_name, table_name;
+
+    -- pixel_m: size of a pixel in webmercator units (meters)
+    SELECT CDB_XYZ_Resolution(overview_z) INTO pixel_m;
+    -- grid size in meters
+    grid_m = grid_px * pixel_m;
+
+    attributes := _CDB_Aggregable_Attributes_Expression(reloid);
+    aggr_attributes := _CDB_Aggregated_Attributes_Expression(reloid);
+    IF attributes <> '' THEN
+      attributes := ', ' || attributes;
+    END IF;
+    IF aggr_attributes <> '' THEN
+      aggr_attributes := aggr_attributes || ', ';
+    END IF;
+
+    -- Center of each cell:
+    cell_x := Format('gx*%1$s + %2$s', grid_m, grid_m/2);
+    cell_y := Format('gy*%1$s + %2$s', grid_m, grid_m/2);
+
+    -- Displacement to the nearest pixel center:
+    IF MOD(grid_px::numeric, 1.0::numeric) = 0 THEN
+      offset_m := pixel_m/2 - MOD((grid_m/2)::numeric, pixel_m::numeric)::float8;
+      offset_x := Format('%s', offset_m);
+      offset_y := Format('%s', offset_m);
+    ELSE
+      offset_x := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_x, pixel_m);
+      offset_y := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_y, pixel_m);
+    END IF;
+
+    point_geom := Format('ST_SetSRID(ST_MakePoint(%1$s + %3$s, %2$s + %4$s), 3857)', cell_x, cell_y, offset_x, offset_y);
+
+    -- compute the resulting columns in the same order as in the base table
+    WITH cols AS (
+      SELECT
+        CASE c
+        WHEN 'cartodb_id' THEN 'cartodb_id'
+        WHEN 'the_geom' THEN
+          Format('ST_Transform(%s, 4326) AS the_geom', point_geom)
+        WHEN 'the_geom_webmercator' THEN
+           Format('%s AS the_geom_webmercator', point_geom)
+        ELSE c
+        END AS column
+        FROM CDB_ColumnNames(reloid) c
+    )
+    SELECT string_agg(s.column, ',') FROM (
+      SELECT * FROM cols
+    ) AS s INTO columns;
+
+    IF NOT columns LIKE '%_feature_count%' THEN
+      columns := columns || ', n AS _feature_count';
+    END IF;
+
+    EXECUTE Format('DROP TABLE IF EXISTS %I.%I CASCADE;', schema_name, overview_rel);
+
+    -- Now we cluster the data using a grid of size grid_m
+    -- and selecte the centroid (average coordinates) of each cluster.
+    -- If we had a selected numeric attribute of interest we could use it
+    -- as a weight for the average coordinates.
+    EXECUTE Format('
+      CREATE TABLE %7$I.%3$I AS
+         WITH clusters AS (
+           SELECT
+             %5$s
+             count(*) AS n,
+             Floor(ST_X(f.the_geom_webmercator)/%2$s)::int AS gx,
+             Floor(ST_Y(f.the_geom_webmercator)/%2$s)::int AS gy,
+             MIN(cartodb_id) AS cartodb_id
+          FROM %1$s f
+          GROUP BY gx, gy
+         )
+         SELECT %6$s FROM clusters
+    ', reloid::text, grid_m, overview_rel, attributes, aggr_attributes, columns, schema_name);
+
+    RETURN Format('%I.%I', schema_name, overview_rel)::regclass;
+  END;
+$$ LANGUAGE PLPGSQL;
+
+-- This strategy places the aggregation of each cluster at the centroid of the cluster members.
+CREATE OR REPLACE FUNCTION _CDB_GridClusterCentroid_Reduce_Strategy(reloid REGCLASS, ref_z INTEGER, overview_z INTEGER, grid_px FLOAT8 DEFAULT NULL)
+RETURNS REGCLASS
+AS $$
+  DECLARE
+    overview_rel TEXT;
+    reduction FLOAT8;
+    base_name TEXT;
+    pixel_m FLOAT8;
+    grid_m FLOAT8;
+    offset_m FLOAT8;
+    offset_x TEXT;
+    offset_y TEXT;
+    cell_x TEXT;
+    cell_y TEXT;
+    aggr_attributes TEXT;
+    attributes TEXT;
+    columns TEXT;
+    gtypes TEXT[];
+    schema_name TEXT;
+    table_name TEXT;
+    point_geom TEXT;
+  BEGIN
+    SELECT _CDB_GeometryTypes(reloid) INTO gtypes;
+    IF gtypes IS NULL OR array_upper(gtypes, 1) <> 1 OR gtypes[1] <> 'ST_Point' THEN
+      -- This strategy only supports datasets with point geomety
+      RETURN NULL;
+    END IF;
+
+    --TODO: check applicability: geometry type, minimum number of points...
+
+    overview_rel := _CDB_Overview_Name(reloid, ref_z, overview_z);
+
+    -- Grid size in pixels at Z level overview_z
+    IF grid_px IS NULL THEN
+      grid_px := 1.0;
+    END IF;
+
+    SELECT * FROM _cdb_split_table_name(reloid) INTO schema_name, table_name;
+
+    -- pixel_m: size of a pixel in webmercator units (meters)
+    SELECT CDB_XYZ_Resolution(overview_z) INTO pixel_m;
+    -- grid size in meters
+    grid_m = grid_px * pixel_m;
+
+    attributes := _CDB_Aggregable_Attributes_Expression(reloid);
+    aggr_attributes := _CDB_Aggregated_Attributes_Expression(reloid);
+    IF attributes <> '' THEN
+      attributes := ', ' || attributes;
+    END IF;
+    IF aggr_attributes <> '' THEN
+      aggr_attributes := aggr_attributes || ', ';
+    END IF;
+
+    -- Center of each cell:
+    cell_x := Format('gx*%1$s + %2$s', grid_m, grid_m/2);
+    cell_y := Format('gy*%1$s + %2$s', grid_m, grid_m/2);
+
+    -- Displacement to the nearest pixel center:
+    IF MOD(grid_px::numeric, 1.0::numeric) = 0 THEN
+      offset_m := pixel_m/2 - MOD((grid_m/2)::numeric, pixel_m::numeric)::float8;
+      offset_x := Format('%s', offset_m);
+      offset_y := Format('%s', offset_m);
+    ELSE
+      offset_x := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_x, pixel_m);
+      offset_y := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_y, pixel_m);
+    END IF;
+
+    point_geom := Format('ST_SetSRID(ST_MakePoint(%1$s + %3$s, %2$s + %4$s), 3857)', cell_x, cell_y, offset_x, offset_y);
+
+    -- compute the resulting columns in the same order as in the base table
+    WITH cols AS (
+      SELECT
+        CASE c
+        WHEN 'cartodb_id' THEN 'cartodb_id'
+        WHEN 'the_geom' THEN
+          'ST_Transform(ST_SetSRID(ST_MakePoint(_sum_of_x/n, _sum_of_y/n), 3857), 4326) AS the_geom'
+        WHEN 'the_geom_webmercator' THEN
+          'ST_SetSRID(ST_MakePoint(_sum_of_x/n, _sum_of_y/n), 3857) AS the_geom_webmercator'
+        ELSE c
+        END AS column
+        FROM CDB_ColumnNames(reloid) c
+    )
+    SELECT string_agg(s.column, ',') FROM (
+      SELECT * FROM cols
+    ) AS s INTO columns;
+
+    IF NOT columns LIKE '%_feature_count%' THEN
+      columns := columns || ', n AS _feature_count';
+    END IF;
+
+    EXECUTE Format('DROP TABLE IF EXISTS %I.%I CASCADE;', schema_name, overview_rel);
+
+    -- Now we cluster the data using a grid of size grid_m
+    -- and selecte the centroid (average coordinates) of each cluster.
+    -- If we had a selected numeric attribute of interest we could use it
+    -- as a weight for the average coordinates.
+    EXECUTE Format('
+      CREATE TABLE %7$I.%3$I AS
+         WITH clusters AS (
+           SELECT
+             %5$s
+             count(*) AS n,
+             SUM(ST_X(f.the_geom_webmercator)) AS _sum_of_x,
+             SUM(ST_Y(f.the_geom_webmercator)) AS _sum_of_y,
+             Floor(ST_Y(f.the_geom_webmercator)/%2$s)::int AS gy,
+             Floor(ST_X(f.the_geom_webmercator)/%2$s)::int AS gx,
+             MIN(cartodb_id) AS cartodb_id
+          FROM %1$s f
+          GROUP BY gx, gy
+         )
+         SELECT %6$s FROM clusters
+    ', reloid::text, grid_m, overview_rel, attributes, aggr_attributes, columns, schema_name);
+
+    RETURN Format('%I.%I', schema_name, overview_rel)::regclass;
+  END;
+$$ LANGUAGE PLPGSQL;
+
+-- This strategy places the aggregation of each cluster at the position of one of the cluster members.
+CREATE OR REPLACE FUNCTION _CDB_GridClusterSample_Reduce_Strategy(reloid REGCLASS, ref_z INTEGER, overview_z INTEGER, grid_px FLOAT8 DEFAULT NULL)
+RETURNS REGCLASS
+AS $$
+  DECLARE
+    overview_rel TEXT;
+    reduction FLOAT8;
+    base_name TEXT;
+    pixel_m FLOAT8;
+    grid_m FLOAT8;
+    offset_m FLOAT8;
+    offset_x TEXT;
+    offset_y TEXT;
+    cell_x TEXT;
+    cell_y TEXT;
+    aggr_attributes TEXT;
+    attributes TEXT;
+    columns TEXT;
+    gtypes TEXT[];
+    schema_name TEXT;
+    table_name TEXT;
+    point_geom TEXT;
+  BEGIN
+    SELECT _CDB_GeometryTypes(reloid) INTO gtypes;
+    IF gtypes IS NULL OR array_upper(gtypes, 1) <> 1 OR gtypes[1] <> 'ST_Point' THEN
+      -- This strategy only supports datasets with point geomety
+      RETURN NULL;
+    END IF;
+
+    --TODO: check applicability: geometry type, minimum number of points...
+
+    overview_rel := _CDB_Overview_Name(reloid, ref_z, overview_z);
+
+    -- Grid size in pixels at Z level overview_z
+    IF grid_px IS NULL THEN
+      grid_px := 1.0;
+    END IF;
+
+    SELECT * FROM _cdb_split_table_name(reloid) INTO schema_name, table_name;
+
+    -- pixel_m: size of a pixel in webmercator units (meters)
+    SELECT CDB_XYZ_Resolution(overview_z) INTO pixel_m;
+    -- grid size in meters
+    grid_m = grid_px * pixel_m;
+
+    attributes := _CDB_Aggregable_Attributes_Expression(reloid);
+    aggr_attributes := _CDB_Aggregated_Attributes_Expression(reloid);
+    IF attributes <> '' THEN
+      attributes := ', ' || attributes;
+    END IF;
+    IF aggr_attributes <> '' THEN
+      aggr_attributes := aggr_attributes || ', ';
+    END IF;
+
+    -- Center of each cell:
+    cell_x := Format('gx*%1$s + %2$s', grid_m, grid_m/2);
+    cell_y := Format('gy*%1$s + %2$s', grid_m, grid_m/2);
+
+    -- Displacement to the nearest pixel center:
+    IF MOD(grid_px::numeric, 1.0::numeric) = 0 THEN
+      offset_m := pixel_m/2 - MOD((grid_m/2)::numeric, pixel_m::numeric)::float8;
+      offset_x := Format('%s', offset_m);
+      offset_y := Format('%s', offset_m);
+    ELSE
+      offset_x := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_x, pixel_m);
+      offset_y := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_y, pixel_m);
+    END IF;
+
+    point_geom := Format('ST_SetSRID(ST_MakePoint(%1$s + %3$s, %2$s + %4$s), 3857)', cell_x, cell_y, offset_x, offset_y);
+
+    -- compute the resulting columns in the same order as in the base table
+    WITH cols AS (
+      SELECT
+        CASE c
+        WHEN 'cartodb_id' THEN 'cartodb_id'
+        ELSE c
+        END AS column
+        FROM CDB_ColumnNames(reloid) c
+    )
+    SELECT string_agg(s.column, ',') FROM (
+      SELECT * FROM cols
+    ) AS s INTO columns;
+
+    IF NOT columns LIKE '%_feature_count%' THEN
+      columns := columns || ', n AS _feature_count';
+    END IF;
+
+    EXECUTE Format('DROP TABLE IF EXISTS %I.%I CASCADE;', schema_name, overview_rel);
+
+    -- Now we cluster the data using a grid of size grid_m
+    -- and select the centroid (average coordinates) of each cluster.
+    -- If we had a selected numeric attribute of interest we could use it
+    -- as a weight for the average coordinates.
+    EXECUTE Format('
+      CREATE TABLE %7$I.%3$I AS
+         WITH clusters AS (
+           SELECT
+             %5$s
+             count(*) AS n,
+             Floor(ST_X(_f.the_geom_webmercator)/%2$s)::int AS gx,
+             Floor(ST_Y(_f.the_geom_webmercator)/%2$s)::int AS gy,
+             MIN(cartodb_id) AS cartodb_id
+          FROM %1$s _f
+          GROUP BY gx, gy
+         ),
+         cluster_geom AS (
+           SELECT the_geom, the_geom_webmercator, clusters.*
+             FROM clusters INNER JOIN %1$s _g ON (clusters.cartodb_id = _g.cartodb_id)
+         )
+         SELECT %6$s FROM cluster_geom
+    ', reloid::text, grid_m, overview_rel, attributes, aggr_attributes, columns, schema_name);
+
+    RETURN Format('%I.%I', schema_name, overview_rel)::regclass;
+  END;
+$$ LANGUAGE PLPGSQL;
+
 -- Create overview tables for a dataset.
 -- Scope: public
 -- Parameters:
--- a/scripts-available/CDB_Stats.sql
+++ b/scripts-available/CDB_Stats.sql
@@ -4,7 +4,7 @@
 -- @param in_array A numeric array of numbers
 --
 -- Returns: statistical quantity chosen
-- 
+--
 -- References: http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm
 --

@@ -13,17 +13,21 @@ CREATE OR REPLACE FUNCTION CDB_Kurtosis ( in_array NUMERIC[] ) RETURNS NUMERIC a
 DECLARE
    a numeric;
    c numeric;
-    s numeric;
    k numeric;
 BEGIN
-    SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x;
+    SELECT AVG(e), COUNT(e)::numeric * power(stddev(e),4) INTO a, c FROM ( SELECT unnest(in_array) e ) x;

-    EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) - 3
-             FROM (SELECT unnest($4) e ) x'
-    INTO k
-    USING a, c, s, in_array;
+    IF c=0 THEN
+      RETURN 0;
+    ELSE

-    RETURN k;
+      EXECUTE 'SELECT sum(power($1 - e, 4)) / ($2 ) - 3
+             FROM (SELECT unnest($3) e ) x'
+      INTO k
+      USING a, c, in_array;
+
+      RETURN k;
+    END IF;
 END;
 $$ language plpgsql IMMUTABLE;

@@ -32,16 +36,18 @@ CREATE OR REPLACE FUNCTION CDB_Skewness ( in_array NUMERIC[] ) RETURNS NUMERIC a
 DECLARE
    a numeric;
    c numeric;
-    s numeric;
    sk numeric;
 BEGIN
-    SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x;
+    SELECT AVG(e), COUNT(e)::numeric * power(stddev(e),3) INTO a, c FROM ( SELECT unnest(in_array) e ) x;
+    IF c=0 THEN
+      RETURN 0;
+    ELSE
+      EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 )
+             FROM (SELECT unnest($3) e ) x'
+      INTO sk
+      USING a, c, in_array;

-    EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 * power($3, 3))
-             FROM (SELECT unnest($4) e ) x'
-    INTO sk
-    USING a, c, s, in_array;
-
-    RETURN sk;
+      RETURN sk;
+    END IF;
 END;
 $$ language plpgsql IMMUTABLE;
--- a/scripts-enabled/280-CDB_EstimateRowCount.sql
+++ b/scripts-enabled/280-CDB_EstimateRowCount.sql
@@ -0,0 +1 @@
+../scripts-available/CDB_EstimateRowCount.sql
--- a/test/CDB_DigitSeparatorTest_expect
+++ b/test/CDB_DigitSeparatorTest_expect
@@ -1,5 +1,6 @@
 BEGIN
 CREATE TABLE
+COPY 3
 none||
 only_com_dec|.|,
 only_dot_dec|,|.
--- a/test/CDB_EqualIntervalBinsTest.sql
+++ b/test/CDB_EqualIntervalBinsTest.sql
@@ -2,4 +2,10 @@ WITH data AS (
    SELECT array_agg(x::numeric) s FROM generate_series(1,300) x 
        WHERE x % 5 != 0 AND x % 7 != 0
    ) 
-SELECT round(unnest(CDB_EqualIntervalBins(s, 7)),7) FROM data
+SELECT round(unnest(CDB_EqualIntervalBins(s, 7)),7) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT round(unnest(CDB_EqualIntervalBins(s, 7)),7) FROM data_nulls;
--- a/test/CDB_EqualIntervalBinsTest_expect
+++ b/test/CDB_EqualIntervalBinsTest_expect
@@ -5,3 +5,10 @@
 213.8571429
 256.4285714
 299.0000000
+15.0000000
+29.0000000
+43.0000000
+57.0000000
+71.0000000
+85.0000000
+99.0000000
--- a/test/CDB_EstimateRowCountTest.sql
+++ b/test/CDB_EstimateRowCountTest.sql
@@ -0,0 +1,10 @@
+SET client_min_messages TO error;
+\set VERBOSITY terse
+CREATE TABLE tmptab1(id INT);
+INSERT INTO tmptab1(id) VALUES (1), (2), (3);
+CREATE TABLE tmptab2(id INT, value NUMERIC);
+INSERT INTO tmptab2(id, value) VALUES (1, 10.0), (2, 20.0);
+SELECT CDB_EstimateRowCount('SELECT SUM(value) FROM tmptab1 INNER JOIN tmptab2 ON (tmptab1.id = tmptab2.id);') AS row_count;
+SELECT CDB_EstimateRowCount('UPDATE tmptab2 SET value = 30 WHERE id=2;') AS row_count;
+DROP TABLE tmptab2;
+DROP TABLE tmptab1;
--- a/test/CDB_EstimateRowCountTest_expect
+++ b/test/CDB_EstimateRowCountTest_expect
@@ -0,0 +1,9 @@
+SET
+CREATE TABLE
+INSERT 0 3
+CREATE TABLE
+INSERT 0 2
+1
+1
+DROP TABLE
+DROP TABLE
--- a/test/CDB_HeadsTailsBinsTest.sql
+++ b/test/CDB_HeadsTailsBinsTest.sql
@@ -1,5 +1,11 @@
 WITH data AS (
-    SELECT array_agg(x) x FROM generate_series(1,100) x 
+    SELECT array_agg(x::numeric) s FROM generate_series(1,100) x
        WHERE x % 5 != 0 AND x % 7 != 0
-    ) 
-SELECT round(unnest(CDB_HeadsTailsBins(x, 7)),2) FROM data
+    )
+SELECT round(unnest(CDB_HeadsTailsBins(s, 7)),2) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT round(unnest(CDB_HeadsTailsBins(s, 7)),2) FROM data_nulls;
--- a/test/CDB_HeadsTailsBinsTest_expect
+++ b/test/CDB_HeadsTailsBinsTest_expect
@@ -5,3 +5,9 @@
 96.50
 98.00
 99.00
+49.76
+74.65
+88.50
+94.50
+98.00
+99.00
--- a/test/CDB_JenksBinsTest.sql
+++ b/test/CDB_JenksBinsTest.sql
@@ -1,5 +1,11 @@
 WITH data AS (
-    SELECT array_agg(x) x FROM generate_series(1,100) x 
+    SELECT array_agg(x::numeric) s FROM generate_series(1,300) x
        WHERE x % 5 != 0 AND x % 7 != 0
    ) 
-SELECT unnest(CDB_JenksBins(x, 7)) FROM data
+SELECT unnest(CDB_JenksBins(s, 7)) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,300) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT unnest(CDB_JenksBins(s, 7)) FROM data_nulls;
--- a/test/CDB_JenksBinsTest_expect
+++ b/test/CDB_JenksBinsTest_expect
@@ -1,7 +1,14 @@
-13
-29
 43
-57
-71
-83
-99
+86
+129
+172
+213
+257
+299
+37
+51
+97
+157
+213
+241
+
--- a/test/CDB_QuantileBinsTest.sql
+++ b/test/CDB_QuantileBinsTest.sql
@@ -1,5 +1,11 @@
 WITH data AS (
-    SELECT array_agg(x) x FROM generate_series(1,100) x 
+    SELECT array_agg(x::numeric) s FROM generate_series(1,100) x
        WHERE x % 5 != 0 AND x % 7 != 0
    ) 
-SELECT unnest(CDB_QuantileBins(x, 7)) FROM data
+SELECT unnest(CDB_QuantileBins(s, 7)) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT unnest(CDB_QuantileBins(s, 7)) FROM data_nulls;
--- a/test/CDB_QuantileBinsTest_expect
+++ b/test/CDB_QuantileBinsTest_expect
@@ -4,4 +4,11 @@
 57
 71
 86
+99
+29
+57
+87
+
+
+
 99
--- a/test/CDB_QueryStatementsTest.sql
+++ b/test/CDB_QueryStatementsTest.sql
@@ -1,3 +1,6 @@
+SET client_min_messages TO error;
+\set VERBOSITY terse
+
 WITH q AS ( SELECT CDB_QueryStatements('
 SELECT * FROM geometry_columns;
 ') as statement )
--- a/test/CDB_QueryStatementsTest_expect
+++ b/test/CDB_QueryStatementsTest_expect
@@ -1,3 +1,4 @@
+SET
 1|1|SELECT * FROM geometry_columns
 2|1|SELECT * FROM geometry_columns
 3|1|SELECT * FROM geometry_columns
--- a/test/CDB_QueryTablesTest.sql
+++ b/test/CDB_QueryTablesTest.sql
@@ -1,3 +1,5 @@
+SET client_min_messages TO warning;
+\set VERBOSITY terse

 WITH inp AS ( select 'SELECT * FROM geometry_columns'::text as q )
 SELECT q, CDB_QueryTables(q) from inp;
--- a/test/CDB_QueryTablesTest_expect
+++ b/test/CDB_QueryTablesTest_expect
@@ -1,17 +1,14 @@
+SET
 SELECT * FROM geometry_columns|{pg_catalog.pg_attribute,pg_catalog.pg_class,pg_catalog.pg_constraint,pg_catalog.pg_namespace,pg_catalog.pg_type}
 SELECT a.attname FROM pg_class c JOIN pg_attribute a on (a.attrelid = c.oid)|{pg_catalog.pg_attribute,pg_catalog.pg_class}
 CREATE table "my'tab;le" as select 1|{}
 SELECT a.oid, b.oid FROM pg_class a, pg_class b|{pg_catalog.pg_class}
 SELECT 1 as col1; select 2 as col2|{}
 WARNING:  CDB_QueryTables cannot explain query: select 1 from nonexistant (42P01: relation "nonexistant" does not exist)
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 ERROR:  relation "nonexistant" does not exist
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 begin; select * from pg_class; commit;|{pg_catalog.pg_class}
 WARNING:  CDB_QueryTables cannot explain query: select * from test (42P01: relation "test" does not exist)
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 ERROR:  relation "test" does not exist
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 WITH a AS (select * from pg_class) select * from a|{pg_catalog.pg_class}
 CREATE SCHEMA
 CREATE TABLE
--- a/test/extension/test.sh
+++ b/test/extension/test.sh
@@ -228,6 +228,7 @@ function tear_down() {
    sql 'DROP ROLE cdb_testmember_2;'

    tear_down_database
+    DATABASE=postgres sql postgres 'DROP ROLE IF EXISTS publicuser';
 }


@@ -486,6 +487,18 @@ function test_foreign_tables() {
    ${CMD} -d fdw_target -f scripts-available/CDB_QueryTables.sql
    ${CMD} -d fdw_target -f scripts-available/CDB_TableMetadata.sql

+    DATABASE=fdw_target sql postgres "DO
+\$\$
+BEGIN
+   IF NOT EXISTS (
+      SELECT *
+      FROM   pg_catalog.pg_user
+      WHERE  usename = 'publicuser') THEN
+
+      CREATE ROLE publicuser LOGIN;
+   END IF;
+END
+\$\$;"
    DATABASE=fdw_target sql postgres 'CREATE SCHEMA test_fdw;'
    DATABASE=fdw_target sql postgres 'CREATE TABLE test_fdw.foo (a int);'
    DATABASE=fdw_target sql postgres 'INSERT INTO test_fdw.foo (a) values (42);'
Author	SHA1	Message	Date
Javier Goizueta	2d473cf693	New version 0.19.0	2017-04-11 11:22:20 +02:00
Javier Goizueta	4193ff3874	Merge pull request #298 from CartoDB/295-estimate-row-count Add CDB_EstimateRowCount function	2017-04-11 11:01:31 +02:00
Javier Goizueta	68a0752849	Use PG 9.5 for travis tests; fix tests	2017-04-10 15:58:49 +02:00
Javier Goizueta	815b5b429d	Fix tests	2017-04-10 13:50:37 +02:00
Javier Goizueta	76bdb3657a	Fix tests	2017-04-10 12:17:47 +02:00
Javier Goizueta	234373df11	Replace unnecessary count	2017-04-10 08:08:59 +02:00
Javier Goizueta	a486eed2e3	Add CDB_EstimateRowCount function See #295	2017-04-07 15:35:48 +02:00
Mario de Frutos	795d92da8d	Added CLA paragraph	2017-01-25 10:54:12 +01:00
Javier Goizueta	58e2e7e238	Release 0.18.5	2016-11-30 17:17:45 +01:00
Javier Goizueta	25d27263cb	Merge pull request #249 from CartoDB/nullbins Test behavior of binning fuctions with nulls	2016-11-30 16:09:23 +01:00
Javier Goizueta	bbadcc838e	Merge pull request #244 from CartoDB/equalbins Convert CDB_EqualIntervalBins to a single SQL statement and add float version	2016-11-30 16:09:05 +01:00
Javier Goizueta	b1a0904c07	Merge pull request #181 from CartoDB/update_to_cdb_stats Fix for division by zero error on empty or homogenous array	2016-11-30 16:08:40 +01:00
Javier Goizueta	399b680b41	Merge pull request #283 from CartoDB/157-test-fixes Fix tests: race condition with publicuser #157	2016-11-30 16:08:21 +01:00
Javier Goizueta	7c0636c5f9	Merge pull request #290 from CartoDB/286-overview-strategies Add point overview strategies	2016-11-30 11:46:29 +01:00
Javier Goizueta	f58f870457	Remove use of first aggregator in sample-cluster overviews strategy This is not more efficient but the geometry now corresponds to the cartodb_id and the dependency with custom aggregators (firt) is removed.	2016-11-29 14:08:08 +01:00
Javier Goizueta	5992304b47	Add a couple of overview clustering strategies	2016-11-03 13:31:04 +01:00
Rafa de la Torre	30cd4cf1f9	Fix tests: race condition with publicuser #157	2016-10-17 16:31:10 +02:00
Paul Norman	3122a0479d	Test behavior of binning fuctions with nulls All test results are based off of existing behavior, which doesn't always make sense (ref #247)	2016-04-28 09:59:33 -07:00
Paul Norman	956e56cd37	Use anyarray for equalintervalbins	2016-04-27 16:10:01 -07:00
Paul Norman	b19a5fc3dc	Convert CDB_EqualIntervalBins to a single SQL statement and add float version	2016-04-25 14:35:26 -07:00
Stuart Lynn	0ecbbd8e71	Make sure that empty arrays or arrays with all the same entry return 0 for Skewness and Kurtosis rather than throwing a division by zero error	2015-12-04 14:54:15 -05:00
				`@@ -0,0 +1 @@`
				`../scripts-available/CDB_EstimateRowCount.sql`