New version 0.19.0

Merge pull request #298 from CartoDB/295-estimate-row-count
Add CDB_EstimateRowCount function
2017-04-11 11:22:20 +02:00 · 2017-04-11 11:01:31 +02:00 · 2017-04-10 15:58:49 +02:00 · 2017-04-10 13:50:37 +02:00 · 2017-04-10 12:17:47 +02:00 · 2017-04-10 08:08:59 +02:00
26 changed files with 604 additions and 66 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,14 +1,41 @@
 language: c

 addons:
-  postgresql: 9.3
+  postgresql: 9.5

 before_install:
+  # Add custom PPAs from cartodb
+  - sudo add-apt-repository -y ppa:cartodb/postgresql-9.5
+  - sudo add-apt-repository -y ppa:cartodb/gis
+  - sudo add-apt-repository -y ppa:cartodb/gis-testing
  - sudo apt-get update
-  #- sudo apt-get install -q postgresql-9.3-postgis-2.1
-  - sudo apt-get update
-  - sudo apt-get install -q postgresql-server-dev-9.3
-  - sudo apt-get install -q postgresql-plpython-9.3
+
+  # Force instalation of libgeos-3.5.0 (presumably needed because of existing version of postgis)
+  - sudo apt-get -y install libgeos-3.5.0=3.5.0-1cdb2
+
+  # Install postgres db and build deps
+  - sudo /etc/init.d/postgresql stop # stop travis default instance
+  - sudo apt-get -y remove --purge postgresql-9.1
+  - sudo apt-get -y remove --purge postgresql-9.2
+  - sudo apt-get -y remove --purge postgresql-9.3
+  - sudo apt-get -y remove --purge postgresql-9.4
+  - sudo apt-get -y remove --purge postgresql-9.5
+  - sudo rm -rf /var/lib/postgresql/
+  - sudo rm -rf /var/log/postgresql/
+  - sudo rm -rf /etc/postgresql/
+  - sudo apt-get -y remove --purge postgis-2.2
+  - sudo apt-get -y autoremove
+
+  - sudo apt-get -y install postgresql-9.5=9.5.2-3cdb2
+  - sudo apt-get -y install postgresql-server-dev-9.5=9.5.2-3cdb2
+  - sudo apt-get -y install postgresql-plpython-9.5=9.5.2-3cdb2
+  - sudo apt-get -y install postgresql-9.5-postgis-scripts=2.2.2.0-cdb2
+  - sudo apt-get -y install postgresql-9.5-postgis-2.2=2.2.2.0-cdb2
+
+  # configure it to accept local connections from postgres
+  - echo -e "# TYPE  DATABASE        USER            ADDRESS                 METHOD \nlocal   all             postgres                                trust\nlocal   all             all                                     trust\nhost    all             all             127.0.0.1/32            trust" \
+    | sudo tee /etc/postgresql/9.5/main/pg_hba.conf
+  - sudo /etc/init.d/postgresql restart 9.5

 script:
  - make
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,3 +62,7 @@ A useful query:
 ```sql
 SELECT * FROM pg_extension_update_paths('cartodb') WHERE path IS NOT NULL AND source = cdb_version();
 ```
+
+## Submitting Contributions
+
+* You will need to sign a Contributor License Agreement (CLA) before making a submission. [Learn more here](https://carto.com/contributions).
--- a/6
+++ b/6
@@ -1,7 +1,7 @@
 # cartodb/Makefile

 EXTENSION = cartodb
-EXTVERSION = 0.18.3
+EXTVERSION = 0.19.0

 SED = sed

@@ -75,7 +75,11 @@ UPGRADABLE = \
  0.17.1 \
  0.18.0 \
  0.18.1 \
+  0.18.2 \
  0.18.3 \
+  0.18.4 \
+  0.18.5 \
+  0.19.0 \
  $(EXTVERSION)dev \
  $(EXTVERSION)next \
  $(END)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,19 @@
+0.19.0 (2017-04-11)
+
+* Add new function `CDB_EstimateRowCount` #295
+
+0.18.5 (2016-11-30)
+
+* Add to new overview creation strategies #290
+* Fix tests: race condition with publicuser #157
+* Fix: CDB_Stats divisions by zero #181
+* Better implementation of `CDB_EqualIntervalBins` #244
+* New tests for binning functions #249
+
+0.18.4 (2016-11-04)
+
+* No functional changes; fixes the migration from previous versions #288
+
 0.18.3 (2016-11-03)

 * Exclude analysis cache tables from the quota #281
--- a/doc/CDB_EstimateRowCount.md
+++ b/doc/CDB_EstimateRowCount.md
@@ -0,0 +1,25 @@
+Estimate the number of rows of a query.
+
+
+#### Using the function
+
+```sql
+SELECT CDB_EstimateRowCount($$
+  UPDATE addresses SET the_geom = cdb_geocode_street_point(addr, city, state, 'US');
+$$) AS row_count;
+```
+
+Result:
+
+```
+ row_count
+-----------
+         5
+(1 row)
+```
+
+#### Arguments
+
+CDB_EstimateRowCount(query)
+
+* **query** text: the SQL query to estimate the row count for.
--- a/scripts-available/CDB_EqualIntervalBins.sql
+++ b/scripts-available/CDB_EqualIntervalBins.sql
@@ -1,8 +1,8 @@
 --
 -- Calculate the equal interval bins for a given column
 --
-- @param in_array A numeric array of numbers to determine the best
--                   to determine the bin boundary
+-- @param in_array An array of numbers to determine the best
+--                   bin boundary
 --
 -- @param breaks The number of bins you want to find.
 --  
@@ -11,27 +11,14 @@
 -- 
 --

-CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$
-DECLARE 
-    diff numeric;
-    min_val numeric;
-    max_val numeric;
-    tmp_val numeric;
-    i INT := 1;
-    reply numeric[];
-BEGIN
-    SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL;
-    diff = (max_val - min_val) / breaks::numeric;
-    LOOP
-        IF i < breaks THEN
-            tmp_val = min_val + i::numeric * diff;
-            reply = array_append(reply, tmp_val);
-            i := i+1;
-        ELSE
-            reply = array_append(reply, max_val);
-            EXIT;
-        END IF;
-    END LOOP;
-    RETURN reply;
-END;
-$$ language plpgsql IMMUTABLE;
+CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array anyarray, breaks INT ) RETURNS anyarray as $$
+WITH stats AS (
+  SELECT min(e), (max(e)-min(e))/breaks AS del
+    FROM (SELECT unnest(in_array) e) AS p)
+SELECT array_agg(bins)
+  FROM (
+    SELECT min + generate_series(1,breaks)*del AS bins
+      FROM stats) q;
+$$ LANGUAGE SQL IMMUTABLE;
+
+DROP FUNCTION IF EXISTS CDB_EqualIntervalBins( numeric[], integer);
--- a/scripts-available/CDB_EstimateRowCount.sql
+++ b/scripts-available/CDB_EstimateRowCount.sql
@@ -0,0 +1,31 @@
+-- Internal function to generate stats for a table if they don't exist
+CREATE OR REPLACE FUNCTION _CDB_GenerateStats(reloid REGCLASS)
+RETURNS VOID
+AS $$
+DECLARE
+  has_stats BOOLEAN;
+BEGIN
+  SELECT EXISTS (
+    SELECT * FROM pg_catalog.pg_statistic WHERE starelid = reloid
+  ) INTO has_stats;
+  IF NOT has_stats THEN
+    EXECUTE Format('ANALYZE %s;', reloid);
+  END IF;
+END
+$$ LANGUAGE 'plpgsql' VOLATILE STRICT SECURITY DEFINER;
+
+-- Return a row count estimate of the result of a query using statistics
+CREATE OR REPLACE FUNCTION CDB_EstimateRowCount(query text)
+RETURNS Numeric
+AS $$
+DECLARE
+  plan JSON;
+BEGIN
+  -- Make sure statistics exist for all the tables of the query
+  PERFORM _CDB_GenerateStats(tabname) FROM  unnest(CDB_QueryTablesText(query)) AS tabname;
+
+  -- Use the query planner to obtain an estimate of the number of result rows
+  EXECUTE 'EXPLAIN (FORMAT JSON) ' || query INTO STRICT plan;
+  RETURN plan->0->'Plan'->'Plan Rows';
+END
+$$ LANGUAGE 'plpgsql' VOLATILE STRICT;
--- a/scripts-available/CDB_Overviews.sql
+++ b/scripts-available/CDB_Overviews.sql
@@ -697,6 +697,356 @@ AS $$
  END;
 $$ LANGUAGE PLPGSQL;

+
+CREATE OR REPLACE FUNCTION _CDB_GridCluster_Reduce_Strategy(reloid REGCLASS, ref_z INTEGER, overview_z INTEGER, grid_px FLOAT8 DEFAULT NULL)
+RETURNS REGCLASS
+AS $$
+  DECLARE
+    overview_rel TEXT;
+    reduction FLOAT8;
+    base_name TEXT;
+    pixel_m FLOAT8;
+    grid_m FLOAT8;
+    offset_m FLOAT8;
+    offset_x TEXT;
+    offset_y TEXT;
+    cell_x TEXT;
+    cell_y TEXT;
+    aggr_attributes TEXT;
+    attributes TEXT;
+    columns TEXT;
+    gtypes TEXT[];
+    schema_name TEXT;
+    table_name TEXT;
+    point_geom TEXT;
+  BEGIN
+    SELECT _CDB_GeometryTypes(reloid) INTO gtypes;
+    IF gtypes IS NULL OR array_upper(gtypes, 1) <> 1 OR gtypes[1] <> 'ST_Point' THEN
+      -- This strategy only supports datasets with point geomety
+      RETURN NULL;
+    END IF;
+
+    --TODO: check applicability: geometry type, minimum number of points...
+
+    overview_rel := _CDB_Overview_Name(reloid, ref_z, overview_z);
+
+    -- Grid size in pixels at Z level overview_z
+    IF grid_px IS NULL THEN
+      grid_px := 1.0;
+    END IF;
+
+    SELECT * FROM _cdb_split_table_name(reloid) INTO schema_name, table_name;
+
+    -- pixel_m: size of a pixel in webmercator units (meters)
+    SELECT CDB_XYZ_Resolution(overview_z) INTO pixel_m;
+    -- grid size in meters
+    grid_m = grid_px * pixel_m;
+
+    attributes := _CDB_Aggregable_Attributes_Expression(reloid);
+    aggr_attributes := _CDB_Aggregated_Attributes_Expression(reloid);
+    IF attributes <> '' THEN
+      attributes := ', ' || attributes;
+    END IF;
+    IF aggr_attributes <> '' THEN
+      aggr_attributes := aggr_attributes || ', ';
+    END IF;
+
+    -- Center of each cell:
+    cell_x := Format('gx*%1$s + %2$s', grid_m, grid_m/2);
+    cell_y := Format('gy*%1$s + %2$s', grid_m, grid_m/2);
+
+    -- Displacement to the nearest pixel center:
+    IF MOD(grid_px::numeric, 1.0::numeric) = 0 THEN
+      offset_m := pixel_m/2 - MOD((grid_m/2)::numeric, pixel_m::numeric)::float8;
+      offset_x := Format('%s', offset_m);
+      offset_y := Format('%s', offset_m);
+    ELSE
+      offset_x := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_x, pixel_m);
+      offset_y := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_y, pixel_m);
+    END IF;
+
+    point_geom := Format('ST_SetSRID(ST_MakePoint(%1$s + %3$s, %2$s + %4$s), 3857)', cell_x, cell_y, offset_x, offset_y);
+
+    -- compute the resulting columns in the same order as in the base table
+    WITH cols AS (
+      SELECT
+        CASE c
+        WHEN 'cartodb_id' THEN 'cartodb_id'
+        WHEN 'the_geom' THEN
+          Format('ST_Transform(%s, 4326) AS the_geom', point_geom)
+        WHEN 'the_geom_webmercator' THEN
+           Format('%s AS the_geom_webmercator', point_geom)
+        ELSE c
+        END AS column
+        FROM CDB_ColumnNames(reloid) c
+    )
+    SELECT string_agg(s.column, ',') FROM (
+      SELECT * FROM cols
+    ) AS s INTO columns;
+
+    IF NOT columns LIKE '%_feature_count%' THEN
+      columns := columns || ', n AS _feature_count';
+    END IF;
+
+    EXECUTE Format('DROP TABLE IF EXISTS %I.%I CASCADE;', schema_name, overview_rel);
+
+    -- Now we cluster the data using a grid of size grid_m
+    -- and selecte the centroid (average coordinates) of each cluster.
+    -- If we had a selected numeric attribute of interest we could use it
+    -- as a weight for the average coordinates.
+    EXECUTE Format('
+      CREATE TABLE %7$I.%3$I AS
+         WITH clusters AS (
+           SELECT
+             %5$s
+             count(*) AS n,
+             Floor(ST_X(f.the_geom_webmercator)/%2$s)::int AS gx,
+             Floor(ST_Y(f.the_geom_webmercator)/%2$s)::int AS gy,
+             MIN(cartodb_id) AS cartodb_id
+          FROM %1$s f
+          GROUP BY gx, gy
+         )
+         SELECT %6$s FROM clusters
+    ', reloid::text, grid_m, overview_rel, attributes, aggr_attributes, columns, schema_name);
+
+    RETURN Format('%I.%I', schema_name, overview_rel)::regclass;
+  END;
+$$ LANGUAGE PLPGSQL;
+
+-- This strategy places the aggregation of each cluster at the centroid of the cluster members.
+CREATE OR REPLACE FUNCTION _CDB_GridClusterCentroid_Reduce_Strategy(reloid REGCLASS, ref_z INTEGER, overview_z INTEGER, grid_px FLOAT8 DEFAULT NULL)
+RETURNS REGCLASS
+AS $$
+  DECLARE
+    overview_rel TEXT;
+    reduction FLOAT8;
+    base_name TEXT;
+    pixel_m FLOAT8;
+    grid_m FLOAT8;
+    offset_m FLOAT8;
+    offset_x TEXT;
+    offset_y TEXT;
+    cell_x TEXT;
+    cell_y TEXT;
+    aggr_attributes TEXT;
+    attributes TEXT;
+    columns TEXT;
+    gtypes TEXT[];
+    schema_name TEXT;
+    table_name TEXT;
+    point_geom TEXT;
+  BEGIN
+    SELECT _CDB_GeometryTypes(reloid) INTO gtypes;
+    IF gtypes IS NULL OR array_upper(gtypes, 1) <> 1 OR gtypes[1] <> 'ST_Point' THEN
+      -- This strategy only supports datasets with point geomety
+      RETURN NULL;
+    END IF;
+
+    --TODO: check applicability: geometry type, minimum number of points...
+
+    overview_rel := _CDB_Overview_Name(reloid, ref_z, overview_z);
+
+    -- Grid size in pixels at Z level overview_z
+    IF grid_px IS NULL THEN
+      grid_px := 1.0;
+    END IF;
+
+    SELECT * FROM _cdb_split_table_name(reloid) INTO schema_name, table_name;
+
+    -- pixel_m: size of a pixel in webmercator units (meters)
+    SELECT CDB_XYZ_Resolution(overview_z) INTO pixel_m;
+    -- grid size in meters
+    grid_m = grid_px * pixel_m;
+
+    attributes := _CDB_Aggregable_Attributes_Expression(reloid);
+    aggr_attributes := _CDB_Aggregated_Attributes_Expression(reloid);
+    IF attributes <> '' THEN
+      attributes := ', ' || attributes;
+    END IF;
+    IF aggr_attributes <> '' THEN
+      aggr_attributes := aggr_attributes || ', ';
+    END IF;
+
+    -- Center of each cell:
+    cell_x := Format('gx*%1$s + %2$s', grid_m, grid_m/2);
+    cell_y := Format('gy*%1$s + %2$s', grid_m, grid_m/2);
+
+    -- Displacement to the nearest pixel center:
+    IF MOD(grid_px::numeric, 1.0::numeric) = 0 THEN
+      offset_m := pixel_m/2 - MOD((grid_m/2)::numeric, pixel_m::numeric)::float8;
+      offset_x := Format('%s', offset_m);
+      offset_y := Format('%s', offset_m);
+    ELSE
+      offset_x := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_x, pixel_m);
+      offset_y := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_y, pixel_m);
+    END IF;
+
+    point_geom := Format('ST_SetSRID(ST_MakePoint(%1$s + %3$s, %2$s + %4$s), 3857)', cell_x, cell_y, offset_x, offset_y);
+
+    -- compute the resulting columns in the same order as in the base table
+    WITH cols AS (
+      SELECT
+        CASE c
+        WHEN 'cartodb_id' THEN 'cartodb_id'
+        WHEN 'the_geom' THEN
+          'ST_Transform(ST_SetSRID(ST_MakePoint(_sum_of_x/n, _sum_of_y/n), 3857), 4326) AS the_geom'
+        WHEN 'the_geom_webmercator' THEN
+          'ST_SetSRID(ST_MakePoint(_sum_of_x/n, _sum_of_y/n), 3857) AS the_geom_webmercator'
+        ELSE c
+        END AS column
+        FROM CDB_ColumnNames(reloid) c
+    )
+    SELECT string_agg(s.column, ',') FROM (
+      SELECT * FROM cols
+    ) AS s INTO columns;
+
+    IF NOT columns LIKE '%_feature_count%' THEN
+      columns := columns || ', n AS _feature_count';
+    END IF;
+
+    EXECUTE Format('DROP TABLE IF EXISTS %I.%I CASCADE;', schema_name, overview_rel);
+
+    -- Now we cluster the data using a grid of size grid_m
+    -- and selecte the centroid (average coordinates) of each cluster.
+    -- If we had a selected numeric attribute of interest we could use it
+    -- as a weight for the average coordinates.
+    EXECUTE Format('
+      CREATE TABLE %7$I.%3$I AS
+         WITH clusters AS (
+           SELECT
+             %5$s
+             count(*) AS n,
+             SUM(ST_X(f.the_geom_webmercator)) AS _sum_of_x,
+             SUM(ST_Y(f.the_geom_webmercator)) AS _sum_of_y,
+             Floor(ST_Y(f.the_geom_webmercator)/%2$s)::int AS gy,
+             Floor(ST_X(f.the_geom_webmercator)/%2$s)::int AS gx,
+             MIN(cartodb_id) AS cartodb_id
+          FROM %1$s f
+          GROUP BY gx, gy
+         )
+         SELECT %6$s FROM clusters
+    ', reloid::text, grid_m, overview_rel, attributes, aggr_attributes, columns, schema_name);
+
+    RETURN Format('%I.%I', schema_name, overview_rel)::regclass;
+  END;
+$$ LANGUAGE PLPGSQL;
+
+-- This strategy places the aggregation of each cluster at the position of one of the cluster members.
+CREATE OR REPLACE FUNCTION _CDB_GridClusterSample_Reduce_Strategy(reloid REGCLASS, ref_z INTEGER, overview_z INTEGER, grid_px FLOAT8 DEFAULT NULL)
+RETURNS REGCLASS
+AS $$
+  DECLARE
+    overview_rel TEXT;
+    reduction FLOAT8;
+    base_name TEXT;
+    pixel_m FLOAT8;
+    grid_m FLOAT8;
+    offset_m FLOAT8;
+    offset_x TEXT;
+    offset_y TEXT;
+    cell_x TEXT;
+    cell_y TEXT;
+    aggr_attributes TEXT;
+    attributes TEXT;
+    columns TEXT;
+    gtypes TEXT[];
+    schema_name TEXT;
+    table_name TEXT;
+    point_geom TEXT;
+  BEGIN
+    SELECT _CDB_GeometryTypes(reloid) INTO gtypes;
+    IF gtypes IS NULL OR array_upper(gtypes, 1) <> 1 OR gtypes[1] <> 'ST_Point' THEN
+      -- This strategy only supports datasets with point geomety
+      RETURN NULL;
+    END IF;
+
+    --TODO: check applicability: geometry type, minimum number of points...
+
+    overview_rel := _CDB_Overview_Name(reloid, ref_z, overview_z);
+
+    -- Grid size in pixels at Z level overview_z
+    IF grid_px IS NULL THEN
+      grid_px := 1.0;
+    END IF;
+
+    SELECT * FROM _cdb_split_table_name(reloid) INTO schema_name, table_name;
+
+    -- pixel_m: size of a pixel in webmercator units (meters)
+    SELECT CDB_XYZ_Resolution(overview_z) INTO pixel_m;
+    -- grid size in meters
+    grid_m = grid_px * pixel_m;
+
+    attributes := _CDB_Aggregable_Attributes_Expression(reloid);
+    aggr_attributes := _CDB_Aggregated_Attributes_Expression(reloid);
+    IF attributes <> '' THEN
+      attributes := ', ' || attributes;
+    END IF;
+    IF aggr_attributes <> '' THEN
+      aggr_attributes := aggr_attributes || ', ';
+    END IF;
+
+    -- Center of each cell:
+    cell_x := Format('gx*%1$s + %2$s', grid_m, grid_m/2);
+    cell_y := Format('gy*%1$s + %2$s', grid_m, grid_m/2);
+
+    -- Displacement to the nearest pixel center:
+    IF MOD(grid_px::numeric, 1.0::numeric) = 0 THEN
+      offset_m := pixel_m/2 - MOD((grid_m/2)::numeric, pixel_m::numeric)::float8;
+      offset_x := Format('%s', offset_m);
+      offset_y := Format('%s', offset_m);
+    ELSE
+      offset_x := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_x, pixel_m);
+      offset_y := Format('%2$s/2 - MOD((%1$s)::numeric, (%2$s)::numeric)::float8', cell_y, pixel_m);
+    END IF;
+
+    point_geom := Format('ST_SetSRID(ST_MakePoint(%1$s + %3$s, %2$s + %4$s), 3857)', cell_x, cell_y, offset_x, offset_y);
+
+    -- compute the resulting columns in the same order as in the base table
+    WITH cols AS (
+      SELECT
+        CASE c
+        WHEN 'cartodb_id' THEN 'cartodb_id'
+        ELSE c
+        END AS column
+        FROM CDB_ColumnNames(reloid) c
+    )
+    SELECT string_agg(s.column, ',') FROM (
+      SELECT * FROM cols
+    ) AS s INTO columns;
+
+    IF NOT columns LIKE '%_feature_count%' THEN
+      columns := columns || ', n AS _feature_count';
+    END IF;
+
+    EXECUTE Format('DROP TABLE IF EXISTS %I.%I CASCADE;', schema_name, overview_rel);
+
+    -- Now we cluster the data using a grid of size grid_m
+    -- and select the centroid (average coordinates) of each cluster.
+    -- If we had a selected numeric attribute of interest we could use it
+    -- as a weight for the average coordinates.
+    EXECUTE Format('
+      CREATE TABLE %7$I.%3$I AS
+         WITH clusters AS (
+           SELECT
+             %5$s
+             count(*) AS n,
+             Floor(ST_X(_f.the_geom_webmercator)/%2$s)::int AS gx,
+             Floor(ST_Y(_f.the_geom_webmercator)/%2$s)::int AS gy,
+             MIN(cartodb_id) AS cartodb_id
+          FROM %1$s _f
+          GROUP BY gx, gy
+         ),
+         cluster_geom AS (
+           SELECT the_geom, the_geom_webmercator, clusters.*
+             FROM clusters INNER JOIN %1$s _g ON (clusters.cartodb_id = _g.cartodb_id)
+         )
+         SELECT %6$s FROM cluster_geom
+    ', reloid::text, grid_m, overview_rel, attributes, aggr_attributes, columns, schema_name);
+
+    RETURN Format('%I.%I', schema_name, overview_rel)::regclass;
+  END;
+$$ LANGUAGE PLPGSQL;
+
 -- Create overview tables for a dataset.
 -- Scope: public
 -- Parameters:
--- a/scripts-available/CDB_Stats.sql
+++ b/scripts-available/CDB_Stats.sql
@@ -4,7 +4,7 @@
 -- @param in_array A numeric array of numbers
 --
 -- Returns: statistical quantity chosen
-- 
+--
 -- References: http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm
 --

@@ -13,17 +13,21 @@ CREATE OR REPLACE FUNCTION CDB_Kurtosis ( in_array NUMERIC[] ) RETURNS NUMERIC a
 DECLARE
    a numeric;
    c numeric;
-    s numeric;
    k numeric;
 BEGIN
-    SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x;
+    SELECT AVG(e), COUNT(e)::numeric * power(stddev(e),4) INTO a, c FROM ( SELECT unnest(in_array) e ) x;

-    EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) - 3
-             FROM (SELECT unnest($4) e ) x'
-    INTO k
-    USING a, c, s, in_array;
+    IF c=0 THEN
+      RETURN 0;
+    ELSE

-    RETURN k;
+      EXECUTE 'SELECT sum(power($1 - e, 4)) / ($2 ) - 3
+             FROM (SELECT unnest($3) e ) x'
+      INTO k
+      USING a, c, in_array;
+
+      RETURN k;
+    END IF;
 END;
 $$ language plpgsql IMMUTABLE;

@@ -32,16 +36,18 @@ CREATE OR REPLACE FUNCTION CDB_Skewness ( in_array NUMERIC[] ) RETURNS NUMERIC a
 DECLARE
    a numeric;
    c numeric;
-    s numeric;
    sk numeric;
 BEGIN
-    SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x;
+    SELECT AVG(e), COUNT(e)::numeric * power(stddev(e),3) INTO a, c FROM ( SELECT unnest(in_array) e ) x;
+    IF c=0 THEN
+      RETURN 0;
+    ELSE
+      EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 )
+             FROM (SELECT unnest($3) e ) x'
+      INTO sk
+      USING a, c, in_array;

-    EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 * power($3, 3))
-             FROM (SELECT unnest($4) e ) x'
-    INTO sk
-    USING a, c, s, in_array;
-
-    RETURN sk;
+      RETURN sk;
+    END IF;
 END;
 $$ language plpgsql IMMUTABLE;
--- a/scripts-enabled/280-CDB_EstimateRowCount.sql
+++ b/scripts-enabled/280-CDB_EstimateRowCount.sql
@@ -0,0 +1 @@
+../scripts-available/CDB_EstimateRowCount.sql
--- a/test/CDB_DigitSeparatorTest_expect
+++ b/test/CDB_DigitSeparatorTest_expect
@@ -1,5 +1,6 @@
 BEGIN
 CREATE TABLE
+COPY 3
 none||
 only_com_dec|.|,
 only_dot_dec|,|.
--- a/test/CDB_EqualIntervalBinsTest.sql
+++ b/test/CDB_EqualIntervalBinsTest.sql
@@ -2,4 +2,10 @@ WITH data AS (
    SELECT array_agg(x::numeric) s FROM generate_series(1,300) x 
        WHERE x % 5 != 0 AND x % 7 != 0
    ) 
-SELECT round(unnest(CDB_EqualIntervalBins(s, 7)),7) FROM data
+SELECT round(unnest(CDB_EqualIntervalBins(s, 7)),7) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT round(unnest(CDB_EqualIntervalBins(s, 7)),7) FROM data_nulls;
--- a/test/CDB_EqualIntervalBinsTest_expect
+++ b/test/CDB_EqualIntervalBinsTest_expect
@@ -5,3 +5,10 @@
 213.8571429
 256.4285714
 299.0000000
+15.0000000
+29.0000000
+43.0000000
+57.0000000
+71.0000000
+85.0000000
+99.0000000
--- a/test/CDB_EstimateRowCountTest.sql
+++ b/test/CDB_EstimateRowCountTest.sql
@@ -0,0 +1,10 @@
+SET client_min_messages TO error;
+\set VERBOSITY terse
+CREATE TABLE tmptab1(id INT);
+INSERT INTO tmptab1(id) VALUES (1), (2), (3);
+CREATE TABLE tmptab2(id INT, value NUMERIC);
+INSERT INTO tmptab2(id, value) VALUES (1, 10.0), (2, 20.0);
+SELECT CDB_EstimateRowCount('SELECT SUM(value) FROM tmptab1 INNER JOIN tmptab2 ON (tmptab1.id = tmptab2.id);') AS row_count;
+SELECT CDB_EstimateRowCount('UPDATE tmptab2 SET value = 30 WHERE id=2;') AS row_count;
+DROP TABLE tmptab2;
+DROP TABLE tmptab1;
--- a/test/CDB_EstimateRowCountTest_expect
+++ b/test/CDB_EstimateRowCountTest_expect
@@ -0,0 +1,9 @@
+SET
+CREATE TABLE
+INSERT 0 3
+CREATE TABLE
+INSERT 0 2
+1
+1
+DROP TABLE
+DROP TABLE
--- a/test/CDB_HeadsTailsBinsTest.sql
+++ b/test/CDB_HeadsTailsBinsTest.sql
@@ -1,5 +1,11 @@
 WITH data AS (
-    SELECT array_agg(x) x FROM generate_series(1,100) x 
+    SELECT array_agg(x::numeric) s FROM generate_series(1,100) x
        WHERE x % 5 != 0 AND x % 7 != 0
-    ) 
-SELECT round(unnest(CDB_HeadsTailsBins(x, 7)),2) FROM data
+    )
+SELECT round(unnest(CDB_HeadsTailsBins(s, 7)),2) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT round(unnest(CDB_HeadsTailsBins(s, 7)),2) FROM data_nulls;
--- a/test/CDB_HeadsTailsBinsTest_expect
+++ b/test/CDB_HeadsTailsBinsTest_expect
@@ -5,3 +5,9 @@
 96.50
 98.00
 99.00
+49.76
+74.65
+88.50
+94.50
+98.00
+99.00
--- a/test/CDB_JenksBinsTest.sql
+++ b/test/CDB_JenksBinsTest.sql
@@ -1,5 +1,11 @@
 WITH data AS (
-    SELECT array_agg(x) x FROM generate_series(1,100) x 
+    SELECT array_agg(x::numeric) s FROM generate_series(1,300) x
        WHERE x % 5 != 0 AND x % 7 != 0
    ) 
-SELECT unnest(CDB_JenksBins(x, 7)) FROM data
+SELECT unnest(CDB_JenksBins(s, 7)) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,300) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT unnest(CDB_JenksBins(s, 7)) FROM data_nulls;
--- a/test/CDB_JenksBinsTest_expect
+++ b/test/CDB_JenksBinsTest_expect
@@ -1,7 +1,14 @@
-13
-29
 43
-57
-71
-83
-99
+86
+129
+172
+213
+257
+299
+37
+51
+97
+157
+213
+241
+
--- a/test/CDB_QuantileBinsTest.sql
+++ b/test/CDB_QuantileBinsTest.sql
@@ -1,5 +1,11 @@
 WITH data AS (
-    SELECT array_agg(x) x FROM generate_series(1,100) x 
+    SELECT array_agg(x::numeric) s FROM generate_series(1,100) x
        WHERE x % 5 != 0 AND x % 7 != 0
    ) 
-SELECT unnest(CDB_QuantileBins(x, 7)) FROM data
+SELECT unnest(CDB_QuantileBins(s, 7)) FROM data;
+
+WITH data_nulls AS (
+    SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
+        WHERE x % 5 != 0 AND x % 7 != 0
+    )
+SELECT unnest(CDB_QuantileBins(s, 7)) FROM data_nulls;
--- a/test/CDB_QuantileBinsTest_expect
+++ b/test/CDB_QuantileBinsTest_expect
@@ -4,4 +4,11 @@
 57
 71
 86
+99
+29
+57
+87
+
+
+
 99
--- a/test/CDB_QueryStatementsTest.sql
+++ b/test/CDB_QueryStatementsTest.sql
@@ -1,3 +1,6 @@
+SET client_min_messages TO error;
+\set VERBOSITY terse
+
 WITH q AS ( SELECT CDB_QueryStatements('
 SELECT * FROM geometry_columns;
 ') as statement )
--- a/test/CDB_QueryStatementsTest_expect
+++ b/test/CDB_QueryStatementsTest_expect
@@ -1,3 +1,4 @@
+SET
 1|1|SELECT * FROM geometry_columns
 2|1|SELECT * FROM geometry_columns
 3|1|SELECT * FROM geometry_columns
--- a/test/CDB_QueryTablesTest.sql
+++ b/test/CDB_QueryTablesTest.sql
@@ -1,3 +1,5 @@
+SET client_min_messages TO warning;
+\set VERBOSITY terse

 WITH inp AS ( select 'SELECT * FROM geometry_columns'::text as q )
 SELECT q, CDB_QueryTables(q) from inp;
--- a/test/CDB_QueryTablesTest_expect
+++ b/test/CDB_QueryTablesTest_expect
@@ -1,17 +1,14 @@
+SET
 SELECT * FROM geometry_columns|{pg_catalog.pg_attribute,pg_catalog.pg_class,pg_catalog.pg_constraint,pg_catalog.pg_namespace,pg_catalog.pg_type}
 SELECT a.attname FROM pg_class c JOIN pg_attribute a on (a.attrelid = c.oid)|{pg_catalog.pg_attribute,pg_catalog.pg_class}
 CREATE table "my'tab;le" as select 1|{}
 SELECT a.oid, b.oid FROM pg_class a, pg_class b|{pg_catalog.pg_class}
 SELECT 1 as col1; select 2 as col2|{}
 WARNING:  CDB_QueryTables cannot explain query: select 1 from nonexistant (42P01: relation "nonexistant" does not exist)
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 ERROR:  relation "nonexistant" does not exist
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 begin; select * from pg_class; commit;|{pg_catalog.pg_class}
 WARNING:  CDB_QueryTables cannot explain query: select * from test (42P01: relation "test" does not exist)
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 ERROR:  relation "test" does not exist
-CONTEXT:  PL/pgSQL function cdb_querytables(text) line 3 at RETURN
 WITH a AS (select * from pg_class) select * from a|{pg_catalog.pg_class}
 CREATE SCHEMA
 CREATE TABLE
--- a/test/extension/test.sh
+++ b/test/extension/test.sh
@@ -228,6 +228,7 @@ function tear_down() {
    sql 'DROP ROLE cdb_testmember_2;'

    tear_down_database
+    DATABASE=postgres sql postgres 'DROP ROLE IF EXISTS publicuser';
 }


@@ -486,6 +487,18 @@ function test_foreign_tables() {
    ${CMD} -d fdw_target -f scripts-available/CDB_QueryTables.sql
    ${CMD} -d fdw_target -f scripts-available/CDB_TableMetadata.sql

+    DATABASE=fdw_target sql postgres "DO
+\$\$
+BEGIN
+   IF NOT EXISTS (
+      SELECT *
+      FROM   pg_catalog.pg_user
+      WHERE  usename = 'publicuser') THEN
+
+      CREATE ROLE publicuser LOGIN;
+   END IF;
+END
+\$\$;"
    DATABASE=fdw_target sql postgres 'CREATE SCHEMA test_fdw;'
    DATABASE=fdw_target sql postgres 'CREATE TABLE test_fdw.foo (a int);'
    DATABASE=fdw_target sql postgres 'INSERT INTO test_fdw.foo (a) values (42);'
Author	SHA1	Message	Date
Javier Goizueta	2d473cf693	New version 0.19.0	2017-04-11 11:22:20 +02:00
Javier Goizueta	4193ff3874	Merge pull request #298 from CartoDB/295-estimate-row-count Add CDB_EstimateRowCount function	2017-04-11 11:01:31 +02:00
Javier Goizueta	68a0752849	Use PG 9.5 for travis tests; fix tests	2017-04-10 15:58:49 +02:00
Javier Goizueta	815b5b429d	Fix tests	2017-04-10 13:50:37 +02:00
Javier Goizueta	76bdb3657a	Fix tests	2017-04-10 12:17:47 +02:00
Javier Goizueta	234373df11	Replace unnecessary count	2017-04-10 08:08:59 +02:00
Javier Goizueta	a486eed2e3	Add CDB_EstimateRowCount function See #295	2017-04-07 15:35:48 +02:00
Mario de Frutos	795d92da8d	Added CLA paragraph	2017-01-25 10:54:12 +01:00
Javier Goizueta	58e2e7e238	Release 0.18.5	2016-11-30 17:17:45 +01:00
Javier Goizueta	25d27263cb	Merge pull request #249 from CartoDB/nullbins Test behavior of binning fuctions with nulls	2016-11-30 16:09:23 +01:00
Javier Goizueta	bbadcc838e	Merge pull request #244 from CartoDB/equalbins Convert CDB_EqualIntervalBins to a single SQL statement and add float version	2016-11-30 16:09:05 +01:00
Javier Goizueta	b1a0904c07	Merge pull request #181 from CartoDB/update_to_cdb_stats Fix for division by zero error on empty or homogenous array	2016-11-30 16:08:40 +01:00
Javier Goizueta	399b680b41	Merge pull request #283 from CartoDB/157-test-fixes Fix tests: race condition with publicuser #157	2016-11-30 16:08:21 +01:00
Javier Goizueta	7c0636c5f9	Merge pull request #290 from CartoDB/286-overview-strategies Add point overview strategies	2016-11-30 11:46:29 +01:00
Javier Goizueta	f58f870457	Remove use of first aggregator in sample-cluster overviews strategy This is not more efficient but the geometry now corresponds to the cartodb_id and the dependency with custom aggregators (firt) is removed.	2016-11-29 14:08:08 +01:00
Javier Goizueta	a7c8dc04e3	Release 0.18.4 This just fixes the lack of migration path from 0.18.2	2016-11-04 16:25:03 +01:00
Javier Goizueta	90ee56eb35	Merge pull request #288 from CartoDB/fix-migration Fix migration script generation	2016-11-04 16:22:56 +01:00
Javier Goizueta	1032737600	Fix migration script generation In the 0.18.3 release the script to migrate from 0.18.2 was missing. This will generate a new version 0.18.4 that when install will generate scripts to migrate from all old versions to it, so it will be possible to migrate existing users to 0.18.4 (but not to 0.18.3)	2016-11-04 16:19:06 +01:00
Javier Goizueta	5992304b47	Add a couple of overview clustering strategies	2016-11-03 13:31:04 +01:00
Rafa de la Torre	30cd4cf1f9	Fix tests: race condition with publicuser #157	2016-10-17 16:31:10 +02:00
Paul Norman	3122a0479d	Test behavior of binning fuctions with nulls All test results are based off of existing behavior, which doesn't always make sense (ref #247)	2016-04-28 09:59:33 -07:00
Paul Norman	956e56cd37	Use anyarray for equalintervalbins	2016-04-27 16:10:01 -07:00
Paul Norman	b19a5fc3dc	Convert CDB_EqualIntervalBins to a single SQL statement and add float version	2016-04-25 14:35:26 -07:00
Stuart Lynn	0ecbbd8e71	Make sure that empty arrays or arrays with all the same entry return 0 for Skewness and Kurtosis rather than throwing a division by zero error	2015-12-04 14:54:15 -05:00
				`@@ -0,0 +1 @@`
				`../scripts-available/CDB_EstimateRowCount.sql`