Aggregate small number of text items in overviews

Instead of nulling text fields for non-singleton aggregated records concatenate distinct text values when they're few (5 or less). Fixes #231
2016-04-15 12:37:16 +02:00
parent 20989e2f28
commit 45f063d469
2 changed files with 29 additions and 12 deletions
--- a/scripts-available/CDB_Overviews.sql
+++ b/scripts-available/CDB_Overviews.sql
@@ -515,6 +515,23 @@ BEGIN
 END
 $$ LANGUAGE PLPGSQL STABLE;

+-- Check if a column of a table is of an unlimited-length text type
+CREATE OR REPLACE FUNCTION _cdb_unlimited_text_column(reloid REGCLASS, col_name TEXT)
+RETURNS BOOLEAN
+AS $$
+  SELECT EXISTS (
+    SELECT  *
+      FROM information_schema.columns c, pg_class _tn, pg_namespace _sn
+      WHERE table_name = _tn.relname
+        AND table_schema = _sn.nspname
+        AND c.column_name = col_name
+        AND _tn.oid = reloid
+        AND _sn.oid = _tn.relnamespace
+        AND character_maximum_length IS NULL
+        AND c.data_type IN ('text', 'character varying', 'character')
+  );
+$$ LANGUAGE SQL STABLE;
+
 -- SQL Aggregation expression for a datase attribute
 -- Scope: private.
 -- Parameters
@@ -532,6 +549,7 @@ DECLARE
  has_counter_column BOOLEAN;
  feature_count TEXT;
  total_feature_count TEXT;
+  unlimited_text BOOLEAN;
 BEGIN
  IF table_alias <> '' THEN
    qualified_column := Format('%I.%I', table_alias, column_name);
@@ -559,16 +577,15 @@ BEGIN
    ELSE
      RETURN Format('SUM(%s*%s)/%s::' || column_type, qualified_column, feature_count, total_feature_count);
    END IF;
-  WHEN 'text' THEN
-    -- TODO: we could define a new aggregate function that returns distinct
-    -- separated values with a limit, adding ellipsis if more values existed
-    -- e.g. with '/' as separator and a limit of three:
-    --     'A', 'B', 'A', 'C', 'D' => 'A/B/C/...'
-    -- Other ideas: if value is unique then use it, otherwise use something
-    -- like '*' or '(varies)' or '(multiple values)', or NULL
-    -- Using 'string_agg(' || qualified_column || ',''/'')'
-    -- here causes
-    RETURN 'CASE count(*) WHEN 1 THEN MIN(' || qualified_column || ') ELSE NULL END::' || column_type;
+  WHEN 'text', 'character varying', 'character' THEN
+    SELECT _cdb_unlimited_text_column(reloid, column_name) INTO unlimited_text;
+    IF unlimited_text THEN
+      -- TODO: this should not be applied to columns containing largish text;
+      -- it is intended only to short names/identifiers
+      RETURN  'CASE WHEN count(*) = 1 THEN MIN(' || qualified_column || ') WHEN ' || total_feature_count || ' < 5 THEN string_agg(distinct ' || qualified_column || ','' / '') ELSE ''*'' END::' || column_type;
+    ELSE
+      RETURN 'CASE count(*) WHEN 1 THEN MIN(' || qualified_column || ') ELSE NULL END::' || column_type;
+    END IF;
  WHEN 'boolean' THEN
    RETURN 'CASE count(*) WHEN 1 THEN BOOL_AND(' || qualified_column || ') ELSE NULL END::' || column_type;
  ELSE
--- a/test/CDB_OverviewsTest_expect
+++ b/test/CDB_OverviewsTest_expect
@@ -12,8 +12,8 @@ SELECT 1114
 {_vovw_2_base_bare_t,_vovw_1_base_bare_t,_vovw_0_base_bare_t}
 126
 number,int_number,name,start
-SUM(number*1)/count(*)::double precision AS number,SUM(int_number*1)/count(*)::integer AS int_number,CASE count(*) WHEN 1 THEN MIN(name) ELSE NULL END::text AS name,CASE count(*) WHEN 1 THEN MIN(start) ELSE NULL END::date AS start
-SUM(tab.number*1)/count(*)::double precision AS number,SUM(tab.int_number*1)/count(*)::integer AS int_number,CASE count(*) WHEN 1 THEN MIN(tab.name) ELSE NULL END::text AS name,CASE count(*) WHEN 1 THEN MIN(tab.start) ELSE NULL END::date AS start
+SUM(number*1)/count(*)::double precision AS number,SUM(int_number*1)/count(*)::integer AS int_number,CASE WHEN count(*) = 1 THEN MIN(name) WHEN count(*) < 5 THEN string_agg(distinct name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(start) ELSE NULL END::date AS start
+SUM(tab.number*1)/count(*)::double precision AS number,SUM(tab.int_number*1)/count(*)::integer AS int_number,CASE WHEN count(*) = 1 THEN MIN(tab.name) WHEN count(*) < 5 THEN string_agg(distinct tab.name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(tab.start) ELSE NULL END::date AS start
 {_vovw_2_base_t,_vovw_1_base_t,_vovw_0_base_t}
 126