LISTING 2: This script, skewDscv.sql, determines whether column data is evenly distributed or skewed, according to a skew tolerance that you set.
/* 
Filename : skewDscv.sql		skewed data distribution discoverer
Author   : Julius J.Y. Lin		01/30/98
Purpose  : find out if data distribution in tables are skewed or not after  histAnalyzeProbe.sql 
produces the result in dba_histograms.  Run this script to plug the information into skew_discovery. 
Eventually the bucket size will be put in    CONDI_ANALYZE_CTRL. HSTGRM_BUCKETS
Environment : Oracle V.7.3
Note !!  : dba_histograms.endpoint is the bucket_number in most cases.
	* if a group of continual buckets have the same endpoint_value, only the 
	   last one in the group is put in the dba_histograms table.
	* you may have 5 groups in dba_histograms each represent 20% of data in 
	  an even distribution case, so you can't assume that if there is repeating
	 endpoint_value, then it is skewed. In other words, if the return value of 
     	  "select count(*) 
	  from dba_histograms 
	  where owner=x and table_name=y and column_name =z " 
  	  is less than 
     	  "select num_buckets 
	  from dba_tab_columns 
	  where owner=x and table_name=y and column_name =z "
         you can not jump to the conclusion that the data distribution is skewed.
         But if they are equal, then the data is highly evenly distributed.
	* run this script under user system
*/
-- may change to update, instead of truncate to keep the result of previous analyze and skew discovery
truncate table skew_discovery_detail reuse storage ;
truncate table skew_discovery           reuse storage ;

-- use dba_histograms.endpoint_value of bucket N minus the value in bucket N-1
-- to get Bucket-Density ( implied by 1/x_len, where x_len = v_x_end - v_x_begin) into skew_discovery_detail
Declare
	v_bucket_size	number := 75 ;	-- consult size in analyze statement in 
-- skewDscvInst.sql 
	-- the less the threshold the more restrictive the skew definition 
	v_threshold   	number := 0.2 ; -- of ratio std_dev/mean/num_buckets
	v_x_end		number ;
	v_x_begin 	number ;

	/* to demo the relative degree of skew, select all records from dba_histograms.
	    You can parameterize v_bucket_size, v_threshold, and where table_name is a scope.
	    Change where statement for different table scope or parameterize this condition 
	*/
cursor xlen_cs is
		select	owner, table_name, column_name, 
			endpoint_number, endpoint_value
		from 	dba_histograms 
		where   table_name in ('BUSINESS_UNITS', 'JOB_CONTROL' )
		order by 1,2,3,4
		;	
/*
-- next variation of cursor can be used in a real run to speed the process time because
-- skew_discovery_detail will keep information of possible skewed columns only.
-- if c.num_buckets = v_bucket_size then it is evenly distributed under current detail level of 
-- bucket_size specified in analyze command.
cursor xlen_cs is
		select	h.owner, h.table_name, h.column_name, 
			h.endpoint_number, h.endpoint_value
		from 	dba_histograms h, dba_tab_columns c
		where     h.owner       = c.owner
		and	h.table_name  = c.table_name 
		and	h.column_name = c.column_name 
		and	c.num_buckets < v_bucket_size
		and	h.table_name in ('BUSINESS_UNITS', 'JOB_CONTROL' )
		order by 1,2,3,4
		;
*/
	v_prev_owner 	dba_histograms.owner%TYPE  ;	
	v_prev_table	dba_histograms.table_name%TYPE ;
	v_prev_col	dba_histograms.column_name%TYPE ;
	v_prev_endp 	dba_histograms.endpoint_number%TYPE  := 0;
	v_prev_endv	dba_histograms.endpoint_value%TYPE  := 0;
BEGIN
  FOR xlen_rec IN xlen_cs LOOP
	-- if any one of the following 3 values changes then the mean column 
	-- changed. If so, reset v_prev_endv to 0
      IF xlen_rec.column_name  != v_prev_col   OR 
	 xlen_rec.table_name  != v_prev_table OR
	 xlen_rec.owner          != v_prev_owner THEN
	      v_prev_endv  := 0 ;
	      v_prev_endp  := 0 ;
      END IF ;
/*
Start to reconstruct missing records in dba_histograms because the statistic 
function needs data to explicitly exist.
If column NUM_BUCKETS in table DBA_TAB_COLUMNS is greater than 1, and 
DBA_HISTOGRAMS.ENDPOINT_NUMBER is less than or equal to the bucket size you 
specified in the analyze command, and it does not have a contiguous number, then there are 
missing buckets that need to be reconstructed. 
*/
      -- normally we will use endpoint_value to calculate Bucket-Density
      v_x_begin := v_prev_endv ; 
      v_x_end    := xlen_rec.endpoint_value ; 

      IF xlen_rec.endpoint_number < v_bucket_size  THEN
	--reconstruct missing buckets in a while loop
          WHILE ( xlen_rec.endpoint_number > v_prev_endp  ) LOOP
	    insert into skew_discovery_detail ( 
		   owner, table_name,column_name, bucket_number, Bucket_Density ) 
	    values ( xlen_rec.owner, xlen_rec.table_name,
		    xlen_rec.column_name, v_prev_endp, 
		    v_x_end - v_x_begin
	    ) ; 
                  v_prev_endp := v_prev_endp + 1 ;
 	    v_x_begin     := xlen_rec.endpoint_value ; 
          END LOOP ;
      ELSIF xlen_rec.endpoint_number > v_bucket_size THEN
/* 
If DBA_HISTOGRAMS.ENDPOINT_NUMBER is greater than the bucket size specified in the 
analyze command, then transpose them using endpoint_number as endpoint_value to calculate 
Bucket-Density without making up missing bucket.
The pair of (endpoint_value, endpoint_number ) is equal to the output of the next SQL statement :
select &column_name, count(*) 
from  &table_name
group by &column_name 
;
where "&" indicates a variable, so the reader can replace it with an actual value.
In this case, each group of "&column_name" represents a bucket and the Bucket-Densitys need 
to be calculated from the difference of count(*) value
*/
	v_x_begin := v_prev_endp ;
	v_x_end   := xlen_rec.endpoint_number ;
      END IF ;
	-- under normal situations, bucket comes from the difference of two end points
      insert into skew_discovery_detail ( 
	     owner, table_name, column_name, bucket_number, Bucket_Density ) 
      values ( xlen_rec.owner, xlen_rec.table_name,
	       xlen_rec.column_name, xlen_rec.endpoint_number, 
	       v_x_end - v_x_begin
      ) ;
      v_prev_owner  := xlen_rec.owner ;
      v_prev_table    := xlen_rec.table_name ;
      v_prev_col      := xlen_rec.column_name ;
      v_prev_endp   := xlen_rec.endpoint_number + 1;
      v_prev_endv   := xlen_rec.endpoint_value ;
  END LOOP;	-- FOR xlen_rec

/* 
   Now get the standard deviation and use the average as the mean, though statistically
   mean is different to average. Oracle's built-in functions do not provide a mean
   calculation, but for the skew test, average is close enough.
   skew_discovery.buckets represents real buckets after analyze
*/
  -- one record for every column in skew_discovery  header table
  INSERT INTO skew_discovery (
	 owner, table_name, column_name, std_dev, mean, table_buckets )
  SELECT owner, table_name, column_name, stddev(Bucket_Density),
 	 avg( Bucket_Density ), count(*)
  FROM	 skew_discovery_detail  
  GROUP BY owner, table_name, column_name
  ; 
  update skew_discovery s
  set    s.column_buckets = (
		select   c.num_buckets
		from    dba_tab_columns c
  		where  s.owner	        = c.owner
  		and      s.table_name     = c.table_name
  		and      s.column_name = c.column_name
	 )
  ;
/* 
why s.column_buckets is increased  by 1 when d.bucket_number > v_bucket_size:
The DBA_TAB_COLUMN.NUM_BUCKETS does not correctly represent the actual 
buckets in this case, so skew_discovery.column_buckets = dba_table_column.num_bucket + 1, 
because each record is a bucket.  It is not the normal case that each bucket equals the current 
endpoint_value - previous endpoint_value.
*/
update skew_discovery s
 set       s.column_buckets = s.column_buckets + 1  
 where 	exists ( 
		select   1
		from     skew_discovery_detail d
  		where    s.owner	 = d.owner
  		and      s.table_name    = d.table_name
  		and      s.column_name   = d.column_name
		and      d.bucket_number > v_bucket_size
	 )
  ;
  UPDATE	skew_discovery
  SET		distribution = 'SKEW'
  WHERE	std_dev / decode( sign(mean - 1),  -1, 1, mean)/ column_buckets > v_threshold
   AND		column_buckets > 1
   ;
/* next SQL statement can be run independently, once you are satisfied with the results
   you received. The reason it is not run here is because once it is put into condi_analyze_ctrl 
    it will affect production.
*/
   dbms_output.put_line('Please run the next update statement in this file');
   dbms_output.put_line('to update condi_analyze_ctrl table, if test OK');
/*
-- max(s.buckets) always <= tc.num_buckets, if they are equal,
-- mean it reach the max. resolution
	
   UPDATE condi_analyze_ctrl c
   SET	  c.hstgrm_buckets = (
select  distinct bucket_size
 		from   skew_discovery s
		where  c.owner 	       = s.owner
		and      c.object_name = s.table_name
		and      s.distribution   = 'SKEW'	-- if not skew, no histogram is need.
	 )
   WHERE 	c.object_name in ( 'BUSINESS_UNITS', 'JOB_CONTROL' )
   ;	-- change the second where condition if you want to expand or limit the scope of tables.
*/
END ;
/