LISTING 2: This script, skewDscv.sql, determines whether column data is evenly distributed or skewed, according to a skew tolerance that you set.
/*
Filename : skewDscv.sql skewed data distribution discoverer
Author : Julius J.Y. Lin 01/30/98
Purpose : find out if data distribution in tables are skewed or not after histAnalyzeProbe.sql
produces the result in dba_histograms. Run this script to plug the information into skew_discovery.
Eventually the bucket size will be put in CONDI_ANALYZE_CTRL. HSTGRM_BUCKETS
Environment : Oracle V.7.3
Note !! : dba_histograms.endpoint is the bucket_number in most cases.
* if a group of continual buckets have the same endpoint_value, only the
last one in the group is put in the dba_histograms table.
* you may have 5 groups in dba_histograms each represent 20% of data in
an even distribution case, so you can't assume that if there is repeating
endpoint_value, then it is skewed. In other words, if the return value of
"select count(*)
from dba_histograms
where owner=x and table_name=y and column_name =z "
is less than
"select num_buckets
from dba_tab_columns
where owner=x and table_name=y and column_name =z "
you can not jump to the conclusion that the data distribution is skewed.
But if they are equal, then the data is highly evenly distributed.
* run this script under user system
*/
-- may change to update, instead of truncate to keep the result of previous analyze and skew discovery
truncate table skew_discovery_detail reuse storage ;
truncate table skew_discovery reuse storage ;
-- use dba_histograms.endpoint_value of bucket N minus the value in bucket N-1
-- to get Bucket-Density ( implied by 1/x_len, where x_len = v_x_end - v_x_begin) into skew_discovery_detail
Declare
v_bucket_size number := 75 ; -- consult size in analyze statement in
-- skewDscvInst.sql
-- the less the threshold the more restrictive the skew definition
v_threshold number := 0.2 ; -- of ratio std_dev/mean/num_buckets
v_x_end number ;
v_x_begin number ;
/* to demo the relative degree of skew, select all records from dba_histograms.
You can parameterize v_bucket_size, v_threshold, and where table_name is a scope.
Change where statement for different table scope or parameterize this condition
*/
cursor xlen_cs is
select owner, table_name, column_name,
endpoint_number, endpoint_value
from dba_histograms
where table_name in ('BUSINESS_UNITS', 'JOB_CONTROL' )
order by 1,2,3,4
;
/*
-- next variation of cursor can be used in a real run to speed the process time because
-- skew_discovery_detail will keep information of possible skewed columns only.
-- if c.num_buckets = v_bucket_size then it is evenly distributed under current detail level of
-- bucket_size specified in analyze command.
cursor xlen_cs is
select h.owner, h.table_name, h.column_name,
h.endpoint_number, h.endpoint_value
from dba_histograms h, dba_tab_columns c
where h.owner = c.owner
and h.table_name = c.table_name
and h.column_name = c.column_name
and c.num_buckets < v_bucket_size
and h.table_name in ('BUSINESS_UNITS', 'JOB_CONTROL' )
order by 1,2,3,4
;
*/
v_prev_owner dba_histograms.owner%TYPE ;
v_prev_table dba_histograms.table_name%TYPE ;
v_prev_col dba_histograms.column_name%TYPE ;
v_prev_endp dba_histograms.endpoint_number%TYPE := 0;
v_prev_endv dba_histograms.endpoint_value%TYPE := 0;
BEGIN
FOR xlen_rec IN xlen_cs LOOP
-- if any one of the following 3 values changes then the mean column
-- changed. If so, reset v_prev_endv to 0
IF xlen_rec.column_name != v_prev_col OR
xlen_rec.table_name != v_prev_table OR
xlen_rec.owner != v_prev_owner THEN
v_prev_endv := 0 ;
v_prev_endp := 0 ;
END IF ;
/*
Start to reconstruct missing records in dba_histograms because the statistic
function needs data to explicitly exist.
If column NUM_BUCKETS in table DBA_TAB_COLUMNS is greater than 1, and
DBA_HISTOGRAMS.ENDPOINT_NUMBER is less than or equal to the bucket size you
specified in the analyze command, and it does not have a contiguous number, then there are
missing buckets that need to be reconstructed.
*/
-- normally we will use endpoint_value to calculate Bucket-Density
v_x_begin := v_prev_endv ;
v_x_end := xlen_rec.endpoint_value ;
IF xlen_rec.endpoint_number < v_bucket_size THEN
--reconstruct missing buckets in a while loop
WHILE ( xlen_rec.endpoint_number > v_prev_endp ) LOOP
insert into skew_discovery_detail (
owner, table_name,column_name, bucket_number, Bucket_Density )
values ( xlen_rec.owner, xlen_rec.table_name,
xlen_rec.column_name, v_prev_endp,
v_x_end - v_x_begin
) ;
v_prev_endp := v_prev_endp + 1 ;
v_x_begin := xlen_rec.endpoint_value ;
END LOOP ;
ELSIF xlen_rec.endpoint_number > v_bucket_size THEN
/*
If DBA_HISTOGRAMS.ENDPOINT_NUMBER is greater than the bucket size specified in the
analyze command, then transpose them using endpoint_number as endpoint_value to calculate
Bucket-Density without making up missing bucket.
The pair of (endpoint_value, endpoint_number ) is equal to the output of the next SQL statement :
select &column_name, count(*)
from &table_name
group by &column_name
;
where "&" indicates a variable, so the reader can replace it with an actual value.
In this case, each group of "&column_name" represents a bucket and the Bucket-Densitys need
to be calculated from the difference of count(*) value
*/
v_x_begin := v_prev_endp ;
v_x_end := xlen_rec.endpoint_number ;
END IF ;
-- under normal situations, bucket comes from the difference of two end points
insert into skew_discovery_detail (
owner, table_name, column_name, bucket_number, Bucket_Density )
values ( xlen_rec.owner, xlen_rec.table_name,
xlen_rec.column_name, xlen_rec.endpoint_number,
v_x_end - v_x_begin
) ;
v_prev_owner := xlen_rec.owner ;
v_prev_table := xlen_rec.table_name ;
v_prev_col := xlen_rec.column_name ;
v_prev_endp := xlen_rec.endpoint_number + 1;
v_prev_endv := xlen_rec.endpoint_value ;
END LOOP; -- FOR xlen_rec
/*
Now get the standard deviation and use the average as the mean, though statistically
mean is different to average. Oracle's built-in functions do not provide a mean
calculation, but for the skew test, average is close enough.
skew_discovery.buckets represents real buckets after analyze
*/
-- one record for every column in skew_discovery header table
INSERT INTO skew_discovery (
owner, table_name, column_name, std_dev, mean, table_buckets )
SELECT owner, table_name, column_name, stddev(Bucket_Density),
avg( Bucket_Density ), count(*)
FROM skew_discovery_detail
GROUP BY owner, table_name, column_name
;
update skew_discovery s
set s.column_buckets = (
select c.num_buckets
from dba_tab_columns c
where s.owner = c.owner
and s.table_name = c.table_name
and s.column_name = c.column_name
)
;
/*
why s.column_buckets is increased by 1 when d.bucket_number > v_bucket_size:
The DBA_TAB_COLUMN.NUM_BUCKETS does not correctly represent the actual
buckets in this case, so skew_discovery.column_buckets = dba_table_column.num_bucket + 1,
because each record is a bucket. It is not the normal case that each bucket equals the current
endpoint_value - previous endpoint_value.
*/
update skew_discovery s
set s.column_buckets = s.column_buckets + 1
where exists (
select 1
from skew_discovery_detail d
where s.owner = d.owner
and s.table_name = d.table_name
and s.column_name = d.column_name
and d.bucket_number > v_bucket_size
)
;
UPDATE skew_discovery
SET distribution = 'SKEW'
WHERE std_dev / decode( sign(mean - 1), -1, 1, mean)/ column_buckets > v_threshold
AND column_buckets > 1
;
/* next SQL statement can be run independently, once you are satisfied with the results
you received. The reason it is not run here is because once it is put into condi_analyze_ctrl
it will affect production.
*/
dbms_output.put_line('Please run the next update statement in this file');
dbms_output.put_line('to update condi_analyze_ctrl table, if test OK');
/*
-- max(s.buckets) always <= tc.num_buckets, if they are equal,
-- mean it reach the max. resolution
UPDATE condi_analyze_ctrl c
SET c.hstgrm_buckets = (
select distinct bucket_size
from skew_discovery s
where c.owner = s.owner
and c.object_name = s.table_name
and s.distribution = 'SKEW' -- if not skew, no histogram is need.
)
WHERE c.object_name in ( 'BUSINESS_UNITS', 'JOB_CONTROL' )
; -- change the second where condition if you want to expand or limit the scope of tables.
*/
END ;
/