LISTING 2: This script, skewDscv.sql, determines whether column data is evenly distributed or skewed, according to a skew tolerance that you set. /* Filename : skewDscv.sql skewed data distribution discoverer Author : Julius J.Y. Lin 01/30/98 Purpose : find out if data distribution in tables are skewed or not after histAnalyzeProbe.sql produces the result in dba_histograms. Run this script to plug the information into skew_discovery. Eventually the bucket size will be put in CONDI_ANALYZE_CTRL. HSTGRM_BUCKETS Environment : Oracle V.7.3 Note !! : dba_histograms.endpoint is the bucket_number in most cases. * if a group of continual buckets have the same endpoint_value, only the last one in the group is put in the dba_histograms table. * you may have 5 groups in dba_histograms each represent 20% of data in an even distribution case, so you can't assume that if there is repeating endpoint_value, then it is skewed. In other words, if the return value of "select count(*) from dba_histograms where owner=x and table_name=y and column_name =z " is less than "select num_buckets from dba_tab_columns where owner=x and table_name=y and column_name =z " you can not jump to the conclusion that the data distribution is skewed. But if they are equal, then the data is highly evenly distributed. * run this script under user system */ -- may change to update, instead of truncate to keep the result of previous analyze and skew discovery truncate table skew_discovery_detail reuse storage ; truncate table skew_discovery reuse storage ; -- use dba_histograms.endpoint_value of bucket N minus the value in bucket N-1 -- to get Bucket-Density ( implied by 1/x_len, where x_len = v_x_end - v_x_begin) into skew_discovery_detail Declare v_bucket_size number := 75 ; -- consult size in analyze statement in -- skewDscvInst.sql -- the less the threshold the more restrictive the skew definition v_threshold number := 0.2 ; -- of ratio std_dev/mean/num_buckets v_x_end number ; v_x_begin number ; /* to demo the relative degree of skew, select all records from dba_histograms. You can parameterize v_bucket_size, v_threshold, and where table_name is a scope. Change where statement for different table scope or parameterize this condition */ cursor xlen_cs is select owner, table_name, column_name, endpoint_number, endpoint_value from dba_histograms where table_name in ('BUSINESS_UNITS', 'JOB_CONTROL' ) order by 1,2,3,4 ; /* -- next variation of cursor can be used in a real run to speed the process time because -- skew_discovery_detail will keep information of possible skewed columns only. -- if c.num_buckets = v_bucket_size then it is evenly distributed under current detail level of -- bucket_size specified in analyze command. cursor xlen_cs is select h.owner, h.table_name, h.column_name, h.endpoint_number, h.endpoint_value from dba_histograms h, dba_tab_columns c where h.owner = c.owner and h.table_name = c.table_name and h.column_name = c.column_name and c.num_buckets < v_bucket_size and h.table_name in ('BUSINESS_UNITS', 'JOB_CONTROL' ) order by 1,2,3,4 ; */ v_prev_owner dba_histograms.owner%TYPE ; v_prev_table dba_histograms.table_name%TYPE ; v_prev_col dba_histograms.column_name%TYPE ; v_prev_endp dba_histograms.endpoint_number%TYPE := 0; v_prev_endv dba_histograms.endpoint_value%TYPE := 0; BEGIN FOR xlen_rec IN xlen_cs LOOP -- if any one of the following 3 values changes then the mean column -- changed. If so, reset v_prev_endv to 0 IF xlen_rec.column_name != v_prev_col OR xlen_rec.table_name != v_prev_table OR xlen_rec.owner != v_prev_owner THEN v_prev_endv := 0 ; v_prev_endp := 0 ; END IF ; /* Start to reconstruct missing records in dba_histograms because the statistic function needs data to explicitly exist. If column NUM_BUCKETS in table DBA_TAB_COLUMNS is greater than 1, and DBA_HISTOGRAMS.ENDPOINT_NUMBER is less than or equal to the bucket size you specified in the analyze command, and it does not have a contiguous number, then there are missing buckets that need to be reconstructed. */ -- normally we will use endpoint_value to calculate Bucket-Density v_x_begin := v_prev_endv ; v_x_end := xlen_rec.endpoint_value ; IF xlen_rec.endpoint_number < v_bucket_size THEN --reconstruct missing buckets in a while loop WHILE ( xlen_rec.endpoint_number > v_prev_endp ) LOOP insert into skew_discovery_detail ( owner, table_name,column_name, bucket_number, Bucket_Density ) values ( xlen_rec.owner, xlen_rec.table_name, xlen_rec.column_name, v_prev_endp, v_x_end - v_x_begin ) ; v_prev_endp := v_prev_endp + 1 ; v_x_begin := xlen_rec.endpoint_value ; END LOOP ; ELSIF xlen_rec.endpoint_number > v_bucket_size THEN /* If DBA_HISTOGRAMS.ENDPOINT_NUMBER is greater than the bucket size specified in the analyze command, then transpose them using endpoint_number as endpoint_value to calculate Bucket-Density without making up missing bucket. The pair of (endpoint_value, endpoint_number ) is equal to the output of the next SQL statement : select &column_name, count(*) from &table_name group by &column_name ; where "&" indicates a variable, so the reader can replace it with an actual value. In this case, each group of "&column_name" represents a bucket and the Bucket-Densitys need to be calculated from the difference of count(*) value */ v_x_begin := v_prev_endp ; v_x_end := xlen_rec.endpoint_number ; END IF ; -- under normal situations, bucket comes from the difference of two end points insert into skew_discovery_detail ( owner, table_name, column_name, bucket_number, Bucket_Density ) values ( xlen_rec.owner, xlen_rec.table_name, xlen_rec.column_name, xlen_rec.endpoint_number, v_x_end - v_x_begin ) ; v_prev_owner := xlen_rec.owner ; v_prev_table := xlen_rec.table_name ; v_prev_col := xlen_rec.column_name ; v_prev_endp := xlen_rec.endpoint_number + 1; v_prev_endv := xlen_rec.endpoint_value ; END LOOP; -- FOR xlen_rec /* Now get the standard deviation and use the average as the mean, though statistically mean is different to average. Oracle's built-in functions do not provide a mean calculation, but for the skew test, average is close enough. skew_discovery.buckets represents real buckets after analyze */ -- one record for every column in skew_discovery header table INSERT INTO skew_discovery ( owner, table_name, column_name, std_dev, mean, table_buckets ) SELECT owner, table_name, column_name, stddev(Bucket_Density), avg( Bucket_Density ), count(*) FROM skew_discovery_detail GROUP BY owner, table_name, column_name ; update skew_discovery s set s.column_buckets = ( select c.num_buckets from dba_tab_columns c where s.owner = c.owner and s.table_name = c.table_name and s.column_name = c.column_name ) ; /* why s.column_buckets is increased by 1 when d.bucket_number > v_bucket_size: The DBA_TAB_COLUMN.NUM_BUCKETS does not correctly represent the actual buckets in this case, so skew_discovery.column_buckets = dba_table_column.num_bucket + 1, because each record is a bucket. It is not the normal case that each bucket equals the current endpoint_value - previous endpoint_value. */ update skew_discovery s set s.column_buckets = s.column_buckets + 1 where exists ( select 1 from skew_discovery_detail d where s.owner = d.owner and s.table_name = d.table_name and s.column_name = d.column_name and d.bucket_number > v_bucket_size ) ; UPDATE skew_discovery SET distribution = 'SKEW' WHERE std_dev / decode( sign(mean - 1), -1, 1, mean)/ column_buckets > v_threshold AND column_buckets > 1 ; /* next SQL statement can be run independently, once you are satisfied with the results you received. The reason it is not run here is because once it is put into condi_analyze_ctrl it will affect production. */ dbms_output.put_line('Please run the next update statement in this file'); dbms_output.put_line('to update condi_analyze_ctrl table, if test OK'); /* -- max(s.buckets) always <= tc.num_buckets, if they are equal, -- mean it reach the max. resolution UPDATE condi_analyze_ctrl c SET c.hstgrm_buckets = ( select distinct bucket_size from skew_discovery s where c.owner = s.owner and c.object_name = s.table_name and s.distribution = 'SKEW' -- if not skew, no histogram is need. ) WHERE c.object_name in ( 'BUSINESS_UNITS', 'JOB_CONTROL' ) ; -- change the second where condition if you want to expand or limit the scope of tables. */ END ; /