Technical Article

Source system pre-integration analysis

,

Initial analysis of a new source system can be quite time consuming to manually examine tables and columns to determine which even have worthwhile values in the first place. Running this script against the current database will generate useful information for all the columns of all the tables. Details include data types with varchar lengths, row counts, minimum and maximum values, and a count of the nulls and the distincts. It does take a long time to run but my (your) time can be spent more productively doing something else while this examines all the columns. Please see inline comments for additional thoughts.

--  This sample script uses a temp table.  I recommend replacing the references to the temp table
--  with a static table due to the potentially lengthy processing times involved with this procedure.
--  A good strategy is to create a database for holding these tables and naming a table in that
--  database for each database examined.

create table #tmr_temp(
Table_Name varchar(200),
Col_No int,
Col_Nm varchar(200),
Data_Type varchar(200),
Row_Count int,
Null_Count int,
Distinct_Count int,
Min_Value varchar(200),
Max_Value varchar(200)
)

declare @next_table as varchar(200)
declare @next_column as varchar(200)
declare @next_data_type as varchar(200)
declare @runme as varchar(500)

--  Initial load of the temp table creates the list of tables and columns in the current db.
--  I borrowed and only slightly modified the next 8 code lines from another script found on sqlservercentral.com

insert into #tmr_temp (Table_Name, Col_No, Col_Nm, Data_Type) select
TABLE_NAME,
ORDINAL_POSITION 'Col No',
COLUMN_NAME 'Column Name',
substring(DATA_TYPE + case when DATA_TYPE like '%char%' then '(' + 
rtrim(convert(char(3),CHARACTER_MAXIMUM_LENGTH)) + ')' else '' end,1,25) 'DataType'
from INFORMATION_SCHEMA.COLUMNS 

--  Add any other undesired tables here; especially tables already examined and/or
--  suspected of containing large amounts of useful data (There's no need to do this level of
--  initial examination on a table you already know to be useful).

where TABLE_NAME not in ('dtproperties','sysconstraints','syssegments')
order by Table_Name, Ordinal_Position

--  This cursor will loop through to fill in the details into the temp table.

declare table_cursor cursor for select Table_Name, Col_Nm, Data_Type from #tmr_temp

open table_cursor
fetch next from table_cursor into @next_table, @next_column, @next_data_type

--  This WHILE loop takes a long time to process with any significant amount of data
--  even on a speedy server.  It should be run when other database use is at its lowest.

while (@@fetch_status <> -1)
begin

-- Both the count and the NULL count are not dependent upon data type and can be processed first.

set @runme = 'update #tmr_temp set Row_Count = (select count(1) from '+@next_table+') where Table_Name = '''+@next_table+''''
exec (@runme)
set @runme = 'update #tmr_temp set Null_Count = (select count(1) from '+@next_table+' where '+@next_column+' is null) where Table_Name = '''+@next_table+''' and Col_Nm = '''+@next_column+''''
exec (@runme)

--  SQL Server will error on aggregate functions with data types image, bit, text, binary and varbinary.
--  If the database in questions contains additional types that cannot be used with the
--  min and max functions, add those types here.

if @next_data_type not in ('image', 'bit', 'text', 'binary','varbinary')
begin
set @runme = 'update #tmr_temp set Min_Value = (select cast(min('+@next_column+') as varchar(200)) from '+@next_table +') where Table_Name = '''+@next_table+''' and Col_Nm = '''+@next_column+''''
exec (@runme)
set @runme = 'update #tmr_temp set Max_Value = (select cast(max('+@next_column+') as varchar(200)) from '+@next_table + ') where Table_Name = '''+@next_table+''' and Col_Nm = '''+@next_column+''''
exec (@runme)
set @runme = 'update #tmr_temp set Distinct_Count = (select count(distinct '+@next_column+') from '+@next_table+') where Table_Name = '''+@next_table+''' and Col_Nm = '''+@next_column+''''
exec (@runme)
end
fetch next from table_cursor into @next_table, @next_column, @next_data_type
end
close table_cursor
deallocate table_cursor

--  The Min and Max values columns will show 'NULL' when the row count is zero.
--  This is confusing at first glance as it appears the column in question
--  contains only NULL values when really there are no values at all.

update #tmr_temp set Min_Value = ' ** ZERO ROWS ** ' where Row_Count = 0
update #tmr_temp set Max_Value = ' ** ZERO ROWS ** ' where Row_Count = 0
select * from #tmr_temp

--  If using a static table or wish to further query the temp table, comment out the line below.

drop table #tmr_temp

Rate

5 (1)

You rated this post out of 5. Change rating

Share

Share

Rate

5 (1)

You rated this post out of 5. Change rating