@techreport{TD:100030,
	att_abstract={{Data quality mining (DQM) is the use of data mining techniques to
detect, quantify, explain, and correct data quality problems. Current
DQM approaches focus on addressing each category of data
glitch separately. However, in real-life data, different types of data
glitches co-occur in complex patterns. These patterns and interactions
between glitches offer valuable clues for developing effective
data cleaning strategies that are more informed than blind, predefined
strategies.
In this paper, we propose a novel data quality mining framework
for the comprehensive definition, detection and cleaning of complex,
multi-type data glitches. We exploit the distributions and interaction
of different types of glitches to develop data-driven cleaning
strategies that offer significant advantages over blind strategies.
We develop a statistically rigorous framework for glitch scoring,
strategy evaluation and the selection of an optimal strategy from
the space of candidate quantitative cleaning strategies. We demonstrate
the efficacy and scalability of our framework on very large
real and synthetic data sets.}},
	att_authors={td3863, ds8961},
	att_categories={},
	att_copyright={{IEEE}},
	att_copyright_notice={{}},
	att_donotupload={true},
	att_private={false},
	att_projects={},
	att_tags={},
	att_techdoc={true},
	att_techdoc_key={TD:100030},
	att_url={},
	author={Laure Berti-Equille and Tamraparni Dasu and Divesh Srivastava},
	institution={{IEEE International Conference on Data Engineering (ICDE)}},
	month={April},
	title={{Discovery of Complex Glitch Patterns: A Novel Approach
to Quantitative Data Cleaning}},
	year=2011,
}