@techreport{TD:101596,
	att_abstract={{The web is a rich resource of structured data. There has been an increasing interest in using web structured data for many applications such as data integration, web search and question answering. In this paper, we present DEXTER, a system to find product sites on the web and, detect and extract product specifications from them. Since product specifications are not located in a single point on the web, but spread among different product sites, our focused crawler relies on search queries and backlinks to discover product sites. To perform the detection, and handle the high diversity of specifications in terms of content, size and format, our system uses supervised learning to classify html fragments (e.g., tables and lists) present in web pages as specifications or not. To perform large-scale extraction of the attribute-value pairs from the html fragments identified by the specification detector, DEXTER adopts two lightweight strategies: a domain-independent and unsupervised wrapper method, which relies on the observation that these html fragments have very similar structure; and a combination of this strategy with a previous approach, which infers extraction patterns by annotations generated by automatic but noisy annotators. The results show that (1) our crawler strategy to locate product pages is effective: it discovered product pages from 2,719 sites and 5 different categories; (2) the specification detector obtains high values of F-measure (close to 0.9) over a heterogeneous set of specifications; and (3) our efficient wrapper methods get very high values of precision and recall and obtain better results than a supervised rule-based wrapper.}},
	att_authors={ds8961},
	att_categories={C_BB.1, C_NSS.2, C_IIS.5},
	att_copyright={{VLDB Foundation}},
	att_copyright_notice={{The definitive version was published in Very Large Databases, 2015. {{, Volume 8}}{{, Issue 13}}{{, 2015-08-31}}
}},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={},
	att_techdoc={true},
	att_techdoc_key={TD:101596},
	att_url={http://web1-clone.research.att.com:81/techdocs_downloads/TD:101596_DS1_2015-11-28T16:24:36.427Z.pdf},
	author={Divesh Srivastava and Disheng Qiu and Luciano Barbosa and Xin Luna Dong and Yanyan Shen},
	institution={{Proceedings of the VLDB Endowment}},
	month={August},
	title={{DEXTER: Large-Scale Discovery and Extraction of Product Specifications on the Web}},
	year=2015,
}