@inproceedings{TD:100424,
	att_abstract={Parallel text acquisition from the Web is an attractive way for
augmenting statistical models (e.g., machine translation, cross-
lingual document retrieval) with domain representative data.
The basis for obtaining such data is a collection of pairs of bilin-
gual Web sites or pages. In this work, we propose a crawling
strategy that locates bilingualWeb sites by constraining the vis-
itation policy of the crawler to the graph neighborhood of bilin-
gual sites on the Web. Subsequently, we use a novel recursive
mining technique that recursively extracts text and links from
the collection of bilingual Web sites obtained from the crawl-
ing. Our method does not suffer from the computationally pro-
hibitive combinatorial matching typically used in previous work
that uses document retrieval techniques to match a collection of
bilingual webpages. We demonstrate the efficacy of our ap-
proach in the context of machine translation in the tourism and
hospitality domain. The parallel text obtained using our novel
crawling strategy results in a relative improvement of 21% in
BLEU score (English-to-Spanish) over an out-of-domain seed
translation model trained on the European parliamentary pro-
ceedings.},
	att_authors={vk947h, lb091c, sb7658},
	att_categories={},
	att_copyright={ACL},
	att_copyright_notice={The definitive version was published in EMNLP. {{, 2011-08-27}}
},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={},
	att_techdoc={true},
	att_techdoc_key={TD:100424},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100424_DS1_2013-01-30T20:54:44.114Z.pdf},
	author={Vivek Kumar Rangarajan Sridhar AND Luciano Barbosa AND Srinivas Bangalore},
	booktitle={Proceedings of Interspeech},
	institution={{INTERSPEECH}},
	month={August},
	title={{A Scalable Approach to Building a Parallel Corpus from the Web}},
	year=2011,
}