@techreport{TD:101648,
	att_abstract={{Measurement of the lexical properties of domain names enables many types of relatively fast, lightweight web mining analyses. These include unsupervised learning tasks such as automatic categorization and clustering of websites, as well as supervised learning tasks, such as classifying websites as malicious or benign. In this paper we explore whether these tasks can be better accomplished by identifying semantically coherent groups of words in a large set of domain names using a combination of word segmentation and topic modeling methods. By segmenting domain names to generate a large set of new domain-level features, we compare three different unsupervised learning methods for identifying topics among domain name keywords: spherical k-means clustering (SKM), Latent Dirichlet Allocation (LDA), and the Biterm Topic Model (BTM). We successfully infer semantically coherent groups of words in two independent data sets, finding that BTM topics are quantitatively the most coherent. Using the BTM, we compare inferred topics across data sets and across time periods, and we also highlight instances of homophony within the topics. Finally, we show that the BTM topics can be used as features to improve the interpretability of a supervised learning model for the detection of malicious domain names. To our knowledge this is the first large-scale empirical analysis of the co-occurrence patterns of words within domain names.}},
	att_authors={cf5264, ks970b, ww727g},
	att_categories={C_CCF.9},
	att_copyright={{IEEE}},
	att_copyright_notice={{This version of the work is reprinted here with permission of IEEE for your personal use. Not for redistribution. The definitive version was published in IEEE Conference on Data Science and Advanced Analytics {{, 2016-10-17}}



This version of the work is reprinted here with permission of IEEE for your personal use. Not for redistribution. The definitive version was published in 2015. {{, 2016-10-17}}
}},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={Domain Names,  Latent Dirichlet Allocation (LDA),  Text Mining,  Topic Models,  Word Segmentation},
	att_techdoc={true},
	att_techdoc_key={TD:101648},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:101648_DS1_2016-09-07T17:34:40.635Z.pdf},
	author={Cheryl Flynn and Kenneth Shirley and Wei Wang},
	institution={{IEEE Conference on Data Science and Advanced Analytics}},
	month={October},
	title={{Deconstructing Domain Names to Reveal Latent Topics}},
	year=2016,
}