@techreport{TD:100324,
	att_abstract={{Video-based lip activity analysis has been successfully used for assisting speech recognition for almost a decade. Surprisingly, this same activity has not been heavily used for near real-time visual speaker retrieval and verification, due to tracking complexity, inad- equate or difficult feature determination, and the need for a large amount of pre-labeled data for model training. This paper explores the performance of several solutions using modern histogram of ori- ented gradients (HOG) features, different quantization techniques, and the benefits of temporal sampling and spatial partitioning to derive a representation called LipActs. Two datasets are used for evaluation: one with 81 participants derived from varying quality YouTube content and one with 3 participants derived from a forward-facing mobile video camera with 10 varied lighting and capture angle environments. Over these datasets, histograms of a moderate number of pooled temporal frames with multi-resolution spatial quantization, offer an improvement of 37-73% over raw features when optimizing for lowest equal error rate (EER).}},
	att_authors={ez2685},
	att_categories={C_IIS.13, C_IIS.4, C_IIS.10},
	att_copyright={{IEEE}},
	att_copyright_notice={{This version of the work is reprinted here with permission of IEEE for your personal use. Not for redistribution. The definitive version was published in IEEE ICME. {{, 2011-07-11}}
}},
	att_donotupload={},
	att_private={false},
	att_projects={Miracle},
	att_tags={recognition, verification, retrieval, video, mobile, face, lips},
	att_techdoc={true},
	att_techdoc_key={TD:100324},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100324_DS1_2011-03-16T19:34:39.556Z.pdf},
	author={Eric Zavesky},
	institution={{IEEE ICME}},
	month={July},
	title={{LipActs: Efficient Representations For Visual Speakers}},
	year=2011,
}