@techreport{TD:101148,
	att_abstract={{Most emotion recognition systems do not perform real-time emotion recognition due to latencies caused by phrase segmentation and resource-intensive feature acquisition, etc. To address this issue, we present an emotion recognition approach that can estimate speaker emotions with much lower latency.

The proposed approach does not rely on phrase-level features to recognize speaker emotion; rather, it estimates the speaker’s emotional state over the course of the utterance incrementally, using a shifting n-word window on the basis of easily computable features. These features are obtained from three information streams, i.e. cepstral, prosodic and textual, at the word-level and combined at decision-level using a statistical frame-work. Our work shows that combining the three information streams yields higher emotion recognition accuracy than any
single information stream.

Using features extracted from n-word sequences rather than phrases provides for the low-latency capabilities of the proposed system, without any loss in utterance-level emotion recognition accuracy. The performance of the proposed system on a binary utterance-level emotion recognition task using an in-house database shows a relative improvement of 41% over chance, compared to a relative improvement of 31.82% shown by the baseline phrase-level emotion recognition approach.
}},
	att_authors={tm330a, dd734j},
	att_categories={C_IIS.11},
	att_copyright={{ISCA}},
	att_copyright_notice={{The definitive version was published in   2013. {{, 2013-08-25}}
}},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={Emotion,  sentiment,  emotion recognition, feature fusion},
	att_techdoc={true},
	att_techdoc_key={TD:101148},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:101148_DS1_2013-03-26T15:48:41.543Z.pdf},
	author={Taniya Mishra and Dimitrios Dimitriadis},
	institution={{ISCA Interspeech}},
	month={August},
	title={{Incremental Emotion Recognition}},
	year=2013,
}