@phdthesis{oai:nagoya.repo.nii.ac.jp:00012410, author = {坂井, 誠 and SAKAI, Makoto}, month = {Sep}, note = {This thesis deals with acoustic feature transformations in automatic speech recognition to improve basic performance of a speech recognizer. The aim of acoustic feature transformations is to reduce dimensionality of long-term speech features without losing discriminative information among the different phonetic classes.
First, we focus on optimizing acoustic feature transformations using criteria with which to maximize the ratio of between-class scatter to within-class scatter. This approach is based on a family of functions of scatter or covariance matrices, which is frequently used in practice. Typical methods in this approach include linear discriminant analysis (LDA), heteroscedastic linear discriminant analysis (HLDA), and heteroscedastic discriminant analysis (HDA). Although LDA, HLDA and HDA are the most widely used in speech recognition, the connections between them have been disregarded so far. By developing a unified mathematical framework, close relationships between them are identified and analyzed in detail. The framework termed power LDA (PLDA) can describe various criteria by varying its control parameter. PLDA includes LDA, HLDA and HDA as special cases. In order to determine a sub-optimal control parameter automatically, a control parameter selection method is also provided.
The effectiveness of the combinations of acoustic feature transformations and discriminative training techniques of acoustic models is investigated and additional performance improvement is obtained. Unfortunately, the transformation methods mentioned above may result in an unexpected dimensionality reduction if the data in a certain class consist of several clusters, because they implicitly assume that data are generated from a single Gaussian distribution. This study provides extensions of HDA and PLDA to deal with class distributions with several clusters.
Second, we focus attention on acoustic feature transformations which minimize a kind of classification error between different phonetic classes. As the performance of speech recognition systems generally correlates strongly with the classification accuracy of features, the features should have the power to discriminate between different classes. The existing methods for this approach attempt to minimize the average classification error between different classes. Although minimizing the average classification error suppresses total classification error, it cannot prevent the occurrence of considerable overlaps between distributions of some different classes with low frequencies, which is critical for speech recognition because there may be class pairs that have little or no discriminative information on each other. Instead of the average classification error, minimization methods of maximum classification error are proposed herewith so as to avoid considerable error between different classes. In addition, interpolation methods that minimize the maximization error while minimizing the average classification error are also proposed and achieved the best results., 名古屋大学博士学位論文学位の種類:博士(情報科学)(課程) 学位授与年月日:平成22年9月30日}, school = {名古屋大学, Nagoya University}, title = {Acoustic Feature Transformation Based on Generalized Criteria for Speech Recognition}, year = {2010} }