@phdthesis{oai:nagoya.repo.nii.ac.jp:00009574, author = {Hagiwara, Masato}, month = {Mar}, note = {Lexical knowledge is one of the most fundamental yet important resources for natural language processing, having a broad range of applications such as query expansion for information retrieval and automatic thesaurus construction. However, construction and maintenance of lexical knowledge by hand are costly tasks. Therefore, they should be technically supported or fully automated. Among various kinds of lexical relations, synonym relation is frequently used as the basis in many applications. To automatically detect synonym relations from corpora, a concept called distributional hypothesis, which states that semantically similar words tend to share similar context, has been utilized. Distributional similarity is then computed from the extracted context. However, although there have been a number of researches which focused on the similarity computation itself and the use of context is actually the essence of distributional similarity, the importance of context has been surprisingly underestimated so far. Firstly, very few studies have ever paid attention to the formalization or comparison of effective context for synonym acquisition, let alone the extension. However, how to construct and extend the context is the most fundamental issue for distributional similarity. Secondly, the effective context for synonym acquisition has only been intuitively determined, although the context used for synonym acquisition greatly influences the performance of the task. Thirdly, past studies have paid attention to the simple vector space model almost exclusively. However, more complex, semantic representation of context must be beneficial to synonym acquisition. In response to these problems, we, in in this thesis, address the issues of formalization, extension, selection, and modeling of context in order to achieve better synonym acquisition from large corpora, which have been all unsolved or underestimated in the literature of synonym acquisition and distributional similarity. More specifically, the contribution of this thesis is three-fold — (1) As for the context representation and extension problem, we propose a new method to formalize and extend the conventional dependency-based context through the use of indirect dependency, and show its effectiveness. (2) As for the context selection problem, we propose three schemes to automatically select the extracted context types using statistical measures, and show that these schemes are effective in terms of their performance/cost trade-off. (3) As for the context modeling problem, we pursue the application of latent semantic models to formalize and model the context of words, or word-context co-occurrences, and show that these models can boost the synonym acquisition performance. Also, we propose supervised approaches to synonym acquisition, contextual feature-based classification and metric learning, and show that these both demonstrate the higher performance compared to conventional simpler models. Note that, although the methods in this thesis are based on English, they are not limited to any particular languages. This thesis consists of a total of eight chapters. Chapter 1 is the introduction to this thesis, where we introduce the current issues and problems regarding distributional similarity and clarify the position of this thesis. In Chapter 2, we introduce some basic concepts of distributional similarity, describe how to extract dependency-based context, and list a number of similarity measures and weighting functions, as the baseline for other experiments described in this thesis. In Chapter 3, we actually apply the concepts introduced in Chapter 2 to the synonym acquisition task. To evaluate the performance, we introduce two evaluation measures, i.e., average precision (AP) and correlation coefficient (CC), which both use existing thesauri such as WordNet. As the preprocessing of experiments, we firstly investigate the effect of frequency cut-off to word and context types. We then compare similarity measures and weighting functions, and show that similarity measures which incorporate some type of normalization to the context vector or the probability distribution, namely cosine similarity, vector-based Jaccard coefficient, and Jensen-Shannon divergence, perform well. The experiment has also shown that PMI and t-test are among the best, meaning that the word-based normalization factor may have helped. In Chapter 4, we introduce and apply two latent semantic models, i.e., LSI and PLSI, to synonym acquisition and compare their performance with the simple vector space model. Latent semantic models formalize the co-occurrences of words and contexts through latent semantics, and solve the problems of noise and sparseness. Although each model behaves quite differently, LSI and PLSI have achieved higher performance when coupled with their own suitable measures. We have also shown that the performance peaks when the number of the latent classes is around 100 to 150. In Chapter 5, we formalize and extend the normal direct dependency to cover indirectly related words and enhance the contextual information for distributional similarity. The experiment shows that incorporating indirect dependency in addition to direct dependency is effective for the acquisition performance. We also compare the context representations for indirect dependency. The improvement is especially clear when fine-grained context representations are used. In Chapter 6, we propose three schemes of context selection for distributional similarity: category-, type-, and co-occurrence based selection. In the experiment, these three selection schemes are compared, clarifying the characteristics of the schemes and the measures. It shows the effectiveness of the simplest category-based selection, while type- and co-occurrence based selection methods work well for both the word- and dependency-based context, showing that these method can be generally and flexibly used for any kinds of context and dimensionality/computational cost constraints. In Chapter 7, two novel, supervised approaches to synonym acquisition are proposed. The first one is a classification model, where we propose context-based features called distributional features, which enabled to use both context-based features and pattern-based features to build fully integrated classifiers. The comparison experiment shows that the context-based features greatly increase the performance (more than 60% on F-1 measure) compared to distributional similarity-based methods. The other approach learns Mahalanobis distance, which is the generalization of Euclid distance. It significantly outperforms existing similarity metrics. Although we have to resort to aggressive feature reduction to make it possible to apply the learning, the performance gain from the supervised learning is enough to offset the disadvantage and justify its usage in some applications. In Chapter 8, we conclude this thesis, wrapping up the findings and results obtained from each chapter. We also present our possible future direction of this study., 名古屋大学博士学位論文 学位の種類:博士(情報科学) (課程) 学位授与年月日:平成21年3月25日}, school = {名古屋大学, Nagoya University}, title = {Modeling and Selection of Context for Better Synonym Acquisition}, year = {2009} }