@inproceedings{oai:nagoya.repo.nii.ac.jp:00008724, author = {OHISHI, Yasunori and GOTO, Masataka and ITOU, Katunobu and TAKEDA, Kazuya}, book = {4th Symposium on "Intelligent Media Integration for Social Information Infrastructure" December 7-8, 2006}, month = {Dec}, note = {This paper describes a music retrieval system that enables a user to retrieve a song by two different methods: by singing its melody or by saying its title. To allow the user to use those methods seamlessly without changing a voice input mode, a method of automatically discriminating between singing and speaking voices is indispensable. We therefore first investigated measures that characterize differences between singing and speaking voices. From subjective experiments, we found that even short term characteristics such as the spectral envelope represented as MFCC can be used as a discrimination cue, while the temporal structure is the most important cue when longer signals are given. According to these results, we developed the automatic method of discriminating between singing and speaking voices by combining two measures: MFCC and an F0 (voice pitch) contour. Based on this method, we built the music retrieval system that can accept both singing voices for the melody and speaking voices for the title.}, pages = {113--114}, publisher = {INTELLIGENT MEDIA INTEGRATION NAGOYA UNIVERSITY / COE}, title = {AUTOMATIC DISCRIMINATION BETWEEN SINGING AND SPEAKING VOICES FOR A FLEXIBLE MUSIC RETRIEVAL SYSTEM}, year = {2006} }