001    /**
002     * Stemmer.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 15/04/2007
008     */
009    package jcolibri.extensions.textual.stemmer;
010    
011    import java.util.StringTokenizer;
012    
013    /**
014     * Stemmes a word using the Snowball package. It works with several languages.
015     * @author Juan A. Recio-García
016     * @version 1.0
017     */
018    public class Stemmer {
019    
020            /** Available languages */
021            public enum Language { DANISH, DUTCH, ENGLISH, FINNISH, FRENCH, GERMAN, ITALIAN, NORWEGIAN, PORTUGUESE, RUSSIAN, SPANISH, SWEDISH};
022            
023            private net.sf.snowball.SnowballProgram _stemmer;
024            
025            /**
026             * Creates a stemmer for English
027             */
028            public Stemmer()
029            {
030                    this(Language.ENGLISH);
031            }
032            
033            /**
034             * Creates a stemmer for the given language
035             */
036            public Stemmer(Language language)
037            {
038                    if (language == Language.DANISH)
039                            _stemmer = new net.sf.snowball.ext.danishStemmer();
040                    else if (language == Language.DUTCH)
041                            _stemmer = new net.sf.snowball.ext.dutchStemmer();
042                    else if (language == Language.ENGLISH)
043                            _stemmer = new net.sf.snowball.ext.englishStemmer();
044                    else if (language == Language.FINNISH)
045                            _stemmer = new net.sf.snowball.ext.finnishStemmer();
046                    else if (language == Language.FRENCH)
047                            _stemmer = new net.sf.snowball.ext.frenchStemmer();
048                    else if (language == Language.GERMAN)
049                            _stemmer = new net.sf.snowball.ext.germanStemmer();
050                    else if (language == Language.ITALIAN)
051                            _stemmer = new net.sf.snowball.ext.italianStemmer();
052                    else if (language == Language.NORWEGIAN)
053                            _stemmer = new net.sf.snowball.ext.norwegianStemmer();
054                    else if (language == Language.PORTUGUESE)
055                            _stemmer = new net.sf.snowball.ext.portugueseStemmer();
056                    else if (language == Language.RUSSIAN)
057                            _stemmer = new net.sf.snowball.ext.russianStemmer();
058                    else if (language == Language.SPANISH)
059                            _stemmer = new net.sf.snowball.ext.spanishStemmer();
060                    else if (language == Language.SWEDISH)
061                            _stemmer = new net.sf.snowball.ext.swedishStemmer();
062                    else
063                            _stemmer = new net.sf.snowball.ext.englishStemmer();
064            }
065            
066            /**
067             * Stems a word
068             */
069            public String stem(String word)
070            {
071                    if (_stemmer == null)
072                            return word;
073                    _stemmer.setCurrent(word.toLowerCase());
074                    _stemmer.stem();
075                    return _stemmer.getCurrent();
076            }
077            
078            /**
079             * Stems a sentences. It returns the same sentence but with all words stemmed
080             */
081            public String stemSentence(String sentence)
082            {
083                    StringTokenizer st = new StringTokenizer(sentence, " ");
084                    StringBuffer res = new StringBuffer();
085                    while(st.hasMoreTokens())
086                    {
087                            String nextToken = st.nextToken();
088                            res.append(stem(nextToken));
089                            if(st.hasMoreTokens())
090                                    res.append(' ');
091                    }
092                    return res.toString();
093            }
094    }