001 /** 002 * Stemmer.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 15/04/2007 008 */ 009 package jcolibri.extensions.textual.stemmer; 010 011 import java.util.StringTokenizer; 012 013 /** 014 * Stemmes a word using the Snowball package. It works with several languages. 015 * @author Juan A. Recio-García 016 * @version 1.0 017 */ 018 public class Stemmer { 019 020 /** Available languages */ 021 public enum Language { DANISH, DUTCH, ENGLISH, FINNISH, FRENCH, GERMAN, ITALIAN, NORWEGIAN, PORTUGUESE, RUSSIAN, SPANISH, SWEDISH}; 022 023 private net.sf.snowball.SnowballProgram _stemmer; 024 025 /** 026 * Creates a stemmer for English 027 */ 028 public Stemmer() 029 { 030 this(Language.ENGLISH); 031 } 032 033 /** 034 * Creates a stemmer for the given language 035 */ 036 public Stemmer(Language language) 037 { 038 if (language == Language.DANISH) 039 _stemmer = new net.sf.snowball.ext.danishStemmer(); 040 else if (language == Language.DUTCH) 041 _stemmer = new net.sf.snowball.ext.dutchStemmer(); 042 else if (language == Language.ENGLISH) 043 _stemmer = new net.sf.snowball.ext.englishStemmer(); 044 else if (language == Language.FINNISH) 045 _stemmer = new net.sf.snowball.ext.finnishStemmer(); 046 else if (language == Language.FRENCH) 047 _stemmer = new net.sf.snowball.ext.frenchStemmer(); 048 else if (language == Language.GERMAN) 049 _stemmer = new net.sf.snowball.ext.germanStemmer(); 050 else if (language == Language.ITALIAN) 051 _stemmer = new net.sf.snowball.ext.italianStemmer(); 052 else if (language == Language.NORWEGIAN) 053 _stemmer = new net.sf.snowball.ext.norwegianStemmer(); 054 else if (language == Language.PORTUGUESE) 055 _stemmer = new net.sf.snowball.ext.portugueseStemmer(); 056 else if (language == Language.RUSSIAN) 057 _stemmer = new net.sf.snowball.ext.russianStemmer(); 058 else if (language == Language.SPANISH) 059 _stemmer = new net.sf.snowball.ext.spanishStemmer(); 060 else if (language == Language.SWEDISH) 061 _stemmer = new net.sf.snowball.ext.swedishStemmer(); 062 else 063 _stemmer = new net.sf.snowball.ext.englishStemmer(); 064 } 065 066 /** 067 * Stems a word 068 */ 069 public String stem(String word) 070 { 071 if (_stemmer == null) 072 return word; 073 _stemmer.setCurrent(word.toLowerCase()); 074 _stemmer.stem(); 075 return _stemmer.getCurrent(); 076 } 077 078 /** 079 * Stems a sentences. It returns the same sentence but with all words stemmed 080 */ 081 public String stemSentence(String sentence) 082 { 083 StringTokenizer st = new StringTokenizer(sentence, " "); 084 StringBuffer res = new StringBuffer(); 085 while(st.hasMoreTokens()) 086 { 087 String nextToken = st.nextToken(); 088 res.append(stem(nextToken)); 089 if(st.hasMoreTokens()) 090 res.append(' '); 091 } 092 return res.toString(); 093 } 094 }