001 /** 002 * Stemmer.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 20/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.util.Collection; 012 013 import jcolibri.cbrcore.Attribute; 014 import jcolibri.cbrcore.CBRCase; 015 import jcolibri.cbrcore.CBRQuery; 016 import jcolibri.extensions.textual.IE.IEutils; 017 import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor; 018 import jcolibri.extensions.textual.IE.representation.IEText; 019 import jcolibri.extensions.textual.IE.representation.Token; 020 import jcolibri.extensions.textual.stemmer.Stemmer; 021 import jcolibri.util.AttributeUtils; 022 import jcolibri.util.ProgressController; 023 024 /** 025 * Stemes the tokens of the text using the SnowBall package. 026 * <a href="http://snowball.tartarus.org">http://snowball.tartarus.org</a> 027 * <br> 028 * It stores the stem in each token using the flag with the same name. 029 * <p> 030 * This method uses the SnowBall package: 031 * </p> 032 * <p> 033 * First version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 034 * Universidad Complutense de Madrid (GAIA) 035 * </p> 036 * @author Juan A. Recio-Garcia 037 * @version 2.0 038 * 039 */ 040 public class TextStemmer 041 { 042 static Stemmer stemmer = new Stemmer(); 043 044 /** 045 * Performs the algorithm in the given attributes of a collection of cases. 046 * These attributes must be IEText objects. 047 */ 048 public static void stem(Collection<CBRCase> cases, Collection<Attribute> attributes) 049 { 050 org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text."); 051 ProgressController.init(TextStemmer.class, "Stemming text...", cases.size()); 052 for(CBRCase c: cases) 053 { 054 for(Attribute a: attributes) 055 { 056 Object o = AttributeUtils.findValue(a, c); 057 stem((IEText)o); 058 } 059 ProgressController.step(GatePhrasesExtractor.class); 060 } 061 ProgressController.finish(GatePhrasesExtractor.class); 062 } 063 064 /** 065 * Performs the algorithm in the given attributes of a query. 066 * These attributes must be IEText objects. 067 */ 068 public static void stem(CBRQuery query, Collection<Attribute> attributes) 069 { 070 org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text."); 071 for(Attribute a: attributes) 072 { 073 Object o = AttributeUtils.findValue(a, query); 074 stem((IEText)o); 075 } 076 } 077 078 /** 079 * Performs the algorithm in all the attributes of a collection of cases 080 * These attributes must be IEText objects. 081 */ 082 public static void stem(Collection<CBRCase> cases) 083 { 084 org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text."); 085 ProgressController.init(TextStemmer.class, "Stemming text...", cases.size()); 086 for(CBRCase c: cases) 087 { 088 Collection<IEText> texts = IEutils.getTexts(c); 089 for(IEText t : texts) 090 stem(t); 091 ProgressController.step(GatePhrasesExtractor.class); 092 } 093 ProgressController.finish(GatePhrasesExtractor.class); 094 } 095 096 /** 097 * Performs the algorithm in all the attributes of a query 098 * These attributes must be IEText objects. 099 */ 100 public static void stem(CBRQuery query) 101 { 102 org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text."); 103 Collection<IEText> texts = IEutils.getTexts(query); 104 for(IEText t : texts) 105 stem(t); 106 } 107 108 109 /** 110 * Stems the tokens of the text. If no stem is found, it stores the original word as the stem. 111 * @param text to stem 112 */ 113 public static void stem(IEText text) 114 { 115 for(Token t: text.getAllTokens()) 116 if(!t.isStopWord()) 117 { 118 String stem = stemmer.stem(t.getRawContent()); 119 if(stem == null) 120 stem = t.getRawContent(); 121 t.setStem(stem); 122 } 123 } 124 }