001    /**
002     * Stemmer.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 20/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.util.Collection;
012    
013    import jcolibri.cbrcore.Attribute;
014    import jcolibri.cbrcore.CBRCase;
015    import jcolibri.cbrcore.CBRQuery;
016    import jcolibri.extensions.textual.IE.IEutils;
017    import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor;
018    import jcolibri.extensions.textual.IE.representation.IEText;
019    import jcolibri.extensions.textual.IE.representation.Token;
020    import jcolibri.extensions.textual.stemmer.Stemmer;
021    import jcolibri.util.AttributeUtils;
022    import jcolibri.util.ProgressController;
023    
024    /**
025     * Stemes the tokens of the text using the SnowBall package. 
026     * <a href="http://snowball.tartarus.org">http://snowball.tartarus.org</a>
027     * <br>
028     * It stores the stem in each token using the flag with the same name.
029     * <p>
030     * This method uses the SnowBall package: 
031     * </p>
032     * <p>
033     * First version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
034     * Universidad Complutense de Madrid (GAIA)
035     * </p>
036     * @author Juan A. Recio-Garcia
037     * @version 2.0
038     *
039     */
040    public class TextStemmer
041    {
042        static Stemmer stemmer = new Stemmer();
043     
044        /**
045         * Performs the algorithm in the given attributes of a collection of cases.
046         * These attributes must be IEText objects.
047         */
048        public static void stem(Collection<CBRCase> cases, Collection<Attribute> attributes)
049        {
050            org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text.");
051            ProgressController.init(TextStemmer.class, "Stemming text...", cases.size());
052            for(CBRCase c: cases)
053            {
054                for(Attribute a: attributes)
055                {
056                    Object o = AttributeUtils.findValue(a, c);
057                    stem((IEText)o);
058                }
059                ProgressController.step(GatePhrasesExtractor.class);
060            }
061            ProgressController.finish(GatePhrasesExtractor.class);
062        }
063    
064        /**
065         * Performs the algorithm in the given attributes of a query.
066         * These attributes must be IEText objects.
067         */
068        public static void stem(CBRQuery query, Collection<Attribute> attributes)
069        {
070                org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text.");
071                for(Attribute a: attributes)
072                {
073                    Object o = AttributeUtils.findValue(a, query);
074                    stem((IEText)o);
075                }
076        }
077        
078        /**
079         * Performs the algorithm in all the attributes of a collection of cases
080         * These attributes must be IEText objects.
081         */
082        public static void stem(Collection<CBRCase> cases)
083        {
084            org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text.");
085            ProgressController.init(TextStemmer.class, "Stemming text...", cases.size());
086            for(CBRCase c: cases)
087            {
088                Collection<IEText> texts = IEutils.getTexts(c);
089                for(IEText t : texts)
090                    stem(t);
091                ProgressController.step(GatePhrasesExtractor.class);
092            }
093            ProgressController.finish(GatePhrasesExtractor.class);
094        }
095        
096        /**
097         * Performs the algorithm in all the attributes of a query
098         * These attributes must be IEText objects.
099         */
100        public static void stem(CBRQuery query)
101        {      
102            org.apache.commons.logging.LogFactory.getLog(TextStemmer.class).info("Stemming text.");
103            Collection<IEText> texts = IEutils.getTexts(query);
104            for(IEText t : texts)
105                stem(t);
106        }
107        
108        
109        /**
110         * Stems the tokens of the text. If no stem is found, it stores the original word as the stem.
111         * @param text to stem
112         */
113        public static void stem(IEText text)
114        {
115            for(Token t: text.getAllTokens())
116                if(!t.isStopWord())
117                {
118                    String stem = stemmer.stem(t.getRawContent());
119                    if(stem == null)
120                        stem = t.getRawContent();
121                    t.setStem(stem);
122                }
123        }
124    }