001    /**
002     * ThesaurusLinker.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.util.ArrayList;
012    import java.util.Collection;
013    import java.util.List;
014    
015    import jcolibri.cbrcore.Attribute;
016    import jcolibri.cbrcore.CBRCase;
017    import jcolibri.cbrcore.CBRQuery;
018    import jcolibri.cbrcore.CaseComponent;
019    import jcolibri.exception.AttributeAccessException;
020    import jcolibri.extensions.textual.IE.IEutils;
021    import jcolibri.extensions.textual.IE.representation.IEText;
022    import jcolibri.extensions.textual.IE.representation.Token;
023    import jcolibri.extensions.textual.IE.representation.info.WeightedRelation;
024    import jcolibri.extensions.textual.wordnet.WordNetBridge;
025    import jcolibri.extensions.textual.wordnet.WordNetBridge.POS;
026    import jcolibri.util.AttributeUtils;
027    import jcolibri.util.ProgressController;
028    
029    /**
030     * Relates query words to cases words using WordNet. 
031     * Words are related if belong to the same synset.
032     * <p>
033     * Tokens are related using a list of WeightedRelation objects.
034     * These relations are stored in each Token instance.
035     * <p>
036     * First version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
037     * Universidad Complutense de Madrid (GAIA)
038     * @author Juan A. Recio-Garcia
039     * @version 2.0
040     *
041     */
042    public class ThesaurusLinker
043    {
044        /**
045         * Performs the algorithm in all the attributes of a collection of cases and a query.
046         * These attributes must be IEText objects.
047         */  
048        public static void linkWithWordNet(Collection<CBRCase> cases, CBRQuery query)
049        {
050            org.apache.commons.logging.LogFactory.getLog(ThesaurusLinker.class).info("Linking tokens with WordNet.");
051            ProgressController.init(ThesaurusLinker.class, "Linking tokens with WordNet ...", cases.size());
052            
053            List<IEText> queryTexts = new ArrayList<IEText>();
054            IEutils.addTexts(query.getDescription(), queryTexts);
055            
056            for(CBRCase c: cases)
057            {
058                List<IEText> caseTexts = new ArrayList<IEText>();
059                IEutils.addTexts(c.getDescription(), caseTexts);
060                
061                for(int i=0; i<queryTexts.size(); i++)
062                {
063                    IEText queryText = queryTexts.get(i);
064                    IEText caseText  = caseTexts.get(i);
065                    linkWithWordNet(caseText, queryText);
066                }
067                ProgressController.step(ThesaurusLinker.class);
068            }
069            ProgressController.finish(ThesaurusLinker.class);
070    
071        }
072    
073        /**
074         * Performs the algorithm in the given attributes of a collection of cases and a query.
075         * These attributes must be IEText objects.
076         */
077        public static void linkWithWordNet(Collection<CBRCase> cases, CBRQuery query, Collection<Attribute> attributes)
078        {
079            org.apache.commons.logging.LogFactory.getLog(ThesaurusLinker.class).info("Linking tokens with WordNet.");
080            ProgressController.init(ThesaurusLinker.class, "Linking tokens with WordNet ...", cases.size());
081            for(CBRCase c: cases)
082            {
083                for(Attribute at: attributes)
084                {
085                    CaseComponent caseCC  = AttributeUtils.findBelongingComponent(at, c);
086                    CaseComponent queryCC = AttributeUtils.findBelongingComponent(at, query);
087                    
088                    try
089                    {
090                        IEText queryText = (IEText)at.getValue(queryCC);
091                        IEText caseText  = (IEText)at.getValue(caseCC);
092                        linkWithWordNet(caseText, queryText);
093                    } catch (AttributeAccessException e)
094                    {
095                        org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).error(e);
096                    }
097                }
098                ProgressController.step(ThesaurusLinker.class);
099            }
100            ProgressController.finish(ThesaurusLinker.class);
101    
102        }
103        
104        /**
105         * Links two texts using wordNet. It only relates words in the same sysnset.
106         */
107        public static void linkWithWordNet(IEText caseText, IEText queryText)
108        {
109            List<Token> queryTokens = queryText.getAllTokens();
110            List<Token> caseTokens  = caseText.getAllTokens();
111            
112            for(Token queryTok : queryTokens)
113            {
114                    for(Token caseTok: caseTokens)
115                    {
116                        WordNetBridge.POS queryPOS = lookupWordNetPos(queryTok.getPostag());
117                        WordNetBridge.POS casePOS  = lookupWordNetPos(caseTok.getPostag());
118                        if(queryPOS != casePOS)
119                                continue;
120                        if(queryTok.isStopWord())
121                                continue;
122                        if(caseTok.isStopWord())
123                                continue;
124                        if(queryTok.getStem().equals(caseTok.getStem()))
125                                continue;
126                        if(WordNetBridge.sameSynset(queryTok.getRawContent(), queryPOS, caseTok.getRawContent(), casePOS))
127                        {
128                            queryTok.addRelation(new WeightedRelation(queryTok, caseTok, 0.75));
129                            //org.apache.commons.logging.LogFactory.getLog(ThesaurusLinker.class).info("Adding relation: "+queryTok.getRawContent()+" --> "+caseTok.getRawContent()+". Weight: "+ 0.75);
130                        }
131                    }
132            }
133        }
134        
135        /**
136         * Initializes WordNet.
137         */
138        public static void loadWordNet()
139        {
140            WordNetBridge.init();
141        }
142        
143        /**
144         * This method transforms POS tags defined in PartofSpeechMethod to the
145         * tags used in the glossary file
146         * 
147         * @param tag
148         *                POS tag
149         * @return NOUN, VERB, ADJECTIVE or ADVERB
150         */
151        static WordNetBridge.POS lookupWordNetPos(String tag)
152        {
153            /*
154             * 12. NN Noun, singular or mass 13. NNS Noun, plural
155             */
156            if (tag.equals("NN") || tag.equals("NNS"))
157                return POS.NOUN;
158            /*
159             * 27. VB Verb, base form 28. VBD Verb, past tense 29. VBG Verb, gerund
160             * or present participle 30. VBN Verb, past participle 31. VBP Verb,
161             * non-3rd person singular present 32. VBZ Verb, 3rd person singular
162             * present
163             */
164            if (tag.startsWith("V"))
165                return POS.VERB;
166    
167            /*
168             * 7. JJ Adjective 8. JJR Adjective, comparative 9. JJS Adjective,
169             * superlative
170             */
171            if (tag.startsWith("J"))
172                return POS.ADJECTIVE;
173    
174            /*
175             * 20. RB Adverb 21. RBR Adverb, comparative 22. RBS Adverb, superlative
176             */
177            if (tag.startsWith("RB"))
178                return POS.ADVERB;
179    
180            return null;
181        }
182    
183    }