001    /**
002     * GlossaryLinker.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.io.BufferedReader;
012    import java.io.InputStreamReader;
013    import java.net.URL;
014    import java.util.ArrayList;
015    import java.util.Collection;
016    import java.util.HashSet;
017    import java.util.List;
018    import java.util.Set;
019    import java.util.StringTokenizer;
020    
021    import jcolibri.cbrcore.Attribute;
022    import jcolibri.cbrcore.CBRCase;
023    import jcolibri.cbrcore.CBRQuery;
024    import jcolibri.cbrcore.CaseComponent;
025    import jcolibri.exception.AttributeAccessException;
026    import jcolibri.extensions.textual.IE.IEutils;
027    import jcolibri.extensions.textual.IE.representation.IEText;
028    import jcolibri.extensions.textual.IE.representation.Token;
029    import jcolibri.extensions.textual.IE.representation.info.WeightedRelation;
030    import jcolibri.extensions.textual.stemmer.Stemmer;
031    import jcolibri.util.AttributeUtils;
032    import jcolibri.util.ProgressController;
033    
034    /**
035     * 
036     * Relates query words to cases words using a domain specific glossary.
037     * <p>
038     * Tokens are related using a list of WeightedRelation objects.
039     * These relations are stored in each Token instance.
040     * <p>
041     * Glossary Format:
042     * <p>
043     * [Part-of-Speech Tag]{Similarity} word1 word2 ... wordn
044     * <ul>
045     * <li>Part-of-Speech Tag: Sometimes words can have different POS tags, this
046     * parameter marks that the following words are only related when they appear in
047     * a sentence with that tag.
048     * <p>
049     * Possible values: NOUN, VERB, ADJECTIVE, ADVERB
050     * <li>Similarity: Indicates the similarity relation.
051     * <p>
052     * Possible values: 1, 2, 3. (1 - very similar, 2 - similar, 3 - not very
053     * similar)
054     * <li>Words must be separated with white spaces.
055     * </ul>
056     * <p>
057     * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
058     * Universidad Complutense de Madrid (GAIA)
059     * </p>
060     * @author Juan A. Recio-Garcia
061     * @version 2.0
062     * 
063     */
064    public class GlossaryLinker
065    {
066        /**
067         * Performs the algorithm in all the ttributes of a collection of cases and a query.
068         * These attributes must be IEText objects.
069         */
070        public static void LinkWithGlossary(Collection<CBRCase> cases, CBRQuery query)
071        {   
072            org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).info("Linking tokens with user glossary.");
073            ProgressController.init(GlossaryLinker.class, "Linking tokens with user glossary ...", cases.size());
074    
075            List<IEText> queryTexts = new ArrayList<IEText>();
076            IEutils.addTexts(query.getDescription(), queryTexts);
077    
078            for(CBRCase c: cases)
079            {
080                List<IEText> caseTexts = new ArrayList<IEText>();
081                IEutils.addTexts(c.getDescription(), caseTexts);
082                
083                for(int i=0; i<queryTexts.size(); i++)
084                {
085                    IEText queryText = queryTexts.get(i);
086                    IEText caseText  = caseTexts.get(i);
087                    linkWithGlossary(caseText, queryText);
088                }
089                ProgressController.step(GlossaryLinker.class);
090            }
091            ProgressController.finish(GlossaryLinker.class);
092        }
093    
094        /**
095         * Performs the algorithm in the given attributes of a collection of cases and a query.
096         * These attributes must be IEText objects.
097         */
098        public static void linkWithGlossary(Collection<CBRCase> cases, CBRQuery query, Collection<Attribute> attributes)
099        {
100            org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).info("Linking tokens with user glossary.");
101            ProgressController.init(GlossaryLinker.class, "Linking tokens with user glossary ...", cases.size());
102            
103            for(CBRCase c: cases)
104            {
105                for(Attribute at: attributes)
106                {
107                    CaseComponent caseCC  = AttributeUtils.findBelongingComponent(at, c);
108                    CaseComponent queryCC = AttributeUtils.findBelongingComponent(at, query);
109                    
110                    try
111                    {
112                        IEText queryText = (IEText)at.getValue(queryCC);
113                        IEText caseText  = (IEText)at.getValue(caseCC);
114                        linkWithGlossary(caseText, queryText);
115                    } catch (AttributeAccessException e)
116                    {
117                        org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).error(e);
118                    }
119                }
120                ProgressController.step(GlossaryLinker.class);
121            }
122            ProgressController.finish(GlossaryLinker.class);
123        }
124        
125        
126        protected static ArrayList<GlossaryTriple> glossary;
127    
128        /**
129         * Links two text objects using the glossary.
130         */
131        public static void linkWithGlossary(IEText caseText, IEText queryText)
132        {
133            List<Token> queryTokens = queryText.getAllTokens();
134            List<Token> caseTokens  = caseText.getAllTokens();
135            
136            for(GlossaryTriple gt : glossary)
137            {
138                String posTag     = gt._posTag;
139                Set<String> words = gt._words;
140                int weight        = gt._weight;
141                
142                for(Token queryTok : queryTokens)
143                {
144                    String queryStem = queryTok.getStem();
145                    if(!words.contains(queryStem))
146                        continue;
147                    String queryPOS = lookupGlossaryPos(queryTok.getPostag());
148                    if(!queryPOS.equals(posTag))
149                        continue;
150    
151                    for(Token caseTok: caseTokens)
152                    {
153                        String caseStem = caseTok.getStem();
154                        if(!words.contains(caseStem))
155                            continue;
156                        if(caseStem.equals(queryStem))
157                            continue;
158                        
159                        String casePOS  = lookupGlossaryPos(caseTok.getPostag());
160                        
161                        if(!queryPOS.equals(casePOS))
162                            continue;
163                        
164                        queryTok.addRelation(new WeightedRelation(queryTok, caseTok, 1/weight));
165                        org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).info("Adding relation: "+queryTok.getRawContent()+" --> "+caseTok.getRawContent()+". Weight: "+ 1/weight);
166                    }
167                }
168            }
169        }
170        
171        
172        /**
173         * Load glossary reations stored in GLOSSARY_FILE
174         */
175        public static void loadGlossary(String filename)
176        {
177            glossary = new ArrayList<GlossaryTriple>();
178    
179            try
180            {
181                URL file = jcolibri.util.FileIO.findFile(filename);
182                BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream()));
183    
184    
185                String line = "";
186    
187                Stemmer stemmer = new Stemmer();
188                while ((line = br.readLine()) != null)
189                {
190                    if (line.startsWith("#"))
191                        continue;
192                    int pos = line.indexOf(']');
193                    if (pos == -1)
194                        throw new Exception(line + "  POSTag field not found");
195                    String _posTag = line.substring(1, pos);
196                    String _rest = line.substring(pos + 1);
197                    pos = _rest.indexOf('}');
198                    if (pos == -1)
199                        throw new Exception(line + "  Weight field not found");
200                    String _weight = _rest.substring(1, pos);
201                    int weight = Integer.parseInt(_weight);
202                    String _words = _rest.substring(pos + 1);
203                    StringTokenizer st = new StringTokenizer(_words, " ");
204                    Set<String> words = new HashSet<String>();
205                    while (st.hasMoreTokens())
206                    {
207                        String sw = st.nextToken();
208                        words.add(stemmer.stem(sw));
209                    }
210    
211                    glossary.add(new GlossaryTriple(_posTag, words, weight));
212                }
213                br.close();
214            } catch (Exception e)
215            {
216                org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class)
217                        .error(e);
218            }
219    
220        }
221    
222        /**
223         * This method transforms POS tags defined in PartofSpeechMethod to the
224         * tags used in the glossary file
225         * 
226         * @param tag
227         *                POS tag
228         * @return NOUN, VERB, ADJECTIVE or ADVERB
229         */
230        static String lookupGlossaryPos(String tag)
231        {
232            /*
233             * 12. NN Noun, singular or mass 13. NNS Noun, plural
234             */
235            if (tag.equals("NN") || tag.equals("NNS"))
236                return "NOUN";
237            /*
238             * 27. VB Verb, base form 28. VBD Verb, past tense 29. VBG Verb, gerund
239             * or present participle 30. VBN Verb, past participle 31. VBP Verb,
240             * non-3rd person singular present 32. VBZ Verb, 3rd person singular
241             * present
242             */
243            if (tag.startsWith("V"))
244                return "VERB";
245    
246            /*
247             * 7. JJ Adjective 8. JJR Adjective, comparative 9. JJS Adjective,
248             * superlative
249             */
250            if (tag.startsWith("J"))
251                return "ADJECTIVE";
252    
253            /*
254             * 20. RB Adverb 21. RBR Adverb, comparative 22. RBS Adverb, superlative
255             */
256            if (tag.startsWith("RB"))
257                return "ADVERB";
258    
259            return null;
260        }
261    
262        static private class GlossaryTriple
263        {
264            String _posTag;
265    
266            Set<String> _words;
267    
268            int _weight;
269    
270            GlossaryTriple(String p, Set<String> wor, int w)
271            {
272                _posTag = p;
273                _words = wor;
274                _weight = w;
275            }
276        }
277    
278    }