001 /** 002 * ThesaurusLinker.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.util.ArrayList; 012 import java.util.Collection; 013 import java.util.List; 014 015 import jcolibri.cbrcore.Attribute; 016 import jcolibri.cbrcore.CBRCase; 017 import jcolibri.cbrcore.CBRQuery; 018 import jcolibri.cbrcore.CaseComponent; 019 import jcolibri.exception.AttributeAccessException; 020 import jcolibri.extensions.textual.IE.IEutils; 021 import jcolibri.extensions.textual.IE.representation.IEText; 022 import jcolibri.extensions.textual.IE.representation.Token; 023 import jcolibri.extensions.textual.IE.representation.info.WeightedRelation; 024 import jcolibri.extensions.textual.wordnet.WordNetBridge; 025 import jcolibri.extensions.textual.wordnet.WordNetBridge.POS; 026 import jcolibri.util.AttributeUtils; 027 import jcolibri.util.ProgressController; 028 029 /** 030 * Relates query words to cases words using WordNet. 031 * Words are related if belong to the same synset. 032 * <p> 033 * Tokens are related using a list of WeightedRelation objects. 034 * These relations are stored in each Token instance. 035 * <p> 036 * First version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 037 * Universidad Complutense de Madrid (GAIA) 038 * @author Juan A. Recio-Garcia 039 * @version 2.0 040 * 041 */ 042 public class ThesaurusLinker 043 { 044 /** 045 * Performs the algorithm in all the attributes of a collection of cases and a query. 046 * These attributes must be IEText objects. 047 */ 048 public static void linkWithWordNet(Collection<CBRCase> cases, CBRQuery query) 049 { 050 org.apache.commons.logging.LogFactory.getLog(ThesaurusLinker.class).info("Linking tokens with WordNet."); 051 ProgressController.init(ThesaurusLinker.class, "Linking tokens with WordNet ...", cases.size()); 052 053 List<IEText> queryTexts = new ArrayList<IEText>(); 054 IEutils.addTexts(query.getDescription(), queryTexts); 055 056 for(CBRCase c: cases) 057 { 058 List<IEText> caseTexts = new ArrayList<IEText>(); 059 IEutils.addTexts(c.getDescription(), caseTexts); 060 061 for(int i=0; i<queryTexts.size(); i++) 062 { 063 IEText queryText = queryTexts.get(i); 064 IEText caseText = caseTexts.get(i); 065 linkWithWordNet(caseText, queryText); 066 } 067 ProgressController.step(ThesaurusLinker.class); 068 } 069 ProgressController.finish(ThesaurusLinker.class); 070 071 } 072 073 /** 074 * Performs the algorithm in the given attributes of a collection of cases and a query. 075 * These attributes must be IEText objects. 076 */ 077 public static void linkWithWordNet(Collection<CBRCase> cases, CBRQuery query, Collection<Attribute> attributes) 078 { 079 org.apache.commons.logging.LogFactory.getLog(ThesaurusLinker.class).info("Linking tokens with WordNet."); 080 ProgressController.init(ThesaurusLinker.class, "Linking tokens with WordNet ...", cases.size()); 081 for(CBRCase c: cases) 082 { 083 for(Attribute at: attributes) 084 { 085 CaseComponent caseCC = AttributeUtils.findBelongingComponent(at, c); 086 CaseComponent queryCC = AttributeUtils.findBelongingComponent(at, query); 087 088 try 089 { 090 IEText queryText = (IEText)at.getValue(queryCC); 091 IEText caseText = (IEText)at.getValue(caseCC); 092 linkWithWordNet(caseText, queryText); 093 } catch (AttributeAccessException e) 094 { 095 org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).error(e); 096 } 097 } 098 ProgressController.step(ThesaurusLinker.class); 099 } 100 ProgressController.finish(ThesaurusLinker.class); 101 102 } 103 104 /** 105 * Links two texts using wordNet. It only relates words in the same sysnset. 106 */ 107 public static void linkWithWordNet(IEText caseText, IEText queryText) 108 { 109 List<Token> queryTokens = queryText.getAllTokens(); 110 List<Token> caseTokens = caseText.getAllTokens(); 111 112 for(Token queryTok : queryTokens) 113 { 114 for(Token caseTok: caseTokens) 115 { 116 WordNetBridge.POS queryPOS = lookupWordNetPos(queryTok.getPostag()); 117 WordNetBridge.POS casePOS = lookupWordNetPos(caseTok.getPostag()); 118 if(queryPOS != casePOS) 119 continue; 120 if(queryTok.isStopWord()) 121 continue; 122 if(caseTok.isStopWord()) 123 continue; 124 if(queryTok.getStem().equals(caseTok.getStem())) 125 continue; 126 if(WordNetBridge.sameSynset(queryTok.getRawContent(), queryPOS, caseTok.getRawContent(), casePOS)) 127 { 128 queryTok.addRelation(new WeightedRelation(queryTok, caseTok, 0.75)); 129 //org.apache.commons.logging.LogFactory.getLog(ThesaurusLinker.class).info("Adding relation: "+queryTok.getRawContent()+" --> "+caseTok.getRawContent()+". Weight: "+ 0.75); 130 } 131 } 132 } 133 } 134 135 /** 136 * Initializes WordNet. 137 */ 138 public static void loadWordNet() 139 { 140 WordNetBridge.init(); 141 } 142 143 /** 144 * This method transforms POS tags defined in PartofSpeechMethod to the 145 * tags used in the glossary file 146 * 147 * @param tag 148 * POS tag 149 * @return NOUN, VERB, ADJECTIVE or ADVERB 150 */ 151 static WordNetBridge.POS lookupWordNetPos(String tag) 152 { 153 /* 154 * 12. NN Noun, singular or mass 13. NNS Noun, plural 155 */ 156 if (tag.equals("NN") || tag.equals("NNS")) 157 return POS.NOUN; 158 /* 159 * 27. VB Verb, base form 28. VBD Verb, past tense 29. VBG Verb, gerund 160 * or present participle 30. VBN Verb, past participle 31. VBP Verb, 161 * non-3rd person singular present 32. VBZ Verb, 3rd person singular 162 * present 163 */ 164 if (tag.startsWith("V")) 165 return POS.VERB; 166 167 /* 168 * 7. JJ Adjective 8. JJR Adjective, comparative 9. JJS Adjective, 169 * superlative 170 */ 171 if (tag.startsWith("J")) 172 return POS.ADJECTIVE; 173 174 /* 175 * 20. RB Adverb 21. RBR Adverb, comparative 22. RBS Adverb, superlative 176 */ 177 if (tag.startsWith("RB")) 178 return POS.ADVERB; 179 180 return null; 181 } 182 183 }