001 /** 002 * GlossaryLinker.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.io.BufferedReader; 012 import java.io.InputStreamReader; 013 import java.net.URL; 014 import java.util.ArrayList; 015 import java.util.Collection; 016 import java.util.HashSet; 017 import java.util.List; 018 import java.util.Set; 019 import java.util.StringTokenizer; 020 021 import jcolibri.cbrcore.Attribute; 022 import jcolibri.cbrcore.CBRCase; 023 import jcolibri.cbrcore.CBRQuery; 024 import jcolibri.cbrcore.CaseComponent; 025 import jcolibri.exception.AttributeAccessException; 026 import jcolibri.extensions.textual.IE.IEutils; 027 import jcolibri.extensions.textual.IE.representation.IEText; 028 import jcolibri.extensions.textual.IE.representation.Token; 029 import jcolibri.extensions.textual.IE.representation.info.WeightedRelation; 030 import jcolibri.extensions.textual.stemmer.Stemmer; 031 import jcolibri.util.AttributeUtils; 032 import jcolibri.util.ProgressController; 033 034 /** 035 * 036 * Relates query words to cases words using a domain specific glossary. 037 * <p> 038 * Tokens are related using a list of WeightedRelation objects. 039 * These relations are stored in each Token instance. 040 * <p> 041 * Glossary Format: 042 * <p> 043 * [Part-of-Speech Tag]{Similarity} word1 word2 ... wordn 044 * <ul> 045 * <li>Part-of-Speech Tag: Sometimes words can have different POS tags, this 046 * parameter marks that the following words are only related when they appear in 047 * a sentence with that tag. 048 * <p> 049 * Possible values: NOUN, VERB, ADJECTIVE, ADVERB 050 * <li>Similarity: Indicates the similarity relation. 051 * <p> 052 * Possible values: 1, 2, 3. (1 - very similar, 2 - similar, 3 - not very 053 * similar) 054 * <li>Words must be separated with white spaces. 055 * </ul> 056 * <p> 057 * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 058 * Universidad Complutense de Madrid (GAIA) 059 * </p> 060 * @author Juan A. Recio-Garcia 061 * @version 2.0 062 * 063 */ 064 public class GlossaryLinker 065 { 066 /** 067 * Performs the algorithm in all the ttributes of a collection of cases and a query. 068 * These attributes must be IEText objects. 069 */ 070 public static void LinkWithGlossary(Collection<CBRCase> cases, CBRQuery query) 071 { 072 org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).info("Linking tokens with user glossary."); 073 ProgressController.init(GlossaryLinker.class, "Linking tokens with user glossary ...", cases.size()); 074 075 List<IEText> queryTexts = new ArrayList<IEText>(); 076 IEutils.addTexts(query.getDescription(), queryTexts); 077 078 for(CBRCase c: cases) 079 { 080 List<IEText> caseTexts = new ArrayList<IEText>(); 081 IEutils.addTexts(c.getDescription(), caseTexts); 082 083 for(int i=0; i<queryTexts.size(); i++) 084 { 085 IEText queryText = queryTexts.get(i); 086 IEText caseText = caseTexts.get(i); 087 linkWithGlossary(caseText, queryText); 088 } 089 ProgressController.step(GlossaryLinker.class); 090 } 091 ProgressController.finish(GlossaryLinker.class); 092 } 093 094 /** 095 * Performs the algorithm in the given attributes of a collection of cases and a query. 096 * These attributes must be IEText objects. 097 */ 098 public static void linkWithGlossary(Collection<CBRCase> cases, CBRQuery query, Collection<Attribute> attributes) 099 { 100 org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).info("Linking tokens with user glossary."); 101 ProgressController.init(GlossaryLinker.class, "Linking tokens with user glossary ...", cases.size()); 102 103 for(CBRCase c: cases) 104 { 105 for(Attribute at: attributes) 106 { 107 CaseComponent caseCC = AttributeUtils.findBelongingComponent(at, c); 108 CaseComponent queryCC = AttributeUtils.findBelongingComponent(at, query); 109 110 try 111 { 112 IEText queryText = (IEText)at.getValue(queryCC); 113 IEText caseText = (IEText)at.getValue(caseCC); 114 linkWithGlossary(caseText, queryText); 115 } catch (AttributeAccessException e) 116 { 117 org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).error(e); 118 } 119 } 120 ProgressController.step(GlossaryLinker.class); 121 } 122 ProgressController.finish(GlossaryLinker.class); 123 } 124 125 126 protected static ArrayList<GlossaryTriple> glossary; 127 128 /** 129 * Links two text objects using the glossary. 130 */ 131 public static void linkWithGlossary(IEText caseText, IEText queryText) 132 { 133 List<Token> queryTokens = queryText.getAllTokens(); 134 List<Token> caseTokens = caseText.getAllTokens(); 135 136 for(GlossaryTriple gt : glossary) 137 { 138 String posTag = gt._posTag; 139 Set<String> words = gt._words; 140 int weight = gt._weight; 141 142 for(Token queryTok : queryTokens) 143 { 144 String queryStem = queryTok.getStem(); 145 if(!words.contains(queryStem)) 146 continue; 147 String queryPOS = lookupGlossaryPos(queryTok.getPostag()); 148 if(!queryPOS.equals(posTag)) 149 continue; 150 151 for(Token caseTok: caseTokens) 152 { 153 String caseStem = caseTok.getStem(); 154 if(!words.contains(caseStem)) 155 continue; 156 if(caseStem.equals(queryStem)) 157 continue; 158 159 String casePOS = lookupGlossaryPos(caseTok.getPostag()); 160 161 if(!queryPOS.equals(casePOS)) 162 continue; 163 164 queryTok.addRelation(new WeightedRelation(queryTok, caseTok, 1/weight)); 165 org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class).info("Adding relation: "+queryTok.getRawContent()+" --> "+caseTok.getRawContent()+". Weight: "+ 1/weight); 166 } 167 } 168 } 169 } 170 171 172 /** 173 * Load glossary reations stored in GLOSSARY_FILE 174 */ 175 public static void loadGlossary(String filename) 176 { 177 glossary = new ArrayList<GlossaryTriple>(); 178 179 try 180 { 181 URL file = jcolibri.util.FileIO.findFile(filename); 182 BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream())); 183 184 185 String line = ""; 186 187 Stemmer stemmer = new Stemmer(); 188 while ((line = br.readLine()) != null) 189 { 190 if (line.startsWith("#")) 191 continue; 192 int pos = line.indexOf(']'); 193 if (pos == -1) 194 throw new Exception(line + " POSTag field not found"); 195 String _posTag = line.substring(1, pos); 196 String _rest = line.substring(pos + 1); 197 pos = _rest.indexOf('}'); 198 if (pos == -1) 199 throw new Exception(line + " Weight field not found"); 200 String _weight = _rest.substring(1, pos); 201 int weight = Integer.parseInt(_weight); 202 String _words = _rest.substring(pos + 1); 203 StringTokenizer st = new StringTokenizer(_words, " "); 204 Set<String> words = new HashSet<String>(); 205 while (st.hasMoreTokens()) 206 { 207 String sw = st.nextToken(); 208 words.add(stemmer.stem(sw)); 209 } 210 211 glossary.add(new GlossaryTriple(_posTag, words, weight)); 212 } 213 br.close(); 214 } catch (Exception e) 215 { 216 org.apache.commons.logging.LogFactory.getLog(GlossaryLinker.class) 217 .error(e); 218 } 219 220 } 221 222 /** 223 * This method transforms POS tags defined in PartofSpeechMethod to the 224 * tags used in the glossary file 225 * 226 * @param tag 227 * POS tag 228 * @return NOUN, VERB, ADJECTIVE or ADVERB 229 */ 230 static String lookupGlossaryPos(String tag) 231 { 232 /* 233 * 12. NN Noun, singular or mass 13. NNS Noun, plural 234 */ 235 if (tag.equals("NN") || tag.equals("NNS")) 236 return "NOUN"; 237 /* 238 * 27. VB Verb, base form 28. VBD Verb, past tense 29. VBG Verb, gerund 239 * or present participle 30. VBN Verb, past participle 31. VBP Verb, 240 * non-3rd person singular present 32. VBZ Verb, 3rd person singular 241 * present 242 */ 243 if (tag.startsWith("V")) 244 return "VERB"; 245 246 /* 247 * 7. JJ Adjective 8. JJR Adjective, comparative 9. JJS Adjective, 248 * superlative 249 */ 250 if (tag.startsWith("J")) 251 return "ADJECTIVE"; 252 253 /* 254 * 20. RB Adverb 21. RBR Adverb, comparative 22. RBS Adverb, superlative 255 */ 256 if (tag.startsWith("RB")) 257 return "ADVERB"; 258 259 return null; 260 } 261 262 static private class GlossaryTriple 263 { 264 String _posTag; 265 266 Set<String> _words; 267 268 int _weight; 269 270 GlossaryTriple(String p, Set<String> wor, int w) 271 { 272 _posTag = p; 273 _words = wor; 274 _weight = w; 275 } 276 } 277 278 }