001 /** 002 * GatePhrasesExtractor.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.gate; 010 011 import gate.Annotation; 012 import gate.AnnotationSet; 013 import gate.Factory; 014 import gate.creole.ExecutionException; 015 import gate.creole.gazetteer.DefaultGazetteer; 016 017 import java.util.Collection; 018 import java.util.Iterator; 019 020 import jcolibri.cbrcore.Attribute; 021 import jcolibri.cbrcore.CBRCase; 022 import jcolibri.cbrcore.CBRQuery; 023 import jcolibri.extensions.textual.IE.IEutils; 024 import jcolibri.extensions.textual.IE.representation.IEText; 025 import jcolibri.extensions.textual.IE.representation.Token; 026 import jcolibri.extensions.textual.IE.representation.info.PhraseInfo; 027 import jcolibri.util.AttributeUtils; 028 import jcolibri.util.ProgressController; 029 030 /** 031 * Phrases extractor based on the Gate Gazetteer. 032 * It is compatible with the generic PhrasesExtractor so they can be executed together. 033 * GATE's default rules file or any other file can be loaded. 034 * <br> 035 * For more information see the GATE tutorial. 036 * @author Juan A. Recio-Garcia 037 * @version 1.0 038 * 039 */ 040 public class GatePhrasesExtractor 041 { 042 private static DefaultGazetteer gaze = null; 043 044 /** 045 * Performs the algorithm in the given attributes of a collection of cases. 046 * These attributes must be IETextGate objects. 047 */ 048 public static void extractPhrases(Collection<CBRCase> cases, Collection<Attribute> attributes) 049 { 050 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases."); 051 ProgressController.init(GatePhrasesExtractor.class, "Extracting phrases", cases.size()); 052 for(CBRCase c: cases) 053 { 054 for(Attribute a: attributes) 055 { 056 Object o = AttributeUtils.findValue(a, c); 057 if(o instanceof IETextGate) 058 extractPhrases((IETextGate)o); 059 } 060 ProgressController.step(GatePhrasesExtractor.class); 061 } 062 ProgressController.finish(GatePhrasesExtractor.class); 063 } 064 065 /** 066 * Performs the algorithm in the given attributes of a query. 067 * These attributes must be IETextGate objects. 068 */ 069 public static void extractPhrases(CBRQuery query, Collection<Attribute> attributes) 070 { 071 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases."); 072 for(Attribute a: attributes) 073 { 074 Object o = AttributeUtils.findValue(a, query); 075 if(o instanceof IETextGate) 076 extractPhrases((IETextGate)o); 077 } 078 } 079 080 /** 081 * Performs the algorithm in all the IETextGate typed attributes of a collection of cases. 082 */ 083 public static void extractPhrases(Collection<CBRCase> cases) 084 { 085 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases."); 086 ProgressController.init(GatePhrasesExtractor.class, "Extracting phrases", cases.size()); 087 for(CBRCase c: cases) 088 { 089 Collection<IEText> texts = IEutils.getTexts(c); 090 for(IEText t : texts) 091 if(t instanceof IETextGate) 092 extractPhrases((IETextGate)t); 093 ProgressController.step(GatePhrasesExtractor.class); 094 } 095 ProgressController.finish(GatePhrasesExtractor.class); 096 } 097 098 /** 099 * Performs the algorithm in all the IETextGate typed attributes of a query. 100 */ 101 public static void extractPhrases(CBRQuery query) 102 { 103 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases."); 104 Collection<IEText> texts = IEutils.getTexts(query); 105 for(IEText t : texts) 106 if(t instanceof IETextGate) 107 extractPhrases((IETextGate)t); 108 } 109 110 /** 111 * Performs the algorithm in a given IETextGate object 112 */ 113 public static void extractPhrases(IETextGate text) 114 { 115 try 116 { 117 gaze.setDocument(text.getDocument()); 118 gaze.execute(); 119 120 AnnotationSet lookupAnnotations = text.getDocument().getAnnotations().get("Lookup"); 121 122 for(Token t: text.getAllTokens()) 123 { 124 Annotation anotToken = text.getTokenMapping(t); 125 AnnotationSet lookupAnnots = lookupAnnotations.get(anotToken.getStartNode().getOffset(), anotToken.getEndNode().getOffset()); 126 for(Iterator iter = lookupAnnots.iterator(); iter.hasNext(); ) 127 { 128 Annotation anot = (Annotation)iter.next(); 129 String Type = (String)anot.getFeatures().get("majorType"); 130 String minorType = (String)anot.getFeatures().get("minorType"); 131 if(minorType!= null) 132 Type = Type+"."+minorType; 133 text.addPhrase(new PhraseInfo(Type, 134 anot.getStartNode().getOffset().intValue(), 135 anot.getEndNode().getOffset().intValue())); 136 } 137 138 } 139 140 //System.err.print(text.getDocument()); 141 142 } catch (ExecutionException e) 143 { 144 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).error(e); 145 } 146 } 147 148 private static String defaultRulesFileName = "jcolibri/extensions/textual/IE/gate/gateinit/plugins/ANNIE/resources/gazetteer/lists.def"; 149 150 public static void loadDefaultRules() 151 { 152 loadRules(defaultRulesFileName); 153 } 154 155 /** 156 * Loads a rules file 157 */ 158 public static void loadRules(String filename) 159 { 160 try 161 { 162 gaze = (DefaultGazetteer) Factory 163 .createResource("gate.creole.gazetteer.DefaultGazetteer"); 164 gaze.setListsURL(jcolibri.util.FileIO.findFile(filename)); 165 gaze.init(); 166 } catch (Exception e) 167 { 168 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).error(e); 169 170 } 171 172 } 173 174 175 }