001 /** 002 * GateFeaturesExtractor.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.gate; 010 011 import gate.Annotation; 012 import gate.AnnotationSet; 013 import gate.Factory; 014 import gate.creole.ExecutionException; 015 import gate.creole.Transducer; 016 017 import java.util.Collection; 018 import java.util.Iterator; 019 020 import jcolibri.cbrcore.Attribute; 021 import jcolibri.cbrcore.CBRCase; 022 import jcolibri.cbrcore.CBRQuery; 023 import jcolibri.extensions.textual.IE.IEutils; 024 import jcolibri.extensions.textual.IE.representation.IEText; 025 import jcolibri.extensions.textual.IE.representation.info.FeatureInfo; 026 import jcolibri.util.AttributeUtils; 027 import jcolibri.util.ProgressController; 028 029 /** 030 * Extract features from text using the GATE grammars in jape format. 031 * This method uses internally an ANNIETransducer object. 032 * GATE's default rules file or any other file can be loaded. 033 * <br> 034 * It is compatible with the generic FeaturesExtractor so they can be executed together. 035 * <br> 036 * For more information see the GATE tutorial. 037 * @author Juan A. Recio-Garcia 038 * @version 1.0 039 * 040 */ 041 public class GateFeaturesExtractor 042 { 043 044 /** 045 * Performs the algorithm in the given attributes of a collection of cases. 046 * These attributes must be IETextGate objects. 047 */ 048 public static void extractFeatures(Collection<CBRCase> cases, Collection<Attribute> attributes) 049 { 050 org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features."); 051 ProgressController.init(GateFeaturesExtractor.class, "Extracting features ...", cases.size()); 052 for(CBRCase c: cases) 053 { 054 for(Attribute a: attributes) 055 { 056 Object o = AttributeUtils.findValue(a, c); 057 extractFeatures((IETextGate)o); 058 } 059 ProgressController.step(GateFeaturesExtractor.class); 060 } 061 ProgressController.finish(GateFeaturesExtractor.class); 062 } 063 064 /** 065 * Performs the algorithm in the given attributes of a query. 066 * These attributes must be IETextGate objects. 067 */ 068 public static void extractFeatures(CBRQuery query, Collection<Attribute> attributes) 069 { 070 org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features."); 071 for(Attribute a: attributes) 072 { 073 Object o = AttributeUtils.findValue(a, query); 074 extractFeatures((IETextGate)o); 075 } 076 } 077 078 /** 079 * Performs the algorithm in all the IETextGate typed attributes of a collection of cases. 080 */ 081 public static void extractFeatures(Collection<CBRCase> cases) 082 { 083 org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features."); 084 ProgressController.init(GateFeaturesExtractor.class, "Extracting features ...", cases.size()); 085 for(CBRCase c: cases) 086 { 087 Collection<IEText> texts = IEutils.getTexts(c); 088 for(IEText t : texts) 089 if(t instanceof IETextGate) 090 extractFeatures((IETextGate)t); 091 ProgressController.step(GateFeaturesExtractor.class); 092 } 093 ProgressController.finish(GateFeaturesExtractor.class); 094 } 095 096 /** 097 * Performs the algorithm in all the IETextGate typed attributes of a query. 098 */ 099 public static void extractFeatures(CBRQuery query) 100 { 101 org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features."); 102 Collection<IEText> texts = IEutils.getTexts(query); 103 for(IEText t : texts) 104 if(t instanceof IETextGate) 105 extractFeatures((IETextGate)t); 106 } 107 108 /** 109 * Performs the algorithm in a given IETextGate object 110 */ 111 public static void extractFeatures(IETextGate text) 112 { 113 try 114 { 115 116 featureExtractor.setDocument(text.getDocument()); 117 featureExtractor.execute(); 118 119 String content = text.getRAWContent(); 120 121 AnnotationSet featuresAnnotations = text.getDocument().getAnnotations("Features"); 122 123 for (Iterator iter = featuresAnnotations.iterator(); iter.hasNext();) 124 { 125 Annotation anot = (Annotation) iter.next(); 126 String Type = (String) anot.getType(); 127 int begin = anot.getStartNode().getOffset().intValue(); 128 int end = anot.getEndNode().getOffset().intValue(); 129 String value = content.substring(begin, end); 130 text.addFeature(new FeatureInfo(Type, value, begin, end)); 131 } 132 133 // System.err.print(text.getDocument()); 134 135 } catch (ExecutionException e) 136 { 137 org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).error(e); 138 } 139 } 140 141 static Transducer featureExtractor; 142 143 private static String defaultRulesFileName ="jcolibri/extensions/textual/IE/gate/gateinit/plugins/ANNIE/resources/NE/main.jape"; 144 145 public static void loadDefaultRules() 146 { 147 loadRules(defaultRulesFileName); 148 } 149 150 /** 151 * Loads a rules file 152 */ 153 public static void loadRules(String filename) 154 { 155 try 156 { 157 featureExtractor = (Transducer) Factory.createResource("gate.creole.ANNIETransducer"); 158 featureExtractor.setGrammarURL(jcolibri.util.FileIO.findFile(filename)); 159 featureExtractor.setOutputASName("Features"); 160 featureExtractor.init(); 161 } catch (Exception e) 162 { 163 org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).error(e); 164 165 } 166 167 } 168 }