001 /** 002 * FeaturesExtractor.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.io.BufferedReader; 012 import java.io.InputStreamReader; 013 import java.net.URL; 014 import java.util.ArrayList; 015 import java.util.Collection; 016 import java.util.StringTokenizer; 017 import java.util.regex.Matcher; 018 import java.util.regex.Pattern; 019 020 import jcolibri.cbrcore.Attribute; 021 import jcolibri.cbrcore.CBRCase; 022 import jcolibri.cbrcore.CBRQuery; 023 import jcolibri.extensions.textual.IE.IEutils; 024 import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor; 025 import jcolibri.extensions.textual.IE.representation.IEText; 026 import jcolibri.extensions.textual.IE.representation.info.FeatureInfo; 027 import jcolibri.util.AttributeUtils; 028 import jcolibri.util.ProgressController; 029 030 /** 031 * 032 * <p> 033 * Extracts features using Regular Expressions. 034 * </p> 035 * <p> 036 * Rules format is: 037 * </p> 038 * <p> 039 * [FeatureName]{FeaturePosition}FeatureRegularExpresion 040 * <ul> 041 * <li>FeatureName is used to store the extracted information 042 * <li>FeaturePosition indicates the position of the information that we want 043 * to extract inside the regular expression. The feature is indicated by 044 * counting the opening parentheses from left to right. 045 * <p> 046 * In the expression ((A)(B(C))), for example, there are four such groups: 047 * <ol> 048 * <li> ((A)(B(C))) 049 * <li> (A) 050 * <li> (B(C))4(C) 051 * </ol> 052 * <p> 053 * Group zero always stands for the entire expression 054 * <li>Regular Expressions are deffined following java.util.regex.Pattern 055 * syntaxis. (See API for details) 056 * </ul> 057 * <p> 058 * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 059 * Universidad Complutense de Madrid (GAIA) 060 * </p> 061 * @author Juan A. Recio-Garcia 062 * @version 2.0 063 * 064 */ 065 public class FeaturesExtractor 066 { 067 static ArrayList<FeatureRule> featuresRules; 068 069 /** 070 * Performs the algorithm in the given attributes of a collection of cases. 071 * These attributes must be IEText objects. 072 */ 073 public static void extractFeatures(Collection<CBRCase> cases, Collection<Attribute> attributes) 074 { 075 org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features."); 076 ProgressController.init(PhrasesExtractor.class, "Extracting features ...", cases.size()); 077 for(CBRCase c: cases) 078 { 079 for(Attribute a: attributes) 080 { 081 Object o = AttributeUtils.findValue(a, c); 082 extractFeatures((IEText)o); 083 } 084 ProgressController.step(GatePhrasesExtractor.class); 085 } 086 ProgressController.finish(GatePhrasesExtractor.class); 087 } 088 089 /** 090 * Performs the algorithm in the given attributes of a query. 091 * These attributes must be IEText objects. 092 */ 093 public static void extractFeatures(CBRQuery query, Collection<Attribute> attributes) 094 { 095 org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features."); 096 for(Attribute a: attributes) 097 { 098 Object o = AttributeUtils.findValue(a, query); 099 extractFeatures((IEText)o); 100 } 101 } 102 103 /** 104 * Performs the algorithm in all the attributes of a collection of cases 105 * These attributes must be IEText objects. 106 */ 107 public static void extractFeatures(Collection<CBRCase> cases) 108 { 109 org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features."); 110 ProgressController.init(PhrasesExtractor.class, "Extracting features ...", cases.size()); 111 for(CBRCase c: cases) 112 { 113 Collection<IEText> texts = IEutils.getTexts(c); 114 for(IEText t : texts) 115 extractFeatures(t); 116 ProgressController.step(GatePhrasesExtractor.class); 117 } 118 ProgressController.finish(GatePhrasesExtractor.class); 119 } 120 121 /** 122 * Performs the algorithm in all the attributes of a query 123 * These attributes must be IEText objects. 124 */ 125 public static void extractFeatures(CBRQuery query) 126 { 127 org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features."); 128 Collection<IEText> texts = IEutils.getTexts(query); 129 for(IEText t : texts) 130 extractFeatures(t); 131 } 132 133 /** 134 * Performs the algorithm in a given IEText object 135 */ 136 public static void extractFeatures(IEText text) 137 { 138 String rawText = text.getRAWContent(); 139 for (FeatureRule rule : featuresRules) 140 { 141 Matcher m = rule._pattern.matcher(rawText); 142 while (m.find()) 143 { 144 String group = m.group(rule._group); 145 group = cleanSpaces(group); 146 text.addFeature(new FeatureInfo(rule._feature, group, m.start(), m.end())); 147 } 148 } 149 } 150 151 static private String cleanSpaces(String w) 152 { 153 String res = ""; 154 StringTokenizer st = new StringTokenizer(w, " "); 155 while (st.hasMoreTokens()) 156 { 157 res += st.nextToken(); 158 if (st.hasMoreTokens()) 159 res += " "; 160 } 161 return res; 162 } 163 164 /** 165 * Load the features rules 166 */ 167 public static void loadRules(String filename) 168 { 169 try 170 { 171 featuresRules = new ArrayList<FeatureRule>(); 172 URL file = jcolibri.util.FileIO.findFile(filename); 173 BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream())); 174 175 String line = ""; 176 while ((line = br.readLine()) != null) 177 { 178 if (line.startsWith("#")) 179 continue; 180 int pos = line.indexOf(']'); 181 if (pos == -1) 182 throw new Exception(line + " Feature field not found"); 183 String _feature = line.substring(1, pos); 184 String _rest = line.substring(pos + 1); 185 pos = _rest.indexOf('}'); 186 if (pos == -1) 187 throw new Exception(line 188 + " FeaturePostion field not found"); 189 String _group = _rest.substring(1, pos); 190 String _rule = _rest.substring(pos + 1); 191 int g = Integer.parseInt(_group); 192 featuresRules.add(new FeatureRule(_feature, Pattern.compile(_rule),g)); 193 } 194 br.close(); 195 } catch (Exception e) 196 { 197 org.apache.commons.logging.LogFactory.getLog( 198 FeaturesExtractor.class).error(e); 199 } 200 } 201 202 private static class FeatureRule 203 { 204 String _feature; 205 206 Pattern _pattern; 207 208 int _group; 209 210 FeatureRule(String _f, Pattern _p, int _g) 211 { 212 _feature = _f; 213 _pattern = _p; 214 _group = _g; 215 } 216 } 217 }