001 /** 002 * PhrasesExtractor.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 20/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.io.BufferedReader; 012 import java.io.InputStreamReader; 013 import java.net.URL; 014 import java.util.Collection; 015 import java.util.HashMap; 016 import java.util.StringTokenizer; 017 import java.util.regex.Matcher; 018 import java.util.regex.Pattern; 019 020 import jcolibri.cbrcore.Attribute; 021 import jcolibri.cbrcore.CBRCase; 022 import jcolibri.cbrcore.CBRQuery; 023 import jcolibri.extensions.textual.IE.IEutils; 024 import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor; 025 import jcolibri.extensions.textual.IE.representation.IEText; 026 import jcolibri.extensions.textual.IE.representation.info.PhraseInfo; 027 import jcolibri.util.AttributeUtils; 028 import jcolibri.util.ProgressController; 029 030 031 032 /** 033 * 034 * <p> 035 * Extracts Phrases using Regular Expressions. 036 * </p> 037 * <p> 038 * Rules file format is: 039 * </p> 040 * <p> 041 * [PhraseName]PhraseRegularExpresion 042 * <ul> 043 * <il>PhraseName is used to store the extracted information <il>Regular 044 * Expressions are deffined following java.util.regex.Pattern syntaxis. (See API 045 * for details) 046 * </ul> 047 * <p> 048 * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 049 * Universidad Complutense de Madrid (GAIA) 050 * </p> 051 * @author Juan A. Recio-Garcia 052 * @version 2.0 053 * 054 */ 055 public class PhrasesExtractor 056 { 057 /** 058 * Performs the algorithm in the given attributes of a collection of cases. 059 * These attributes must be IEText objects. 060 */ 061 public static void extractPhrases(Collection<CBRCase> cases, Collection<Attribute> attributes) 062 { 063 org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases."); 064 ProgressController.init(PhrasesExtractor.class, "Extracting phrases ...", cases.size()); 065 for(CBRCase c: cases) 066 { 067 for(Attribute a: attributes) 068 { 069 Object o = AttributeUtils.findValue(a, c); 070 extractPhrases((IEText)o); 071 } 072 ProgressController.step(GatePhrasesExtractor.class); 073 } 074 ProgressController.finish(GatePhrasesExtractor.class); 075 } 076 077 /** 078 * Performs the algorithm in the given attributes of a query. 079 * These attributes must be IEText objects. 080 */ 081 public static void extractPhrases(CBRQuery query, Collection<Attribute> attributes) 082 { 083 org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases."); 084 for(Attribute a: attributes) 085 { 086 Object o = AttributeUtils.findValue(a, query); 087 extractPhrases((IEText)o); 088 } 089 } 090 091 /** 092 * Performs the algorithm in all the attributes of a collection of cases 093 * These attributes must be IEText objects. 094 */ 095 public static void extractPhrases(Collection<CBRCase> cases) 096 { 097 org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases."); 098 ProgressController.init(PhrasesExtractor.class, "Extracting phrases ...", cases.size()); 099 for(CBRCase c: cases) 100 { 101 Collection<IEText> texts = IEutils.getTexts(c); 102 for(IEText t : texts) 103 extractPhrases(t); 104 ProgressController.step(GatePhrasesExtractor.class); 105 } 106 ProgressController.finish(GatePhrasesExtractor.class); 107 } 108 109 /** 110 * Performs the algorithm in all the attributes of a query 111 * These attributes must be IEText objects. 112 */ 113 public static void extractPhrases(CBRQuery query) 114 { 115 org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases."); 116 Collection<IEText> texts = IEutils.getTexts(query); 117 for(IEText t : texts) 118 extractPhrases(t); 119 } 120 121 122 123 static HashMap<String, Pattern> rulesList; 124 125 /** 126 * Performs the algorithm in a given IEText object 127 */ 128 public static void extractPhrases(IEText text) 129 { 130 String rawText = text.getRAWContent(); 131 132 for(String rule : rulesList.keySet()) 133 { 134 Pattern pattern = rulesList.get(rule); 135 Matcher m = pattern.matcher(rawText); 136 while (m.find()) { 137 text.addPhrase(new PhraseInfo(rule, m.start(), m.end())); 138 } 139 140 } 141 } 142 143 144 145 /** 146 * Loads a rules file 147 */ 148 public static void loadRules(String filename) 149 { 150 try 151 { 152 URL file = jcolibri.util.FileIO.findFile(filename); 153 BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream())); 154 rulesList = new HashMap<String,Pattern>(); 155 156 String line = ""; 157 while ((line = br.readLine()) != null) 158 { 159 if (line.startsWith("#")) 160 continue; 161 int pos = line.indexOf(']'); 162 if (pos == -1) 163 throw new Exception(line + " Feature field not found"); 164 String _feature = line.substring(1, pos); 165 String _rule = line.substring(pos + 1); 166 rulesList.put(cleanSpaces(_feature), Pattern.compile(_rule)); 167 } 168 br.close(); 169 } catch (Exception e) 170 { 171 org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).error(e); 172 } 173 } 174 175 private static String cleanSpaces(String w) 176 { 177 String res = ""; 178 StringTokenizer st = new StringTokenizer(w, " "); 179 while (st.hasMoreTokens()) 180 { 181 res += st.nextToken(); 182 if (st.hasMoreTokens()) 183 res += " "; 184 } 185 return res; 186 } 187 }