001 /** 002 * DomainTopicClassifier.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.io.BufferedReader; 012 import java.io.InputStreamReader; 013 import java.net.URL; 014 import java.util.ArrayList; 015 import java.util.Collection; 016 import java.util.HashMap; 017 import java.util.Iterator; 018 import java.util.StringTokenizer; 019 020 import jcolibri.cbrcore.Attribute; 021 import jcolibri.cbrcore.CBRCase; 022 import jcolibri.cbrcore.CBRQuery; 023 import jcolibri.extensions.textual.IE.IEutils; 024 import jcolibri.extensions.textual.IE.representation.IEText; 025 import jcolibri.extensions.textual.IE.representation.info.FeatureInfo; 026 import jcolibri.extensions.textual.IE.representation.info.PhraseInfo; 027 import jcolibri.util.AttributeUtils; 028 import jcolibri.util.ProgressController; 029 030 /** 031 * Classifies textual objects with a topic depending on the features and phrases. 032 * <br> 033 * This method uses a configuration file with rules following the syntaxis: 034 * <p>[Topic] <FeatureName,FeatureValue> <FeatureName,FeatureValue> ... <Phrase> <Phrase></p> 035 * where: 036 * <ul> 037 * <li>Topic: Topic name 038 * <li>FeatureName: FeatureName extracted by features extraction method 039 * <li>FeatureValue: Feature value. It also can be '?', meaning any value. 040 * <li>Phrase: Any phrase identifier extracted by the phrases extraction method. 041 * </ul> 042 * <p> 043 * First version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 044 * Universidad Complutense de Madrid (GAIA) 045 * </p> 046 * @author Juan A. Recio-Garcia 047 * @version 2.0 048 * 049 */ 050 public class DomainTopicClassifier 051 { 052 static ArrayList<TopicRule> topicsRules; 053 054 /** 055 * Performs the algorithm in the given attributes of a collection of cases. 056 * These attributes must be IEText objects. 057 */ 058 public static void classifyWithTopic(Collection<CBRCase> cases, Collection<Attribute> attributes) 059 { 060 org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic."); 061 ProgressController.init(DomainTopicClassifier.class, "Classifying with topic ...", cases.size()); 062 for(CBRCase c: cases) 063 { 064 for(Attribute a: attributes) 065 { 066 Object o = AttributeUtils.findValue(a, c); 067 classifyWithTopic((IEText)o); 068 } 069 ProgressController.step(DomainTopicClassifier.class); 070 } 071 ProgressController.finish(DomainTopicClassifier.class); 072 } 073 074 /** 075 * Performs the algorithm in the given attributes of a query. 076 * These attributes must be IEText objects. 077 */ 078 public static void classifyWithTopic(CBRQuery query, Collection<Attribute> attributes) 079 { 080 org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic."); 081 for(Attribute a: attributes) 082 { 083 Object o = AttributeUtils.findValue(a, query); 084 classifyWithTopic((IEText)o); 085 } 086 } 087 088 /** 089 * Performs the algorithm in all the attributes of a collection of cases 090 * These attributes must be IEText objects. 091 */ 092 public static void classifyWithTopic(Collection<CBRCase> cases) 093 { 094 org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic."); 095 ProgressController.init(DomainTopicClassifier.class, "Classifying with topic ...", cases.size()); 096 for(CBRCase c: cases) 097 { 098 Collection<IEText> texts = IEutils.getTexts(c); 099 for(IEText t : texts) 100 classifyWithTopic(t); 101 ProgressController.step(DomainTopicClassifier.class); 102 } 103 ProgressController.finish(DomainTopicClassifier.class); 104 } 105 106 /** 107 * Performs the algorithm in all the attributes of a query 108 * These attributes must be IEText objects. 109 */ 110 public static void classifyWithTopic(CBRQuery query) 111 { 112 org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic."); 113 Collection<IEText> texts = IEutils.getTexts(query); 114 for(IEText t : texts) 115 classifyWithTopic(t); 116 } 117 118 /** 119 * Performs the algorithm in a given IEText object 120 */ 121 public static void classifyWithTopic(IEText text) 122 { 123 Collection<PhraseInfo> _phrases = text.getPhrases(); 124 Collection<FeatureInfo> _features = text.getFeatures(); 125 for (TopicRule rule : topicsRules) 126 { 127 // Chech rule conditions 128 boolean valid = true; 129 HashMap<String, String> conditions = rule._data; 130 Iterator<String> fOpIter = conditions.keySet().iterator(); 131 // For each condition 132 while (fOpIter.hasNext() && valid) 133 { 134 String featureOrPhrase = (String) fOpIter.next(); 135 String value = (String) conditions.get(featureOrPhrase); 136 // It's a phrase condition 137 if (value == null) 138 { 139 boolean found = false; 140 for (Iterator<PhraseInfo> it = _phrases.iterator(); it.hasNext() && !found;) 141 { 142 PhraseInfo pi = it.next(); 143 if (pi.getPhrase().equals(featureOrPhrase)) 144 found = true; 145 } 146 valid = found; 147 } 148 // It's a feature condition 149 else 150 { 151 boolean found = false; 152 for (Iterator<FeatureInfo> it = _features.iterator(); it.hasNext() && !found;) 153 { 154 FeatureInfo fi = it.next(); 155 if (!value.equals("?")) 156 found = (fi.getFeature().equals(featureOrPhrase) && fi.getValue().equals(value)); 157 else 158 found = fi.getFeature().equals(featureOrPhrase); 159 } 160 valid = found; 161 } 162 } 163 // If rule conditions are true -> include rule name in 164 // Topics 165 if (valid) 166 text.addTopic(rule._name); 167 } 168 } 169 170 /** 171 * Load topic classification rules file. 172 */ 173 public static void loadRules(String filename) 174 { 175 try 176 { 177 topicsRules = new ArrayList<TopicRule>(); 178 URL file = jcolibri.util.FileIO.findFile(filename); 179 BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream())); 180 String line = ""; 181 while ((line = br.readLine()) != null) 182 { 183 if (line.startsWith("#")) 184 continue; 185 int pos = line.indexOf(']'); 186 if (pos == -1) 187 throw new Exception(line + " Topic field not found"); 188 String _feature = line.substring(1, pos); 189 String _rest = line.substring(pos + 1); 190 191 HashMap<String, String> data = new HashMap<String, String>(); 192 int indexOpen; 193 int indexClose; 194 while (((indexOpen = _rest.indexOf("<")) != -1) && ((indexClose = _rest.indexOf(">")) != -1)) 195 { 196 String content = _rest.substring(indexOpen, indexClose); 197 StringTokenizer st = new StringTokenizer(content, "<,>"); 198 if (!st.hasMoreTokens()) 199 continue; 200 String featureOrPhrase = st.nextToken(); 201 String value = null; 202 if (st.hasMoreTokens()) 203 value = st.nextToken(); 204 // If its a Phrase condition -> value == null 205 data.put(cleanSpaces(featureOrPhrase), cleanSpaces(value)); 206 _rest = _rest.substring(indexClose + 1, _rest.length()); 207 } 208 209 TopicRule rule = new TopicRule(_feature, data); 210 topicsRules.add(rule); 211 } 212 br.close(); 213 } catch (Exception e) 214 { 215 org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).error(e); 216 } 217 } 218 219 static private class TopicRule 220 { 221 String _name; 222 223 HashMap<String, String> _data; 224 225 TopicRule(String n, HashMap<String, String> d) 226 { 227 _name = n; 228 _data = d; 229 } 230 } 231 232 static private String cleanSpaces(String w) 233 { 234 if (w == null) 235 return null; 236 String res = ""; 237 StringTokenizer st = new StringTokenizer(w, " "); 238 while (st.hasMoreTokens()) 239 { 240 res += st.nextToken(); 241 if (st.hasMoreTokens()) 242 res += " "; 243 } 244 return res; 245 } 246 }