001 /** 002 * Test13a.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 23/06/2007 008 */ 009 package jcolibri.test.test13; 010 011 import java.util.Collection; 012 013 import jcolibri.casebase.LinealCaseBase; 014 import jcolibri.cbraplications.StandardCBRApplication; 015 import jcolibri.cbrcore.Attribute; 016 import jcolibri.cbrcore.CBRCase; 017 import jcolibri.cbrcore.CBRCaseBase; 018 import jcolibri.cbrcore.CBRQuery; 019 import jcolibri.cbrcore.Connector; 020 import jcolibri.exception.ExecutionException; 021 import jcolibri.extensions.textual.IE.common.BasicInformationExtractor; 022 import jcolibri.extensions.textual.IE.common.DomainTopicClassifier; 023 import jcolibri.extensions.textual.IE.common.FeaturesExtractor; 024 import jcolibri.extensions.textual.IE.common.GlossaryLinker; 025 import jcolibri.extensions.textual.IE.common.PhrasesExtractor; 026 import jcolibri.extensions.textual.IE.common.StopWordsDetector; 027 import jcolibri.extensions.textual.IE.common.TextStemmer; 028 import jcolibri.extensions.textual.IE.common.ThesaurusLinker; 029 import jcolibri.extensions.textual.IE.opennlp.IETextOpenNLP; 030 import jcolibri.extensions.textual.IE.opennlp.OpennlpMainNamesExtractor; 031 import jcolibri.extensions.textual.IE.opennlp.OpennlpPOStagger; 032 import jcolibri.extensions.textual.IE.opennlp.OpennlpSplitter; 033 import jcolibri.method.retrieve.RetrievalResult; 034 import jcolibri.method.retrieve.NNretrieval.NNConfig; 035 import jcolibri.method.retrieve.NNretrieval.NNScoringMethod; 036 import jcolibri.method.retrieve.NNretrieval.similarity.global.Average; 037 import jcolibri.method.retrieve.NNretrieval.similarity.local.Equal; 038 import jcolibri.method.retrieve.NNretrieval.similarity.local.textual.OverlapCoefficient; 039 import jcolibri.method.retrieve.selection.SelectCases; 040 import jcolibri.test.main.SwingProgressBar; 041 import jcolibri.test.test13.connector.RestaurantsConnector; 042 import jcolibri.test.test13.gui.ResultFrame; 043 import jcolibri.test.test13.similarity.AverageMultipleTextValues; 044 import jcolibri.test.test13.similarity.TokensContained; 045 046 /** 047 * This test shows how to use the Textual CBR extension in a Restaurant recommender. See the jcolibri.extensions.textual.IE package documentation for 048 * details about this extension. This example uses the OpenNLP implementation. 049 * <br> 050 * It uses a custum connector (RestaurantConnector) and similarity functions (AverageMultipleTextValues and TokensContained). 051 * The connector loads cases from a normal txt file and the similarity functions work with the information extracted by the textual CBR methods. 052 * These methods extract information from text and store it in the other attributes of the description. That information is stored as a string with 053 * several values separated with white spaces, so specific similarity measures are requiered to compare those attributes. 054 * See their javadoc for more information. 055 * <br> 056 * To compare the texts it uses a textual similarity function from the jcolibri.method.retrieve.NNretrieval.similarity.local.textual package. 057 * Test13b uses the Lucene similarity function instead that one. 058 * 059 * @author Juan A. Recio-Garcia 060 * @version 1.0 061 * 062 * @see jcolibri.test.test13.similarity.AverageMultipleTextValues 063 * @see jcolibri.test.test13.similarity.TokensContained 064 * @see jcolibri.test.test13.connector.RestaurantsConnector 065 * @see jcolibri.extensions.textual.IE 066 */ 067 public class Test13a implements StandardCBRApplication 068 { 069 070 Connector _connector; 071 CBRCaseBase _caseBase; 072 073 074 /* 075 * (non-Javadoc) 076 * 077 * @see jcolibri.cbraplications.BasicCBRApplication#configure() 078 */ 079 public void configure() throws ExecutionException 080 { 081 try 082 { 083 //Use a custom connector 084 _connector = new RestaurantsConnector("jcolibri/test/test13/restaurants-large-v2.txt"); 085 _caseBase = new LinealCaseBase(); 086 087 //To show the progress 088 jcolibri.util.ProgressController.clear(); 089 SwingProgressBar pb = new SwingProgressBar(); 090 jcolibri.util.ProgressController.register(pb); 091 } catch (Exception e) 092 { 093 throw new ExecutionException(e); 094 } 095 } 096 097 /* 098 * (non-Javadoc) 099 * 100 * @see jcolibri.cbraplications.StandardCBRApplication#preCycle() 101 */ 102 public CBRCaseBase preCycle() throws ExecutionException 103 { 104 //In the precycle we pre-compute the information extraction in the case base 105 106 //Initialize Wordnet 107 ThesaurusLinker.loadWordNet(); 108 //Load user-specific glossary 109 GlossaryLinker.loadGlossary("jcolibri/test/test13/glossary.txt"); 110 //Load phrases rules 111 PhrasesExtractor.loadRules("jcolibri/test/test13/phrasesRules.txt"); 112 //Load features rules 113 FeaturesExtractor.loadRules("jcolibri/test/test13/featuresRules.txt"); 114 //Load topic rules 115 DomainTopicClassifier.loadRules("jcolibri/test/test13/domainRules.txt"); 116 117 //Obtain cases 118 _caseBase.init(_connector); 119 Collection<CBRCase> cases = _caseBase.getCases(); 120 121 //Perform IE methods in the cases 122 123 //Organize cases into paragraphs, sentences and tokens 124 OpennlpSplitter.split(cases); 125 //Detect stopwords 126 StopWordsDetector.detectStopWords(cases); 127 //Stem text 128 TextStemmer.stem(cases); 129 //Perform POS tagging 130 OpennlpPOStagger.tag(cases); 131 //Extract main names 132 OpennlpMainNamesExtractor.extractMainNames(cases); 133 //Extract phrases 134 PhrasesExtractor.extractPhrases(cases); 135 //Extract features 136 FeaturesExtractor.extractFeatures(cases); 137 //Classify with a topic 138 DomainTopicClassifier.classifyWithTopic(cases); 139 //Perform IE copying extracted features or phrases into other attributes of the case 140 BasicInformationExtractor.extractInformation(cases); 141 142 return _caseBase; 143 } 144 145 /* 146 * (non-Javadoc) 147 * 148 * @see jcolibri.cbraplications.StandardCBRApplication#cycle(jcolibri.cbrcore.CBRQuery) 149 */ 150 public void cycle(CBRQuery query) throws ExecutionException 151 { 152 Collection<CBRCase> cases = _caseBase.getCases(); 153 154 //Perform IE methods in the cases 155 156 //Organize the query into paragraphs, sentences and tokens 157 OpennlpSplitter.split(query); 158 //Detect stopwords 159 StopWordsDetector.detectStopWords(query); 160 //Stem query 161 TextStemmer.stem(query); 162 //Perform POS tagging in the query 163 OpennlpPOStagger.tag(query); 164 //Extract main names 165 OpennlpMainNamesExtractor.extractMainNames(query); 166 167 //Now that we have the query we relate cases tokens with the query tokens 168 //Using the user-defined glossary 169 GlossaryLinker.LinkWithGlossary(cases, query); 170 //Using wordnet 171 ThesaurusLinker.linkWithWordNet(cases, query); 172 173 //Extract phrases 174 PhrasesExtractor.extractPhrases(query); 175 //Extract features 176 FeaturesExtractor.extractFeatures(query); 177 //Classify with a topic 178 DomainTopicClassifier.classifyWithTopic(query); 179 //Perform IE copying extracted features or phrases into other attributes of the query 180 BasicInformationExtractor.extractInformation(query); 181 182 //Now we configure the NN method with some user-defined similarity measures 183 NNConfig nnConfig = new NNConfig(); 184 nnConfig.setDescriptionSimFunction(new Average()); 185 186 nnConfig.addMapping(new Attribute("location", RestaurantDescription.class), new Equal()); 187 188 //To compare text we use the OverlapCofficient 189 nnConfig.addMapping(new Attribute("description", RestaurantDescription.class), new OverlapCoefficient()); 190 //This function takes a string with several numerical values and computes the average 191 nnConfig.addMapping(new Attribute("price", RestaurantDescription.class), new AverageMultipleTextValues(1000)); 192 //This function takes a string with several words separated by whitespaces, converts it to a set of tokens and 193 //computes the size of the intersection of the query set and the case set normalized with the case set 194 nnConfig.addMapping(new Attribute("foodType", RestaurantDescription.class), new TokensContained()); 195 nnConfig.addMapping(new Attribute("food", RestaurantDescription.class), new TokensContained()); 196 nnConfig.addMapping(new Attribute("alcohol", RestaurantDescription.class), new Equal()); 197 nnConfig.addMapping(new Attribute("takeout", RestaurantDescription.class), new Equal()); 198 nnConfig.addMapping(new Attribute("delivery", RestaurantDescription.class), new Equal()); 199 nnConfig.addMapping(new Attribute("parking", RestaurantDescription.class), new Equal()); 200 nnConfig.addMapping(new Attribute("catering", RestaurantDescription.class), new Equal()); 201 202 203 System.out.println("RESULT:"); 204 Collection<RetrievalResult> res = NNScoringMethod.evaluateSimilarity(cases, query, nnConfig); 205 res = SelectCases.selectTopKRR(res, 5); 206 207 for(RetrievalResult rr: res) 208 System.out.println(rr); 209 210 //Show the result 211 RestaurantDescription qrd = (RestaurantDescription)query.getDescription(); 212 CBRCase mostSimilar = res.iterator().next().get_case(); 213 RestaurantDescription rrd = (RestaurantDescription)mostSimilar.getDescription(); 214 new ResultFrame(qrd.getDescription().getRAWContent(), rrd.getName(), rrd.getAddress(), rrd.getDescription().getRAWContent()); 215 216 217 } 218 219 /* 220 * (non-Javadoc) 221 * 222 * @see jcolibri.cbraplications.StandardCBRApplication#postCycle() 223 */ 224 public void postCycle() throws ExecutionException 225 { 226 jcolibri.extensions.textual.wordnet.WordNetBridge.deInit(); 227 _connector.close(); 228 229 } 230 231 232 public static void main(String[] args) 233 { 234 Test13a test = new Test13a(); 235 try 236 { 237 test.configure(); 238 239 CBRCaseBase caseBase = test.preCycle(); 240 241 System.out.println("CASE BASE: "); 242 for(CBRCase c: caseBase.getCases()) 243 System.out.println(c); 244 System.out.println("Total: "+caseBase.getCases().size()+" cases"); 245 246 boolean _continue = true; 247 while(_continue) 248 { 249 String queryString = javax.swing.JOptionPane.showInputDialog("Please enter the restaurant description:"); 250 if(queryString == null) 251 _continue = false; 252 else 253 { 254 CBRQuery query = new CBRQuery(); 255 RestaurantDescription queryDescription = new RestaurantDescription(); 256 queryDescription.setDescription(new IETextOpenNLP(queryString)); 257 query.setDescription(queryDescription); 258 259 test.cycle(query); 260 } 261 } 262 test.postCycle(); 263 264 } catch (ExecutionException e) 265 { 266 org.apache.commons.logging.LogFactory.getLog(Test13a.class).error(e); 267 } 268 } 269 }