001 /** 002 * SpamFilterApp.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 01/08/2007 008 */ 009 package jcolibri.test.test16; 010 011 import java.util.Collection; 012 013 import jcolibri.casebase.CachedLinealCaseBase; 014 import jcolibri.cbraplications.StandardCBRApplication; 015 import jcolibri.cbrcore.Attribute; 016 import jcolibri.cbrcore.CBRCase; 017 import jcolibri.cbrcore.CBRCaseBase; 018 import jcolibri.cbrcore.CBRQuery; 019 import jcolibri.cbrcore.Connector; 020 import jcolibri.evaluation.Evaluator; 021 import jcolibri.exception.ExecutionException; 022 import jcolibri.extensions.textual.IE.common.StopWordsDetector; 023 import jcolibri.extensions.textual.IE.common.TextStemmer; 024 import jcolibri.extensions.textual.IE.opennlp.OpennlpSplitter; 025 import jcolibri.method.retrieve.NNretrieval.similarity.LocalSimilarityFunction; 026 import jcolibri.method.retrieve.NNretrieval.similarity.global.Average; 027 import jcolibri.method.reuse.classification.KNNClassificationConfig; 028 import jcolibri.method.reuse.classification.KNNClassificationMethod; 029 import jcolibri.method.revise.classification.BasicClassificationOracle; 030 import jcolibri.method.revise.classification.ClassificationOracle; 031 032 /** 033 * Spam filter application. 034 * It is configured through the set() methods. 035 * The cycle method() runs the application and stores the evaluation results in the evaluation report object. 036 * After running all cycles, some statistics may be read using the get() methods. 037 * <br> 038 * The corpus used by this application must be a zip file with several textual files (one per email). 039 * The filename must start by ham or spam depending on the class. 040 * 041 * @author Juan A. Recio-Garcia 042 * @version 1.0 043 */ 044 public class SpamFilterApp implements StandardCBRApplication 045 { 046 Connector _connector; 047 CBRCaseBase _caseBase; 048 049 private int k = 3; 050 private LocalSimilarityFunction similFunc = null; 051 private KNNClassificationMethod clasifMethod = null; 052 private String corpusZipFile = null; 053 054 private double tp; 055 private double tn; 056 private double fp; 057 private double fn; 058 059 private KNNClassificationConfig spamFilterSimConfig; 060 061 /** 062 * Creates a spam filter application that uses the corpus indicated by the parameter. 063 */ 064 public SpamFilterApp(String corpusZipFile) 065 { 066 this.corpusZipFile = corpusZipFile; 067 spamFilterSimConfig = new KNNClassificationConfig(); 068 } 069 070 071 /* (non-Javadoc) 072 * @see jcolibri.cbraplications.StandardCBRApplication#configure() 073 */ 074 public void configure() throws ExecutionException 075 { 076 _connector = new EmailConnector(corpusZipFile); 077 _caseBase = new CachedLinealCaseBase(); 078 } 079 080 /* (non-Javadoc) 081 * @see jcolibri.cbraplications.StandardCBRApplication#preCycle() 082 */ 083 public CBRCaseBase preCycle() throws ExecutionException 084 { 085 _caseBase.init(_connector); 086 087 Collection<CBRCase> cases = _caseBase.getCases(); 088 089 //Organize cases into paragraphs, sentences and tokens 090 OpennlpSplitter.split(cases); 091 //Detect stopwords 092 StopWordsDetector.detectStopWords(cases); 093 //Stem text 094 TextStemmer.stem(cases); 095 096 tn = tp = fp = fn = 0; 097 098 return _caseBase; 099 100 } 101 102 /* (non-Javadoc) 103 * @see jcolibri.cbraplications.StandardCBRApplication#cycle(jcolibri.cbrcore.CBRQuery) 104 */ 105 public void cycle(CBRQuery query) throws ExecutionException 106 { 107 KNNClassificationConfig spamFilterSimConfig = getKNNConfig(); 108 109 double predictionCost; 110 111 112 ClassificationOracle oracle = new BasicClassificationOracle(); 113 predictionCost = oracle.getPredictionCost(query, _caseBase, spamFilterSimConfig); 114 115 CBRCase _case = (CBRCase)query; 116 EmailSolution sol = (EmailSolution)_case.getSolution(); 117 String _class = sol.getEmailClass(); 118 if(predictionCost == 0) // Prediction was ok 119 { 120 if(_class.equals(EmailSolution.SPAM)) 121 tp++; 122 else 123 tn++; 124 } 125 else 126 { 127 if(_class.equals(EmailSolution.SPAM)) 128 fn++; 129 else 130 fp++; 131 } 132 133 Evaluator.getEvaluationReport().addDataToSeries("Evaluation", new Double(predictionCost)); 134 } 135 136 137 /* (non-Javadoc) 138 * @see jcolibri.cbraplications.StandardCBRApplication#postCycle() 139 */ 140 public void postCycle() throws ExecutionException 141 { 142 // TODO Auto-generated method stub 143 144 } 145 146 /** 147 * Returns the KNN configuration 148 */ 149 public KNNClassificationConfig getKNNConfig() 150 { 151 spamFilterSimConfig.setDescriptionSimFunction(new Average()); 152 spamFilterSimConfig.setClassificationMethod(clasifMethod); 153 spamFilterSimConfig.setK(k); 154 spamFilterSimConfig.addMapping(new Attribute("content",EmailDescription.class), similFunc); 155 return spamFilterSimConfig; 156 } 157 158 /** 159 * Sets the k 160 * @param k The k to set. 161 */ 162 public void setK(int k) 163 { 164 this.k = k; 165 } 166 167 /** 168 * Sets the similarity function 169 * @param similFunc The similFunc to set. 170 */ 171 public void setSimilFunc(LocalSimilarityFunction similFunc) 172 { 173 this.similFunc = similFunc; 174 } 175 176 /** 177 * Sets the classification method. 178 * @param clasifMethod The clasifMethod to set. 179 */ 180 public void setClasifMethod(KNNClassificationMethod clasifMethod) 181 { 182 this.clasifMethod = clasifMethod; 183 } 184 185 /** 186 * Returns the false negatives. 187 */ 188 public double getFalseNegatives() 189 { 190 return fn; 191 } 192 193 194 /** 195 * Returns the false positives. 196 */ 197 public double getFalsePositives() 198 { 199 return fp; 200 } 201 202 203 /** 204 * Returns the true positives 205 */ 206 public double getTruePositives() 207 { 208 return tp; 209 } 210 211 /** 212 * Returns the true negatives 213 */ 214 public double getTrueNegatives() 215 { 216 return tn; 217 } 218 219 }