001    /**
002     * SpamFilterApp.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 01/08/2007
008     */
009    package jcolibri.test.test16;
010    
011    import java.util.Collection;
012    
013    import jcolibri.casebase.CachedLinealCaseBase;
014    import jcolibri.cbraplications.StandardCBRApplication;
015    import jcolibri.cbrcore.Attribute;
016    import jcolibri.cbrcore.CBRCase;
017    import jcolibri.cbrcore.CBRCaseBase;
018    import jcolibri.cbrcore.CBRQuery;
019    import jcolibri.cbrcore.Connector;
020    import jcolibri.evaluation.Evaluator;
021    import jcolibri.exception.ExecutionException;
022    import jcolibri.extensions.textual.IE.common.StopWordsDetector;
023    import jcolibri.extensions.textual.IE.common.TextStemmer;
024    import jcolibri.extensions.textual.IE.opennlp.OpennlpSplitter;
025    import jcolibri.method.retrieve.NNretrieval.similarity.LocalSimilarityFunction;
026    import jcolibri.method.retrieve.NNretrieval.similarity.global.Average;
027    import jcolibri.method.reuse.classification.KNNClassificationConfig;
028    import jcolibri.method.reuse.classification.KNNClassificationMethod;
029    import jcolibri.method.revise.classification.BasicClassificationOracle;
030    import jcolibri.method.revise.classification.ClassificationOracle;
031    
032    /**
033     * Spam filter application.
034     * It is configured through the set() methods. 
035     * The cycle method() runs the application and stores the evaluation results in the evaluation report object.
036     * After running all cycles, some statistics may be read using the get() methods.
037     * <br>
038     * The corpus used by this application must be a zip file with several textual files (one per email).
039     * The filename must start by ham or spam depending on the class.
040     * 
041     * @author Juan A. Recio-Garcia
042     * @version 1.0
043     */
044    public class SpamFilterApp implements StandardCBRApplication
045    {
046        Connector _connector;
047        CBRCaseBase _caseBase;
048        
049        private int k = 3;
050        private LocalSimilarityFunction similFunc = null; 
051        private KNNClassificationMethod clasifMethod = null;
052        private String corpusZipFile = null;
053    
054        private double tp;
055        private double tn;
056        private double fp;
057        private double fn;
058        
059        private KNNClassificationConfig spamFilterSimConfig;        
060    
061        /**
062         * Creates a spam filter application that uses the corpus indicated by the parameter.
063         */
064        public SpamFilterApp(String corpusZipFile)
065        {
066            this.corpusZipFile = corpusZipFile;
067            spamFilterSimConfig = new KNNClassificationConfig();
068        }
069        
070        
071        /* (non-Javadoc)
072         * @see jcolibri.cbraplications.StandardCBRApplication#configure()
073         */
074        public void configure() throws ExecutionException
075        {
076            _connector = new EmailConnector(corpusZipFile);
077            _caseBase = new CachedLinealCaseBase();
078        }
079    
080        /* (non-Javadoc)
081         * @see jcolibri.cbraplications.StandardCBRApplication#preCycle()
082         */
083        public CBRCaseBase preCycle() throws ExecutionException
084        {
085            _caseBase.init(_connector);     
086            
087            Collection<CBRCase> cases = _caseBase.getCases();
088            
089            //Organize cases into paragraphs, sentences and tokens
090            OpennlpSplitter.split(cases);
091            //Detect stopwords
092            StopWordsDetector.detectStopWords(cases);
093            //Stem text
094            TextStemmer.stem(cases);
095    
096            tn = tp = fp = fn = 0;
097            
098            return _caseBase;
099    
100        }
101        
102        /* (non-Javadoc)
103         * @see jcolibri.cbraplications.StandardCBRApplication#cycle(jcolibri.cbrcore.CBRQuery)
104         */
105        public void cycle(CBRQuery query) throws ExecutionException
106        {
107            KNNClassificationConfig spamFilterSimConfig = getKNNConfig();
108            
109            double predictionCost;
110    
111            
112            ClassificationOracle oracle = new BasicClassificationOracle();
113            predictionCost = oracle.getPredictionCost(query, _caseBase, spamFilterSimConfig);
114            
115            CBRCase _case = (CBRCase)query;
116            EmailSolution sol = (EmailSolution)_case.getSolution();
117            String _class = sol.getEmailClass();
118            if(predictionCost == 0) // Prediction was ok
119            {
120                if(_class.equals(EmailSolution.SPAM))
121                    tp++;
122                else
123                    tn++;
124            }
125            else
126            {
127                if(_class.equals(EmailSolution.SPAM))
128                    fn++;
129                else
130                    fp++;
131            }
132                
133            Evaluator.getEvaluationReport().addDataToSeries("Evaluation", new Double(predictionCost));
134         }
135        
136    
137        /* (non-Javadoc)
138         * @see jcolibri.cbraplications.StandardCBRApplication#postCycle()
139         */
140        public void postCycle() throws ExecutionException
141        {
142            // TODO Auto-generated method stub
143    
144        }
145        
146        /**
147         * Returns the KNN configuration
148         */
149        public KNNClassificationConfig getKNNConfig()
150        {
151            spamFilterSimConfig.setDescriptionSimFunction(new Average());
152            spamFilterSimConfig.setClassificationMethod(clasifMethod);
153            spamFilterSimConfig.setK(k);
154            spamFilterSimConfig.addMapping(new Attribute("content",EmailDescription.class), similFunc);
155            return spamFilterSimConfig;
156        }
157        
158        /**
159         * Sets the k
160         * @param k The k to set.
161         */
162        public void setK(int k)
163        {
164            this.k = k;
165        }
166    
167        /**
168         * Sets the similarity function
169         * @param similFunc The similFunc to set.
170         */
171        public void setSimilFunc(LocalSimilarityFunction similFunc)
172        {
173            this.similFunc = similFunc;
174        }
175        
176        /**
177         * Sets the classification method.
178         * @param clasifMethod The clasifMethod to set.
179         */
180        public void setClasifMethod(KNNClassificationMethod clasifMethod)
181        {
182            this.clasifMethod = clasifMethod;
183        }
184    
185        /**
186         * Returns the false negatives.
187         */
188        public double getFalseNegatives()
189        {
190            return fn;
191        }
192    
193    
194        /**
195         * Returns the false positives.
196         */
197        public double getFalsePositives()
198        {
199            return fp;
200        }
201    
202    
203        /**
204         * Returns the true positives
205         */
206        public double getTruePositives()
207        {
208            return tp;
209        }
210        
211        /**
212         * Returns the true negatives
213         */
214        public double getTrueNegatives()
215        {
216            return tn;
217        }
218    
219    }