001    /**
002     * EmailConnector.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 01/08/2007
008     */
009    package jcolibri.test.test16;
010    
011    import java.io.BufferedInputStream;
012    import java.io.BufferedReader;
013    import java.io.IOException;
014    import java.io.InputStreamReader;
015    import java.net.URL;
016    import java.util.ArrayList;
017    import java.util.Collection;
018    import java.util.zip.ZipEntry;
019    import java.util.zip.ZipInputStream;
020    
021    import jcolibri.cbrcore.CBRCase;
022    import jcolibri.cbrcore.CaseBaseFilter;
023    import jcolibri.cbrcore.Connector;
024    import jcolibri.exception.InitializingException;
025    import jcolibri.extensions.textual.IE.opennlp.IETextOpenNLP;
026    
027    /**
028     * Connector that read cases from a zip file with several textual files (one per email).
029     * The filename must start by ham or spam depending on the class.
030     * <br>
031     * The corpus is packed into the lib\textual\spamcorpus\spamcorpus.jar file and was extracted
032     * from the Apache Spamassassin project (http://spamassassin.apache.org/publiccorpus/).
033     * 
034     * @author Juan A. Recio-Garcia
035     * @version 1.0
036     *
037     */
038    public class EmailConnector implements Connector
039    {
040        String zipfile;
041        
042        /**
043         * Creates a connector for a given zip file.
044         */
045        public EmailConnector(String zipfile)
046        {
047            this.zipfile = zipfile;
048        }
049        
050        /* (non-Javadoc)
051         * @see jcolibri.cbrcore.Connector#close()
052         */
053        public void close()
054        {
055            // TODO Auto-generated method stub
056    
057        }
058    
059        /* (non-Javadoc)
060         * @see jcolibri.cbrcore.Connector#deleteCases(java.util.Collection)
061         */
062        public void deleteCases(Collection<CBRCase> cases)
063        {
064            // TODO Auto-generated method stub
065    
066        }
067    
068        /* (non-Javadoc)
069         * @see jcolibri.cbrcore.Connector#initFromXMLfile(java.net.URL)
070         */
071        public void initFromXMLfile(URL file) throws InitializingException
072        {
073            // TODO Auto-generated method stub
074    
075        }
076    
077        /* (non-Javadoc)
078         * @see jcolibri.cbrcore.Connector#retrieveAllCases()
079         */
080        public Collection<CBRCase> retrieveAllCases()
081        {
082            int ham_easy = 0;
083            int ham_hard = 0;
084            int spam = 0;
085            ArrayList<CBRCase> cases = new ArrayList<CBRCase>();
086            try
087            {
088        
089                BufferedInputStream source = new BufferedInputStream (jcolibri.util.FileIO.openFile(zipfile));
090                ZipInputStream zip_in_stream = new ZipInputStream (source);
091    
092                BufferedReader br = new BufferedReader(new InputStreamReader(zip_in_stream));
093                
094                ZipEntry entry;
095                while((entry=zip_in_stream.getNextEntry())!=null)
096                {
097                    String id = entry.getName();
098                    
099                    StringBuffer buffer = new StringBuffer();
100                    while(br.ready())
101                    {
102                        buffer.append(br.readLine());
103                        buffer.append("\n");
104                    }       
105                    
106                    String _class;
107                    if(id.startsWith("spam"))
108                    {
109                        _class = EmailSolution.SPAM;
110                        spam++;
111                    }
112                    else
113                    {
114                        _class = EmailSolution.HAM;
115                        if(id.startsWith("hard"))
116                            ham_hard++;
117                        else
118                            ham_easy++;
119                    }
120                    
121                    EmailDescription desc = new EmailDescription(id, new IETextOpenNLP(new String(buffer)));
122                    EmailSolution    sol  = new EmailSolution(_class);
123                    
124                    CBRCase _case = new CBRCase();
125                    _case.setDescription(desc);
126                    _case.setSolution(sol);
127                    
128                    cases.add(_case);
129                }
130                org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Loaded "+ham_easy+" easy ham - "+ham_hard+" hard ham - "+spam+" spam");
131                br.close();
132            } catch (IOException e)
133            {
134                org.apache.commons.logging.LogFactory.getLog(this.getClass()).error(e);
135                
136            }
137            return cases;
138        }
139    
140        /* (non-Javadoc)
141         * @see jcolibri.cbrcore.Connector#retrieveSomeCases(jcolibri.cbrcore.CaseBaseFilter)
142         */
143        public Collection<CBRCase> retrieveSomeCases(CaseBaseFilter filter)
144        {
145            // TODO Auto-generated method stub
146            return null;
147        }
148    
149        /* (non-Javadoc)
150         * @see jcolibri.cbrcore.Connector#storeCases(java.util.Collection)
151         */
152        public void storeCases(Collection<CBRCase> cases)
153        {
154            // TODO Auto-generated method stub
155    
156        }
157    
158    }