001 /** 002 * EmailConnector.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 01/08/2007 008 */ 009 package jcolibri.test.test16; 010 011 import java.io.BufferedInputStream; 012 import java.io.BufferedReader; 013 import java.io.IOException; 014 import java.io.InputStreamReader; 015 import java.net.URL; 016 import java.util.ArrayList; 017 import java.util.Collection; 018 import java.util.zip.ZipEntry; 019 import java.util.zip.ZipInputStream; 020 021 import jcolibri.cbrcore.CBRCase; 022 import jcolibri.cbrcore.CaseBaseFilter; 023 import jcolibri.cbrcore.Connector; 024 import jcolibri.exception.InitializingException; 025 import jcolibri.extensions.textual.IE.opennlp.IETextOpenNLP; 026 027 /** 028 * Connector that read cases from a zip file with several textual files (one per email). 029 * The filename must start by ham or spam depending on the class. 030 * <br> 031 * The corpus is packed into the lib\textual\spamcorpus\spamcorpus.jar file and was extracted 032 * from the Apache Spamassassin project (http://spamassassin.apache.org/publiccorpus/). 033 * 034 * @author Juan A. Recio-Garcia 035 * @version 1.0 036 * 037 */ 038 public class EmailConnector implements Connector 039 { 040 String zipfile; 041 042 /** 043 * Creates a connector for a given zip file. 044 */ 045 public EmailConnector(String zipfile) 046 { 047 this.zipfile = zipfile; 048 } 049 050 /* (non-Javadoc) 051 * @see jcolibri.cbrcore.Connector#close() 052 */ 053 public void close() 054 { 055 // TODO Auto-generated method stub 056 057 } 058 059 /* (non-Javadoc) 060 * @see jcolibri.cbrcore.Connector#deleteCases(java.util.Collection) 061 */ 062 public void deleteCases(Collection<CBRCase> cases) 063 { 064 // TODO Auto-generated method stub 065 066 } 067 068 /* (non-Javadoc) 069 * @see jcolibri.cbrcore.Connector#initFromXMLfile(java.net.URL) 070 */ 071 public void initFromXMLfile(URL file) throws InitializingException 072 { 073 // TODO Auto-generated method stub 074 075 } 076 077 /* (non-Javadoc) 078 * @see jcolibri.cbrcore.Connector#retrieveAllCases() 079 */ 080 public Collection<CBRCase> retrieveAllCases() 081 { 082 int ham_easy = 0; 083 int ham_hard = 0; 084 int spam = 0; 085 ArrayList<CBRCase> cases = new ArrayList<CBRCase>(); 086 try 087 { 088 089 BufferedInputStream source = new BufferedInputStream (jcolibri.util.FileIO.openFile(zipfile)); 090 ZipInputStream zip_in_stream = new ZipInputStream (source); 091 092 BufferedReader br = new BufferedReader(new InputStreamReader(zip_in_stream)); 093 094 ZipEntry entry; 095 while((entry=zip_in_stream.getNextEntry())!=null) 096 { 097 String id = entry.getName(); 098 099 StringBuffer buffer = new StringBuffer(); 100 while(br.ready()) 101 { 102 buffer.append(br.readLine()); 103 buffer.append("\n"); 104 } 105 106 String _class; 107 if(id.startsWith("spam")) 108 { 109 _class = EmailSolution.SPAM; 110 spam++; 111 } 112 else 113 { 114 _class = EmailSolution.HAM; 115 if(id.startsWith("hard")) 116 ham_hard++; 117 else 118 ham_easy++; 119 } 120 121 EmailDescription desc = new EmailDescription(id, new IETextOpenNLP(new String(buffer))); 122 EmailSolution sol = new EmailSolution(_class); 123 124 CBRCase _case = new CBRCase(); 125 _case.setDescription(desc); 126 _case.setSolution(sol); 127 128 cases.add(_case); 129 } 130 org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Loaded "+ham_easy+" easy ham - "+ham_hard+" hard ham - "+spam+" spam"); 131 br.close(); 132 } catch (IOException e) 133 { 134 org.apache.commons.logging.LogFactory.getLog(this.getClass()).error(e); 135 136 } 137 return cases; 138 } 139 140 /* (non-Javadoc) 141 * @see jcolibri.cbrcore.Connector#retrieveSomeCases(jcolibri.cbrcore.CaseBaseFilter) 142 */ 143 public Collection<CBRCase> retrieveSomeCases(CaseBaseFilter filter) 144 { 145 // TODO Auto-generated method stub 146 return null; 147 } 148 149 /* (non-Javadoc) 150 * @see jcolibri.cbrcore.Connector#storeCases(java.util.Collection) 151 */ 152 public void storeCases(Collection<CBRCase> cases) 153 { 154 // TODO Auto-generated method stub 155 156 } 157 158 }