001 /** 002 * TestCarrot.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 17/05/2007 008 */ 009 package jcolibri.extensions.textual.carrot2; 010 011 import java.io.File; 012 import java.io.IOException; 013 import java.util.Collection; 014 015 import javax.swing.JFileChooser; 016 017 import jcolibri.datatypes.Text; 018 import jcolibri.extensions.textual.carrot2.CarrotClusteringResult.Cluster; 019 import jcolibri.extensions.textual.lucene.LuceneDocument; 020 import jcolibri.extensions.textual.lucene.LuceneIndex; 021 import jcolibri.extensions.textual.wordnet.WordNetBridge; 022 023 /** 024 * Class used to test and learn how to use Carrot2. 025 * It parses the documents of a directory and clusters them according to a query. 026 * <p> 027 * To avoid memory problems use the -Xms -Xmx VM params. For example to use a max of 1Gb of memory use: -Xms256m -Xmx1024m 028 * @author Juan A. Recio García. 029 * @version 1.0 030 * 031 */ 032 public class TestCarrot { 033 034 // Unique field of the document. 035 private static String CONTENT_FIELD = "content"; 036 037 /** 038 * @param args 039 */ 040 public static void main(String[] args) { 041 042 //Obtain the directory with the documents 043 044 JFileChooser jfc = new javax.swing.JFileChooser(); 045 jfc.setDialogType(JFileChooser.OPEN_DIALOG); 046 jfc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); 047 048 jfc.showOpenDialog(null); 049 File docDir = jfc.getSelectedFile(); 050 051 if (!docDir.exists() || !docDir.canRead()) 052 { 053 System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path"); 054 System.exit(1); 055 } 056 057 // Convert the documents to the Lucene format. 058 059 Collection<LuceneDocument> docs = null;; 060 try { 061 docs = indexDocs(docDir); 062 } catch (IOException e) { 063 org.apache.commons.logging.LogFactory.getLog(WordNetBridge.class).error(e); 064 } 065 066 067 try { 068 CarrotClusteringResult ccr; 069 070 // First create the Lucene inverted index 071 LuceneIndex index = new LuceneIndex(docs); 072 073 // Create the Carrot Clusterer for the unique field of the Lucene documents. 074 String[] searchFields = { CONTENT_FIELD }; 075 CarrotClusterer clusterer = new CarrotClusterer(index, searchFields, 20); 076 077 // Ask for the query 078 String query = javax.swing.JOptionPane.showInputDialog("Query?"); 079 080 // Cluster the query 081 ccr = clusterer.cluster(query); 082 083 // Print the clusters 084 int i=0; 085 int total = ccr.getClusters().size(); 086 for(Cluster c: ccr.getClusters()) 087 { 088 System.out.println("Cluster "+i+++"/"+total+": "+ c.getLabels()); 089 System.out.println(c.getDocs().size()+ " documents in cluster"); 090 for(LuceneDocument doc: c.getDocs()) 091 System.out.println(" "+doc.getDocID()); 092 } 093 094 } catch (OutOfMemoryError e) { 095 org.apache.commons.logging.LogFactory.getLog(WordNetBridge.class).error("Carrot2 requires more memory. Launch the JVM with these flags: java -Xms256m -Xmx512m ..."); 096 } 097 } 098 099 /** 100 * Converts the documents in the directory to the lucene format 101 */ 102 private static Collection<LuceneDocument> indexDocs(File directory) throws IOException { 103 104 java.util.ArrayList<LuceneDocument> docs = new java.util.ArrayList<LuceneDocument>(); 105 106 if (!directory.canRead()) 107 return docs; 108 109 if (!directory.isDirectory()) 110 return docs; 111 112 File[] files = directory.listFiles(); 113 114 for(File f: files) 115 { 116 if(f.isDirectory()) 117 continue; 118 java.io.BufferedReader fr = new java.io.BufferedReader(new java.io.FileReader(f)); 119 StringBuffer sb = new StringBuffer(); 120 while (fr.ready()) 121 sb.append(fr.readLine()); 122 fr.close(); 123 124 // Put the content is our unique field 125 LuceneDocument doc = new LuceneDocument(f.getCanonicalPath()); 126 doc.addContentField(CONTENT_FIELD, new Text(sb.toString())); 127 128 docs.add(doc); 129 } 130 131 return docs; 132 133 } 134 }