001    /**
002     * TestCarrot.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 17/05/2007
008     */
009    package jcolibri.extensions.textual.carrot2;
010    
011    import java.io.File;
012    import java.io.IOException;
013    import java.util.Collection;
014    
015    import javax.swing.JFileChooser;
016    
017    import jcolibri.datatypes.Text;
018    import jcolibri.extensions.textual.carrot2.CarrotClusteringResult.Cluster;
019    import jcolibri.extensions.textual.lucene.LuceneDocument;
020    import jcolibri.extensions.textual.lucene.LuceneIndex;
021    import jcolibri.extensions.textual.wordnet.WordNetBridge;
022    
023    /**
024     * Class used to test and learn how to use Carrot2. 
025     * It parses the documents of a directory and clusters them according to a query.
026     * <p>
027     * To avoid memory problems use the -Xms -Xmx VM params. For example to use a max of 1Gb of memory use: -Xms256m -Xmx1024m 
028     * @author Juan A. Recio García.
029     * @version 1.0
030     *
031     */
032    public class TestCarrot {
033    
034            // Unique field of the document.
035            private static String CONTENT_FIELD = "content";
036            
037            /**
038             * @param args
039             */
040            public static void main(String[] args) {
041                    
042                    //Obtain the directory with the documents
043                    
044                JFileChooser jfc = new javax.swing.JFileChooser();
045                    jfc.setDialogType(JFileChooser.OPEN_DIALOG);
046                    jfc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);
047                    
048                    jfc.showOpenDialog(null);
049                    File docDir = jfc.getSelectedFile();
050    
051                    if (!docDir.exists() || !docDir.canRead()) 
052                    {
053                  System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path");
054                  System.exit(1);
055                }
056                
057                    // Convert the documents to the Lucene format.
058                    
059                    Collection<LuceneDocument> docs = null;;
060                    try {
061                            docs = indexDocs(docDir);
062                    } catch (IOException e) {
063                            org.apache.commons.logging.LogFactory.getLog(WordNetBridge.class).error(e);
064                    }
065                    
066                    
067                    try {
068                            CarrotClusteringResult ccr;
069                            
070                            // First create the Lucene inverted index
071                            LuceneIndex index = new LuceneIndex(docs);
072                            
073                            // Create the Carrot Clusterer for the unique field of the Lucene documents.
074                            String[] searchFields = { CONTENT_FIELD };
075                            CarrotClusterer clusterer = new CarrotClusterer(index, searchFields, 20);
076                            
077                            // Ask for the query
078                            String query = javax.swing.JOptionPane.showInputDialog("Query?");
079                            
080                            // Cluster the query
081                            ccr = clusterer.cluster(query);
082                    
083                            // Print the clusters
084                            int i=0;
085                            int total = ccr.getClusters().size();
086                            for(Cluster c: ccr.getClusters())
087                            {
088                                    System.out.println("Cluster "+i+++"/"+total+": "+ c.getLabels());
089                                    System.out.println(c.getDocs().size()+ " documents in cluster");
090                                    for(LuceneDocument doc: c.getDocs())
091                                            System.out.println("  "+doc.getDocID());
092                            }
093                            
094                    } catch (OutOfMemoryError e) {
095                            org.apache.commons.logging.LogFactory.getLog(WordNetBridge.class).error("Carrot2 requires more memory. Launch the JVM with these flags: java -Xms256m -Xmx512m ...");
096                    }
097            }
098            
099            /**
100             * Converts the documents in the directory to the lucene format
101             */
102            private static Collection<LuceneDocument> indexDocs(File directory) throws IOException {
103                
104                    java.util.ArrayList<LuceneDocument> docs = new java.util.ArrayList<LuceneDocument>();
105                    
106                if (!directory.canRead())
107                    return docs;
108                
109                if (!directory.isDirectory())
110                    return docs;
111                
112                File[] files = directory.listFiles();
113                
114                for(File f: files)
115                {
116                    if(f.isDirectory())
117                            continue;
118                    java.io.BufferedReader fr = new java.io.BufferedReader(new java.io.FileReader(f));
119                    StringBuffer sb = new StringBuffer();
120                    while (fr.ready())
121                            sb.append(fr.readLine());
122                        fr.close();
123    
124                        // Put the content is our unique field      
125                    LuceneDocument doc = new LuceneDocument(f.getCanonicalPath());
126                    doc.addContentField(CONTENT_FIELD, new Text(sb.toString()));
127                    
128                    docs.add(doc);  
129                }
130                
131                return docs;
132                   
133              }
134    }