001    /**
002     * CarrotClusterer.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 17/05/2007
008     */
009    package jcolibri.extensions.textual.carrot2;
010    
011    
012    import java.util.HashMap;
013    
014    import org.apache.lucene.analysis.Analyzer;
015    import org.apache.lucene.analysis.standard.StandardAnalyzer;
016    import org.apache.lucene.search.IndexSearcher;
017    import org.apache.lucene.search.Searcher;
018    import org.carrot2.core.LocalComponent;
019    import org.carrot2.core.LocalComponentFactory;
020    import org.carrot2.core.LocalController;
021    import org.carrot2.core.LocalControllerBase;
022    import org.carrot2.core.LocalInputComponent;
023    import org.carrot2.core.LocalProcessBase;
024    import org.carrot2.core.ProcessingResult;
025    import org.carrot2.core.impl.ArrayOutputComponent;
026    import org.carrot2.filter.lingo.local.LingoLocalFilterComponent;
027    import org.carrot2.input.lucene.LuceneLocalInputComponent;
028    import org.carrot2.input.lucene.LuceneSearchConfig;
029    
030    import jcolibri.extensions.textual.lucene.LuceneDocument;
031    import jcolibri.extensions.textual.lucene.LuceneIndex;
032    import jcolibri.util.ProgressController;
033    
034    /**
035     * Clusters documents using the Carrot2 framework. 
036     * This framework uses Lucene to index and retrieve relevant documents for a query, 
037     * and then cluster them assigning a "descriptive label" for each one.
038     * 
039     * <p>
040     * To learn how to use this class see the TestCarrot example.
041     * 
042     * @author Juan A. Recio-García
043     * @version 1.0
044     * @see jcolibri.extensions.textual.lucene.LuceneIndex
045     * @see jcolibri.extensions.textual.carrot2.TestCarrot
046     */
047    public class CarrotClusterer {
048            
049        LocalController controller;
050        LuceneIndex index;
051    
052        /**
053         * Creates a Carrot Clusterer for the given Lucene Index. 
054         * @param index Index of documents
055         * @param searchFields Fields where search inside the document. Each lucene index is divided in several fields an the search can be performed in some of them.
056         */
057            public CarrotClusterer(LuceneIndex index, String[] searchFields)
058            {
059                    this(index,searchFields, -1);
060            }
061        
062        /**
063         * Creates a Carrot Clusterer for the given Lucene Index that returns a maximum number of documents in each search.
064         * @param index Index of documents
065         * @param searchFields Fields where search inside the document. Each lucene index is divided in several fields an the search can be performed in some of them.
066         * @param maxclusters Max number of clusters to return (approximately).
067         */
068            public CarrotClusterer(LuceneIndex index, String[] searchFields, int maxclusters)
069            {
070                    this.index = index;
071            try{
072                    final int _maxclusters = maxclusters;
073                    
074                    controller = new LocalControllerBase();
075                    
076                            Searcher searcher  = new IndexSearcher(index.getDirectory());
077            
078                    // Create an Analyzer. This must be the same analyzer as the one
079                    // used to create your index. We use a standard analyzer here.
080                    final Analyzer analyzer = new StandardAnalyzer();
081            
082                    // Define your field configuration here. Search fields are the
083                    // fields used to retrieve matching documents when you query
084                    // Lucene through Carrot<sup>2</sup>. Title, URL and summary
085                    // fields are used for retriving data to be clustered (the URL
086                    // field is used for document identification, actually).
087                    final String urlField = searchFields[0];
088                    final String titleField = LuceneDocument.ID_FIELD;
089                    final String summaryField = searchFields[0];
090            
091                    final LuceneSearchConfig luceneConfig = new LuceneSearchConfig(
092                            searcher, analyzer, searchFields,
093                            titleField, summaryField, urlField); 
094            
095                    //
096                    // Create Lucene input component factory.
097                    //
098                    final LocalComponentFactory input = new LocalComponentFactory() {
099                        public LocalComponent getInstance() {
100                            return new LuceneLocalInputComponent(luceneConfig);
101                        }
102                    };
103                    
104                    // add lucene input as 'lucene-myindex'
105                    controller.addLocalComponentFactory("lucene-myindex", input);
106            
107            
108                    //
109                    // Now it's time to create filters. We will use Lingo clustering
110                    // component. 
111                    //
112                    final LocalComponentFactory lingo = new LocalComponentFactory() {
113                        public LocalComponent getInstance() {
114                            // we will use the defaults here, see {@link Example}
115                            // for more verbose configuration.
116                            final HashMap<String,String> parameters = new HashMap<String,String>();
117                            parameters.put("lsi.threshold.clusterAssignment", "0.01");
118                            parameters.put("lsi.threshold.candidateCluster",  "3.5");
119                            if(_maxclusters>0)
120                                    parameters.put("clusters.num", String.valueOf(_maxclusters));
121            
122                            return new LingoLocalFilterComponent(null, parameters);
123                        }
124                    };
125            
126                    // add the clustering component as "lingo-classic"
127                    controller.addLocalComponentFactory("lingo-classic", lingo);
128            
129                    
130                    //
131                    // Finally, create a result-catcher component
132                    //
133                    final LocalComponentFactory output = new LocalComponentFactory() {
134                        public LocalComponent getInstance() {
135                            return new ArrayOutputComponent();
136                        }
137                    };
138            
139                    // add the output component as "buffer"
140                    controller.addLocalComponentFactory("buffer", output);
141            
142                    
143                    //
144                    // In the final step, assemble a process from the above.
145                    //
146    
147                controller.addProcess("lucene-lingo", 
148                        new LocalProcessBase("lucene-myindex", "buffer", new String [] {"lingo-classic"}));
149            } catch (Exception e) {
150                            org.apache.commons.logging.LogFactory.getLog(this.getClass()).error(e);
151            }
152            }
153            
154            /**
155             * Clusters the documents for the given query.
156             */
157            public CarrotClusteringResult cluster(String query)
158            {
159                    return cluster(query,-1);
160            }
161    
162            /**
163             * Clusters the documents for the given query, retrieving a maximum of documents from Lucene.
164             */
165            public CarrotClusteringResult cluster(String query, int maxResults)
166            {
167            try {
168                            final HashMap<String,String> params = new HashMap<String,String>();
169                            if(maxResults>-1)
170                                    params.put(LocalInputComponent.PARAM_REQUESTED_RESULTS, Integer.toString(maxResults));
171                            ProgressController.init(this.getClass(),"Carrot2. Clustering documents", -1);
172                            ProgressController.step(this.getClass());
173                            final ProcessingResult pResult = controller.query("lucene-lingo", query, params);
174                            final ArrayOutputComponent.Result result = (ArrayOutputComponent.Result) pResult.getQueryResult();
175                            ProgressController.finish(this.getClass());
176                            return new CarrotClusteringResult(result, index);
177                    } catch (Exception e) {
178                            org.apache.commons.logging.LogFactory.getLog(this.getClass()).error(e);
179                    }
180                    return null;
181            
182            }
183    }