001 /** 002 * CarrotClusterer.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 17/05/2007 008 */ 009 package jcolibri.extensions.textual.carrot2; 010 011 012 import java.util.HashMap; 013 014 import org.apache.lucene.analysis.Analyzer; 015 import org.apache.lucene.analysis.standard.StandardAnalyzer; 016 import org.apache.lucene.search.IndexSearcher; 017 import org.apache.lucene.search.Searcher; 018 import org.carrot2.core.LocalComponent; 019 import org.carrot2.core.LocalComponentFactory; 020 import org.carrot2.core.LocalController; 021 import org.carrot2.core.LocalControllerBase; 022 import org.carrot2.core.LocalInputComponent; 023 import org.carrot2.core.LocalProcessBase; 024 import org.carrot2.core.ProcessingResult; 025 import org.carrot2.core.impl.ArrayOutputComponent; 026 import org.carrot2.filter.lingo.local.LingoLocalFilterComponent; 027 import org.carrot2.input.lucene.LuceneLocalInputComponent; 028 import org.carrot2.input.lucene.LuceneSearchConfig; 029 030 import jcolibri.extensions.textual.lucene.LuceneDocument; 031 import jcolibri.extensions.textual.lucene.LuceneIndex; 032 import jcolibri.util.ProgressController; 033 034 /** 035 * Clusters documents using the Carrot2 framework. 036 * This framework uses Lucene to index and retrieve relevant documents for a query, 037 * and then cluster them assigning a "descriptive label" for each one. 038 * 039 * <p> 040 * To learn how to use this class see the TestCarrot example. 041 * 042 * @author Juan A. Recio-García 043 * @version 1.0 044 * @see jcolibri.extensions.textual.lucene.LuceneIndex 045 * @see jcolibri.extensions.textual.carrot2.TestCarrot 046 */ 047 public class CarrotClusterer { 048 049 LocalController controller; 050 LuceneIndex index; 051 052 /** 053 * Creates a Carrot Clusterer for the given Lucene Index. 054 * @param index Index of documents 055 * @param searchFields Fields where search inside the document. Each lucene index is divided in several fields an the search can be performed in some of them. 056 */ 057 public CarrotClusterer(LuceneIndex index, String[] searchFields) 058 { 059 this(index,searchFields, -1); 060 } 061 062 /** 063 * Creates a Carrot Clusterer for the given Lucene Index that returns a maximum number of documents in each search. 064 * @param index Index of documents 065 * @param searchFields Fields where search inside the document. Each lucene index is divided in several fields an the search can be performed in some of them. 066 * @param maxclusters Max number of clusters to return (approximately). 067 */ 068 public CarrotClusterer(LuceneIndex index, String[] searchFields, int maxclusters) 069 { 070 this.index = index; 071 try{ 072 final int _maxclusters = maxclusters; 073 074 controller = new LocalControllerBase(); 075 076 Searcher searcher = new IndexSearcher(index.getDirectory()); 077 078 // Create an Analyzer. This must be the same analyzer as the one 079 // used to create your index. We use a standard analyzer here. 080 final Analyzer analyzer = new StandardAnalyzer(); 081 082 // Define your field configuration here. Search fields are the 083 // fields used to retrieve matching documents when you query 084 // Lucene through Carrot<sup>2</sup>. Title, URL and summary 085 // fields are used for retriving data to be clustered (the URL 086 // field is used for document identification, actually). 087 final String urlField = searchFields[0]; 088 final String titleField = LuceneDocument.ID_FIELD; 089 final String summaryField = searchFields[0]; 090 091 final LuceneSearchConfig luceneConfig = new LuceneSearchConfig( 092 searcher, analyzer, searchFields, 093 titleField, summaryField, urlField); 094 095 // 096 // Create Lucene input component factory. 097 // 098 final LocalComponentFactory input = new LocalComponentFactory() { 099 public LocalComponent getInstance() { 100 return new LuceneLocalInputComponent(luceneConfig); 101 } 102 }; 103 104 // add lucene input as 'lucene-myindex' 105 controller.addLocalComponentFactory("lucene-myindex", input); 106 107 108 // 109 // Now it's time to create filters. We will use Lingo clustering 110 // component. 111 // 112 final LocalComponentFactory lingo = new LocalComponentFactory() { 113 public LocalComponent getInstance() { 114 // we will use the defaults here, see {@link Example} 115 // for more verbose configuration. 116 final HashMap<String,String> parameters = new HashMap<String,String>(); 117 parameters.put("lsi.threshold.clusterAssignment", "0.01"); 118 parameters.put("lsi.threshold.candidateCluster", "3.5"); 119 if(_maxclusters>0) 120 parameters.put("clusters.num", String.valueOf(_maxclusters)); 121 122 return new LingoLocalFilterComponent(null, parameters); 123 } 124 }; 125 126 // add the clustering component as "lingo-classic" 127 controller.addLocalComponentFactory("lingo-classic", lingo); 128 129 130 // 131 // Finally, create a result-catcher component 132 // 133 final LocalComponentFactory output = new LocalComponentFactory() { 134 public LocalComponent getInstance() { 135 return new ArrayOutputComponent(); 136 } 137 }; 138 139 // add the output component as "buffer" 140 controller.addLocalComponentFactory("buffer", output); 141 142 143 // 144 // In the final step, assemble a process from the above. 145 // 146 147 controller.addProcess("lucene-lingo", 148 new LocalProcessBase("lucene-myindex", "buffer", new String [] {"lingo-classic"})); 149 } catch (Exception e) { 150 org.apache.commons.logging.LogFactory.getLog(this.getClass()).error(e); 151 } 152 } 153 154 /** 155 * Clusters the documents for the given query. 156 */ 157 public CarrotClusteringResult cluster(String query) 158 { 159 return cluster(query,-1); 160 } 161 162 /** 163 * Clusters the documents for the given query, retrieving a maximum of documents from Lucene. 164 */ 165 public CarrotClusteringResult cluster(String query, int maxResults) 166 { 167 try { 168 final HashMap<String,String> params = new HashMap<String,String>(); 169 if(maxResults>-1) 170 params.put(LocalInputComponent.PARAM_REQUESTED_RESULTS, Integer.toString(maxResults)); 171 ProgressController.init(this.getClass(),"Carrot2. Clustering documents", -1); 172 ProgressController.step(this.getClass()); 173 final ProcessingResult pResult = controller.query("lucene-lingo", query, params); 174 final ArrayOutputComponent.Result result = (ArrayOutputComponent.Result) pResult.getQueryResult(); 175 ProgressController.finish(this.getClass()); 176 return new CarrotClusteringResult(result, index); 177 } catch (Exception e) { 178 org.apache.commons.logging.LogFactory.getLog(this.getClass()).error(e); 179 } 180 return null; 181 182 } 183 }