001 /** 002 * LuceneIndex.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 10/04/2007 008 */ 009 package jcolibri.extensions.textual.lucene; 010 011 012 import java.io.File; 013 import java.io.IOException; 014 import java.util.Collection; 015 016 import jcolibri.util.ProgressController; 017 018 import org.apache.lucene.analysis.standard.StandardAnalyzer; 019 import org.apache.lucene.index.IndexWriter; 020 import org.apache.lucene.store.Directory; 021 import org.apache.lucene.store.FSDirectory; 022 import org.apache.lucene.store.RAMDirectory; 023 024 /** 025 * This class wraps the Lucene inverted terms index. 026 * This structure stores in which documents appears a word. 027 * <br> 028 * It also mantains a hash table that allows to retrieve a document form the index given its ID. 029 * <p> 030 * There are two ways to store the index: 031 * <ul> 032 * <li>In the file system. It saves the index in a directory. It is slower but does not consume memory. 033 * <li>In memory. It stores the index in memory. You will need very much RAM memory but it will work quickly. 034 * If you obtain an outOfMemoryException try the -Xms -Xmx VM params. 035 * </ul> 036 * @author Juan A. Recio-García 037 * @version 2.0 038 */ 039 public class LuceneIndex{ 040 041 private Directory directory; 042 private java.util.HashMap<String, LuceneDocument> docsMapping; 043 044 /** 045 * Creates a LuceneIndex stored in the File System. 046 * @param directory to store the index once generated 047 * @param documents to index 048 */ 049 public LuceneIndex(File directory, Collection<LuceneDocument> documents) 050 { 051 this.docsMapping = new java.util.HashMap<String, LuceneDocument>(); 052 053 org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Creating File System Index in: "+directory.getPath()); 054 055 try { 056 this.directory = FSDirectory.getDirectory(directory); 057 } catch (IOException e) { 058 org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).error(e); 059 } 060 061 createIndex(documents); 062 063 } 064 065 /** 066 * Creates an index stored into memory. 067 * @param documents to index. 068 */ 069 public LuceneIndex(Collection<LuceneDocument> documents) 070 { 071 this.docsMapping = new java.util.HashMap<String, LuceneDocument>(); 072 org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Creating In-Memory index"); 073 074 this.directory = new RAMDirectory(); 075 createIndex(documents); 076 } 077 078 private void createIndex(Collection<LuceneDocument> documents) 079 { 080 try { 081 082 IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true); 083 084 org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Indexing "+documents.size()+" documents."); 085 ProgressController.init(this.getClass(),"Lucene. Indexing documents", documents.size()); 086 087 for(LuceneDocument doc: documents) 088 { 089 writer.addDocument(doc.getInternalDocument()); 090 docsMapping.put(doc.getDocID(), doc); 091 ProgressController.step(this.getClass()); 092 } 093 org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Optimizing index."); 094 095 writer.optimize(); 096 writer.close(); 097 ProgressController.finish(this.getClass()); 098 } catch (Exception e) { 099 org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).error(e); 100 } 101 } 102 103 104 105 106 /** 107 * @return the directory 108 */ 109 public Directory getDirectory() { 110 return directory; 111 } 112 113 114 115 116 public int getNumberOfDocuments() 117 { 118 return docsMapping.size(); 119 } 120 121 public LuceneDocument getDocument(String docId) 122 { 123 return docsMapping.get(docId); 124 } 125 }