001    /**
002     * LuceneIndex.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 10/04/2007
008     */
009    package jcolibri.extensions.textual.lucene;
010    
011    
012    import java.io.File;
013    import java.io.IOException;
014    import java.util.Collection;
015    
016    import jcolibri.util.ProgressController;
017    
018    import org.apache.lucene.analysis.standard.StandardAnalyzer;
019    import org.apache.lucene.index.IndexWriter;
020    import org.apache.lucene.store.Directory;
021    import org.apache.lucene.store.FSDirectory;
022    import org.apache.lucene.store.RAMDirectory;
023    
024    /**
025     * This class wraps the Lucene inverted terms index. 
026     * This structure stores in which documents appears a word.
027     * <br>
028     * It also mantains a hash table that allows to retrieve a document form the index given its ID.
029     * <p>
030     * There are two ways to store the index:
031     * <ul>
032     * <li>In the file system. It saves the index in a directory. It is slower but does not consume memory.
033     * <li>In memory. It stores the index in memory. You will need very much RAM memory but it will work quickly. 
034     * If you obtain an outOfMemoryException try the -Xms -Xmx VM params.
035     * </ul>
036     * @author Juan A. Recio-García
037     * @version 2.0
038     */
039    public class LuceneIndex{
040            
041            private Directory directory;
042            private java.util.HashMap<String, LuceneDocument> docsMapping;
043    
044            /**
045             * Creates a LuceneIndex stored in the File System.
046             * @param directory to store the index once generated
047             * @param documents to index
048             */
049            public LuceneIndex(File directory,  Collection<LuceneDocument> documents)
050            {
051                    this.docsMapping = new java.util.HashMap<String, LuceneDocument>();
052    
053                org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Creating File System Index in: "+directory.getPath());
054                    
055                    try {
056                            this.directory = FSDirectory.getDirectory(directory);
057                    } catch (IOException e) {
058                            org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).error(e);
059                    }
060    
061                    createIndex(documents);
062    
063            }
064            
065            /**
066             * Creates an index stored into memory.
067             * @param documents to index.
068             */
069            public LuceneIndex(Collection<LuceneDocument> documents)
070            {
071                    this.docsMapping = new java.util.HashMap<String, LuceneDocument>();
072                    org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Creating In-Memory index");
073                    
074                this.directory = new RAMDirectory();
075                    createIndex(documents);
076            }
077    
078            private void createIndex(Collection<LuceneDocument> documents)
079            {
080                    try {
081                            
082                            IndexWriter writer = new IndexWriter(directory,  new StandardAnalyzer(), true);
083                        
084                            org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Indexing "+documents.size()+" documents.");
085                            ProgressController.init(this.getClass(),"Lucene. Indexing documents", documents.size());
086                            
087                            for(LuceneDocument doc: documents)
088                            {
089                                    writer.addDocument(doc.getInternalDocument());
090                                    docsMapping.put(doc.getDocID(), doc);
091                                    ProgressController.step(this.getClass());
092                            }                   
093                            org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).info("Optimizing index.");
094                            
095                            writer.optimize();
096                        writer.close();
097                        ProgressController.finish(this.getClass());
098                    } catch (Exception e) {
099                            org.apache.commons.logging.LogFactory.getLog(LuceneIndex.class).error(e);               
100                    }
101            }
102    
103            
104            
105            
106            /**
107             * @return the directory
108             */
109            public Directory getDirectory() {
110                    return directory;
111            }
112            
113            
114            
115                    
116            public int getNumberOfDocuments()
117            {
118                    return docsMapping.size();
119            }
120            
121            public LuceneDocument getDocument(String docId)
122            {
123                    return docsMapping.get(docId);
124            }
125    }