001    /**-
002     * Copyright (c) 2006 Hugo Zaragoza and Jose R. Pérez-Agüera
003     * All rights reserved.
004     *
005     * Redistribution and use in source and binary forms, with or without
006     * modification, are permitted provided that the following conditions
007     * are met:
008     * 1. Redistributions of source code must retain the above copyright
009     *    notice, this list of conditions and the following disclaimer.
010     * 2. Redistributions in binary form must reproduce the above copyright
011     *    notice, this list of conditions and the following disclaimer in the
012     *    documentation and/or other materials provided with the distribution.
013     * 3. Neither the name of copyright holders nor the names of its
014     *    contributors may be used to endorse or promote products derived
015     *    from this software without specific prior written permission.
016     *
017     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
018     * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019     * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
020     * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
021     * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
022     * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
023     * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
024     * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
025     * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
026     * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027     * POSSIBILITY OF SUCH DAMAGE.
028     */
029    package jcolibri.extensions.textual.lucene.spanish;
030    
031    import java.io.BufferedReader;
032    import java.io.IOException;
033    import java.io.InputStreamReader;
034    import java.io.Reader;
035    import java.util.ArrayList;
036    import java.util.Set;
037    
038    import jcolibri.util.FileIO;
039    
040    import org.apache.lucene.analysis.Analyzer;
041    import org.apache.lucene.analysis.LowerCaseFilter;
042    import org.apache.lucene.analysis.StopFilter;
043    import org.apache.lucene.analysis.TokenStream;
044    import org.apache.lucene.analysis.standard.StandardFilter;
045    import org.apache.lucene.analysis.standard.StandardTokenizer;
046    
047    
048    
049    /**
050     * Spanish Lucene analyzer
051     * @author Hugo Zaragoza and Jose R. Pérez-Agüera
052     */
053    public class SpanishAnalyzer extends Analyzer {
054            
055            private Set stopSet;
056            
057            /**
058             * Creates the Lucene Spanish Analyzer
059             * @throws IOException
060             */
061            public SpanishAnalyzer() throws IOException
062            {
063                    super();
064                    stopSet = StopFilter.makeStopSet(loadStopWords());
065            }
066            
067            /** Constructs a {@link StandardTokenizer} filtered by a {@link
068            StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
069            public TokenStream tokenStream(String fieldName, Reader reader) 
070            {
071                TokenStream result = new StandardTokenizer(reader);
072                result = new StandardFilter(result);
073                result = new LowerCaseFilter(result);
074                result = new StopFilter(result, stopSet);
075                result = new SpanishStemmerFilter(result);
076                return result;
077            }
078            
079            /**
080             * Loads the spanish stop-words list
081             * @throws IOException
082             */
083            private static String[] loadStopWords() throws IOException
084            {
085                    InputStreamReader isr = new InputStreamReader(FileIO.openFile("jcolibri/extensions/textual/lucene/spanish/stopwords-spanish.txt"));
086                    BufferedReader br = new BufferedReader(isr);
087                    String line = br.readLine();
088                    ArrayList<String> list = new ArrayList<String>();
089                    while(line != null)
090                    {
091                            list.add(line.trim());
092                            line = br.readLine();
093                    }
094                    String stopWords[] = new String[list.toArray().length];
095                    for(int i = 0; i<list.toArray().length;i++)
096                            stopWords[i]= (String) list.get(i);
097                    
098                    return stopWords;
099            }
100            
101            
102    
103    }