001 /**- 002 * Copyright (c) 2006 Hugo Zaragoza and Jose R. Pérez-Agüera 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions 007 * are met: 008 * 1. Redistributions of source code must retain the above copyright 009 * notice, this list of conditions and the following disclaimer. 010 * 2. Redistributions in binary form must reproduce the above copyright 011 * notice, this list of conditions and the following disclaimer in the 012 * documentation and/or other materials provided with the distribution. 013 * 3. Neither the name of copyright holders nor the names of its 014 * contributors may be used to endorse or promote products derived 015 * from this software without specific prior written permission. 016 * 017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 018 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 019 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 020 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS 021 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 022 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 023 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 024 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 025 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 026 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 * POSSIBILITY OF SUCH DAMAGE. 028 */ 029 package jcolibri.extensions.textual.lucene.spanish; 030 031 import java.io.BufferedReader; 032 import java.io.IOException; 033 import java.io.InputStreamReader; 034 import java.io.Reader; 035 import java.util.ArrayList; 036 import java.util.Set; 037 038 import jcolibri.util.FileIO; 039 040 import org.apache.lucene.analysis.Analyzer; 041 import org.apache.lucene.analysis.LowerCaseFilter; 042 import org.apache.lucene.analysis.StopFilter; 043 import org.apache.lucene.analysis.TokenStream; 044 import org.apache.lucene.analysis.standard.StandardFilter; 045 import org.apache.lucene.analysis.standard.StandardTokenizer; 046 047 048 049 /** 050 * Spanish Lucene analyzer 051 * @author Hugo Zaragoza and Jose R. Pérez-Agüera 052 */ 053 public class SpanishAnalyzer extends Analyzer { 054 055 private Set stopSet; 056 057 /** 058 * Creates the Lucene Spanish Analyzer 059 * @throws IOException 060 */ 061 public SpanishAnalyzer() throws IOException 062 { 063 super(); 064 stopSet = StopFilter.makeStopSet(loadStopWords()); 065 } 066 067 /** Constructs a {@link StandardTokenizer} filtered by a {@link 068 StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ 069 public TokenStream tokenStream(String fieldName, Reader reader) 070 { 071 TokenStream result = new StandardTokenizer(reader); 072 result = new StandardFilter(result); 073 result = new LowerCaseFilter(result); 074 result = new StopFilter(result, stopSet); 075 result = new SpanishStemmerFilter(result); 076 return result; 077 } 078 079 /** 080 * Loads the spanish stop-words list 081 * @throws IOException 082 */ 083 private static String[] loadStopWords() throws IOException 084 { 085 InputStreamReader isr = new InputStreamReader(FileIO.openFile("jcolibri/extensions/textual/lucene/spanish/stopwords-spanish.txt")); 086 BufferedReader br = new BufferedReader(isr); 087 String line = br.readLine(); 088 ArrayList<String> list = new ArrayList<String>(); 089 while(line != null) 090 { 091 list.add(line.trim()); 092 line = br.readLine(); 093 } 094 String stopWords[] = new String[list.toArray().length]; 095 for(int i = 0; i<list.toArray().length;i++) 096 stopWords[i]= (String) list.get(i); 097 098 return stopWords; 099 } 100 101 102 103 }