001    /**
002     * StopWordsDetector.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 20/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.util.Collection;
012    import java.util.HashSet;
013    import java.util.Set;
014    
015    import jcolibri.cbrcore.Attribute;
016    import jcolibri.cbrcore.CBRCase;
017    import jcolibri.cbrcore.CBRQuery;
018    import jcolibri.extensions.textual.IE.IEutils;
019    import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor;
020    import jcolibri.extensions.textual.IE.representation.IEText;
021    import jcolibri.extensions.textual.IE.representation.Token;
022    import jcolibri.util.AttributeUtils;
023    import jcolibri.util.ProgressController;
024    
025    
026    /**
027     * Removes stop words (workds without relevant meaning) and punctuation symbols.
028     * It uses a built-in list and modifies the "isStopWord" flag of the tokens.
029     * <p>
030     * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
031     * Universidad Complutense de Madrid (GAIA)
032     * </p>
033     * @author Juan A. Recio-Garcia
034     * @version 2.0
035     */
036    public class StopWordsDetector
037    {
038    
039        /**
040         * Performs the algorithm in the given attributes of a collection of cases.
041         * These attributes must be IEText objects.
042         */
043        public static void detectStopWords(Collection<CBRCase> cases, Collection<Attribute> attributes)
044        {
045            org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words.");
046            ProgressController.init(StopWordsDetector.class, "Detecting stop words ...", cases.size());
047            for(CBRCase c: cases)
048            {
049                for(Attribute a: attributes)
050                {
051                    Object o = AttributeUtils.findValue(a, c);
052                    detectStopWords((IEText)o);
053                }
054                ProgressController.step(GatePhrasesExtractor.class);
055            }
056            ProgressController.finish(GatePhrasesExtractor.class);
057        }
058    
059        /**
060         * Performs the algorithm in the given attributes of a query.
061         * These attributes must be IEText objects.
062         */
063        public static void detectStopWords(CBRQuery query, Collection<Attribute> attributes)
064        {
065                org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words.");
066                for(Attribute a: attributes)
067                {
068                    Object o = AttributeUtils.findValue(a, query);
069                    detectStopWords((IEText)o);
070                }
071        }
072        
073        /**
074         * Performs the algorithm in all the attributes of a collection of cases
075         * These attributes must be IEText objects.
076         */
077        public static void detectStopWords(Collection<CBRCase> cases)
078        {
079            org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words.");
080            ProgressController.init(StopWordsDetector.class, "Detecting stop words ...", cases.size());
081            for(CBRCase c: cases)
082            {
083                Collection<IEText> texts = IEutils.getTexts(c);
084                for(IEText t : texts)
085                    detectStopWords(t);
086                ProgressController.step(GatePhrasesExtractor.class);
087            }
088            ProgressController.finish(GatePhrasesExtractor.class);
089        }
090        
091        /**
092         * Performs the algorithm in all the attributes of a query
093         * These attributes must be IEText objects.
094         */
095        public static void detectStopWords(CBRQuery query)
096        {    
097            org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words.");
098            Collection<IEText> texts = IEutils.getTexts(query);
099            for(IEText t : texts)
100                detectStopWords(t);
101        }
102        
103        /**
104         * Performs the algorithm in a given IEText object
105         */
106        public static void detectStopWords(IEText text)
107        {
108            for(Token t: text.getAllTokens())
109            {
110                String word = t.getRawContent().toLowerCase();
111                if(stopWordSet.contains(word))
112                    t.setStopWord(true);
113            }
114        }
115            
116        /**
117         * Stop words list
118         */
119        static String[] stopWords = { "a", "a's", "able", "about", "above",
120                    "according", "accordingly", "across", "actually", "after",
121                    "afterwards", "again", "against", "ain't", "all", "allow",
122                    "allows", "almost", "alone", "along", "already", "also",
123                    "although", "always", "am", "among", "amongst", "an", "and",
124                    "another", "any", "anybody", "anyhow", "anyone", "anything",
125                    "anyway", "anyways", "anywhere", "apart", "appear", "appreciate",
126                    "appropriate", "are", "aren't", "around", "as", "aside", "ask",
127                    "asking", "associated", "at", "available", "away", "awfully", "b",
128                    "be", "became", "because", "become", "becomes", "becoming", "been",
129                    "before", "beforehand", "behind", "being", "believe", "below",
130                    "beside", "besides", "best", "better", "between", "beyond", "both",
131                    "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't",
132                    "cannot", "cant", "cause", "causes", "certain", "certainly",
133                    "changes", "clearly", "co", "com", "come", "comes", "concerning",
134                    "consequently", "consider", "considering", "contain", "containing",
135                    "contains", "corresponding", "could", "couldn't", "course",
136                    "currently", "d", "definitely", "described", "despite", "did",
137                    "didn't", "different", "do", "does", "doesn't", "doing", "don't",
138                    "done", "down", "downwards", "during", "e", "each", "edu", "eg",
139                    "eight", "either", "else", "elsewhere", "enough", "entirely",
140                    "especially", "et", "etc", "even", "ever", "every", "everybody",
141                    "everyone", "everything", "everywhere", "ex", "exactly", "example",
142                    "except", "f", "far", "few", "fifth", "first", "five", "followed",
143                    "following", "follows", "for", "former", "formerly", "forth",
144                    "four", "from", "further", "furthermore", "g", "get", "gets",
145                    "getting", "given", "gives", "go", "goes", "going", "gone", "got",
146                    "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly",
147                    "has", "hasn't", "have", "haven't", "having", "he", "he's",
148                    "hello", "help", "hence", "her", "here", "here's", "hereafter",
149                    "hereby", "herein", "hereupon", "hers", "herself", "hi", "him",
150                    "himself", "his", "hither", "hopefully", "how", "howbeit",
151                    "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if",
152                    "ignored", "immediate", "in", "inasmuch", "inc", "indeed",
153                    "indicate", "indicated", "indicates", "inner", "insofar",
154                    "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll",
155                    "it's", "its", "itself", "j", "just", "k", "keep", "keeps", "kept",
156                    "know", "knows", "known", "l", "last", "lately", "later", "latter",
157                    "latterly", "least", "less", "lest", "let", "let's", "like",
158                    "liked", "likely", "little", "look", "looking", "looks", "ltd",
159                    "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile",
160                    "merely", "might", "more", "moreover", "most", "mostly", "much",
161                    "must", "my", "myself", "n", "name", "namely", "nd", "near",
162                    "nearly", "necessary", "need", "needs", "neither", "never",
163                    "nevertheless", "new", "next", "nine", "no", "nobody", "non",
164                    "none", "noone", "nor", "normally", "not", "nothing", "novel",
165                    "now", "nowhere", "o", "obviously", "of", "off", "often", "oh",
166                    "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto",
167                    "or", "other", "others", "otherwise", "ought", "our", "ours",
168                    "ourselves", "out", "outside", "over", "overall", "own", "p",
169                    "particular", "particularly", "per", "perhaps", "placed", "please",
170                    "plus", "possible", "presumably", "probably", "provides", "q",
171                    "que", "quite", "qv", "r", "rather", "rd", "re", "really",
172                    "reasonably", "regarding", "regardless", "regards", "relatively",
173                    "respectively", "right", "s", "said", "same", "saw", "say",
174                    "saying", "says", "second", "secondly", "see", "seeing", "seem",
175                    "seemed", "seeming", "seems", "seen", "self", "selves", "sensible",
176                    "sent", "serious", "seriously", "seven", "several", "shall", "she",
177                    "should", "shouldn't", "since", "six", "so", "some", "somebody",
178                    "somehow", "someone", "something", "sometime", "sometimes",
179                    "somewhat", "somewhere", "soon", "sorry", "specified", "specify",
180                    "specifying", "still", "sub", "such", "sup", "sure", "t", "t's",
181                    "take", "taken", "tell", "tends", "th", "than", "thank", "thanks",
182                    "thanx", "that", "that's", "thats", "the", "their", "theirs",
183                    "them", "themselves", "then", "thence", "there", "there's",
184                    "thereafter", "thereby", "therefore", "therein", "theres",
185                    "thereupon", "these", "they", "they'd", "they'll", "they're",
186                    "they've", "think", "third", "this", "thorough", "thoroughly",
187                    "those", "though", "three", "through", "throughout", "thru",
188                    "thus", "to", "together", "too", "took", "toward", "towards",
189                    "tried", "tries", "truly", "try", "trying", "twice", "two", "u",
190                    "un", "under", "unfortunately", "unless", "unlikely", "until",
191                    "unto", "up", "upon", "us", "use", "used", "useful", "uses",
192                    "using", "usually", "uucp", "v", "value", "various", "very", "via",
193                    "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we",
194                    "we'd", "we'll", "we're", "we've", "welcome", "well", "went",
195                    "were", "weren't", "what", "what's", "whatever", "when", "whence",
196                    "whenever", "where", "where's", "whereafter", "whereas", "whereby",
197                    "wherein", "whereupon", "wherever", "whether", "which", "while",
198                    "whither", "who", "who's", "whoever", "whole", "whom", "whose",
199                    "why", "will", "willing", "wish", "with", "within", "without",
200                    "won't", "wonder", "would", "wouldn't", "x", "y", "yes", "yet",
201                    "you", "you'd", "you'll", "you're", "you've", "your", "yours",
202                    "yourself", "yourselves", "z", "zero", "albeit", "author", "av",
203                    "canst", "cf", "cfrd", "choose", "conducted", "considered",
204                    "contrariwise", "cos", "crd", "cu", "day", "describes", "designed",
205                    "determine", "determined", "discussed", "dost", "doth", "double",
206                    "dual", "due", "excepted", "excepting", "exception", "exclude",
207                    "excluding", "exclusive", "farther", "farthest", "ff", "forward",
208                    "found", "front", "furthest", "general", "halves", "hast", "hath",
209                    "henceforth", "hereabouts", "hereto", "hindmost", "hitherto",
210                    "howsoever", "I", "include", "included", "including", "indoors",
211                    "inside", "insomuch", "investigated", "inwards", "kind", "kg",
212                    "km", "made", "meantime", "mr", "mrs", "ms", "nonetheless", "nope",
213                    "notwithstandi", "ng", "nowadays", "obtained", "performance",
214                    "performed", "plenty", "present", "presented", "presents",
215                    "provide", "provided", "related", "report", "required", "results",
216                    "round", "sake", "sang", "save", "seldom", "selected", "sfrd",
217                    "shalt", "shown", "sideways", "significant", "slept", "slew",
218                    "slung", "slunk", "smote", "spake", "spat", "spoke", "spoken",
219                    "sprang", "sprung", "srd", "stave", "staves", "studies",
220                    "supposing", "tested", "thee", "thenceforth", "thereabout",
221                    "thereabouts", "thereof", "thereon", "thereto", "thou", "thrice",
222                    "thy", "thyself", "till", "types", "unable", "underneath",
223                    "unlike", "upward", "upwards", "week", "whatsoever", "whensoever",
224                    "whereabouts", "whereat", "wherefore", "wherefrom", "whereinto",
225                    "whereof", "whereon", "wheresoever", "whereto", "whereunto",
226                    "wherewith", "whew", "whichever", "whichsoevr", "whilst", "whoa",
227                    "whomever", "whomsoever", "whosoever", "wilt", "worse", "worst",
228                    "wow", "ye", "year", "yippee",
229                    //Also include puntuation
230                    ",", ";", ".", ":", "_", "{", "}", "[", "]", "+", "*", "¡", "¿", "?", "=", ")", "(", "/", "&", "%", "$", "·"
231                    };
232    
233        static Set<String> stopWordSet = new HashSet<String>(java.util.Arrays.asList(stopWords));
234    
235    }