001 /** 002 * StopWordsDetector.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 20/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.common; 010 011 import java.util.Collection; 012 import java.util.HashSet; 013 import java.util.Set; 014 015 import jcolibri.cbrcore.Attribute; 016 import jcolibri.cbrcore.CBRCase; 017 import jcolibri.cbrcore.CBRQuery; 018 import jcolibri.extensions.textual.IE.IEutils; 019 import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor; 020 import jcolibri.extensions.textual.IE.representation.IEText; 021 import jcolibri.extensions.textual.IE.representation.Token; 022 import jcolibri.util.AttributeUtils; 023 import jcolibri.util.ProgressController; 024 025 026 /** 027 * Removes stop words (workds without relevant meaning) and punctuation symbols. 028 * It uses a built-in list and modifies the "isStopWord" flag of the tokens. 029 * <p> 030 * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática, 031 * Universidad Complutense de Madrid (GAIA) 032 * </p> 033 * @author Juan A. Recio-Garcia 034 * @version 2.0 035 */ 036 public class StopWordsDetector 037 { 038 039 /** 040 * Performs the algorithm in the given attributes of a collection of cases. 041 * These attributes must be IEText objects. 042 */ 043 public static void detectStopWords(Collection<CBRCase> cases, Collection<Attribute> attributes) 044 { 045 org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words."); 046 ProgressController.init(StopWordsDetector.class, "Detecting stop words ...", cases.size()); 047 for(CBRCase c: cases) 048 { 049 for(Attribute a: attributes) 050 { 051 Object o = AttributeUtils.findValue(a, c); 052 detectStopWords((IEText)o); 053 } 054 ProgressController.step(GatePhrasesExtractor.class); 055 } 056 ProgressController.finish(GatePhrasesExtractor.class); 057 } 058 059 /** 060 * Performs the algorithm in the given attributes of a query. 061 * These attributes must be IEText objects. 062 */ 063 public static void detectStopWords(CBRQuery query, Collection<Attribute> attributes) 064 { 065 org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words."); 066 for(Attribute a: attributes) 067 { 068 Object o = AttributeUtils.findValue(a, query); 069 detectStopWords((IEText)o); 070 } 071 } 072 073 /** 074 * Performs the algorithm in all the attributes of a collection of cases 075 * These attributes must be IEText objects. 076 */ 077 public static void detectStopWords(Collection<CBRCase> cases) 078 { 079 org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words."); 080 ProgressController.init(StopWordsDetector.class, "Detecting stop words ...", cases.size()); 081 for(CBRCase c: cases) 082 { 083 Collection<IEText> texts = IEutils.getTexts(c); 084 for(IEText t : texts) 085 detectStopWords(t); 086 ProgressController.step(GatePhrasesExtractor.class); 087 } 088 ProgressController.finish(GatePhrasesExtractor.class); 089 } 090 091 /** 092 * Performs the algorithm in all the attributes of a query 093 * These attributes must be IEText objects. 094 */ 095 public static void detectStopWords(CBRQuery query) 096 { 097 org.apache.commons.logging.LogFactory.getLog(StopWordsDetector.class).info("Detecting stop words."); 098 Collection<IEText> texts = IEutils.getTexts(query); 099 for(IEText t : texts) 100 detectStopWords(t); 101 } 102 103 /** 104 * Performs the algorithm in a given IEText object 105 */ 106 public static void detectStopWords(IEText text) 107 { 108 for(Token t: text.getAllTokens()) 109 { 110 String word = t.getRawContent().toLowerCase(); 111 if(stopWordSet.contains(word)) 112 t.setStopWord(true); 113 } 114 } 115 116 /** 117 * Stop words list 118 */ 119 static String[] stopWords = { "a", "a's", "able", "about", "above", 120 "according", "accordingly", "across", "actually", "after", 121 "afterwards", "again", "against", "ain't", "all", "allow", 122 "allows", "almost", "alone", "along", "already", "also", 123 "although", "always", "am", "among", "amongst", "an", "and", 124 "another", "any", "anybody", "anyhow", "anyone", "anything", 125 "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", 126 "appropriate", "are", "aren't", "around", "as", "aside", "ask", 127 "asking", "associated", "at", "available", "away", "awfully", "b", 128 "be", "became", "because", "become", "becomes", "becoming", "been", 129 "before", "beforehand", "behind", "being", "believe", "below", 130 "beside", "besides", "best", "better", "between", "beyond", "both", 131 "brief", "but", "by", "c", "c'mon", "c's", "came", "can", "can't", 132 "cannot", "cant", "cause", "causes", "certain", "certainly", 133 "changes", "clearly", "co", "com", "come", "comes", "concerning", 134 "consequently", "consider", "considering", "contain", "containing", 135 "contains", "corresponding", "could", "couldn't", "course", 136 "currently", "d", "definitely", "described", "despite", "did", 137 "didn't", "different", "do", "does", "doesn't", "doing", "don't", 138 "done", "down", "downwards", "during", "e", "each", "edu", "eg", 139 "eight", "either", "else", "elsewhere", "enough", "entirely", 140 "especially", "et", "etc", "even", "ever", "every", "everybody", 141 "everyone", "everything", "everywhere", "ex", "exactly", "example", 142 "except", "f", "far", "few", "fifth", "first", "five", "followed", 143 "following", "follows", "for", "former", "formerly", "forth", 144 "four", "from", "further", "furthermore", "g", "get", "gets", 145 "getting", "given", "gives", "go", "goes", "going", "gone", "got", 146 "gotten", "greetings", "h", "had", "hadn't", "happens", "hardly", 147 "has", "hasn't", "have", "haven't", "having", "he", "he's", 148 "hello", "help", "hence", "her", "here", "here's", "hereafter", 149 "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", 150 "himself", "his", "hither", "hopefully", "how", "howbeit", 151 "however", "i", "i'd", "i'll", "i'm", "i've", "ie", "if", 152 "ignored", "immediate", "in", "inasmuch", "inc", "indeed", 153 "indicate", "indicated", "indicates", "inner", "insofar", 154 "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll", 155 "it's", "its", "itself", "j", "just", "k", "keep", "keeps", "kept", 156 "know", "knows", "known", "l", "last", "lately", "later", "latter", 157 "latterly", "least", "less", "lest", "let", "let's", "like", 158 "liked", "likely", "little", "look", "looking", "looks", "ltd", 159 "m", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", 160 "merely", "might", "more", "moreover", "most", "mostly", "much", 161 "must", "my", "myself", "n", "name", "namely", "nd", "near", 162 "nearly", "necessary", "need", "needs", "neither", "never", 163 "nevertheless", "new", "next", "nine", "no", "nobody", "non", 164 "none", "noone", "nor", "normally", "not", "nothing", "novel", 165 "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", 166 "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", 167 "or", "other", "others", "otherwise", "ought", "our", "ours", 168 "ourselves", "out", "outside", "over", "overall", "own", "p", 169 "particular", "particularly", "per", "perhaps", "placed", "please", 170 "plus", "possible", "presumably", "probably", "provides", "q", 171 "que", "quite", "qv", "r", "rather", "rd", "re", "really", 172 "reasonably", "regarding", "regardless", "regards", "relatively", 173 "respectively", "right", "s", "said", "same", "saw", "say", 174 "saying", "says", "second", "secondly", "see", "seeing", "seem", 175 "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", 176 "sent", "serious", "seriously", "seven", "several", "shall", "she", 177 "should", "shouldn't", "since", "six", "so", "some", "somebody", 178 "somehow", "someone", "something", "sometime", "sometimes", 179 "somewhat", "somewhere", "soon", "sorry", "specified", "specify", 180 "specifying", "still", "sub", "such", "sup", "sure", "t", "t's", 181 "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", 182 "thanx", "that", "that's", "thats", "the", "their", "theirs", 183 "them", "themselves", "then", "thence", "there", "there's", 184 "thereafter", "thereby", "therefore", "therein", "theres", 185 "thereupon", "these", "they", "they'd", "they'll", "they're", 186 "they've", "think", "third", "this", "thorough", "thoroughly", 187 "those", "though", "three", "through", "throughout", "thru", 188 "thus", "to", "together", "too", "took", "toward", "towards", 189 "tried", "tries", "truly", "try", "trying", "twice", "two", "u", 190 "un", "under", "unfortunately", "unless", "unlikely", "until", 191 "unto", "up", "upon", "us", "use", "used", "useful", "uses", 192 "using", "usually", "uucp", "v", "value", "various", "very", "via", 193 "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", 194 "we'd", "we'll", "we're", "we've", "welcome", "well", "went", 195 "were", "weren't", "what", "what's", "whatever", "when", "whence", 196 "whenever", "where", "where's", "whereafter", "whereas", "whereby", 197 "wherein", "whereupon", "wherever", "whether", "which", "while", 198 "whither", "who", "who's", "whoever", "whole", "whom", "whose", 199 "why", "will", "willing", "wish", "with", "within", "without", 200 "won't", "wonder", "would", "wouldn't", "x", "y", "yes", "yet", 201 "you", "you'd", "you'll", "you're", "you've", "your", "yours", 202 "yourself", "yourselves", "z", "zero", "albeit", "author", "av", 203 "canst", "cf", "cfrd", "choose", "conducted", "considered", 204 "contrariwise", "cos", "crd", "cu", "day", "describes", "designed", 205 "determine", "determined", "discussed", "dost", "doth", "double", 206 "dual", "due", "excepted", "excepting", "exception", "exclude", 207 "excluding", "exclusive", "farther", "farthest", "ff", "forward", 208 "found", "front", "furthest", "general", "halves", "hast", "hath", 209 "henceforth", "hereabouts", "hereto", "hindmost", "hitherto", 210 "howsoever", "I", "include", "included", "including", "indoors", 211 "inside", "insomuch", "investigated", "inwards", "kind", "kg", 212 "km", "made", "meantime", "mr", "mrs", "ms", "nonetheless", "nope", 213 "notwithstandi", "ng", "nowadays", "obtained", "performance", 214 "performed", "plenty", "present", "presented", "presents", 215 "provide", "provided", "related", "report", "required", "results", 216 "round", "sake", "sang", "save", "seldom", "selected", "sfrd", 217 "shalt", "shown", "sideways", "significant", "slept", "slew", 218 "slung", "slunk", "smote", "spake", "spat", "spoke", "spoken", 219 "sprang", "sprung", "srd", "stave", "staves", "studies", 220 "supposing", "tested", "thee", "thenceforth", "thereabout", 221 "thereabouts", "thereof", "thereon", "thereto", "thou", "thrice", 222 "thy", "thyself", "till", "types", "unable", "underneath", 223 "unlike", "upward", "upwards", "week", "whatsoever", "whensoever", 224 "whereabouts", "whereat", "wherefore", "wherefrom", "whereinto", 225 "whereof", "whereon", "wheresoever", "whereto", "whereunto", 226 "wherewith", "whew", "whichever", "whichsoevr", "whilst", "whoa", 227 "whomever", "whomsoever", "whosoever", "wilt", "worse", "worst", 228 "wow", "ye", "year", "yippee", 229 //Also include puntuation 230 ",", ";", ".", ":", "_", "{", "}", "[", "]", "+", "*", "¡", "¿", "?", "=", ")", "(", "/", "&", "%", "$", "·" 231 }; 232 233 static Set<String> stopWordSet = new HashSet<String>(java.util.Arrays.asList(stopWords)); 234 235 }