001 /** 002 * GatePOStagger.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 20/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.gate; 010 011 import java.util.Collection; 012 013 import gate.Annotation; 014 import gate.AnnotationSet; 015 import gate.Factory; 016 import gate.creole.POSTagger; 017 import jcolibri.cbrcore.Attribute; 018 import jcolibri.cbrcore.CBRCase; 019 import jcolibri.cbrcore.CBRQuery; 020 import jcolibri.extensions.textual.IE.IEutils; 021 import jcolibri.extensions.textual.IE.representation.IEText; 022 import jcolibri.extensions.textual.IE.representation.Token; 023 import jcolibri.util.AttributeUtils; 024 import jcolibri.util.ProgressController; 025 026 /** 027 * Performs the POS tagging using the GATE algorithm. 028 * <br> 029 * Part-Of-Speech tags (the original GATE set): 030 * <ul> 031 * <li>CC - coordinating conjunction: ”and”, ”but”, ”nor”, ”or”, ”yet”, plus, 032 * minus, less, times (multiplication), over (division). Also ”for” (because) 033 * and ”so” (i.e., ”so that”). 034 * <li>CD - cardinal number 035 * <li>DT - determiner: Articles including ”a”, ”an”, ”every”, ”no”, ”the”, 036 * ”another”, ”any”, ”some”, ”those”. 037 * <li>EX - existential there: Unstressed ”there” that triggers inversion of 038 * the inflected verb and the logical subject; ”There was a party in progress”. 039 * <li>FW - foreign word 040 * <li>IN - preposition or subordinating conjunction 041 * <li>JJ - adjective: Hyphenated compounds that are used as modifiers; 042 * happy-go-lucky. 043 * <li>JJR - adjective - comparative: Adjectives with the comparative ending 044 * ”-er” and a comparative meaning. Sometimes ”more” and ”less”. 045 * <li>JJS - adjective - superlative: Adjectives with the superlative ending 046 * ”-est” (and ”worst”). Sometimes ”most”and ”least”. 047 * <li>JJSS - -unknown-, but probably a variant of JJS 048 * <li>-LRB- - -unknown- 049 * <li>LS - list item marker: Numbers and letters used as identifiers of items 050 * in a list. 051 * <li>MD - modal: All verbs that don’t take an ”-s” ending in the third person 052 * singular present: ”can”, ”could”, ”dare”, ”may”, ”might”, ”must”, ”ought”, 053 * ”shall”, ”should”, ”will”, ”would”. 054 * <li>NN - noun - singular or mass 055 * <li>NNP - proper noun - singular: All words in names usually are capitalized 056 * but titles might not be. 057 * <li>NNPS - proper noun - plural: All words in names usually are capitalized 058 * but titles might not be. 059 * <li>NNS - noun - plural 060 * <li>NP - proper noun - singular 061 * <li>ML Configuration 283 062 * <li>NPS - proper noun - plural 063 * <li>PDT - predeterminer: Determinerlike elements preceding an article or 064 * possessive pronoun; 065 * <li>”all/PDT his marbles”, ”quite/PDT a mess”. 066 * <li>POS - possesive ending: Nouns ending in ”’s” or ”’”. 067 * <li>PP - personal pronoun 068 * <li>PRPR$ - unknown-, but probably possessive pronoun 069 * <li>PRP - unknown-, but probably possessive pronoun 070 * <li>PRP$ - unknown, but probably possessive pronoun,such as ”my”, ”your”, 071 * ”his”, ”his”, ”its”, ”one’s”, ”our”, and ”their”. 072 * <li>RB - adverb: most words ending in ”-ly”. Also ”quite”, ”too”, ”very”, 073 * ”enough”, ”indeed”, ”not”, ”-n’t”, and ”never”. 074 * <li>RBR - adverb - comparative: adverbs ending with ”-er” with a comparative 075 * meaning. 076 * <li>RBS - adverb - superlative 077 * <li>RP - particle: Mostly monosyllabic words that also double as directional 078 * adverbs. 079 * <li>STAART - start state marker (used internally) 080 * <li>SYM - symbol: technical symbols or expressions that aren’t English 081 * words. 082 * <li>TO - literal to 083 * <li>UH - interjection: Such as ”my”, ”oh”, ”please”, ”uh”, ”well”, ”yes”. 084 * <li>VBD - verb - past tense: includes conditional form of the verb ”to be”; 085 * ”If I were/VBD rich...”. 086 * <li>VBG - verb - gerund or present participle 087 * <li>VBN - verb - past participle 088 * <li>VBP - verb - non-3rd person singular present 089 * <li>VB - verb - base form: subsumes imperatives, infinitives and 090 * subjunctives. 091 * <li>VBZ - verb - 3rd person singular present 092 * <li>WDT - wh-determiner 093 * <li>WP$ - possesive wh-pronoun: includes ”whose” 094 * <li>WP - wh-pronoun: includes ”what”, ”who”, and ”whom”. 095 * <li>WRB - wh-adverb: includes ”how”, ”where”, ”why”. Includes ”when” when 096 * used in a temporal sense. 097 * <li>:: - literal colon 098 * <li>, - literal comma 099 * <li>$ - literal dollar sign 100 * <li>- - literal double-dash 101 * <li>- literal double quotes 102 * <li>- literal grave 103 * <li>( - literal left parenthesis 104 * <li>. - literal period 105 * <li># - literal pound sign 106 * <li>) - literal right parenthesis 107 * <li>- literal single quote or apostrophe 108 * </ul> 109 * 110 * @author Juan A. Recio-Garcia 111 * @version 1.0 112 * 113 */ 114 public class GatePOStagger 115 { 116 117 /** 118 * Performs the algorithm in the given attributes of a collection of cases. 119 * These attributes must be IETextGate objects. 120 */ 121 public static void tag(Collection<CBRCase> cases, Collection<Attribute> attributes) 122 { 123 org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging..."); 124 ProgressController.init(GatePOStagger.class, "Gate POS tagging...", cases.size()); 125 for (CBRCase c : cases) 126 { 127 for (Attribute a : attributes) 128 { 129 Object o = AttributeUtils.findValue(a, c); 130 if (o instanceof IETextGate) 131 tag((IETextGate) o); 132 } 133 ProgressController.step(GatePOStagger.class); 134 } 135 ProgressController.finish(GatePOStagger.class); 136 } 137 138 /** 139 * Performs the algorithm in the given attributes of a query. 140 * These attributes must be IETextGate objects. 141 */ 142 public static void tag(CBRQuery query, Collection<Attribute> attributes) 143 { 144 org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging..."); 145 for (Attribute a : attributes) 146 { 147 Object o = AttributeUtils.findValue(a, query); 148 if (o instanceof IETextGate) 149 tag((IETextGate) o); 150 } 151 } 152 153 /** 154 * Performs the algorithm in all the IETextGate typed attributes of a collection of cases. 155 */ 156 public static void tag(Collection<CBRCase> cases) 157 { 158 org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging..."); 159 ProgressController.init(GatePOStagger.class, "Gate POS tagging...", cases.size()); 160 for (CBRCase c : cases) 161 { 162 Collection<IEText> texts = IEutils.getTexts(c); 163 for (IEText t : texts) 164 if (t instanceof IETextGate) 165 tag((IETextGate) t); 166 ProgressController.step(GatePOStagger.class); 167 } 168 ProgressController.finish(GatePOStagger.class); 169 } 170 171 /** 172 * Performs the algorithm in all the IETextGate typed attributes of a query. 173 */ 174 public static void tag(CBRQuery query) 175 { 176 org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging..."); 177 Collection<IEText> texts = IEutils.getTexts(query); 178 for (IEText t : texts) 179 if (t instanceof IETextGate) 180 tag((IETextGate) t); 181 } 182 183 /** 184 * Performs the algorithm in a given IETextGate object 185 */ 186 public static void tag(IETextGate text) 187 { 188 try 189 { 190 POSTagger tagger = getTokeniser(); 191 tagger.setDocument(text.getDocument()); 192 tagger.execute(); 193 194 AnnotationSet posAnnotations = text.getDocument().getAnnotations() 195 .get("POS"); 196 197 for (Token t : text.getAllTokens()) 198 { 199 Annotation anotToken = text.getTokenMapping(t); 200 AnnotationSet posAnnots = posAnnotations.get(anotToken 201 .getStartNode().getOffset(), anotToken.getEndNode() 202 .getOffset()); 203 Annotation anot = (Annotation) posAnnots.iterator().next(); 204 String postag = (String) anot.getFeatures().get("category"); 205 t.setPostag(postag); 206 } 207 208 } catch (Exception e) 209 { 210 org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class) 211 .error(e); 212 213 } 214 } 215 216 private static POSTagger tagger = null; 217 218 private static POSTagger getTokeniser() throws Exception 219 { 220 if (tagger == null) 221 { 222 tagger = (POSTagger) Factory 223 .createResource("gate.creole.POSTagger"); 224 tagger.setBaseSentenceAnnotationType("Sentence"); 225 tagger.setBaseTokenAnnotationType("Token"); 226 tagger.setOutputAnnotationType("POS"); 227 tagger.init(); 228 } 229 return tagger; 230 } 231 }