001 /** 002 * OpennlpPOStagger.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 20/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.opennlp; 010 011 import java.util.Collection; 012 013 import jcolibri.cbrcore.Attribute; 014 import jcolibri.cbrcore.CBRCase; 015 import jcolibri.cbrcore.CBRQuery; 016 import jcolibri.extensions.textual.IE.IEutils; 017 import jcolibri.extensions.textual.IE.representation.IEText; 018 import jcolibri.extensions.textual.IE.representation.Token; 019 import jcolibri.util.AttributeUtils; 020 import jcolibri.util.ProgressController; 021 import opennlp.grok.preprocess.postag.EnglishPOSTaggerME; 022 023 import org.jdom.Element; 024 025 /** 026 * Performs the POS tagging using a OpenNLP Maximum Entropy algorithm. This algorithm uses the same tags than GATE. 027 * <br> 028 * Part-Of-Speech tags (the original GATE set): 029 * <ul> 030 * <li>CC - coordinating conjunction: ”and”, ”but”, ”nor”, ”or”, ”yet”, plus, minus, less, times (multiplication), over (division). Also ”for” (because) and ”so” (i.e., ”so that”). 031 * <li>CD - cardinal number 032 * <li>DT - determiner: Articles including ”a”, ”an”, ”every”, ”no”, ”the”, ”another”, ”any”, ”some”, ”those”. 033 * <li>EX - existential there: Unstressed ”there” that triggers inversion of the inflected verb and the logical subject; ”There was a party in progress”. 034 * <li>FW - foreign word 035 * <li>IN - preposition or subordinating conjunction 036 * <li>JJ - adjective: Hyphenated compounds that are used as modifiers; happy-go-lucky. 037 * <li>JJR - adjective - comparative: Adjectives with the comparative ending ”-er” and a comparative meaning. Sometimes ”more” and ”less”. 038 * <li>JJS - adjective - superlative: Adjectives with the superlative ending ”-est” (and ”worst”). Sometimes ”most”and ”least”. 039 * <li>JJSS - -unknown-, but probably a variant of JJS 040 * <li>-LRB- - -unknown- 041 * <li>LS - list item marker: Numbers and letters used as identifiers of items in a list. 042 * <li>MD - modal: All verbs that don’t take an ”-s” ending in the third person singular present: ”can”, ”could”, ”dare”, ”may”, ”might”, ”must”, ”ought”, ”shall”, ”should”, ”will”, ”would”. 043 * <li>NN - noun - singular or mass 044 * <li>NNP - proper noun - singular: All words in names usually are capitalized but titles might not be. 045 * <li>NNPS - proper noun - plural: All words in names usually are capitalized but titles might not be. 046 * <li>NNS - noun - plural 047 * <li>NP - proper noun - singular 048 * <li>ML Configuration 283 049 * <li>NPS - proper noun - plural 050 * <li>PDT - predeterminer: Determinerlike elements preceding an article or possessive pronoun; 051 * <li>”all/PDT his marbles”, ”quite/PDT a mess”. 052 * <li>POS - possesive ending: Nouns ending in ”’s” or ”’”. 053 * <li>PP - personal pronoun 054 * <li>PRPR$ - unknown-, but probably possessive pronoun 055 * <li>PRP - unknown-, but probably possessive pronoun 056 * <li>PRP$ - unknown, but probably possessive pronoun,such as ”my”, ”your”, ”his”, ”his”, ”its”, ”one’s”, ”our”, and ”their”. 057 * <li>RB - adverb: most words ending in ”-ly”. Also ”quite”, ”too”, ”very”, ”enough”, ”indeed”, ”not”, ”-n’t”, and ”never”. 058 * <li>RBR - adverb - comparative: adverbs ending with ”-er” with a comparative meaning. 059 * <li>RBS - adverb - superlative 060 * <li>RP - particle: Mostly monosyllabic words that also double as directional adverbs. 061 * <li>STAART - start state marker (used internally) 062 * <li>SYM - symbol: technical symbols or expressions that aren’t English words. 063 * <li>TO - literal to 064 * <li>UH - interjection: Such as ”my”, ”oh”, ”please”, ”uh”, ”well”, ”yes”. 065 * <li>VBD - verb - past tense: includes conditional form of the verb ”to be”; ”If I were/VBD rich...”. 066 * <li>VBG - verb - gerund or present participle 067 * <li>VBN - verb - past participle 068 * <li>VBP - verb - non-3rd person singular present 069 * <li>VB - verb - base form: subsumes imperatives, infinitives and subjunctives. 070 * <li>VBZ - verb - 3rd person singular present 071 * <li>WDT - wh-determiner 072 * <li>WP$ - possesive wh-pronoun: includes ”whose” 073 * <li>WP - wh-pronoun: includes ”what”, ”who”, and ”whom”. 074 * <li>WRB - wh-adverb: includes ”how”, ”where”, ”why”. Includes ”when” when used in a temporal sense. 075 * <li>:: - literal colon 076 * <li>, - literal comma 077 * <li>$ - literal dollar sign 078 * <li>- - literal double-dash 079 * <li>- literal double quotes 080 * <li>- literal grave 081 * <li>( - literal left parenthesis 082 * <li>. - literal period 083 * <li># - literal pound sign 084 * <li>) - literal right parenthesis 085 * <li>- literal single quote or apostrophe 086 * </ul> 087 * 088 * @author Juan A. Recio-Garcia 089 * @version 2.0 090 * 091 */ 092 public class OpennlpPOStagger 093 { 094 095 /** 096 * Performs the algorithm in the given attributes of a collection of cases. 097 * These attributes must be IETextOpenNLP objects. 098 */ 099 public static void tag(Collection<CBRCase> cases, Collection<Attribute> attributes) 100 { 101 org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging."); 102 ProgressController.init(OpennlpPOStagger.class, "OpenNLP POS tagging", cases.size()); 103 for(CBRCase c: cases) 104 { 105 for(Attribute a: attributes) 106 { 107 Object o = AttributeUtils.findValue(a, c); 108 if(o instanceof IETextOpenNLP) 109 tag((IETextOpenNLP)o); 110 } 111 ProgressController.step(OpennlpPOStagger.class); 112 } 113 ProgressController.finish(OpennlpPOStagger.class); 114 } 115 116 /** 117 * Performs the algorithm in the given attributes of a query. 118 * These attributes must be IETextOpenNLP objects. 119 */ 120 public static void tag(CBRQuery query, Collection<Attribute> attributes) 121 { 122 org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging."); 123 for(Attribute a: attributes) 124 { 125 Object o = AttributeUtils.findValue(a, query); 126 if(o instanceof IETextOpenNLP) 127 tag((IETextOpenNLP)o); 128 } 129 } 130 131 /** 132 * Performs the algorithm in all the IETextOpenNLP typed attributes of a collection of cases. 133 */ 134 public static void tag(Collection<CBRCase> cases) 135 { 136 org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging."); 137 ProgressController.init(OpennlpPOStagger.class, "OpenNLP POS tagging", cases.size()); 138 for(CBRCase c: cases) 139 { 140 Collection<IEText> texts = IEutils.getTexts(c); 141 for(IEText t : texts) 142 if(t instanceof IETextOpenNLP) 143 tag((IETextOpenNLP)t); 144 ProgressController.step(OpennlpPOStagger.class); 145 } 146 ProgressController.finish(OpennlpPOStagger.class); 147 } 148 149 /** 150 * Performs the algorithm in all the IETextOpenNLP typed attributes of a query. 151 */ 152 public static void tag(CBRQuery query) 153 { 154 org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging."); 155 Collection<IEText> texts = IEutils.getTexts(query); 156 for(IEText t : texts) 157 if(t instanceof IETextOpenNLP) 158 tag((IETextOpenNLP)t); 159 } 160 161 /** 162 * Performs the algorithm in a given IETextOpenNLP object 163 */ 164 public static void tag(IETextOpenNLP text) 165 { 166 EnglishPOSTaggerME tagger = getSentenceDetector(); 167 168 tagger.process(text.getDocument()); 169 170 for(Token t: text.getAllTokens()) 171 { 172 Element elem = text.getTokenMapping(t); 173 Element word = elem.getChild("w"); 174 String posTag = word.getAttributeValue("pos"); 175 t.setPostag(posTag); 176 } 177 } 178 179 180 181 182 private static EnglishPOSTaggerME englishPOStagger = null; 183 private static EnglishPOSTaggerME getSentenceDetector() 184 { 185 if(englishPOStagger == null) 186 englishPOStagger = new EnglishPOSTaggerME(); 187 return englishPOStagger; 188 } 189 }