001    /**
002     * OpennlpPOStagger.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 20/06/2007
008     */
009    package jcolibri.extensions.textual.IE.opennlp;
010    
011    import java.util.Collection;
012    
013    import jcolibri.cbrcore.Attribute;
014    import jcolibri.cbrcore.CBRCase;
015    import jcolibri.cbrcore.CBRQuery;
016    import jcolibri.extensions.textual.IE.IEutils;
017    import jcolibri.extensions.textual.IE.representation.IEText;
018    import jcolibri.extensions.textual.IE.representation.Token;
019    import jcolibri.util.AttributeUtils;
020    import jcolibri.util.ProgressController;
021    import opennlp.grok.preprocess.postag.EnglishPOSTaggerME;
022    
023    import org.jdom.Element;
024    
025    /**
026     * Performs the POS tagging using a OpenNLP Maximum Entropy algorithm. This algorithm uses the same tags than GATE.
027     * <br>
028     * Part-Of-Speech tags (the original GATE set):
029     * <ul>
030     * <li>CC - coordinating conjunction: ”and”, ”but”, ”nor”, ”or”, ”yet”, plus, minus, less, times (multiplication), over (division). Also ”for” (because) and ”so” (i.e., ”so that”).
031     * <li>CD - cardinal number
032     * <li>DT - determiner: Articles including ”a”, ”an”, ”every”, ”no”, ”the”, ”another”, ”any”, ”some”, ”those”.
033     * <li>EX - existential there: Unstressed ”there” that triggers inversion of the inflected verb and the logical subject; ”There was a party in progress”.
034     * <li>FW - foreign word
035     * <li>IN - preposition or subordinating conjunction
036     * <li>JJ - adjective: Hyphenated compounds that are used as modifiers; happy-go-lucky.
037     * <li>JJR - adjective - comparative: Adjectives with the comparative ending ”-er” and a comparative meaning. Sometimes ”more” and ”less”.
038     * <li>JJS - adjective - superlative: Adjectives with the superlative ending ”-est” (and ”worst”). Sometimes ”most”and ”least”.
039     * <li>JJSS - -unknown-, but probably a variant of JJS
040     * <li>-LRB- - -unknown-
041     * <li>LS - list item marker: Numbers and letters used as identifiers of items in a list.
042     * <li>MD - modal: All verbs that don’t take an ”-s” ending in the third person singular present: ”can”, ”could”, ”dare”, ”may”, ”might”, ”must”, ”ought”, ”shall”, ”should”, ”will”, ”would”.
043     * <li>NN - noun - singular or mass
044     * <li>NNP - proper noun - singular: All words in names usually are capitalized but titles might not be.
045     * <li>NNPS - proper noun - plural: All words in names usually are capitalized but titles might not be.
046     * <li>NNS - noun - plural
047     * <li>NP - proper noun - singular
048     * <li>ML Configuration 283
049     * <li>NPS - proper noun - plural
050     * <li>PDT - predeterminer: Determinerlike elements preceding an article or possessive pronoun;
051     * <li>”all/PDT his marbles”, ”quite/PDT a mess”.
052     * <li>POS - possesive ending: Nouns ending in ”’s” or ”’”.
053     * <li>PP - personal pronoun
054     * <li>PRPR$ - unknown-, but probably possessive pronoun
055     * <li>PRP - unknown-, but probably possessive pronoun
056     * <li>PRP$ - unknown, but probably possessive pronoun,such as ”my”, ”your”, ”his”, ”his”, ”its”, ”one’s”, ”our”, and ”their”.
057     * <li>RB - adverb: most words ending in ”-ly”. Also ”quite”, ”too”, ”very”, ”enough”, ”indeed”, ”not”, ”-n’t”, and ”never”.
058     * <li>RBR - adverb - comparative: adverbs ending with ”-er” with a comparative meaning.
059     * <li>RBS - adverb - superlative
060     * <li>RP - particle: Mostly monosyllabic words that also double as directional adverbs.
061     * <li>STAART - start state marker (used internally)
062     * <li>SYM - symbol: technical symbols or expressions that aren’t English words.
063     * <li>TO - literal to
064     * <li>UH - interjection: Such as ”my”, ”oh”, ”please”, ”uh”, ”well”, ”yes”.
065     * <li>VBD - verb - past tense: includes conditional form of the verb ”to be”; ”If I were/VBD rich...”.
066     * <li>VBG - verb - gerund or present participle
067     * <li>VBN - verb - past participle
068     * <li>VBP - verb - non-3rd person singular present
069     * <li>VB - verb - base form: subsumes imperatives, infinitives and subjunctives.
070     * <li>VBZ - verb - 3rd person singular present
071     * <li>WDT - wh-determiner
072     * <li>WP$ - possesive wh-pronoun: includes ”whose”
073     * <li>WP - wh-pronoun: includes ”what”, ”who”, and ”whom”.
074     * <li>WRB - wh-adverb: includes ”how”, ”where”, ”why”. Includes ”when” when used in a temporal sense.
075     * <li>:: - literal colon
076     * <li>, - literal comma
077     * <li>$ - literal dollar sign
078     * <li>- - literal double-dash
079     * <li>- literal double quotes
080     * <li>- literal grave
081     * <li>( - literal left parenthesis
082     * <li>. - literal period
083     * <li># - literal pound sign
084     * <li>) - literal right parenthesis
085     * <li>- literal single quote or apostrophe
086     * </ul>
087     * 
088     * @author Juan A. Recio-Garcia
089     * @version 2.0
090     * 
091     */
092    public class OpennlpPOStagger
093    {
094        
095        /**
096         * Performs the algorithm in the given attributes of a collection of cases.
097         * These attributes must be IETextOpenNLP objects.
098         */
099        public static void tag(Collection<CBRCase> cases, Collection<Attribute> attributes)
100        {
101            org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging.");
102            ProgressController.init(OpennlpPOStagger.class, "OpenNLP POS tagging", cases.size());
103            for(CBRCase c: cases)
104            {
105                for(Attribute a: attributes)
106                {
107                    Object o = AttributeUtils.findValue(a, c);
108                    if(o instanceof IETextOpenNLP)
109                        tag((IETextOpenNLP)o);
110                }
111                ProgressController.step(OpennlpPOStagger.class);
112            }
113            ProgressController.finish(OpennlpPOStagger.class);
114        }
115    
116        /**
117         * Performs the algorithm in the given attributes of a query.
118         * These attributes must be IETextOpenNLP objects.
119         */
120        public static void tag(CBRQuery query, Collection<Attribute> attributes)
121        {
122            org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging.");
123                for(Attribute a: attributes)
124                {
125                    Object o = AttributeUtils.findValue(a, query);
126                    if(o instanceof IETextOpenNLP)
127                        tag((IETextOpenNLP)o);
128                }
129        }
130        
131        /**
132         * Performs the algorithm in all the IETextOpenNLP typed attributes of a collection of cases.
133         */
134        public static void tag(Collection<CBRCase> cases)
135        {
136            org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging.");
137            ProgressController.init(OpennlpPOStagger.class, "OpenNLP POS tagging", cases.size());
138            for(CBRCase c: cases)
139            {
140                Collection<IEText> texts = IEutils.getTexts(c);
141                for(IEText t : texts)
142                    if(t instanceof IETextOpenNLP)
143                        tag((IETextOpenNLP)t);
144                ProgressController.step(OpennlpPOStagger.class);
145            }
146            ProgressController.finish(OpennlpPOStagger.class);
147        }
148        
149        /**
150         * Performs the algorithm in all the IETextOpenNLP typed attributes of a query.
151         */ 
152        public static void tag(CBRQuery query)
153        {       
154            org.apache.commons.logging.LogFactory.getLog(OpennlpPOStagger.class).info("OpenNLP POS tagging.");
155            Collection<IEText> texts = IEutils.getTexts(query);
156            for(IEText t : texts)
157                if(t instanceof IETextOpenNLP)
158                    tag((IETextOpenNLP)t);
159        }
160        
161        /**
162         * Performs the algorithm in a given IETextOpenNLP object
163         */
164        public static void tag(IETextOpenNLP text)
165        {
166            EnglishPOSTaggerME tagger = getSentenceDetector();
167            
168            tagger.process(text.getDocument());
169            
170            for(Token t: text.getAllTokens())
171            {
172                Element elem = text.getTokenMapping(t);
173                Element word = elem.getChild("w");
174                String posTag = word.getAttributeValue("pos");
175                t.setPostag(posTag);
176            }
177        }
178        
179        
180        
181        
182        private static EnglishPOSTaggerME englishPOStagger = null;
183        private static EnglishPOSTaggerME getSentenceDetector()
184        {
185            if(englishPOStagger == null)
186                englishPOStagger = new EnglishPOSTaggerME();
187            return englishPOStagger;
188        }
189    }