001    /**
002     * GatePOStagger.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 20/06/2007
008     */
009    package jcolibri.extensions.textual.IE.gate;
010    
011    import java.util.Collection;
012    
013    import gate.Annotation;
014    import gate.AnnotationSet;
015    import gate.Factory;
016    import gate.creole.POSTagger;
017    import jcolibri.cbrcore.Attribute;
018    import jcolibri.cbrcore.CBRCase;
019    import jcolibri.cbrcore.CBRQuery;
020    import jcolibri.extensions.textual.IE.IEutils;
021    import jcolibri.extensions.textual.IE.representation.IEText;
022    import jcolibri.extensions.textual.IE.representation.Token;
023    import jcolibri.util.AttributeUtils;
024    import jcolibri.util.ProgressController;
025    
026    /**
027     * Performs the POS tagging using the GATE algorithm.
028     * <br>
029     * Part-Of-Speech tags (the original GATE set):
030     * <ul>
031     * <li>CC - coordinating conjunction: ”and”, ”but”, ”nor”, ”or”, ”yet”, plus,
032     * minus, less, times (multiplication), over (division). Also ”for” (because)
033     * and ”so” (i.e., ”so that”).
034     * <li>CD - cardinal number
035     * <li>DT - determiner: Articles including ”a”, ”an”, ”every”, ”no”, ”the”,
036     * ”another”, ”any”, ”some”, ”those”.
037     * <li>EX - existential there: Unstressed ”there” that triggers inversion of
038     * the inflected verb and the logical subject; ”There was a party in progress”.
039     * <li>FW - foreign word
040     * <li>IN - preposition or subordinating conjunction
041     * <li>JJ - adjective: Hyphenated compounds that are used as modifiers;
042     * happy-go-lucky.
043     * <li>JJR - adjective - comparative: Adjectives with the comparative ending
044     * ”-er” and a comparative meaning. Sometimes ”more” and ”less”.
045     * <li>JJS - adjective - superlative: Adjectives with the superlative ending
046     * ”-est” (and ”worst”). Sometimes ”most”and ”least”.
047     * <li>JJSS - -unknown-, but probably a variant of JJS
048     * <li>-LRB- - -unknown-
049     * <li>LS - list item marker: Numbers and letters used as identifiers of items
050     * in a list.
051     * <li>MD - modal: All verbs that don’t take an ”-s” ending in the third person
052     * singular present: ”can”, ”could”, ”dare”, ”may”, ”might”, ”must”, ”ought”,
053     * ”shall”, ”should”, ”will”, ”would”.
054     * <li>NN - noun - singular or mass
055     * <li>NNP - proper noun - singular: All words in names usually are capitalized
056     * but titles might not be.
057     * <li>NNPS - proper noun - plural: All words in names usually are capitalized
058     * but titles might not be.
059     * <li>NNS - noun - plural
060     * <li>NP - proper noun - singular
061     * <li>ML Configuration 283
062     * <li>NPS - proper noun - plural
063     * <li>PDT - predeterminer: Determinerlike elements preceding an article or
064     * possessive pronoun;
065     * <li>”all/PDT his marbles”, ”quite/PDT a mess”.
066     * <li>POS - possesive ending: Nouns ending in ”’s” or ”’”.
067     * <li>PP - personal pronoun
068     * <li>PRPR$ - unknown-, but probably possessive pronoun
069     * <li>PRP - unknown-, but probably possessive pronoun
070     * <li>PRP$ - unknown, but probably possessive pronoun,such as ”my”, ”your”,
071     * ”his”, ”his”, ”its”, ”one’s”, ”our”, and ”their”.
072     * <li>RB - adverb: most words ending in ”-ly”. Also ”quite”, ”too”, ”very”,
073     * ”enough”, ”indeed”, ”not”, ”-n’t”, and ”never”.
074     * <li>RBR - adverb - comparative: adverbs ending with ”-er” with a comparative
075     * meaning.
076     * <li>RBS - adverb - superlative
077     * <li>RP - particle: Mostly monosyllabic words that also double as directional
078     * adverbs.
079     * <li>STAART - start state marker (used internally)
080     * <li>SYM - symbol: technical symbols or expressions that aren’t English
081     * words.
082     * <li>TO - literal to
083     * <li>UH - interjection: Such as ”my”, ”oh”, ”please”, ”uh”, ”well”, ”yes”.
084     * <li>VBD - verb - past tense: includes conditional form of the verb ”to be”;
085     * ”If I were/VBD rich...”.
086     * <li>VBG - verb - gerund or present participle
087     * <li>VBN - verb - past participle
088     * <li>VBP - verb - non-3rd person singular present
089     * <li>VB - verb - base form: subsumes imperatives, infinitives and
090     * subjunctives.
091     * <li>VBZ - verb - 3rd person singular present
092     * <li>WDT - wh-determiner
093     * <li>WP$ - possesive wh-pronoun: includes ”whose”
094     * <li>WP - wh-pronoun: includes ”what”, ”who”, and ”whom”.
095     * <li>WRB - wh-adverb: includes ”how”, ”where”, ”why”. Includes ”when” when
096     * used in a temporal sense.
097     * <li>:: - literal colon
098     * <li>, - literal comma
099     * <li>$ - literal dollar sign
100     * <li>- - literal double-dash
101     * <li>- literal double quotes
102     * <li>- literal grave
103     * <li>( - literal left parenthesis
104     * <li>. - literal period
105     * <li># - literal pound sign
106     * <li>) - literal right parenthesis
107     * <li>- literal single quote or apostrophe
108     * </ul>
109     * 
110     * @author Juan A. Recio-Garcia
111     * @version 1.0
112     * 
113     */
114    public class GatePOStagger
115    {
116        
117        /**
118         * Performs the algorithm in the given attributes of a collection of cases.
119         * These attributes must be IETextGate objects.
120         */
121        public static void tag(Collection<CBRCase> cases, Collection<Attribute> attributes)
122        {
123            org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging...");
124            ProgressController.init(GatePOStagger.class, "Gate POS tagging...", cases.size());
125            for (CBRCase c : cases)
126            {
127                for (Attribute a : attributes)
128                {
129                    Object o = AttributeUtils.findValue(a, c);
130                    if (o instanceof IETextGate)
131                        tag((IETextGate) o);
132                }
133                ProgressController.step(GatePOStagger.class);
134            }
135            ProgressController.finish(GatePOStagger.class);
136        }
137    
138        /**
139         * Performs the algorithm in the given attributes of a query.
140         * These attributes must be IETextGate objects.
141         */
142        public static void tag(CBRQuery query, Collection<Attribute> attributes)
143        {
144            org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging...");
145            for (Attribute a : attributes)
146            {
147                Object o = AttributeUtils.findValue(a, query);
148                if (o instanceof IETextGate)
149                    tag((IETextGate) o);
150            }
151        }
152        
153        /**
154         * Performs the algorithm in all the IETextGate typed attributes of a collection of cases.
155         */ 
156        public static void tag(Collection<CBRCase> cases)
157        {
158            org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging...");
159            ProgressController.init(GatePOStagger.class, "Gate POS tagging...", cases.size());
160            for (CBRCase c : cases)
161            {
162                Collection<IEText> texts = IEutils.getTexts(c);
163                for (IEText t : texts)
164                    if (t instanceof IETextGate)
165                        tag((IETextGate) t);
166                ProgressController.step(GatePOStagger.class);
167            }
168            ProgressController.finish(GatePOStagger.class);
169        }
170        
171        /**
172         * Performs the algorithm in all the IETextGate typed attributes of a query.
173         */
174        public static void tag(CBRQuery query)
175        {
176            org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class).info("Gate POS tagging...");
177            Collection<IEText> texts = IEutils.getTexts(query);
178            for (IEText t : texts)
179                if (t instanceof IETextGate)
180                    tag((IETextGate) t);
181        }
182    
183        /**
184         * Performs the algorithm in a given IETextGate object
185         */
186        public static void tag(IETextGate text)
187        {
188            try
189            {
190                POSTagger tagger = getTokeniser();
191                tagger.setDocument(text.getDocument());
192                tagger.execute();
193    
194                AnnotationSet posAnnotations = text.getDocument().getAnnotations()
195                        .get("POS");
196    
197                for (Token t : text.getAllTokens())
198                {
199                    Annotation anotToken = text.getTokenMapping(t);
200                    AnnotationSet posAnnots = posAnnotations.get(anotToken
201                            .getStartNode().getOffset(), anotToken.getEndNode()
202                            .getOffset());
203                    Annotation anot = (Annotation) posAnnots.iterator().next();
204                    String postag = (String) anot.getFeatures().get("category");
205                    t.setPostag(postag);
206                }
207    
208            } catch (Exception e)
209            {
210                org.apache.commons.logging.LogFactory.getLog(GatePOStagger.class)
211                        .error(e);
212    
213            }
214        }
215    
216        private static POSTagger tagger = null;
217    
218        private static POSTagger getTokeniser() throws Exception
219        {
220            if (tagger == null)
221            {
222                tagger = (POSTagger) Factory
223                        .createResource("gate.creole.POSTagger");
224                tagger.setBaseSentenceAnnotationType("Sentence");
225                tagger.setBaseTokenAnnotationType("Token");
226                tagger.setOutputAnnotationType("POS");
227                tagger.init();
228            }
229            return tagger;
230        }
231    }