001    /**
002     * OpennlpSplitter.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    
010    package jcolibri.extensions.textual.IE.opennlp;
011    
012    import java.util.Collection;
013    import java.util.List;
014    
015    import org.jdom.Element;
016    
017    import jcolibri.cbrcore.Attribute;
018    import jcolibri.cbrcore.CBRCase;
019    import jcolibri.cbrcore.CBRQuery;
020    import jcolibri.extensions.textual.IE.IEutils;
021    import jcolibri.extensions.textual.IE.gate.GateSplitter;
022    import jcolibri.extensions.textual.IE.representation.IEText;
023    import jcolibri.extensions.textual.IE.representation.Paragraph;
024    import jcolibri.extensions.textual.IE.representation.Sentence;
025    import jcolibri.extensions.textual.IE.representation.Token;
026    import jcolibri.util.AttributeUtils;
027    import jcolibri.util.ProgressController;
028    import opennlp.common.xml.NLPDocument;
029    import opennlp.grok.preprocess.sentdetect.EnglishSentenceDetectorME;
030    import opennlp.grok.preprocess.tokenize.EnglishTokenizerME;
031    import opennlp.grok.preprocess.tokenize.TokenizerME;
032    
033    /**
034     * Organizes an IETextOpenNLP object in paragraphs, sentences and tokens.
035     * This implementation uses maximum entropy algorithms to obtain sentences and tokens.
036     * @author Juan A. Recio-Garcia
037     * @version 1.0
038     *
039     */
040    public class OpennlpSplitter
041    {    
042        /**
043         * Performs the algorithm in the given attributes of a collection of cases.
044         * These attributes must be IETextOpenNLP objects.
045         */
046        public static void split(Collection<CBRCase> cases, Collection<Attribute> attributes)
047        {
048            org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text.");
049            ProgressController.init(OpennlpSplitter.class, "Splitting OpenNLP text", cases.size());
050            for(CBRCase c: cases)
051            {
052                for(Attribute a: attributes)
053                {
054                    Object o = AttributeUtils.findValue(a, c);
055                    if(o instanceof IETextOpenNLP)
056                        split((IETextOpenNLP)o);
057                }
058                ProgressController.step(OpennlpSplitter.class);
059            }
060            ProgressController.finish(OpennlpSplitter.class);
061        }
062    
063        /**
064         * Performs the algorithm in the given attributes of a query.
065         * These attributes must be IETextOpenNLP objects.
066         */
067        public static void split(CBRQuery query, Collection<Attribute> attributes)
068        {
069            org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text.");
070                for(Attribute a: attributes)
071                {
072                    Object o = AttributeUtils.findValue(a, query);
073                    if(o instanceof IETextOpenNLP)
074                        split((IETextOpenNLP)o);
075                }
076        }
077        
078        /**
079         * Performs the algorithm in all the IETextOpenNLP typed attributes of a collection of cases.
080         */
081        public static void split(Collection<CBRCase> cases)
082        {
083            org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text.");
084            ProgressController.init(OpennlpSplitter.class, "Splitting OpenNLP text", cases.size());
085            for(CBRCase c: cases)
086            {
087                Collection<IEText> texts = IEutils.getTexts(c);
088                for(IEText t : texts)
089                    if(t instanceof IETextOpenNLP)
090                        split((IETextOpenNLP)t);
091                ProgressController.step(OpennlpSplitter.class);
092            }
093            ProgressController.finish(OpennlpSplitter.class);
094        }
095        
096        /**
097         * Performs the algorithm in all the IETextOpenNLP typed attributes of a query.
098         */ 
099        public static void split(CBRQuery query)
100        {    
101            org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text.");
102            Collection<IEText> texts = IEutils.getTexts(query);
103            for(IEText t : texts)
104                if(t instanceof IETextOpenNLP)
105                    split((IETextOpenNLP)t);
106        }
107        
108        
109        
110        public static void split(IETextOpenNLP text)
111        {
112            try
113            {
114                TokenizerME tokeniser = getTokeniser();
115                tokeniser.process(text.getDocument());
116                
117                EnglishSentenceDetectorME sd = getSentenceDetector();
118                sd.process(text.getDocument());
119                
120                organizeText(text);
121                
122            } catch (Exception e)
123            {
124                org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).error(e);   
125            }
126        }
127        
128        /**
129         * Performs the algorithm in a given IETextOpenNLP object
130         */
131        @SuppressWarnings("unchecked")
132        protected static void organizeText(IETextOpenNLP text)
133        {
134            NLPDocument doc = text.getDocument();
135            
136            Element root = doc.getRootElement();
137            Element texte = (Element)root.getChild("text");
138            List<Element> pars = texte.getChildren();
139            String[] parsText  = doc.getParagraphs();
140            for(int p = 0; p<parsText.length; p++)
141            {
142                Element par = pars.get(p);
143                String parText = parsText[p];
144               
145                Paragraph myPar= new Paragraph(parText);
146                text.setParagraphMapping(myPar, par);
147                text.addParagraph(myPar);
148                
149                
150                List<Element> sents = par.getChildren();
151                String[] sentsText = doc.getSentences(par);
152                for(int s=0; s<sentsText.length; s++)
153                {
154                    Element sent = sents.get(s);
155                    String sentText = sentsText[s];
156                    
157                    Sentence mySent = new Sentence(sentText);
158                    myPar.addSentence(mySent);
159                    text.setSentenceMapping(mySent, sent);
160                    
161                    List<Element> toks = sent.getChildren();
162                    String[] toksText =  doc.getWords(sent);
163                    for(int t=0; t<toksText.length; t++)
164                    {
165                        Element tok = toks.get(t);
166                        String tokText = toksText[t];
167                        
168                        Token myTok = new Token(tokText);
169                        mySent.addToken(myTok);
170                        text.setTokenMapping(myTok, tok);
171                    }
172                }
173            }
174            
175            
176        }
177        
178        
179        private static TokenizerME tokeniser = null;
180        private static TokenizerME getTokeniser() throws Exception
181        {
182            if(tokeniser == null)
183                tokeniser = new EnglishTokenizerME();
184            return tokeniser;
185        }
186        
187        private static EnglishSentenceDetectorME englishSentenceDetector = null;
188        private static EnglishSentenceDetectorME getSentenceDetector()
189        {
190            if(englishSentenceDetector == null)
191                englishSentenceDetector = new EnglishSentenceDetectorME();
192            return englishSentenceDetector;
193        }
194        
195        
196    }