001    /**
002     * GateSplitter.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 19/06/2007
008     */
009    package jcolibri.extensions.textual.IE.gate;
010    
011    import gate.Annotation;
012    import gate.AnnotationSet;
013    import gate.Document;
014    import gate.Factory;
015    import gate.GateConstants;
016    import gate.creole.splitter.SentenceSplitter;
017    import gate.creole.tokeniser.DefaultTokeniser;
018    
019    import java.util.ArrayList;
020    import java.util.Collection;
021    import java.util.List;
022    
023    import jcolibri.cbrcore.Attribute;
024    import jcolibri.cbrcore.CBRCase;
025    import jcolibri.cbrcore.CBRQuery;
026    import jcolibri.extensions.textual.IE.IEutils;
027    import jcolibri.extensions.textual.IE.representation.IEText;
028    import jcolibri.extensions.textual.IE.representation.Paragraph;
029    import jcolibri.extensions.textual.IE.representation.Sentence;
030    import jcolibri.extensions.textual.IE.representation.Token;
031    import jcolibri.util.AttributeUtils;
032    import jcolibri.util.ProgressController;
033    
034    /**
035     * Organizes an IETextGate object in paragraphs, sentences and tokens.
036     * This implementation uses the GATE algorithms to obtain sentences and tokens.
037     * @author Juan A. Recio-Garcia
038     * @version 1.0
039     */
040    public class GateSplitter
041    {
042        /**
043         * Performs the algorithm in the given attributes of a collection of cases.
044         * These attributes must be IETextGate objects.
045         */
046        public static void split(Collection<CBRCase> cases, Collection<Attribute> attributes)
047        {
048            org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text.");
049            ProgressController.init(GateSplitter.class, "Splitting Gate text", cases.size());
050            for(CBRCase c: cases)
051            {
052                for(Attribute a: attributes)
053                {
054                    Object o = AttributeUtils.findValue(a, c);
055                    if(o instanceof IETextGate)
056                        split((IETextGate)o);
057                }
058                ProgressController.step(GateSplitter.class);
059            }
060            ProgressController.finish(GateSplitter.class);
061        }
062    
063        /**
064         * Performs the algorithm in the given attributes of a query.
065         * These attributes must be IETextGate objects.
066         */
067        public static void split(CBRQuery query, Collection<Attribute> attributes)
068        {
069                org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text.");
070                for(Attribute a: attributes)
071                {
072                    Object o = AttributeUtils.findValue(a, query);
073                    if(o instanceof IETextGate)
074                        split((IETextGate)o);
075                }
076        }
077        
078        public static void split(Collection<CBRCase> cases)
079        {
080            org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text.");
081            ProgressController.init(GateSplitter.class, "Splitting Gate text", cases.size());
082            for(CBRCase c: cases)
083            {
084                Collection<IEText> texts = IEutils.getTexts(c);
085                for(IEText t : texts)
086                    if(t instanceof IETextGate)
087                        split((IETextGate)t);
088                ProgressController.step(GateSplitter.class);
089            }
090            ProgressController.finish(GateSplitter.class);
091        }
092        
093        /**
094         * Performs the algorithm in all the IETextGate typed attributes of a collection of cases.
095         */ 
096        public static void split(CBRQuery query)
097        {       
098            org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text.");
099            Collection<IEText> texts = IEutils.getTexts(query);
100            for(IEText t : texts)
101                if(t instanceof IETextGate)
102                    split((IETextGate)t);
103        }
104        
105        /**
106         * Performs the algorithm in all the IETextGate typed attributes of a query.
107         */
108        public static void split(IETextGate text)
109        {
110            try
111            {
112                DefaultTokeniser tokeniser = getTokeniser();
113                tokeniser.setDocument(text.getDocument());
114                tokeniser.execute();
115                
116                SentenceSplitter sentenceSplitter = getSentenceSplitter();
117                sentenceSplitter.setDocument(text.getDocument());
118                sentenceSplitter.execute();
119                
120                organizeText(text);
121                
122            } catch (Exception e)
123            {
124                org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).error(e);   
125            }
126        }
127        
128        
129        /**
130         * Performs the algorithm in a given IETextGate object
131         */
132        @SuppressWarnings("unchecked")
133        protected static void organizeText(IETextGate text)
134        {
135            Document doc = text.getDocument();
136            String content = text.getRAWContent();
137            
138            //Paragraphs
139            AnnotationSet parAnnot = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
140            AnnotationSet paragraphs = parAnnot.get("paragraph");
141            AnnotationSet annot = doc.getAnnotations();
142    
143            List<Annotation> sents = new ArrayList<Annotation>(annot.get("Sentence"));
144            java.util.Collections.sort(sents);
145            
146            List<Annotation> tokens = new ArrayList<Annotation>(annot.get("Token"));
147            java.util.Collections.sort(tokens);
148            
149            ArrayList<Annotation> sentsToRemove = new ArrayList<Annotation>();
150            ArrayList<Annotation> tokensToRemove = new ArrayList<Annotation>();
151            for(int p=0; p<paragraphs.size(); p++)
152            {
153                Annotation par = paragraphs.get(p);
154                int beginP = par.getStartNode().getOffset().intValue();
155                int endP   = par.getEndNode().getOffset().intValue();
156                Paragraph myPar = new Paragraph(content.substring(beginP,endP));
157                text.addParagraph(myPar);
158                text.setParagraphMapping(myPar, par);
159                
160                //Sentences
161                sentsToRemove.clear();
162                for(int s=0; s<sents.size(); s++)
163                {
164                    Annotation sent = sents.get(s);
165                    int beginS = sent.getStartNode().getOffset().intValue();
166                    int endS   = sent.getEndNode().getOffset().intValue();
167                    if((beginS<beginP)||(endS>endP))
168                        continue;
169                    Sentence mySent = new Sentence(content.substring(beginS, endS));
170                    myPar.addSentence(mySent);
171                    text.setSentenceMapping(mySent, sent);
172                    sentsToRemove.remove(sent);
173                    
174                    
175                    //Tokens
176                    tokensToRemove.clear();
177                    for(int t=0; t<tokens.size(); t++)
178                    {
179                        Annotation token = tokens.get(t);
180                        int beginT = token.getStartNode().getOffset().intValue();
181                        int endT   = token.getEndNode().getOffset().intValue();
182                        if((beginT<beginS)||(endT>endS))
183                            continue;
184                        Token myToken = new Token(content.substring(beginT,endT));
185                        mySent.addToken(myToken);
186                        text.setTokenMapping(myToken, token);
187                        tokensToRemove.add(token);
188                    }
189                    tokens.removeAll(tokensToRemove);
190                }
191                sents.removeAll(sentsToRemove);
192                
193            }
194            
195                
196        }
197        
198        private static DefaultTokeniser tokeniser = null;
199        private static DefaultTokeniser getTokeniser() throws Exception
200        {
201            if(tokeniser == null)
202            {
203                tokeniser = (DefaultTokeniser) Factory.createResource(
204                                      "gate.creole.tokeniser.DefaultTokeniser");
205                tokeniser.init();
206            }
207            return tokeniser;
208        }
209        
210        private static  SentenceSplitter sentenceSplitter = null;
211        private static SentenceSplitter getSentenceSplitter() throws Exception{
212            
213            if(sentenceSplitter == null)
214            {
215                sentenceSplitter = (SentenceSplitter)Factory.createResource(
216                            "gate.creole.splitter.SentenceSplitter");
217                sentenceSplitter.init();
218            }
219            return sentenceSplitter;
220        }
221     
222        
223    }