001    /**
002     * GatePhrasesExtractor.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    package jcolibri.extensions.textual.IE.gate;
010    
011    import gate.Annotation;
012    import gate.AnnotationSet;
013    import gate.Factory;
014    import gate.creole.ExecutionException;
015    import gate.creole.gazetteer.DefaultGazetteer;
016    
017    import java.util.Collection;
018    import java.util.Iterator;
019    
020    import jcolibri.cbrcore.Attribute;
021    import jcolibri.cbrcore.CBRCase;
022    import jcolibri.cbrcore.CBRQuery;
023    import jcolibri.extensions.textual.IE.IEutils;
024    import jcolibri.extensions.textual.IE.representation.IEText;
025    import jcolibri.extensions.textual.IE.representation.Token;
026    import jcolibri.extensions.textual.IE.representation.info.PhraseInfo;
027    import jcolibri.util.AttributeUtils;
028    import jcolibri.util.ProgressController;
029    
030    /**
031     * Phrases extractor based on the Gate Gazetteer. 
032     * It is compatible with the generic PhrasesExtractor so they can be executed together.
033     * GATE's default rules file or any other file can be loaded.
034     * <br>
035     * For more information see the GATE tutorial.
036     * @author Juan A. Recio-Garcia
037     * @version 1.0
038     *
039     */
040    public class GatePhrasesExtractor
041    {
042        private static DefaultGazetteer gaze = null;
043        
044        /**
045         * Performs the algorithm in the given attributes of a collection of cases.
046         * These attributes must be IETextGate objects.
047         */    
048        public static void extractPhrases(Collection<CBRCase> cases, Collection<Attribute> attributes)
049        {
050            org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases.");
051            ProgressController.init(GatePhrasesExtractor.class, "Extracting phrases", cases.size());
052            for(CBRCase c: cases)
053            {
054                for(Attribute a: attributes)
055                {
056                    Object o = AttributeUtils.findValue(a, c);
057                    if(o instanceof IETextGate)
058                        extractPhrases((IETextGate)o);
059                }
060                ProgressController.step(GatePhrasesExtractor.class);
061            }
062            ProgressController.finish(GatePhrasesExtractor.class);
063        }
064    
065        /**
066         * Performs the algorithm in the given attributes of a query.
067         * These attributes must be IETextGate objects.
068         */
069        public static void extractPhrases(CBRQuery query, Collection<Attribute> attributes)
070        {
071                org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases.");
072                for(Attribute a: attributes)
073                {
074                    Object o = AttributeUtils.findValue(a, query);
075                    if(o instanceof IETextGate)
076                        extractPhrases((IETextGate)o);
077                }
078        }
079        
080        /**
081         * Performs the algorithm in all the IETextGate typed attributes of a collection of cases.
082         */ 
083        public static void extractPhrases(Collection<CBRCase> cases)
084        {
085            org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases.");
086            ProgressController.init(GatePhrasesExtractor.class, "Extracting phrases", cases.size());
087            for(CBRCase c: cases)
088            {
089                Collection<IEText> texts = IEutils.getTexts(c);
090                for(IEText t : texts)
091                    if(t instanceof IETextGate)
092                        extractPhrases((IETextGate)t);
093                ProgressController.step(GatePhrasesExtractor.class);
094            }
095            ProgressController.finish(GatePhrasesExtractor.class);
096        }
097        
098        /**
099         * Performs the algorithm in all the IETextGate typed attributes of a query.
100         */
101        public static void extractPhrases(CBRQuery query)
102        {       
103            org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).info("Extracting phrases.");
104            Collection<IEText> texts = IEutils.getTexts(query);
105            for(IEText t : texts)
106                if(t instanceof IETextGate)
107                    extractPhrases((IETextGate)t);
108        }
109        
110        /**
111         * Performs the algorithm in a given IETextGate object
112         */
113        public static void extractPhrases(IETextGate text)
114        {
115            try
116            {
117                gaze.setDocument(text.getDocument());
118                gaze.execute();
119                
120                AnnotationSet lookupAnnotations = text.getDocument().getAnnotations().get("Lookup");
121                
122                for(Token t: text.getAllTokens())
123                {
124                    Annotation anotToken = text.getTokenMapping(t);
125                    AnnotationSet lookupAnnots = lookupAnnotations.get(anotToken.getStartNode().getOffset(), anotToken.getEndNode().getOffset());
126                    for(Iterator iter = lookupAnnots.iterator(); iter.hasNext(); )
127                    {
128                        Annotation anot = (Annotation)iter.next();
129                        String Type = (String)anot.getFeatures().get("majorType");
130                        String minorType = (String)anot.getFeatures().get("minorType");
131                        if(minorType!= null)
132                            Type = Type+"."+minorType;
133                        text.addPhrase(new PhraseInfo(Type, 
134                                            anot.getStartNode().getOffset().intValue(), 
135                                            anot.getEndNode().getOffset().intValue()));
136                    }
137                    
138                }
139                
140                //System.err.print(text.getDocument());
141                
142            } catch (ExecutionException e)
143            {
144                org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).error(e);
145            }
146        }
147        
148        private static String defaultRulesFileName = "jcolibri/extensions/textual/IE/gate/gateinit/plugins/ANNIE/resources/gazetteer/lists.def";
149        
150        public static void loadDefaultRules()
151        {
152            loadRules(defaultRulesFileName);
153        }
154        
155        /**
156         * Loads a rules file
157         */
158         public static void loadRules(String filename)
159         {
160             try
161            {
162                gaze = (DefaultGazetteer) Factory
163                    .createResource("gate.creole.gazetteer.DefaultGazetteer");
164                gaze.setListsURL(jcolibri.util.FileIO.findFile(filename));
165                gaze.init();
166            } catch (Exception e)
167            {
168                org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).error(e);
169                
170            }
171             
172         }
173         
174    
175    }