001    /**
002     * GateFeaturesExtractor.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    package jcolibri.extensions.textual.IE.gate;
010    
011    import gate.Annotation;
012    import gate.AnnotationSet;
013    import gate.Factory;
014    import gate.creole.ExecutionException;
015    import gate.creole.Transducer;
016    
017    import java.util.Collection;
018    import java.util.Iterator;
019    
020    import jcolibri.cbrcore.Attribute;
021    import jcolibri.cbrcore.CBRCase;
022    import jcolibri.cbrcore.CBRQuery;
023    import jcolibri.extensions.textual.IE.IEutils;
024    import jcolibri.extensions.textual.IE.representation.IEText;
025    import jcolibri.extensions.textual.IE.representation.info.FeatureInfo;
026    import jcolibri.util.AttributeUtils;
027    import jcolibri.util.ProgressController;
028    
029    /**
030     * Extract features from text using the GATE grammars in jape format.
031     * This method uses internally an ANNIETransducer object.
032     * GATE's default rules file or any other file can be loaded.
033     * <br>
034     * It is compatible with the generic FeaturesExtractor so they can be executed together.
035     * <br>
036     * For more information see the GATE tutorial.
037     * @author Juan A. Recio-Garcia
038     * @version 1.0
039     * 
040     */
041    public class GateFeaturesExtractor
042    {
043    
044        /**
045         * Performs the algorithm in the given attributes of a collection of cases.
046         * These attributes must be IETextGate objects.
047         */
048        public static void extractFeatures(Collection<CBRCase> cases, Collection<Attribute> attributes)
049        {
050            org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features.");
051            ProgressController.init(GateFeaturesExtractor.class, "Extracting features ...", cases.size());
052            for(CBRCase c: cases)
053            {
054                for(Attribute a: attributes)
055                {
056                    Object o = AttributeUtils.findValue(a, c);
057                    extractFeatures((IETextGate)o);
058                }
059                ProgressController.step(GateFeaturesExtractor.class);
060            }
061            ProgressController.finish(GateFeaturesExtractor.class);
062        }
063    
064        /**
065         * Performs the algorithm in the given attributes of a query.
066         * These attributes must be IETextGate objects.
067         */
068        public static void extractFeatures(CBRQuery query, Collection<Attribute> attributes)
069        {
070            org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features.");
071            for(Attribute a: attributes)
072            {
073                Object o = AttributeUtils.findValue(a, query);
074                extractFeatures((IETextGate)o);
075            }
076        }
077    
078        /**
079         * Performs the algorithm in all the IETextGate typed attributes of a collection of cases.
080         */  
081        public static void extractFeatures(Collection<CBRCase> cases)
082        {
083            org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features.");
084            ProgressController.init(GateFeaturesExtractor.class, "Extracting features ...", cases.size());
085            for(CBRCase c: cases)
086            {
087                Collection<IEText> texts = IEutils.getTexts(c);
088                for(IEText t : texts)
089                    if(t instanceof IETextGate)
090                        extractFeatures((IETextGate)t);
091                ProgressController.step(GateFeaturesExtractor.class);
092            }
093            ProgressController.finish(GateFeaturesExtractor.class);
094        }
095        
096        /**
097         * Performs the algorithm in all the IETextGate typed attributes of a query.
098         */ 
099        public static void extractFeatures(CBRQuery query)
100        {       
101            org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).info("Extracting features.");
102            Collection<IEText> texts = IEutils.getTexts(query);
103            for(IEText t : texts)
104                if(t instanceof IETextGate)
105                    extractFeatures((IETextGate)t);
106        }   
107        
108        /**
109         * Performs the algorithm in a given IETextGate object
110         */
111        public static void extractFeatures(IETextGate text)
112        {
113            try
114            {
115    
116                featureExtractor.setDocument(text.getDocument());
117                featureExtractor.execute();
118    
119                String content = text.getRAWContent();
120    
121                AnnotationSet featuresAnnotations = text.getDocument().getAnnotations("Features");
122    
123                for (Iterator iter = featuresAnnotations.iterator(); iter.hasNext();)
124                {
125                    Annotation anot = (Annotation) iter.next();
126                    String Type = (String) anot.getType();
127                    int begin = anot.getStartNode().getOffset().intValue();
128                    int end = anot.getEndNode().getOffset().intValue();
129                    String value = content.substring(begin, end);
130                    text.addFeature(new FeatureInfo(Type, value, begin, end));
131                }
132    
133                // System.err.print(text.getDocument());
134    
135            } catch (ExecutionException e)
136            {
137                org.apache.commons.logging.LogFactory.getLog(GateFeaturesExtractor.class).error(e);
138            }
139        }
140    
141        static Transducer featureExtractor;
142    
143        private static String defaultRulesFileName ="jcolibri/extensions/textual/IE/gate/gateinit/plugins/ANNIE/resources/NE/main.jape"; 
144        
145        public static void loadDefaultRules()
146        {
147            loadRules(defaultRulesFileName);
148        }
149        
150        /**
151        * Loads a rules file
152        */
153        public static void loadRules(String filename)
154        {
155            try
156            {
157                featureExtractor = (Transducer) Factory.createResource("gate.creole.ANNIETransducer");
158                featureExtractor.setGrammarURL(jcolibri.util.FileIO.findFile(filename));
159                featureExtractor.setOutputASName("Features");
160                featureExtractor.init();
161            } catch (Exception e)
162            {
163                org.apache.commons.logging.LogFactory.getLog(GatePhrasesExtractor.class).error(e);
164    
165            }
166    
167        }
168    }