001    /**
002     * FeaturesExtractor.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.io.BufferedReader;
012    import java.io.InputStreamReader;
013    import java.net.URL;
014    import java.util.ArrayList;
015    import java.util.Collection;
016    import java.util.StringTokenizer;
017    import java.util.regex.Matcher;
018    import java.util.regex.Pattern;
019    
020    import jcolibri.cbrcore.Attribute;
021    import jcolibri.cbrcore.CBRCase;
022    import jcolibri.cbrcore.CBRQuery;
023    import jcolibri.extensions.textual.IE.IEutils;
024    import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor;
025    import jcolibri.extensions.textual.IE.representation.IEText;
026    import jcolibri.extensions.textual.IE.representation.info.FeatureInfo;
027    import jcolibri.util.AttributeUtils;
028    import jcolibri.util.ProgressController;
029    
030    /**
031     *
032     * <p>
033     * Extracts features using Regular Expressions.
034     * </p>
035     * <p>
036     * Rules format is:
037     * </p>
038     * <p>
039     * [FeatureName]{FeaturePosition}FeatureRegularExpresion
040     * <ul>
041     * <li>FeatureName is used to store the extracted information
042     * <li>FeaturePosition indicates the position of the information that we want
043     * to extract inside the regular expression. The feature is indicated by
044     * counting the opening parentheses from left to right.
045     * <p>
046     * In the expression ((A)(B(C))), for example, there are four such groups:
047     * <ol>
048     * <li> ((A)(B(C)))
049     * <li> (A)
050     * <li> (B(C))4(C)
051     * </ol>
052     * <p>
053     * Group zero always stands for the entire expression
054     * <li>Regular Expressions are deffined following java.util.regex.Pattern
055     * syntaxis. (See API for details)
056     * </ul>
057     * <p>
058     * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
059     * Universidad Complutense de Madrid (GAIA)
060     * </p>
061     * @author Juan A. Recio-Garcia
062     * @version 2.0
063     * 
064     */
065    public class FeaturesExtractor
066    {
067        static ArrayList<FeatureRule> featuresRules;
068    
069        /**
070         * Performs the algorithm in the given attributes of a collection of cases.
071         * These attributes must be IEText objects.
072         */
073        public static void extractFeatures(Collection<CBRCase> cases, Collection<Attribute> attributes)
074        {
075            org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features.");
076            ProgressController.init(PhrasesExtractor.class, "Extracting features ...", cases.size());
077            for(CBRCase c: cases)
078            {
079                for(Attribute a: attributes)
080                {
081                    Object o = AttributeUtils.findValue(a, c);
082                    extractFeatures((IEText)o);
083                }
084                ProgressController.step(GatePhrasesExtractor.class);
085            }
086            ProgressController.finish(GatePhrasesExtractor.class);
087        }
088    
089        /**
090         * Performs the algorithm in the given attributes of a query.
091         * These attributes must be IEText objects.
092         */
093        public static void extractFeatures(CBRQuery query, Collection<Attribute> attributes)
094        {
095            org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features.");
096            for(Attribute a: attributes)
097            {
098                Object o = AttributeUtils.findValue(a, query);
099                extractFeatures((IEText)o);
100            }
101        }
102        
103        /**
104         * Performs the algorithm in all the attributes of a collection of cases
105         * These attributes must be IEText objects.
106         */
107        public static void extractFeatures(Collection<CBRCase> cases)
108        {
109            org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features.");
110            ProgressController.init(PhrasesExtractor.class, "Extracting features ...", cases.size());
111            for(CBRCase c: cases)
112            {
113                Collection<IEText> texts = IEutils.getTexts(c);
114                for(IEText t : texts)
115                    extractFeatures(t);
116                ProgressController.step(GatePhrasesExtractor.class);
117            }
118            ProgressController.finish(GatePhrasesExtractor.class);
119        }
120        
121        /**
122         * Performs the algorithm in all the attributes of a query
123         * These attributes must be IEText objects.
124         */
125        public static void extractFeatures(CBRQuery query)
126        {    
127            org.apache.commons.logging.LogFactory.getLog(FeaturesExtractor.class).info("Extracting features.");
128            Collection<IEText> texts = IEutils.getTexts(query);
129            for(IEText t : texts)
130                extractFeatures(t);
131        }    
132        
133        /**
134         * Performs the algorithm in a given IEText object
135         */
136        public static void extractFeatures(IEText text)
137        {
138            String rawText = text.getRAWContent();
139            for (FeatureRule rule : featuresRules)
140            {
141                Matcher m = rule._pattern.matcher(rawText);
142                while (m.find())
143                {
144                    String group = m.group(rule._group);
145                    group = cleanSpaces(group);
146                    text.addFeature(new FeatureInfo(rule._feature, group, m.start(), m.end()));
147                }
148            }
149        }
150    
151        static private String cleanSpaces(String w)
152        {
153            String res = "";
154            StringTokenizer st = new StringTokenizer(w, " ");
155            while (st.hasMoreTokens())
156            {
157                res += st.nextToken();
158                if (st.hasMoreTokens())
159                    res += " ";
160            }
161            return res;
162        }
163    
164        /**
165             * Load the features rules
166             */
167        public static void loadRules(String filename)
168        {
169            try
170            {
171                featuresRules = new ArrayList<FeatureRule>();
172                URL file = jcolibri.util.FileIO.findFile(filename);
173                BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream()));
174    
175                String line = "";
176                while ((line = br.readLine()) != null)
177                {
178                    if (line.startsWith("#"))
179                        continue;
180                    int pos = line.indexOf(']');
181                    if (pos == -1)
182                        throw new Exception(line + "  Feature field not found");
183                    String _feature = line.substring(1, pos);
184                    String _rest = line.substring(pos + 1);
185                    pos = _rest.indexOf('}');
186                    if (pos == -1)
187                        throw new Exception(line
188                                + "  FeaturePostion field not found");
189                    String _group = _rest.substring(1, pos);
190                    String _rule = _rest.substring(pos + 1);
191                    int g = Integer.parseInt(_group);
192                    featuresRules.add(new FeatureRule(_feature, Pattern.compile(_rule),g));
193                }
194                br.close();
195            } catch (Exception e)
196            {
197                org.apache.commons.logging.LogFactory.getLog(
198                        FeaturesExtractor.class).error(e);
199            }
200        }
201    
202        private static class FeatureRule
203        {
204            String _feature;
205    
206            Pattern _pattern;
207    
208            int _group;
209    
210            FeatureRule(String _f, Pattern _p, int _g)
211            {
212                _feature = _f;
213                _pattern = _p;
214                _group = _g;
215            }
216        }
217    }