001    /**
002     * PhrasesExtractor.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 20/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.io.BufferedReader;
012    import java.io.InputStreamReader;
013    import java.net.URL;
014    import java.util.Collection;
015    import java.util.HashMap;
016    import java.util.StringTokenizer;
017    import java.util.regex.Matcher;
018    import java.util.regex.Pattern;
019    
020    import jcolibri.cbrcore.Attribute;
021    import jcolibri.cbrcore.CBRCase;
022    import jcolibri.cbrcore.CBRQuery;
023    import jcolibri.extensions.textual.IE.IEutils;
024    import jcolibri.extensions.textual.IE.gate.GatePhrasesExtractor;
025    import jcolibri.extensions.textual.IE.representation.IEText;
026    import jcolibri.extensions.textual.IE.representation.info.PhraseInfo;
027    import jcolibri.util.AttributeUtils;
028    import jcolibri.util.ProgressController;
029    
030    
031    
032    /**
033     *
034     * <p>
035     * Extracts Phrases using Regular Expressions.
036     * </p>
037     * <p>
038     * Rules file format is:
039     * </p>
040     * <p>
041     * [PhraseName]PhraseRegularExpresion
042     * <ul>
043     * <il>PhraseName is used to store the extracted information <il>Regular
044     * Expressions are deffined following java.util.regex.Pattern syntaxis. (See API
045     * for details)
046     * </ul>
047     * <p>
048     * The first version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
049     * Universidad Complutense de Madrid (GAIA)
050     * </p>
051     * @author Juan A. Recio-Garcia
052     * @version 2.0
053     * 
054     */
055    public class PhrasesExtractor
056    {
057        /**
058         * Performs the algorithm in the given attributes of a collection of cases.
059         * These attributes must be IEText objects.
060         */
061        public static void extractPhrases(Collection<CBRCase> cases, Collection<Attribute> attributes)
062        {
063            org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases.");
064            ProgressController.init(PhrasesExtractor.class, "Extracting phrases ...", cases.size());
065            for(CBRCase c: cases)
066            {
067                for(Attribute a: attributes)
068                {
069                    Object o = AttributeUtils.findValue(a, c);
070                    extractPhrases((IEText)o);
071                }
072                ProgressController.step(GatePhrasesExtractor.class);
073            }
074            ProgressController.finish(GatePhrasesExtractor.class);
075        }
076    
077        /**
078         * Performs the algorithm in the given attributes of a query.
079         * These attributes must be IEText objects.
080         */
081        public static void extractPhrases(CBRQuery query, Collection<Attribute> attributes)
082        {
083            org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases.");
084            for(Attribute a: attributes)
085            {
086                Object o = AttributeUtils.findValue(a, query);
087                extractPhrases((IEText)o);
088            }
089        }
090        
091        /**
092         * Performs the algorithm in all the attributes of a collection of cases
093         * These attributes must be IEText objects.
094         */
095        public static void extractPhrases(Collection<CBRCase> cases)
096        {
097            org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases.");
098            ProgressController.init(PhrasesExtractor.class, "Extracting phrases ...", cases.size());
099            for(CBRCase c: cases)
100            {
101                Collection<IEText> texts = IEutils.getTexts(c);
102                for(IEText t : texts)
103                    extractPhrases(t);
104                ProgressController.step(GatePhrasesExtractor.class);
105            }
106            ProgressController.finish(GatePhrasesExtractor.class);
107        }
108        
109        /**
110         * Performs the algorithm in all the attributes of a query
111         * These attributes must be IEText objects.
112         */
113        public static void extractPhrases(CBRQuery query)
114        {     
115            org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).info("Extracting phrases.");
116            Collection<IEText> texts = IEutils.getTexts(query);
117            for(IEText t : texts)
118                extractPhrases(t);
119        }
120    
121        
122        
123        static HashMap<String, Pattern> rulesList;
124        
125        /**
126         * Performs the algorithm in a given IEText object
127         */
128        public static void extractPhrases(IEText text)
129        {
130            String rawText = text.getRAWContent();
131            
132            for(String rule : rulesList.keySet())
133            {
134                Pattern pattern = rulesList.get(rule);
135                Matcher m = pattern.matcher(rawText);
136                while (m.find()) {
137                    text.addPhrase(new PhraseInfo(rule, m.start(), m.end()));
138                }
139                            
140            }
141        }
142        
143        
144        
145        /**
146        * Loads a rules file
147        */
148        public static void loadRules(String filename)
149        {
150            try
151            {
152                URL file = jcolibri.util.FileIO.findFile(filename);
153                BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream()));
154                rulesList = new HashMap<String,Pattern>();
155              
156                String line = "";
157                while ((line = br.readLine()) != null)
158                {
159                    if (line.startsWith("#"))
160                    continue;
161                    int pos = line.indexOf(']');
162                    if (pos == -1)
163                    throw new Exception(line + "  Feature field not found");
164                    String _feature = line.substring(1, pos);
165                    String _rule = line.substring(pos + 1);
166                    rulesList.put(cleanSpaces(_feature), Pattern.compile(_rule));
167                }
168                br.close();
169            } catch (Exception e)
170            {
171                org.apache.commons.logging.LogFactory.getLog(PhrasesExtractor.class).error(e); 
172            }
173        }
174    
175        private static String cleanSpaces(String w)
176        {
177            String res = "";
178            StringTokenizer st = new StringTokenizer(w, " ");
179            while (st.hasMoreTokens())
180            {
181                res += st.nextToken();
182                if (st.hasMoreTokens())
183                    res += " ";
184            }
185            return res;
186        }
187    }