001    /**
002     * DomainTopicClassifier.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 21/06/2007
008     */
009    package jcolibri.extensions.textual.IE.common;
010    
011    import java.io.BufferedReader;
012    import java.io.InputStreamReader;
013    import java.net.URL;
014    import java.util.ArrayList;
015    import java.util.Collection;
016    import java.util.HashMap;
017    import java.util.Iterator;
018    import java.util.StringTokenizer;
019    
020    import jcolibri.cbrcore.Attribute;
021    import jcolibri.cbrcore.CBRCase;
022    import jcolibri.cbrcore.CBRQuery;
023    import jcolibri.extensions.textual.IE.IEutils;
024    import jcolibri.extensions.textual.IE.representation.IEText;
025    import jcolibri.extensions.textual.IE.representation.info.FeatureInfo;
026    import jcolibri.extensions.textual.IE.representation.info.PhraseInfo;
027    import jcolibri.util.AttributeUtils;
028    import jcolibri.util.ProgressController;
029    
030    /**
031     * Classifies textual objects with a topic depending on the features and phrases.
032     * <br>
033     * This method uses a configuration file with rules following the syntaxis:
034     * <p>[Topic] &lt;FeatureName,FeatureValue&gt; &lt;FeatureName,FeatureValue&gt; ... &lt;Phrase&gt; &lt;Phrase&gt;</p>
035     * where:
036     * <ul>
037     * <li>Topic: Topic name
038     * <li>FeatureName: FeatureName extracted by features extraction method
039     * <li>FeatureValue: Feature value. It also can be '?', meaning any value.
040     * <li>Phrase: Any phrase identifier extracted by the phrases extraction method.
041     * </ul>
042     * <p>
043     * First version was developed at: Robert Gordon University - Aberdeen & Facultad Informática,
044     * Universidad Complutense de Madrid (GAIA)
045     * </p>
046     * @author Juan A. Recio-Garcia
047     * @version 2.0
048     * 
049     */
050    public class DomainTopicClassifier
051    {
052        static ArrayList<TopicRule> topicsRules;
053    
054        /**
055         * Performs the algorithm in the given attributes of a collection of cases.
056         * These attributes must be IEText objects.
057         */
058        public static void classifyWithTopic(Collection<CBRCase> cases, Collection<Attribute> attributes)
059        {
060            org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic.");
061            ProgressController.init(DomainTopicClassifier.class, "Classifying with topic  ...", cases.size());
062            for(CBRCase c: cases)
063            {
064                for(Attribute a: attributes)
065                {
066                    Object o = AttributeUtils.findValue(a, c);
067                    classifyWithTopic((IEText)o);
068                }
069                ProgressController.step(DomainTopicClassifier.class);
070            }
071            ProgressController.finish(DomainTopicClassifier.class);
072        }
073    
074        /**
075         * Performs the algorithm in the given attributes of a query.
076         * These attributes must be IEText objects.
077         */
078        public static void classifyWithTopic(CBRQuery query, Collection<Attribute> attributes)
079        {
080            org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic.");
081            for(Attribute a: attributes)
082            {
083                Object o = AttributeUtils.findValue(a, query);
084                classifyWithTopic((IEText)o);
085            }
086        }
087        
088        /**
089         * Performs the algorithm in all the attributes of a collection of cases
090         * These attributes must be IEText objects.
091         */
092        public static void classifyWithTopic(Collection<CBRCase> cases)
093        {
094            org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic.");
095            ProgressController.init(DomainTopicClassifier.class, "Classifying with topic  ...", cases.size());
096            for(CBRCase c: cases)
097            {
098                Collection<IEText> texts = IEutils.getTexts(c);
099                for(IEText t : texts)
100                    classifyWithTopic(t);
101                ProgressController.step(DomainTopicClassifier.class);
102            }
103            ProgressController.finish(DomainTopicClassifier.class);
104        }
105        
106        /**
107         * Performs the algorithm in all the attributes of a query
108         * These attributes must be IEText objects.
109         */
110        public static void classifyWithTopic(CBRQuery query)
111        {
112            org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).info("Classifying with topic.");
113            Collection<IEText> texts = IEutils.getTexts(query);
114            for(IEText t : texts)
115                classifyWithTopic(t);
116        }  
117        
118        /**
119         * Performs the algorithm in a given IEText object
120         */
121        public static void classifyWithTopic(IEText text)
122        {
123            Collection<PhraseInfo> _phrases = text.getPhrases();
124            Collection<FeatureInfo> _features = text.getFeatures();
125            for (TopicRule rule : topicsRules)
126            {
127                // Chech rule conditions
128                boolean valid = true;
129                HashMap<String, String> conditions = rule._data;
130                Iterator<String> fOpIter = conditions.keySet().iterator();
131                // For each condition
132                while (fOpIter.hasNext() && valid)
133                {
134                    String featureOrPhrase = (String) fOpIter.next();
135                    String value = (String) conditions.get(featureOrPhrase);
136                    // It's a phrase condition
137                    if (value == null)
138                    {
139                        boolean found = false;
140                        for (Iterator<PhraseInfo> it = _phrases.iterator(); it.hasNext() && !found;)
141                        {
142                            PhraseInfo pi = it.next();
143                            if (pi.getPhrase().equals(featureOrPhrase))
144                                found = true;
145                        }
146                        valid = found;
147                    }
148                    // It's a feature condition
149                    else
150                    {
151                        boolean found = false;
152                        for (Iterator<FeatureInfo> it = _features.iterator(); it.hasNext() && !found;)
153                        {
154                            FeatureInfo fi = it.next();
155                            if (!value.equals("?"))
156                                found = (fi.getFeature().equals(featureOrPhrase) && fi.getValue().equals(value));
157                            else
158                                found = fi.getFeature().equals(featureOrPhrase);
159                        }
160                        valid = found;
161                    }
162                }
163                // If rule conditions are true -> include rule name in
164                // Topics
165                if (valid)
166                    text.addTopic(rule._name);
167            }
168        }
169    
170        /**
171         * Load topic classification rules file.
172         */
173        public static void loadRules(String filename)
174        {
175            try
176            {
177                topicsRules = new ArrayList<TopicRule>();
178                URL file = jcolibri.util.FileIO.findFile(filename);
179                BufferedReader br = new BufferedReader( new InputStreamReader(file.openStream()));
180                String line = "";
181                while ((line = br.readLine()) != null)
182                {
183                    if (line.startsWith("#"))
184                        continue;
185                    int pos = line.indexOf(']');
186                    if (pos == -1)
187                        throw new Exception(line + "  Topic field not found");
188                    String _feature = line.substring(1, pos);
189                    String _rest = line.substring(pos + 1);
190    
191                    HashMap<String, String> data = new HashMap<String, String>();
192                    int indexOpen;
193                    int indexClose;
194                    while (((indexOpen = _rest.indexOf("<")) != -1) && ((indexClose = _rest.indexOf(">")) != -1))
195                    {
196                        String content = _rest.substring(indexOpen, indexClose);
197                        StringTokenizer st = new StringTokenizer(content, "<,>");
198                        if (!st.hasMoreTokens())
199                            continue;
200                        String featureOrPhrase = st.nextToken();
201                        String value = null;
202                        if (st.hasMoreTokens())
203                            value = st.nextToken();
204                        // If its a Phrase condition -> value == null
205                        data.put(cleanSpaces(featureOrPhrase), cleanSpaces(value));
206                        _rest = _rest.substring(indexClose + 1, _rest.length());
207                    }
208    
209                    TopicRule rule = new TopicRule(_feature, data);
210                    topicsRules.add(rule);
211                }
212                br.close();
213            } catch (Exception e)
214            {
215                org.apache.commons.logging.LogFactory.getLog(DomainTopicClassifier.class).error(e);
216            }
217        }
218    
219        static private class TopicRule
220        {
221            String _name;
222    
223            HashMap<String, String> _data;
224    
225            TopicRule(String n, HashMap<String, String> d)
226            {
227                _name = n;
228                _data = d;
229            }
230        }
231    
232        static private String cleanSpaces(String w)
233        {
234            if (w == null)
235                return null;
236            String res = "";
237            StringTokenizer st = new StringTokenizer(w, " ");
238            while (st.hasMoreTokens())
239            {
240                res += st.nextToken();
241                if (st.hasMoreTokens())
242                    res += " ";
243            }
244            return res;
245        }
246    }