001    /**
002     * OpennlpMainNamesExtractor.java
003     * jCOLIBRI2 framework. 
004     * @author Juan A. Recio-García.
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 20/06/2007
008     */
009    package jcolibri.extensions.textual.IE.opennlp;
010    
011    import java.util.Collection;
012    
013    import jcolibri.cbrcore.Attribute;
014    import jcolibri.cbrcore.CBRCase;
015    import jcolibri.cbrcore.CBRQuery;
016    import jcolibri.extensions.textual.IE.IEutils;
017    import jcolibri.extensions.textual.IE.representation.IEText;
018    import jcolibri.extensions.textual.IE.representation.Token;
019    import jcolibri.util.AttributeUtils;
020    import jcolibri.util.ProgressController;
021    import opennlp.grok.preprocess.namefind.EnglishNameFinderME;
022    import opennlp.grok.preprocess.namefind.NameFinderME;
023    
024    import org.jdom.Element;
025    
026    /**
027     * Identifies the tokens that are main names in the sencence using a Maximum entrophy algorithm.
028     * The "isMainName" flag of the Token object is activated if a token is a main name.
029     * @author Juan A. Recio-Garcia
030     * @version 2.0
031     *
032     */
033    public class OpennlpMainNamesExtractor
034    {
035        /**
036         * Performs the algorithm in the given attributes of a collection of cases.
037         * These attributes must be IETextOpenNLP objects.
038         */
039        public static void extractMainNames(Collection<CBRCase> cases, Collection<Attribute> attributes)
040        {
041            org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names.");
042            ProgressController.init(OpennlpMainNamesExtractor.class, "Extracting main names...", cases.size());
043            for(CBRCase c: cases)
044            {
045                for(Attribute a: attributes)
046                {
047                    Object o = AttributeUtils.findValue(a, c);
048                    if(o instanceof IETextOpenNLP)
049                        extractMainNames((IETextOpenNLP)o);
050                }
051                ProgressController.step(OpennlpMainNamesExtractor.class);
052            }
053            ProgressController.finish(OpennlpMainNamesExtractor.class);
054        }
055    
056        /**
057         * Performs the algorithm in the given attributes of a query.
058         * These attributes must be IETextOpenNLP objects.
059         */
060        public static void extractMainNames(CBRQuery query, Collection<Attribute> attributes)
061        {
062                org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names.");
063                for(Attribute a: attributes)
064                {
065                    Object o = AttributeUtils.findValue(a, query);
066                    if(o instanceof IETextOpenNLP)
067                        extractMainNames((IETextOpenNLP)o);
068                }
069        }
070        
071        /**
072         * Performs the algorithm in all the IETextOpenNLP typed attributes of a collection of cases.
073         */
074        public static void extractMainNames(Collection<CBRCase> cases)
075        {
076            org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names.");
077            ProgressController.init(OpennlpMainNamesExtractor.class, "Extracting main names", cases.size());
078            for(CBRCase c: cases)
079            {
080                Collection<IEText> texts = IEutils.getTexts(c);
081                for(IEText t : texts)
082                    if(t instanceof IETextOpenNLP)
083                    extractMainNames((IETextOpenNLP)t);
084                ProgressController.step(OpennlpMainNamesExtractor.class);
085            }
086            ProgressController.finish(OpennlpMainNamesExtractor.class);
087        }
088        
089        /**
090         * Performs the algorithm in all the IETextOpenNLP typed attributes of a query.
091         */ 
092        public static void extractMainNames(CBRQuery query)
093        {      
094            org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names.");
095            Collection<IEText> texts = IEutils.getTexts(query);
096            for(IEText t : texts)
097                if(t instanceof IETextOpenNLP)
098                     extractMainNames((IETextOpenNLP)t);
099        }
100        
101        /**
102         * Performs the algorithm in a given IETextOpenNLP object
103         */
104        public static void extractMainNames(IETextOpenNLP text)
105        {
106            NameFinderME nameFinder = getNameFinder();
107            try
108            {
109                nameFinder.process(text.getDocument());
110            } catch (Exception e)
111            {
112                //org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).warn("There was an error extracting main names. Continuing..."); 
113            }
114            
115            for(Token t: text.getAllTokens())
116            {
117                Element tok = text.getTokenMapping(t);
118                String val  = tok.getAttributeValue("type");
119                t.setMainName((val!=null)&&val.equals("name"));
120            }
121        }
122        
123        private static NameFinderME nameFinder = null;
124        private static NameFinderME getNameFinder()
125        {
126            if(nameFinder == null)
127                nameFinder = new EnglishNameFinderME();
128            return nameFinder;
129        }
130    }