001 /** 002 * OpennlpMainNamesExtractor.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 20/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.opennlp; 010 011 import java.util.Collection; 012 013 import jcolibri.cbrcore.Attribute; 014 import jcolibri.cbrcore.CBRCase; 015 import jcolibri.cbrcore.CBRQuery; 016 import jcolibri.extensions.textual.IE.IEutils; 017 import jcolibri.extensions.textual.IE.representation.IEText; 018 import jcolibri.extensions.textual.IE.representation.Token; 019 import jcolibri.util.AttributeUtils; 020 import jcolibri.util.ProgressController; 021 import opennlp.grok.preprocess.namefind.EnglishNameFinderME; 022 import opennlp.grok.preprocess.namefind.NameFinderME; 023 024 import org.jdom.Element; 025 026 /** 027 * Identifies the tokens that are main names in the sencence using a Maximum entrophy algorithm. 028 * The "isMainName" flag of the Token object is activated if a token is a main name. 029 * @author Juan A. Recio-Garcia 030 * @version 2.0 031 * 032 */ 033 public class OpennlpMainNamesExtractor 034 { 035 /** 036 * Performs the algorithm in the given attributes of a collection of cases. 037 * These attributes must be IETextOpenNLP objects. 038 */ 039 public static void extractMainNames(Collection<CBRCase> cases, Collection<Attribute> attributes) 040 { 041 org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names."); 042 ProgressController.init(OpennlpMainNamesExtractor.class, "Extracting main names...", cases.size()); 043 for(CBRCase c: cases) 044 { 045 for(Attribute a: attributes) 046 { 047 Object o = AttributeUtils.findValue(a, c); 048 if(o instanceof IETextOpenNLP) 049 extractMainNames((IETextOpenNLP)o); 050 } 051 ProgressController.step(OpennlpMainNamesExtractor.class); 052 } 053 ProgressController.finish(OpennlpMainNamesExtractor.class); 054 } 055 056 /** 057 * Performs the algorithm in the given attributes of a query. 058 * These attributes must be IETextOpenNLP objects. 059 */ 060 public static void extractMainNames(CBRQuery query, Collection<Attribute> attributes) 061 { 062 org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names."); 063 for(Attribute a: attributes) 064 { 065 Object o = AttributeUtils.findValue(a, query); 066 if(o instanceof IETextOpenNLP) 067 extractMainNames((IETextOpenNLP)o); 068 } 069 } 070 071 /** 072 * Performs the algorithm in all the IETextOpenNLP typed attributes of a collection of cases. 073 */ 074 public static void extractMainNames(Collection<CBRCase> cases) 075 { 076 org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names."); 077 ProgressController.init(OpennlpMainNamesExtractor.class, "Extracting main names", cases.size()); 078 for(CBRCase c: cases) 079 { 080 Collection<IEText> texts = IEutils.getTexts(c); 081 for(IEText t : texts) 082 if(t instanceof IETextOpenNLP) 083 extractMainNames((IETextOpenNLP)t); 084 ProgressController.step(OpennlpMainNamesExtractor.class); 085 } 086 ProgressController.finish(OpennlpMainNamesExtractor.class); 087 } 088 089 /** 090 * Performs the algorithm in all the IETextOpenNLP typed attributes of a query. 091 */ 092 public static void extractMainNames(CBRQuery query) 093 { 094 org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).info("Extracting main names."); 095 Collection<IEText> texts = IEutils.getTexts(query); 096 for(IEText t : texts) 097 if(t instanceof IETextOpenNLP) 098 extractMainNames((IETextOpenNLP)t); 099 } 100 101 /** 102 * Performs the algorithm in a given IETextOpenNLP object 103 */ 104 public static void extractMainNames(IETextOpenNLP text) 105 { 106 NameFinderME nameFinder = getNameFinder(); 107 try 108 { 109 nameFinder.process(text.getDocument()); 110 } catch (Exception e) 111 { 112 //org.apache.commons.logging.LogFactory.getLog(OpennlpMainNamesExtractor.class).warn("There was an error extracting main names. Continuing..."); 113 } 114 115 for(Token t: text.getAllTokens()) 116 { 117 Element tok = text.getTokenMapping(t); 118 String val = tok.getAttributeValue("type"); 119 t.setMainName((val!=null)&&val.equals("name")); 120 } 121 } 122 123 private static NameFinderME nameFinder = null; 124 private static NameFinderME getNameFinder() 125 { 126 if(nameFinder == null) 127 nameFinder = new EnglishNameFinderME(); 128 return nameFinder; 129 } 130 }