001 /** 002 * OpennlpSplitter.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 21/06/2007 008 */ 009 010 package jcolibri.extensions.textual.IE.opennlp; 011 012 import java.util.Collection; 013 import java.util.List; 014 015 import org.jdom.Element; 016 017 import jcolibri.cbrcore.Attribute; 018 import jcolibri.cbrcore.CBRCase; 019 import jcolibri.cbrcore.CBRQuery; 020 import jcolibri.extensions.textual.IE.IEutils; 021 import jcolibri.extensions.textual.IE.gate.GateSplitter; 022 import jcolibri.extensions.textual.IE.representation.IEText; 023 import jcolibri.extensions.textual.IE.representation.Paragraph; 024 import jcolibri.extensions.textual.IE.representation.Sentence; 025 import jcolibri.extensions.textual.IE.representation.Token; 026 import jcolibri.util.AttributeUtils; 027 import jcolibri.util.ProgressController; 028 import opennlp.common.xml.NLPDocument; 029 import opennlp.grok.preprocess.sentdetect.EnglishSentenceDetectorME; 030 import opennlp.grok.preprocess.tokenize.EnglishTokenizerME; 031 import opennlp.grok.preprocess.tokenize.TokenizerME; 032 033 /** 034 * Organizes an IETextOpenNLP object in paragraphs, sentences and tokens. 035 * This implementation uses maximum entropy algorithms to obtain sentences and tokens. 036 * @author Juan A. Recio-Garcia 037 * @version 1.0 038 * 039 */ 040 public class OpennlpSplitter 041 { 042 /** 043 * Performs the algorithm in the given attributes of a collection of cases. 044 * These attributes must be IETextOpenNLP objects. 045 */ 046 public static void split(Collection<CBRCase> cases, Collection<Attribute> attributes) 047 { 048 org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text."); 049 ProgressController.init(OpennlpSplitter.class, "Splitting OpenNLP text", cases.size()); 050 for(CBRCase c: cases) 051 { 052 for(Attribute a: attributes) 053 { 054 Object o = AttributeUtils.findValue(a, c); 055 if(o instanceof IETextOpenNLP) 056 split((IETextOpenNLP)o); 057 } 058 ProgressController.step(OpennlpSplitter.class); 059 } 060 ProgressController.finish(OpennlpSplitter.class); 061 } 062 063 /** 064 * Performs the algorithm in the given attributes of a query. 065 * These attributes must be IETextOpenNLP objects. 066 */ 067 public static void split(CBRQuery query, Collection<Attribute> attributes) 068 { 069 org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text."); 070 for(Attribute a: attributes) 071 { 072 Object o = AttributeUtils.findValue(a, query); 073 if(o instanceof IETextOpenNLP) 074 split((IETextOpenNLP)o); 075 } 076 } 077 078 /** 079 * Performs the algorithm in all the IETextOpenNLP typed attributes of a collection of cases. 080 */ 081 public static void split(Collection<CBRCase> cases) 082 { 083 org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text."); 084 ProgressController.init(OpennlpSplitter.class, "Splitting OpenNLP text", cases.size()); 085 for(CBRCase c: cases) 086 { 087 Collection<IEText> texts = IEutils.getTexts(c); 088 for(IEText t : texts) 089 if(t instanceof IETextOpenNLP) 090 split((IETextOpenNLP)t); 091 ProgressController.step(OpennlpSplitter.class); 092 } 093 ProgressController.finish(OpennlpSplitter.class); 094 } 095 096 /** 097 * Performs the algorithm in all the IETextOpenNLP typed attributes of a query. 098 */ 099 public static void split(CBRQuery query) 100 { 101 org.apache.commons.logging.LogFactory.getLog(OpennlpSplitter.class).info("Splitting OpenNLP text."); 102 Collection<IEText> texts = IEutils.getTexts(query); 103 for(IEText t : texts) 104 if(t instanceof IETextOpenNLP) 105 split((IETextOpenNLP)t); 106 } 107 108 109 110 public static void split(IETextOpenNLP text) 111 { 112 try 113 { 114 TokenizerME tokeniser = getTokeniser(); 115 tokeniser.process(text.getDocument()); 116 117 EnglishSentenceDetectorME sd = getSentenceDetector(); 118 sd.process(text.getDocument()); 119 120 organizeText(text); 121 122 } catch (Exception e) 123 { 124 org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).error(e); 125 } 126 } 127 128 /** 129 * Performs the algorithm in a given IETextOpenNLP object 130 */ 131 @SuppressWarnings("unchecked") 132 protected static void organizeText(IETextOpenNLP text) 133 { 134 NLPDocument doc = text.getDocument(); 135 136 Element root = doc.getRootElement(); 137 Element texte = (Element)root.getChild("text"); 138 List<Element> pars = texte.getChildren(); 139 String[] parsText = doc.getParagraphs(); 140 for(int p = 0; p<parsText.length; p++) 141 { 142 Element par = pars.get(p); 143 String parText = parsText[p]; 144 145 Paragraph myPar= new Paragraph(parText); 146 text.setParagraphMapping(myPar, par); 147 text.addParagraph(myPar); 148 149 150 List<Element> sents = par.getChildren(); 151 String[] sentsText = doc.getSentences(par); 152 for(int s=0; s<sentsText.length; s++) 153 { 154 Element sent = sents.get(s); 155 String sentText = sentsText[s]; 156 157 Sentence mySent = new Sentence(sentText); 158 myPar.addSentence(mySent); 159 text.setSentenceMapping(mySent, sent); 160 161 List<Element> toks = sent.getChildren(); 162 String[] toksText = doc.getWords(sent); 163 for(int t=0; t<toksText.length; t++) 164 { 165 Element tok = toks.get(t); 166 String tokText = toksText[t]; 167 168 Token myTok = new Token(tokText); 169 mySent.addToken(myTok); 170 text.setTokenMapping(myTok, tok); 171 } 172 } 173 } 174 175 176 } 177 178 179 private static TokenizerME tokeniser = null; 180 private static TokenizerME getTokeniser() throws Exception 181 { 182 if(tokeniser == null) 183 tokeniser = new EnglishTokenizerME(); 184 return tokeniser; 185 } 186 187 private static EnglishSentenceDetectorME englishSentenceDetector = null; 188 private static EnglishSentenceDetectorME getSentenceDetector() 189 { 190 if(englishSentenceDetector == null) 191 englishSentenceDetector = new EnglishSentenceDetectorME(); 192 return englishSentenceDetector; 193 } 194 195 196 }