001 /** 002 * GateSplitter.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 19/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.gate; 010 011 import gate.Annotation; 012 import gate.AnnotationSet; 013 import gate.Document; 014 import gate.Factory; 015 import gate.GateConstants; 016 import gate.creole.splitter.SentenceSplitter; 017 import gate.creole.tokeniser.DefaultTokeniser; 018 019 import java.util.ArrayList; 020 import java.util.Collection; 021 import java.util.List; 022 023 import jcolibri.cbrcore.Attribute; 024 import jcolibri.cbrcore.CBRCase; 025 import jcolibri.cbrcore.CBRQuery; 026 import jcolibri.extensions.textual.IE.IEutils; 027 import jcolibri.extensions.textual.IE.representation.IEText; 028 import jcolibri.extensions.textual.IE.representation.Paragraph; 029 import jcolibri.extensions.textual.IE.representation.Sentence; 030 import jcolibri.extensions.textual.IE.representation.Token; 031 import jcolibri.util.AttributeUtils; 032 import jcolibri.util.ProgressController; 033 034 /** 035 * Organizes an IETextGate object in paragraphs, sentences and tokens. 036 * This implementation uses the GATE algorithms to obtain sentences and tokens. 037 * @author Juan A. Recio-Garcia 038 * @version 1.0 039 */ 040 public class GateSplitter 041 { 042 /** 043 * Performs the algorithm in the given attributes of a collection of cases. 044 * These attributes must be IETextGate objects. 045 */ 046 public static void split(Collection<CBRCase> cases, Collection<Attribute> attributes) 047 { 048 org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text."); 049 ProgressController.init(GateSplitter.class, "Splitting Gate text", cases.size()); 050 for(CBRCase c: cases) 051 { 052 for(Attribute a: attributes) 053 { 054 Object o = AttributeUtils.findValue(a, c); 055 if(o instanceof IETextGate) 056 split((IETextGate)o); 057 } 058 ProgressController.step(GateSplitter.class); 059 } 060 ProgressController.finish(GateSplitter.class); 061 } 062 063 /** 064 * Performs the algorithm in the given attributes of a query. 065 * These attributes must be IETextGate objects. 066 */ 067 public static void split(CBRQuery query, Collection<Attribute> attributes) 068 { 069 org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text."); 070 for(Attribute a: attributes) 071 { 072 Object o = AttributeUtils.findValue(a, query); 073 if(o instanceof IETextGate) 074 split((IETextGate)o); 075 } 076 } 077 078 public static void split(Collection<CBRCase> cases) 079 { 080 org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text."); 081 ProgressController.init(GateSplitter.class, "Splitting Gate text", cases.size()); 082 for(CBRCase c: cases) 083 { 084 Collection<IEText> texts = IEutils.getTexts(c); 085 for(IEText t : texts) 086 if(t instanceof IETextGate) 087 split((IETextGate)t); 088 ProgressController.step(GateSplitter.class); 089 } 090 ProgressController.finish(GateSplitter.class); 091 } 092 093 /** 094 * Performs the algorithm in all the IETextGate typed attributes of a collection of cases. 095 */ 096 public static void split(CBRQuery query) 097 { 098 org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).info("Splitting Gate text."); 099 Collection<IEText> texts = IEutils.getTexts(query); 100 for(IEText t : texts) 101 if(t instanceof IETextGate) 102 split((IETextGate)t); 103 } 104 105 /** 106 * Performs the algorithm in all the IETextGate typed attributes of a query. 107 */ 108 public static void split(IETextGate text) 109 { 110 try 111 { 112 DefaultTokeniser tokeniser = getTokeniser(); 113 tokeniser.setDocument(text.getDocument()); 114 tokeniser.execute(); 115 116 SentenceSplitter sentenceSplitter = getSentenceSplitter(); 117 sentenceSplitter.setDocument(text.getDocument()); 118 sentenceSplitter.execute(); 119 120 organizeText(text); 121 122 } catch (Exception e) 123 { 124 org.apache.commons.logging.LogFactory.getLog(GateSplitter.class).error(e); 125 } 126 } 127 128 129 /** 130 * Performs the algorithm in a given IETextGate object 131 */ 132 @SuppressWarnings("unchecked") 133 protected static void organizeText(IETextGate text) 134 { 135 Document doc = text.getDocument(); 136 String content = text.getRAWContent(); 137 138 //Paragraphs 139 AnnotationSet parAnnot = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 140 AnnotationSet paragraphs = parAnnot.get("paragraph"); 141 AnnotationSet annot = doc.getAnnotations(); 142 143 List<Annotation> sents = new ArrayList<Annotation>(annot.get("Sentence")); 144 java.util.Collections.sort(sents); 145 146 List<Annotation> tokens = new ArrayList<Annotation>(annot.get("Token")); 147 java.util.Collections.sort(tokens); 148 149 ArrayList<Annotation> sentsToRemove = new ArrayList<Annotation>(); 150 ArrayList<Annotation> tokensToRemove = new ArrayList<Annotation>(); 151 for(int p=0; p<paragraphs.size(); p++) 152 { 153 Annotation par = paragraphs.get(p); 154 int beginP = par.getStartNode().getOffset().intValue(); 155 int endP = par.getEndNode().getOffset().intValue(); 156 Paragraph myPar = new Paragraph(content.substring(beginP,endP)); 157 text.addParagraph(myPar); 158 text.setParagraphMapping(myPar, par); 159 160 //Sentences 161 sentsToRemove.clear(); 162 for(int s=0; s<sents.size(); s++) 163 { 164 Annotation sent = sents.get(s); 165 int beginS = sent.getStartNode().getOffset().intValue(); 166 int endS = sent.getEndNode().getOffset().intValue(); 167 if((beginS<beginP)||(endS>endP)) 168 continue; 169 Sentence mySent = new Sentence(content.substring(beginS, endS)); 170 myPar.addSentence(mySent); 171 text.setSentenceMapping(mySent, sent); 172 sentsToRemove.remove(sent); 173 174 175 //Tokens 176 tokensToRemove.clear(); 177 for(int t=0; t<tokens.size(); t++) 178 { 179 Annotation token = tokens.get(t); 180 int beginT = token.getStartNode().getOffset().intValue(); 181 int endT = token.getEndNode().getOffset().intValue(); 182 if((beginT<beginS)||(endT>endS)) 183 continue; 184 Token myToken = new Token(content.substring(beginT,endT)); 185 mySent.addToken(myToken); 186 text.setTokenMapping(myToken, token); 187 tokensToRemove.add(token); 188 } 189 tokens.removeAll(tokensToRemove); 190 } 191 sents.removeAll(sentsToRemove); 192 193 } 194 195 196 } 197 198 private static DefaultTokeniser tokeniser = null; 199 private static DefaultTokeniser getTokeniser() throws Exception 200 { 201 if(tokeniser == null) 202 { 203 tokeniser = (DefaultTokeniser) Factory.createResource( 204 "gate.creole.tokeniser.DefaultTokeniser"); 205 tokeniser.init(); 206 } 207 return tokeniser; 208 } 209 210 private static SentenceSplitter sentenceSplitter = null; 211 private static SentenceSplitter getSentenceSplitter() throws Exception{ 212 213 if(sentenceSplitter == null) 214 { 215 sentenceSplitter = (SentenceSplitter)Factory.createResource( 216 "gate.creole.splitter.SentenceSplitter"); 217 sentenceSplitter.init(); 218 } 219 return sentenceSplitter; 220 } 221 222 223 }