001 /** 002 * IEText.java 003 * jCOLIBRI2 framework. 004 * @author Juan A. Recio-García. 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 15/06/2007 008 */ 009 package jcolibri.extensions.textual.IE.representation; 010 011 import java.util.ArrayList; 012 import java.util.List; 013 014 import jcolibri.datatypes.Text; 015 import jcolibri.extensions.textual.IE.representation.info.FeatureInfo; 016 import jcolibri.extensions.textual.IE.representation.info.PhraseInfo; 017 018 /** 019 * Represents a Textual attribute that will be processed to extract information. 020 * A text is composed by paragraphs, paragraphs by sentences and sentences by tokens: 021 * <p><center><img src="IETextRepresentation.jpg"/></center></p> 022 * This organization is created by a specific method. 023 * <br> 024 * This object stores a list of paragraphs in the order they appear in the text. 025 * <br> 026 * This class also stores the extracted information: 027 * <ul> 028 * <li>Phrases identified in the text. 029 * <li>Features: identifier-value pairs extracted from the text. 030 * <li>Topics: combining phrases and features a topic can be associated to a text. A topic is a classification of the text. 031 * </ul> 032 * 033 * @author Juan A. Recio Garcia 034 * @version 1.0 035 * @see jcolibri.extensions.textual.IE.representation.Paragraph 036 * @see jcolibri.extensions.textual.IE.representation.Sentence 037 * @see jcolibri.extensions.textual.IE.representation.Token 038 */ 039 public class IEText extends Text 040 { 041 042 protected List<Paragraph> paragraphs; 043 044 protected List<PhraseInfo> phrases; 045 046 protected List<FeatureInfo> features; 047 048 protected List<String> topics; 049 050 /** 051 * Creates an empty IEText 052 */ 053 public IEText() 054 { 055 paragraphs = new ArrayList<Paragraph>(); 056 phrases = new ArrayList<PhraseInfo>(); 057 features = new ArrayList<FeatureInfo>(); 058 topics = new ArrayList<String>(); 059 } 060 061 /** 062 * Creates an IEText from a String 063 * @param content 064 */ 065 public IEText(String content) 066 { 067 super(content); 068 paragraphs = new ArrayList<Paragraph>(); 069 phrases = new ArrayList<PhraseInfo>(); 070 features = new ArrayList<FeatureInfo>(); 071 topics = new ArrayList<String>(); 072 } 073 074 /** 075 * Returns the original text of this IEText object 076 */ 077 public String getRAWContent() 078 { 079 return rawContent; 080 } 081 082 /** 083 * Returns the annotations extracted in this text 084 */ 085 public String printAnnotations() 086 { 087 StringBuffer sb = new StringBuffer(); 088 for (Paragraph par : paragraphs) 089 sb.append(par.toString()); 090 return sb.toString() + "\nPHRASES: " + phrases.toString() + "\nFEATURES: " + features.toString(); 091 } 092 093 /** 094 * Returns the features 095 */ 096 public List<FeatureInfo> getFeatures() 097 { 098 return features; 099 } 100 101 /** 102 * Adds features 103 */ 104 public void addFeatures(List<FeatureInfo> features) 105 { 106 features.addAll(features); 107 } 108 109 /** 110 * Adds a feature 111 */ 112 public void addFeature(FeatureInfo feature) 113 { 114 features.add(feature); 115 } 116 117 /** 118 * Returns the paragraphs 119 */ 120 public List<Paragraph> getParagraphs() 121 { 122 return paragraphs; 123 } 124 125 /** 126 * Adds paragraphs 127 */ 128 public void addParagraphs(List<Paragraph> paragraphs) 129 { 130 this.paragraphs.addAll(paragraphs); 131 } 132 133 /** 134 * Adds a paragraph 135 */ 136 public void addParagraph(Paragraph paragraph) 137 { 138 this.paragraphs.add(paragraph); 139 } 140 141 /** 142 * Returns the phrases 143 */ 144 public List<PhraseInfo> getPhrases() 145 { 146 return phrases; 147 } 148 149 /** 150 * Adds phrases 151 */ 152 public void addPhrases(List<PhraseInfo> phrases) 153 { 154 this.phrases.addAll(phrases); 155 } 156 157 /** 158 * Adds a phrase 159 */ 160 public void addPhrase(PhraseInfo phrase) 161 { 162 this.phrases.add(phrase); 163 } 164 165 /** 166 * Returns the topcis 167 */ 168 public List<String> getTopics() 169 { 170 return topics; 171 } 172 173 /*** 174 * Adds topics 175 */ 176 public void addTopics(List<String> topics) 177 { 178 this.topics.addAll(topics); 179 } 180 181 /** 182 * Adds a topic 183 */ 184 public void addTopic(String topics) 185 { 186 this.topics.add(topics); 187 } 188 189 /** 190 * Returns all the sentences of this texts iterating over all paragraphs 191 */ 192 public List<Sentence> getAllSentences() 193 { 194 List<Sentence> sentences = new ArrayList<Sentence>(); 195 for (Paragraph p : paragraphs) 196 sentences.addAll(p.getSentences()); 197 return sentences; 198 } 199 200 /** 201 * Returs all the tokens of this texts iterating over all paragraphs and their contained sentences. 202 */ 203 public List<Token> getAllTokens() 204 { 205 List<Token> tokens = new ArrayList<Token>(); 206 for (Paragraph p : paragraphs) 207 for (Sentence s : p.getSentences()) 208 tokens.addAll(s.getTokens()); 209 return tokens; 210 } 211 212 }