001 /** 002 * CRNTable.java 003 * jCOLIBRI2 framework. 004 * @author 005 * GAIA - Group for Artificial Intelligence Applications 006 * http://gaia.fdi.ucm.es 007 * 14/12/2007 008 */ 009 package jcolibri.extensions.textual.IE.common.crn; 010 011 import java.util.ArrayList; 012 import java.util.HashMap; 013 import java.util.Iterator; 014 import java.util.Set; 015 016 import jcolibri.extensions.textual.IE.common.crn.matrix.Matrix; 017 import jcolibri.extensions.textual.IE.common.crn.matrix.OneByteMatrix; 018 019 /** 020 * Stores the Index entries for a case base 021 * @author iaa 022 */ 023 public class CRNTable 024 { 025 private HashMap<String,ArrayList<CRNIndexEntry>> mappings; 026 private ArrayList<Object> caseIds; 027 private ArrayList<String> units; 028 029 private int nCases=0; 030 private short[][] units_cases; 031 private OneByteMatrix normalised; 032 033 //private float[] idf; 034 //private float[] maxTF; 035 036 /** Creates a new instance of CRNTable */ 037 public CRNTable() 038 { 039 mappings= new HashMap<String,ArrayList<CRNIndexEntry>>(); 040 caseIds= new ArrayList<Object>(); 041 } 042 043 /** adds a single entry to the indexing structure */ 044 public void addEntry(CRNIndexEntry e) 045 { 046 if (mappings.isEmpty()) 047 { 048 ArrayList<CRNIndexEntry> entryList= new ArrayList<CRNIndexEntry>(); 049 entryList.add(e); 050 mappings.put(e.getUnit(),entryList); 051 } 052 else 053 { 054 if (mappings.containsKey(e.getUnit())) 055 { 056 ArrayList<CRNIndexEntry> entryList= mappings.get(e.getUnit()); 057 entryList.add(e); 058 mappings.remove(e.getUnit()); 059 mappings.put(e.getUnit(),entryList); 060 } 061 else 062 { 063 ArrayList<CRNIndexEntry> entryList= new ArrayList<CRNIndexEntry>(); 064 entryList.add(e); 065 mappings.put(e.getUnit(),entryList); 066 } 067 } 068 if (!caseIds.contains(e.getCaseId())) 069 caseIds.add(e.getCaseId()); 070 nCases= caseIds.size(); 071 } 072 073 public void computeMatrix() 074 { 075 computeUnits(); 076 computeUnitsCasesMatrix(); 077 computeNormalisedUnitsCasesMatrix(); 078 //computeIDF(); 079 //computeMaxTF(); 080 } 081 082 083 // public float[] getIdf() 084 // { 085 // return idf; 086 // } 087 // 088 // public float[] getMaxTF() 089 // { 090 // return maxTF; 091 // } 092 093 public Matrix getNormalisedUnitsCasesMatrix() 094 { 095 return normalised; 096 } 097 098 public ArrayList<String> getUnits() 099 { 100 return units; 101 } 102 103 public short[][] getUnitsCasesMatrix() 104 { 105 return units_cases; 106 } 107 108 109 /** Print all the entries in the indexing structure */ 110 public void print() 111 { 112 Set<String> s= mappings.keySet(); 113 Iterator<String> i= s.iterator(); 114 while (i.hasNext()) 115 { 116 String key= i.next(); 117 ArrayList<CRNIndexEntry> list= mappings.get(key); 118 System.out.print(list.get(0).getUnit() + "-->"); 119 for(int a=0;a<list.size();a++) 120 { 121 System.out.print("["+ list.get(a).getCaseId().toString()+","+ list.get(a).getFreq()+ "] "); 122 } 123 System.out.println(); 124 } 125 } 126 127 /** Converts the Inverted file Index structure into a Matrix of the textualUnits by Cases 128 * @return Matrix containing the term frequencies of the units in the cases 129 */ 130 private void computeUnitsCasesMatrix() 131 { 132 org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing Unit-Cases Matrix. Size: "+ mappings.size()*nCases*2); 133 134 units_cases= new short[mappings.size()][nCases]; 135 136 for(String unit : units) 137 { 138 int unitIndex = units.indexOf(unit); 139 ArrayList<CRNIndexEntry> list= mappings.get(unit); 140 for(CRNIndexEntry indexEntry: list) 141 units_cases[unitIndex][caseIds.indexOf(indexEntry.getCaseId())]= indexEntry.getFreq(); 142 } 143 144 } 145 146 /** Computes the normalised TF in the CasesUnitsMatrix 147 */ 148 private void computeNormalisedUnitsCasesMatrix() 149 { 150 org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Normalizing Units-Cases Matrix. Size: "+ mappings.size()*nCases); 151 normalised= new OneByteMatrix(mappings.size(),nCases); //cases_units; 152 for (int i=0; i<nCases;i++) 153 { 154 double sum=0; 155 for (int j=0;j<mappings.size();j++) 156 sum+= units_cases[j][i] * units_cases[j][i]; 157 for (int j=0;j<mappings.size();j++) 158 normalised.setValue(j,i, (float)Math.sqrt(sum)); 159 } 160 } 161 162 /** 163 */ 164 public void computeUnits() 165 { 166 org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing Units list"); 167 168 units = new ArrayList<String>(); 169 170 units.addAll(mappings.keySet()); 171 /* 172 if (!mappings.isEmpty() && units==null) 173 { 174 Set<String> s= mappings.keySet(); 175 ArrayList<String> tempList= new ArrayList<String>(); 176 Iterator<String> x= s.iterator(); 177 while (x.hasNext()) 178 { 179 String key= x.next(); 180 tempList.add(key); 181 } 182 183 units= new String[tempList.size()]; 184 int j=0; 185 Iterator<String> i= tempList.iterator(); 186 while (i.hasNext()) 187 { 188 String key= i.next(); 189 units[j]= key; 190 j++; 191 } 192 } 193 */ 194 } 195 196 /** Computes the inverse document frequency of each textual unit 197 */ 198 // public void computeIDF() 199 // { 200 // org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing IDF"); 201 // 202 // float[][] indexedVector= units_cases; 203 // if (indexedVector.length >1) 204 // { 205 // idf= new float[units.length]; 206 // for (int i=0; i<units.length; i++) 207 // { 208 // int df=0; 209 // for (int j=0; j<nCases;j++) 210 // { 211 // if (indexedVector[i][j]!=0) 212 // df++; 213 // } 214 // idf[i]= (float)Math.log10((float)nCases/df); 215 // if (idf[i]==0) 216 // idf[i]= (float)0.00001; 217 // } 218 // } 219 // else 220 // { 221 // idf= new float[1]; 222 // idf[0]= 0; 223 // } 224 // } 225 226 /** Computes the maximum term frequency in each case that is used to normalise the term frequencies 227 */ 228 // public void computeMaxTF() 229 // { 230 // org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing Max Term frequencies"); 231 // 232 // float[] maxTF = new float[nCases]; 233 // float[][] indexedVector= cases_units; 234 // for (int i=0;i<nCases;i++) 235 // { 236 // maxTF[i]= indexedVector[i][0]; 237 // for (int j=0;j<mappings.size();j++) 238 // { 239 // if (indexedVector[i][j]> maxTF[i]) 240 // maxTF[i]= indexedVector[i][j]; 241 // } 242 // } 243 // } 244 245 public int getNumCases() 246 { 247 return nCases; 248 } 249 250 public ArrayList<Object> getCaseIds() 251 { 252 return caseIds; 253 } 254 } 255