001    /**
002     * CRNTable.java
003     * jCOLIBRI2 framework. 
004     * @author 
005     * GAIA - Group for Artificial Intelligence Applications
006     * http://gaia.fdi.ucm.es
007     * 14/12/2007
008     */
009    package jcolibri.extensions.textual.IE.common.crn;
010    
011    import java.util.ArrayList;
012    import java.util.HashMap;
013    import java.util.Iterator;
014    import java.util.Set;
015    
016    import jcolibri.extensions.textual.IE.common.crn.matrix.Matrix;
017    import jcolibri.extensions.textual.IE.common.crn.matrix.OneByteMatrix;
018    
019    /**
020     * Stores the Index entries for a case base
021     * @author iaa
022     */
023    public class CRNTable 
024    {
025        private HashMap<String,ArrayList<CRNIndexEntry>> mappings;
026        private ArrayList<Object> caseIds;
027        private ArrayList<String> units;
028        
029        private int nCases=0;
030        private short[][] units_cases;
031        private OneByteMatrix normalised;
032    
033        //private float[] idf;
034        //private float[] maxTF;
035        
036        /** Creates a new instance of CRNTable */
037        public CRNTable() 
038        {
039            mappings= new HashMap<String,ArrayList<CRNIndexEntry>>();
040            caseIds= new ArrayList<Object>();
041        }
042        
043        /** adds a single entry to the indexing structure */
044        public void addEntry(CRNIndexEntry e)
045        {
046            if (mappings.isEmpty())
047            {
048                ArrayList<CRNIndexEntry> entryList= new ArrayList<CRNIndexEntry>();
049                entryList.add(e);
050                mappings.put(e.getUnit(),entryList);
051            }
052            else
053            {
054                if (mappings.containsKey(e.getUnit()))
055                {
056                    ArrayList<CRNIndexEntry> entryList= mappings.get(e.getUnit());
057                    entryList.add(e);
058                    mappings.remove(e.getUnit());
059                    mappings.put(e.getUnit(),entryList);                
060                }
061                else
062                {
063                    ArrayList<CRNIndexEntry> entryList= new ArrayList<CRNIndexEntry>();
064                    entryList.add(e);
065                    mappings.put(e.getUnit(),entryList);
066                }
067            }
068            if (!caseIds.contains(e.getCaseId()))
069                    caseIds.add(e.getCaseId());
070            nCases= caseIds.size();                
071        }
072        
073        public void computeMatrix()
074        {
075            computeUnits();
076            computeUnitsCasesMatrix();
077            computeNormalisedUnitsCasesMatrix();
078            //computeIDF();
079            //computeMaxTF();
080        }
081        
082    
083    //    public float[] getIdf()
084    //    {
085    //        return idf;
086    //    }
087    //
088    //    public float[] getMaxTF()
089    //    {
090    //        return maxTF;
091    //    }
092    
093        public Matrix getNormalisedUnitsCasesMatrix()
094        {
095            return normalised;
096        }
097    
098        public ArrayList<String> getUnits()
099        {
100            return units;
101        }
102    
103        public short[][] getUnitsCasesMatrix()
104        {
105            return units_cases;
106        }
107        
108    
109        /** Print all the entries in the indexing structure */
110        public void print()
111        {
112            Set<String> s= mappings.keySet();
113            Iterator<String> i= s.iterator();
114            while (i.hasNext())
115            {            
116                String key= i.next();
117                ArrayList<CRNIndexEntry> list= mappings.get(key);
118                System.out.print(list.get(0).getUnit() + "-->");
119                for(int a=0;a<list.size();a++)
120                {
121                   System.out.print("["+ list.get(a).getCaseId().toString()+","+ list.get(a).getFreq()+ "] ");
122                }
123                System.out.println();            
124            }        
125        }
126        
127        /** Converts the Inverted file Index structure into a Matrix of the textualUnits by Cases
128         * @return Matrix containing the term frequencies of the units in the cases
129         */
130        private void computeUnitsCasesMatrix()
131        {
132            org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing Unit-Cases Matrix. Size: "+ mappings.size()*nCases*2);
133    
134            units_cases= new short[mappings.size()][nCases];
135        
136            for(String unit : units)
137            {           
138                int unitIndex = units.indexOf(unit);
139                ArrayList<CRNIndexEntry> list= mappings.get(unit);                
140                for(CRNIndexEntry indexEntry: list)
141                    units_cases[unitIndex][caseIds.indexOf(indexEntry.getCaseId())]= indexEntry.getFreq();            
142            }
143    
144        }
145        
146        /** Computes the normalised TF in the CasesUnitsMatrix 
147         */
148        private void computeNormalisedUnitsCasesMatrix()
149        {
150            org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Normalizing Units-Cases Matrix. Size: "+ mappings.size()*nCases);
151            normalised= new OneByteMatrix(mappings.size(),nCases);   //cases_units;
152            for (int i=0; i<nCases;i++)
153            {
154                double sum=0;
155                for (int j=0;j<mappings.size();j++)
156                    sum+= units_cases[j][i] * units_cases[j][i];
157                for (int j=0;j<mappings.size();j++)
158                    normalised.setValue(j,i, (float)Math.sqrt(sum));            
159            }           
160        }
161        
162        /**
163         */
164        public void computeUnits()
165        {
166            org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing Units list");
167    
168            units = new ArrayList<String>();
169            
170            units.addAll(mappings.keySet());
171            /*
172            if (!mappings.isEmpty() && units==null)
173            {
174                Set<String> s= mappings.keySet();
175                ArrayList<String> tempList= new ArrayList<String>();
176                Iterator<String> x= s.iterator();
177                while (x.hasNext())
178                {
179                    String key= x.next();
180                    tempList.add(key);
181                }            
182                
183                units= new String[tempList.size()];
184                int j=0;
185                Iterator<String> i= tempList.iterator();        
186                while (i.hasNext())
187                {
188                    String key= i.next();
189                    units[j]= key;                
190                    j++;
191                }
192            }
193            */
194        }
195        
196        /** Computes the inverse document frequency of each textual unit
197         */
198    //    public void computeIDF()
199    //    {
200    //      org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing IDF");
201    //
202    //      float[][] indexedVector= units_cases;
203    //        if (indexedVector.length >1)
204    //        {
205    //           idf= new float[units.length];
206    //           for (int i=0; i<units.length; i++)
207    //           {
208    //               int df=0;
209    //               for (int j=0; j<nCases;j++)
210    //               {
211    //                   if (indexedVector[i][j]!=0)
212    //                        df++;
213    //               }
214    //               idf[i]= (float)Math.log10((float)nCases/df);
215    //               if (idf[i]==0)
216    //                   idf[i]= (float)0.00001;
217    //           }            
218    //        }
219    //        else
220    //        {
221    //            idf= new float[1];
222    //            idf[0]= 0;
223    //        }
224    //    }
225        
226        /** Computes the maximum term frequency in each case that is used to normalise the term frequencies
227         */
228    //    public void computeMaxTF()
229    //    {
230    //      org.apache.commons.logging.LogFactory.getLog(this.getClass()).info("Computing Max Term frequencies");
231    //
232    //      float[] maxTF = new float[nCases];
233    //      float[][] indexedVector= cases_units;
234    //        for (int i=0;i<nCases;i++)
235    //        {
236    //             maxTF[i]= indexedVector[i][0];
237    //             for (int j=0;j<mappings.size();j++)
238    //             {
239    //                 if (indexedVector[i][j]> maxTF[i])
240    //                     maxTF[i]= indexedVector[i][j];
241    //             }            
242    //        }
243    //    }
244        
245        public int getNumCases()
246        {
247            return nCases;
248        }
249        
250        public ArrayList<Object> getCaseIds()
251        {
252            return caseIds;
253        }
254    }
255