001    package jcolibri.method.retrieve.NNretrieval.similarity.local.textual.compressionbased;
002    
003    import jcolibri.datatypes.Text;
004    import jcolibri.method.retrieve.NNretrieval.similarity.LocalSimilarityFunction;
005    
006    
007    /**
008     * This function returns the similarity of two strings using text compression.
009     *
010     * Compression Based Dissimilarity (CDM) is defined as follows
011     * CDM(x, y) = C(xy) / (C(x) + C(y))
012     * where C(x) is the size of string x after compression (and C(y) similarly)
013     * and C(xy) is the size, after compression, of the string that comprises y 
014     * concatenated to the end of x.
015     *
016     * See the following paper:
017     * <pre>
018     * inproceedings{Keogh04,
019     * author = {E. Keogh and S. Lonardi and C. Ratanamahatana},
020     * title ={Towards Parameter-Free Data Mining},
021     * booktitle = {Procs.\ of the 10th ACM SIGKDD, International Conference on Knowledge Discovery and Data Mining},
022     * year = {2004},
023     * pages = {206--215},
024     * address = {New York, NY, USA},
025     * }
026     *</pre>
027     *
028     * Note that CDM is not a distance metric. It is NOT in general the case that
029     * CDM(x, y) = 0 iff x = y
030     * CDM(x, y) = CDM(y, x)
031     * CDM(x, y) + CDM(y, z) >= CDM(x, z)
032     * Do not use this measure in any application that requires metric properties.
033     * 
034     * Values of CDM will lies in the range (0.5, 1.0].
035     *
036     * We convert to a similarity measure by returning 1.0 - CDM(x, y).
037     *
038     * This similarity measure is effective in case-based spam filtering using GZip and
039     * PPM text compression algorithms. For the details, see:
040     * <pre>
041     * inproceedings{Delany-Bridge-2006,
042     * author = "S. J. Delany and D. Bridge",
043     * title = "Feature-Based and Feature-Free Textual {CBR}: A Comparison in Spam Filtering", 
044     * booktitle = "Procs.\ of the 17th Irish Conference on Artificial Intelligence and Cognitive Science", 
045     * pages = "244--253", 
046     * address = "Belfast, Northern Ireland",
047     * year = "2006"
048     * }
049     * </pre>
050     * Its effectiveness outside of spam filtering (e.g. texts other than emails, short strings,
051     * images, etc.) has not been demonstrated. To be effective, the 'right' compression algorithm
052     * must be used.
053     *
054     * @author Derek Bridge
055     * 18/05/07 
056     * 
057     */
058    public class CompressionBased implements LocalSimilarityFunction 
059    {
060            private ICompressor compressor;
061    
062            /**
063             * @param compressor an object that encapsulates the compression algorithm to be used.
064             */
065            public CompressionBased(ICompressor compressor)
066            {
067                this.compressor = compressor;
068            }
069    
070            /**
071             * Applies the similarity function.
072             * 
073             * @param x
074             * @param y
075             * @return the simlarity value.
076             */
077            public double compare(String x, String y) 
078            {
079                return 1.0 - CDM(x, y);
080            }
081    
082            /**
083             * Applies the similarity function.
084             * 
085             * @param o1
086             *            String
087             * @param o2
088             *            String
089             * @return result of apply the similarity function.
090             */
091            public double compute(Object o1, Object o2) throws jcolibri.exception.NoApplicableSimilarityFunctionException{
092                    if ((o1 == null) || (o2 == null))
093                            return 0;
094                    if (!((o1 instanceof java.lang.String)||(o1 instanceof Text)))
095                            throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o1.getClass());
096                    if (!((o2 instanceof java.lang.String)||(o1 instanceof Text)))
097                            throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o2.getClass());
098    
099                    if(o1 instanceof Text)
100                        o1 = ((Text)o1).toString();
101                    if(o2 instanceof Text)
102                        o2 = ((Text)o2).toString();
103                    
104                    String s1 = (String) o1;
105                    String s2 = (String) o2;
106                    double res = compare(s1, s2);
107                    return res;
108            }
109    
110            public boolean isApplicable(Object o1, Object o2)
111            {
112                    if((o1==null)&&(o2==null))
113                            return true;
114                    else if(o1==null)
115                            return (o2 instanceof String)||(o2 instanceof Text);
116                    else if(o2==null)
117                            return (o1 instanceof String)||(o1 instanceof Text);
118                    else
119                            return ((o1 instanceof String)&& (o2 instanceof String)) ||
120                                   ((o1 instanceof Text)  && (o2 instanceof Text));
121            }
122    
123            /**
124             * Applies the dissimilarity function.
125             * 
126             * @param x
127             * @param y
128             * @return
129             */
130            private double CDM(String x, String y) 
131            {
132                return compressor.getCompressedSize(x + y) * 1.0f / 
133                            (compressor.getCompressedSize(x) + compressor.getCompressedSize(y));
134            }
135            
136            public static void main(String[] args)
137            {
138                    CompressionBased a = new CompressionBased(new GZipCompressor());
139                    double res = a.compare("Hello", "Goodbye");
140                    System.out.println("Hello-Goodbye: " + res);
141                    res = a.compare("aaaaaaaaaaaaaaaaaaaaaaa", "zzzzzzzzzzzzzzzzzzzzzzzz");
142                    System.out.println("a-z: " + res);
143                    res = a.compare("Hello", "Hello");
144                    System.out.println("Hello-Hello: " + res);
145                    res = a.compare("abcdefghijkl", "bcdefghijk");
146                    System.out.println("abcdefghijkl-bcd: " + res);
147                    
148                    
149            }
150    }