001    package jcolibri.method.retrieve.NNretrieval.similarity.local.textual.compressionbased;
002    
003    import jcolibri.datatypes.Text;
004    import jcolibri.method.retrieve.NNretrieval.similarity.LocalSimilarityFunction;
005    
006    /**
007     * This function returns the similarity of two strings using text compression.
008     * 
009     * Normalised Compression Distance (NCD) is defined as follows NCD(x, y) =
010     * (C(xy) - min(C(X), C(y)) / max(C(x), C(y)) where C(x) is the size of string x
011     * after compression (and C(y) similarly) and C(xy) is the size, after
012     * compression, of the string that comprises y concatenated to the end of x.
013     * 
014     * See the following paper
015     * <pre>
016     * inproceedings{Li03, author = "M. Li and X. Chen and X. Li and B. Ma and P.
017     *                      Vitanyi", title = "The Similarity Metric", booktitle =
018     *                      {Procs.\ of the 14th Annual ACM-SIAM Symposium on
019     *                      Discrete Algorithms}, pages = {863--872}, address =
020     *                      {Baltimore, Maryland}, year = "2003", }
021     * </pre>
022     * Note that NCD is not a distance metric. It is NOT in general the case that
023     * NCD(x, y) = 0 iff x = y NCD(x, y) = NCD(y, x) NCD(x, y) + NCD(y, z) >= NCD(x,
024     * z) Do not use this measure in any application that requires metric
025     * properties.
026     * 
027     * Values of NCD will lies in the range [0.0, 1.0 + e] where e is some small
028     * value.
029     * 
030     * We convert to a similarity measure by returning 1.0 - min(NCD(x, y), 1.0).
031     * 
032     * This similarity measure is effective in case-based spam filtering using the
033     * GZip text compression algorithm. For the details, see:
034     * <pre>
035     * inproceedings{Delany-Bridge-2007, author = "S. J. Delany and D. Bridge",
036     *                                    title = "Catching the Drift: Using
037     *                                    Feature-Free Case-Based Reasoning for Spam
038     *                                    Filtering", booktitle = "Procs.\ of the
039     *                                    7th International Conference on Case Based
040     *                                    Reasoning", address = "Belfast, Northern
041     *                                    Ireland", year = "2007" }
042     * </pre>
043     * Its effectiveness outside of spam filtering (e.g. texts other than emails,
044     * short strings, images, etc.) has not been demonstrated. To be effective, the 'right'
045     * compression algorithm must be used.
046     * 
047     * @author Derek Bridge 18/05/07
048     * 
049     */
050    public class NormalisedCompression implements LocalSimilarityFunction {
051            private ICompressor compressor;
052    
053            /**
054             * @param compressor
055             *            an object that encapsulates the compression algorithm to be
056             *            used.
057             */
058            public NormalisedCompression(ICompressor compressor) {
059                    this.compressor = compressor;
060            }
061    
062            /**
063             * Applies the similarity function.
064             * 
065             * @param x
066             * @param y
067             * @return the similarity value.
068             */
069            public double compare(String x, String y) {
070                    return 1.0 - Math.min(NCD(x, y), 1.0);
071            }
072    
073            /**
074             * Applies the similarity function.
075             * 
076             * @param o1
077             *            String
078             * @param o2
079             *            String
080             * @return result of apply the similarity function.
081             */
082            public double compute(Object o1, Object o2)
083                            throws jcolibri.exception.NoApplicableSimilarityFunctionException {
084                    if ((o1 == null) || (o2 == null))
085                            return 0;
086                    if (!((o1 instanceof java.lang.String)||(o1 instanceof Text)))
087                            throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o1.getClass());
088                    if (!((o2 instanceof java.lang.String)||(o1 instanceof Text)))
089                            throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o2.getClass());
090    
091                    if(o1 instanceof Text)
092                        o1 = ((Text)o1).toString();
093                    if(o2 instanceof Text)
094                        o2 = ((Text)o2).toString();
095    
096                    String s1 = (String) o1;
097                    String s2 = (String) o2;
098                    double res = compare(s1, s2);
099                    return res;
100            }
101    
102            public boolean isApplicable(Object o1, Object o2)
103            {
104                    if((o1==null)&&(o2==null))
105                            return true;
106                    else if(o1==null)
107                            return (o2 instanceof String)||(o2 instanceof Text);
108                    else if(o2==null)
109                            return (o1 instanceof String)||(o1 instanceof Text);
110                    else
111                            return ((o1 instanceof String)&& (o2 instanceof String)) ||
112                                   ((o1 instanceof Text)  && (o2 instanceof Text));
113            }
114    
115            /**
116             * Applies the dissimilarity function.
117             * 
118             * @param x
119             * @param y
120             * @return
121             */
122            private double NCD(String x, String y) {
123                    int cxy = compressor.getCompressedSize(x + y);
124                    int cx = compressor.getCompressedSize(x);
125                    int cy = compressor.getCompressedSize(y);
126                    return (cxy - Math.min(cx, cy)) * 1.0 / Math.max(cx, cy);
127            }
128            
129            public static void main(String[] args)
130            {
131                    NormalisedCompression a = new NormalisedCompression(new GZipCompressor());
132                    double res = a.compare("Hello", "Goodbye");
133                    System.out.println("Hello-Goodbye: " + res);
134                    res = a.compare("aaaaaaaaaaaaaaaaaaaaaaaaaa", "zzzzzzzzzzzzzzzzzzz");
135                    System.out.println("a-z: " + res);
136                    res = a.compare("Hello", "Hello");
137                    System.out.println("Hello-Hello: " + res);
138                    res = a.compare("abcdefghijkl", "bcdefghijk");
139                    System.out.println("abcdefghijkl-bcd: " + res);
140            }
141    }