001 package jcolibri.method.retrieve.NNretrieval.similarity.local.textual.compressionbased; 002 003 import jcolibri.datatypes.Text; 004 import jcolibri.method.retrieve.NNretrieval.similarity.LocalSimilarityFunction; 005 006 /** 007 * This function returns the similarity of two strings using text compression. 008 * 009 * Normalised Compression Distance (NCD) is defined as follows NCD(x, y) = 010 * (C(xy) - min(C(X), C(y)) / max(C(x), C(y)) where C(x) is the size of string x 011 * after compression (and C(y) similarly) and C(xy) is the size, after 012 * compression, of the string that comprises y concatenated to the end of x. 013 * 014 * See the following paper 015 * <pre> 016 * inproceedings{Li03, author = "M. Li and X. Chen and X. Li and B. Ma and P. 017 * Vitanyi", title = "The Similarity Metric", booktitle = 018 * {Procs.\ of the 14th Annual ACM-SIAM Symposium on 019 * Discrete Algorithms}, pages = {863--872}, address = 020 * {Baltimore, Maryland}, year = "2003", } 021 * </pre> 022 * Note that NCD is not a distance metric. It is NOT in general the case that 023 * NCD(x, y) = 0 iff x = y NCD(x, y) = NCD(y, x) NCD(x, y) + NCD(y, z) >= NCD(x, 024 * z) Do not use this measure in any application that requires metric 025 * properties. 026 * 027 * Values of NCD will lies in the range [0.0, 1.0 + e] where e is some small 028 * value. 029 * 030 * We convert to a similarity measure by returning 1.0 - min(NCD(x, y), 1.0). 031 * 032 * This similarity measure is effective in case-based spam filtering using the 033 * GZip text compression algorithm. For the details, see: 034 * <pre> 035 * inproceedings{Delany-Bridge-2007, author = "S. J. Delany and D. Bridge", 036 * title = "Catching the Drift: Using 037 * Feature-Free Case-Based Reasoning for Spam 038 * Filtering", booktitle = "Procs.\ of the 039 * 7th International Conference on Case Based 040 * Reasoning", address = "Belfast, Northern 041 * Ireland", year = "2007" } 042 * </pre> 043 * Its effectiveness outside of spam filtering (e.g. texts other than emails, 044 * short strings, images, etc.) has not been demonstrated. To be effective, the 'right' 045 * compression algorithm must be used. 046 * 047 * @author Derek Bridge 18/05/07 048 * 049 */ 050 public class NormalisedCompression implements LocalSimilarityFunction { 051 private ICompressor compressor; 052 053 /** 054 * @param compressor 055 * an object that encapsulates the compression algorithm to be 056 * used. 057 */ 058 public NormalisedCompression(ICompressor compressor) { 059 this.compressor = compressor; 060 } 061 062 /** 063 * Applies the similarity function. 064 * 065 * @param x 066 * @param y 067 * @return the similarity value. 068 */ 069 public double compare(String x, String y) { 070 return 1.0 - Math.min(NCD(x, y), 1.0); 071 } 072 073 /** 074 * Applies the similarity function. 075 * 076 * @param o1 077 * String 078 * @param o2 079 * String 080 * @return result of apply the similarity function. 081 */ 082 public double compute(Object o1, Object o2) 083 throws jcolibri.exception.NoApplicableSimilarityFunctionException { 084 if ((o1 == null) || (o2 == null)) 085 return 0; 086 if (!((o1 instanceof java.lang.String)||(o1 instanceof Text))) 087 throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o1.getClass()); 088 if (!((o2 instanceof java.lang.String)||(o1 instanceof Text))) 089 throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o2.getClass()); 090 091 if(o1 instanceof Text) 092 o1 = ((Text)o1).toString(); 093 if(o2 instanceof Text) 094 o2 = ((Text)o2).toString(); 095 096 String s1 = (String) o1; 097 String s2 = (String) o2; 098 double res = compare(s1, s2); 099 return res; 100 } 101 102 public boolean isApplicable(Object o1, Object o2) 103 { 104 if((o1==null)&&(o2==null)) 105 return true; 106 else if(o1==null) 107 return (o2 instanceof String)||(o2 instanceof Text); 108 else if(o2==null) 109 return (o1 instanceof String)||(o1 instanceof Text); 110 else 111 return ((o1 instanceof String)&& (o2 instanceof String)) || 112 ((o1 instanceof Text) && (o2 instanceof Text)); 113 } 114 115 /** 116 * Applies the dissimilarity function. 117 * 118 * @param x 119 * @param y 120 * @return 121 */ 122 private double NCD(String x, String y) { 123 int cxy = compressor.getCompressedSize(x + y); 124 int cx = compressor.getCompressedSize(x); 125 int cy = compressor.getCompressedSize(y); 126 return (cxy - Math.min(cx, cy)) * 1.0 / Math.max(cx, cy); 127 } 128 129 public static void main(String[] args) 130 { 131 NormalisedCompression a = new NormalisedCompression(new GZipCompressor()); 132 double res = a.compare("Hello", "Goodbye"); 133 System.out.println("Hello-Goodbye: " + res); 134 res = a.compare("aaaaaaaaaaaaaaaaaaaaaaaaaa", "zzzzzzzzzzzzzzzzzzz"); 135 System.out.println("a-z: " + res); 136 res = a.compare("Hello", "Hello"); 137 System.out.println("Hello-Hello: " + res); 138 res = a.compare("abcdefghijkl", "bcdefghijk"); 139 System.out.println("abcdefghijkl-bcd: " + res); 140 } 141 }