001 package jcolibri.method.retrieve.NNretrieval.similarity.local.textual.compressionbased; 002 003 import jcolibri.datatypes.Text; 004 import jcolibri.method.retrieve.NNretrieval.similarity.LocalSimilarityFunction; 005 006 007 /** 008 * This function returns the similarity of two strings using text compression. 009 * 010 * Compression Based Dissimilarity (CDM) is defined as follows 011 * CDM(x, y) = C(xy) / (C(x) + C(y)) 012 * where C(x) is the size of string x after compression (and C(y) similarly) 013 * and C(xy) is the size, after compression, of the string that comprises y 014 * concatenated to the end of x. 015 * 016 * See the following paper: 017 * <pre> 018 * inproceedings{Keogh04, 019 * author = {E. Keogh and S. Lonardi and C. Ratanamahatana}, 020 * title ={Towards Parameter-Free Data Mining}, 021 * booktitle = {Procs.\ of the 10th ACM SIGKDD, International Conference on Knowledge Discovery and Data Mining}, 022 * year = {2004}, 023 * pages = {206--215}, 024 * address = {New York, NY, USA}, 025 * } 026 *</pre> 027 * 028 * Note that CDM is not a distance metric. It is NOT in general the case that 029 * CDM(x, y) = 0 iff x = y 030 * CDM(x, y) = CDM(y, x) 031 * CDM(x, y) + CDM(y, z) >= CDM(x, z) 032 * Do not use this measure in any application that requires metric properties. 033 * 034 * Values of CDM will lies in the range (0.5, 1.0]. 035 * 036 * We convert to a similarity measure by returning 1.0 - CDM(x, y). 037 * 038 * This similarity measure is effective in case-based spam filtering using GZip and 039 * PPM text compression algorithms. For the details, see: 040 * <pre> 041 * inproceedings{Delany-Bridge-2006, 042 * author = "S. J. Delany and D. Bridge", 043 * title = "Feature-Based and Feature-Free Textual {CBR}: A Comparison in Spam Filtering", 044 * booktitle = "Procs.\ of the 17th Irish Conference on Artificial Intelligence and Cognitive Science", 045 * pages = "244--253", 046 * address = "Belfast, Northern Ireland", 047 * year = "2006" 048 * } 049 * </pre> 050 * Its effectiveness outside of spam filtering (e.g. texts other than emails, short strings, 051 * images, etc.) has not been demonstrated. To be effective, the 'right' compression algorithm 052 * must be used. 053 * 054 * @author Derek Bridge 055 * 18/05/07 056 * 057 */ 058 public class CompressionBased implements LocalSimilarityFunction 059 { 060 private ICompressor compressor; 061 062 /** 063 * @param compressor an object that encapsulates the compression algorithm to be used. 064 */ 065 public CompressionBased(ICompressor compressor) 066 { 067 this.compressor = compressor; 068 } 069 070 /** 071 * Applies the similarity function. 072 * 073 * @param x 074 * @param y 075 * @return the simlarity value. 076 */ 077 public double compare(String x, String y) 078 { 079 return 1.0 - CDM(x, y); 080 } 081 082 /** 083 * Applies the similarity function. 084 * 085 * @param o1 086 * String 087 * @param o2 088 * String 089 * @return result of apply the similarity function. 090 */ 091 public double compute(Object o1, Object o2) throws jcolibri.exception.NoApplicableSimilarityFunctionException{ 092 if ((o1 == null) || (o2 == null)) 093 return 0; 094 if (!((o1 instanceof java.lang.String)||(o1 instanceof Text))) 095 throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o1.getClass()); 096 if (!((o2 instanceof java.lang.String)||(o1 instanceof Text))) 097 throw new jcolibri.exception.NoApplicableSimilarityFunctionException(this.getClass(), o2.getClass()); 098 099 if(o1 instanceof Text) 100 o1 = ((Text)o1).toString(); 101 if(o2 instanceof Text) 102 o2 = ((Text)o2).toString(); 103 104 String s1 = (String) o1; 105 String s2 = (String) o2; 106 double res = compare(s1, s2); 107 return res; 108 } 109 110 public boolean isApplicable(Object o1, Object o2) 111 { 112 if((o1==null)&&(o2==null)) 113 return true; 114 else if(o1==null) 115 return (o2 instanceof String)||(o2 instanceof Text); 116 else if(o2==null) 117 return (o1 instanceof String)||(o1 instanceof Text); 118 else 119 return ((o1 instanceof String)&& (o2 instanceof String)) || 120 ((o1 instanceof Text) && (o2 instanceof Text)); 121 } 122 123 /** 124 * Applies the dissimilarity function. 125 * 126 * @param x 127 * @param y 128 * @return 129 */ 130 private double CDM(String x, String y) 131 { 132 return compressor.getCompressedSize(x + y) * 1.0f / 133 (compressor.getCompressedSize(x) + compressor.getCompressedSize(y)); 134 } 135 136 public static void main(String[] args) 137 { 138 CompressionBased a = new CompressionBased(new GZipCompressor()); 139 double res = a.compare("Hello", "Goodbye"); 140 System.out.println("Hello-Goodbye: " + res); 141 res = a.compare("aaaaaaaaaaaaaaaaaaaaaaa", "zzzzzzzzzzzzzzzzzzzzzzzz"); 142 System.out.println("a-z: " + res); 143 res = a.compare("Hello", "Hello"); 144 System.out.println("Hello-Hello: " + res); 145 res = a.compare("abcdefghijkl", "bcdefghijk"); 146 System.out.println("abcdefghijkl-bcd: " + res); 147 148 149 } 150 }