001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.language;
019    
020    import java.util.Locale;
021    
022    import org.apache.commons.codec.EncoderException;
023    import org.apache.commons.codec.StringEncoder;
024    
025    /**
026     * <p>
027     * Encodes a string into a Cologne Phonetic value.
028     * </p>
029     * <p>
030     * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">“Kölner Phonetic�</a> (Cologne Phonetic)
031     * algorithm issued by Hans Joachim Postel in 1969.
032     * </p>
033     * 
034     * <p>
035     * The <i>Kölner Phonetik</i> is a phonetic algorithm which is optimized for the German language. It is related to the
036     * well-known soundex algorithm.
037     * </p>
038     * 
039     * <h2>Algorithm</h2>
040     * 
041     * <ul>
042     * 
043     * <li>
044     * <h3>Step 1:</h3>
045     * After preprocessing (convertion to upper case, transcription of <a
046     * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
047     * letters of the supplied text are replaced by their phonetic code according to the folowing table.
048     * <table border="1">
049     * <tbody>
050     * <tr>
051     * <th>Letter</th>
052     * <th>Context</th>
053     * <th align="center">Code</th>
054     * </tr>
055     * <tr>
056     * <td>A, E, I, J, O, U, Y</td>
057     * <td></td>
058     * <td align="center">0</td>
059     * </tr>
060     * <tr>
061     * 
062     * <td>H</td>
063     * <td></td>
064     * <td align="center">-</td>
065     * </tr>
066     * <tr>
067     * <td>B</td>
068     * <td></td>
069     * <td rowspan="2" align="center">1</td>
070     * </tr>
071     * <tr>
072     * <td>P</td>
073     * <td>not before H</td>
074     * 
075     * </tr>
076     * <tr>
077     * <td>D, T</td>
078     * <td>not before C, S, Z</td>
079     * <td align="center">2</td>
080     * </tr>
081     * <tr>
082     * <td>F, V, W</td>
083     * <td></td>
084     * <td rowspan="2" align="center">3</td>
085     * </tr>
086     * <tr>
087     * 
088     * <td>P</td>
089     * <td>before H</td>
090     * </tr>
091     * <tr>
092     * <td>G, K, Q</td>
093     * <td></td>
094     * <td rowspan="3" align="center">4</td>
095     * </tr>
096     * <tr>
097     * <td rowspan="2">C</td>
098     * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
099     * 
100     * </tr>
101     * <tr>
102     * <td>before A, H, K, O, Q, U, X except after S, Z</td>
103     * </tr>
104     * <tr>
105     * <td>X</td>
106     * <td>not after C, K, Q</td>
107     * <td align="center">48</td>
108     * </tr>
109     * <tr>
110     * <td>L</td>
111     * <td></td>
112     * 
113     * <td align="center">5</td>
114     * </tr>
115     * <tr>
116     * <td>M, N</td>
117     * <td></td>
118     * <td align="center">6</td>
119     * </tr>
120     * <tr>
121     * <td>R</td>
122     * <td></td>
123     * <td align="center">7</td>
124     * </tr>
125     * 
126     * <tr>
127     * <td>S, Z</td>
128     * <td></td>
129     * <td rowspan="6" align="center">8</td>
130     * </tr>
131     * <tr>
132     * <td rowspan="3">C</td>
133     * <td>after S, Z</td>
134     * </tr>
135     * <tr>
136     * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
137     * </tr>
138     * 
139     * <tr>
140     * <td>not before A, H, K, O, Q, U, X</td>
141     * </tr>
142     * <tr>
143     * <td>D, T</td>
144     * <td>before C, S, Z</td>
145     * </tr>
146     * <tr>
147     * <td>X</td>
148     * <td>after C, K, Q</td>
149     * </tr>
150     * </tbody>
151     * </table>
152     * <p>
153     * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
154     * Kölner Phonetik – Buchstabencodes</a>)</i></small>
155     * </p>
156     * 
157     * <h4>Example:</h4>
158     * 
159     * {@code "Müller-Lüdenscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"}
160     * 
161     * </li>
162     * 
163     * <li>
164     * <h3>Step 2:</h3>
165     * Collapse of all multiple consecutive code digits.
166     * <h4>Example:</h4>
167     * {@code "6005507500206880022" => "6050750206802"}</li>
168     * 
169     * <li>
170     * <h3>Step 3:</h3>
171     * Removal of all codes “0� except at the beginning. This means that two or more identical consecutive digits can occur
172     * if they occur after removing the "0" digits.
173     * 
174     * <h4>Example:</h4>
175     * {@code "6050750206802" => "65752682"}</li>
176     * 
177     * </ul>
178     * 
179     * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a>
180     * @author Apache Software Foundation
181     * @since 1.5
182     */
183    public class ColognePhonetic implements StringEncoder {
184    
185        private abstract class CologneBuffer {
186    
187            protected final char[] data;
188    
189            protected int length = 0;
190    
191            public CologneBuffer(char[] data) {
192                this.data = data;
193                this.length = data.length;
194            }
195    
196            public CologneBuffer(int buffSize) {
197                this.data = new char[buffSize];
198                this.length = 0;
199            }
200    
201            protected abstract char[] copyData(int start, final int length);
202    
203            public int length() {
204                return length;
205            }
206    
207            public String toString() {
208                return new String(copyData(0, length));
209            }
210        }
211    
212        private class CologneOutputBuffer extends CologneBuffer {
213    
214            public CologneOutputBuffer(int buffSize) {
215                super(buffSize);
216            }
217    
218            public void addRight(char chr) {
219                data[length] = chr;
220                length++;
221            }
222    
223            protected char[] copyData(int start, final int length) {
224                char[] newData = new char[length];
225                System.arraycopy(data, start, newData, 0, length);
226                return newData;
227            }
228        }
229    
230        private class CologneInputBuffer extends CologneBuffer {
231    
232            public CologneInputBuffer(char[] data) {
233                super(data);
234            }
235    
236            public void addLeft(char ch) {
237                length++;
238                data[getNextPos()] = ch;
239            }
240    
241            protected char[] copyData(int start, final int length) {
242                char[] newData = new char[length];
243                System.arraycopy(data, data.length - this.length + start, newData, 0, length);
244                return newData;
245            }
246    
247            public char getNextChar() {
248                return data[getNextPos()];
249            }
250    
251            protected int getNextPos() {
252                return data.length - length;
253            }
254    
255            public char removeNext() {
256                char ch = getNextChar();
257                length--;
258                return ch;
259            }
260        }
261    
262        private static final char[][] PREPROCESS_MAP = new char[][]{{'\u00C4', 'A'}, // Ä
263            {'\u00DC', 'U'}, // Ü
264            {'\u00D6', 'O'}, // Ö
265            {'\u00DF', 'S'} // ß
266        };
267    
268        /*
269         * Returns whether the array contains the key, or not.
270         */
271        private static boolean arrayContains(char[] arr, char key) {
272            for (int i = 0; i < arr.length; i++) {
273                if (arr[i] == key) {
274                    return true;
275                }
276            }
277            return false;
278        }
279    
280        /**
281         * <p>
282         * <b>colognePhonetic()</b> is the actual implementations of the <i>Kölner Phonetik</i> algorithm.
283         * </p>
284         * <p>
285         * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
286         * </p>
287         * 
288         * @param text
289         * @return the corresponding encoding according to the <i>Kölner Phonetik</i> algorithm
290         */
291        public String colognePhonetic(String text) {
292            if (text == null) {
293                return null;
294            }
295    
296            text = preprocess(text);
297    
298            CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
299            CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
300    
301            char nextChar;
302    
303            char lastChar = '-';
304            char lastCode = '/';
305            char code;
306            char chr;
307    
308            int rightLength = input.length();
309    
310            while (rightLength > 0) {
311                chr = input.removeNext();
312    
313                if ((rightLength = input.length()) > 0) {
314                    nextChar = input.getNextChar();
315                } else {
316                    nextChar = '-';
317                }
318    
319                if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) {
320                    code = '0';
321                } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
322                    if (lastCode == '/') {
323                        continue;
324                    }
325                    code = '-';
326                } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
327                    code = '1';
328                } else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) {
329                    code = '2';
330                } else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) {
331                    code = '3';
332                } else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) {
333                    code = '4';
334                } else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) {
335                    code = '4';
336                    input.addLeft('S');
337                    rightLength++;
338                } else if (chr == 'S' || chr == 'Z') {
339                    code = '8';
340                } else if (chr == 'C') {
341                    if (lastCode == '/') {
342                        if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) {
343                            code = '4';
344                        } else {
345                            code = '8';
346                        }
347                    } else {
348                        if (arrayContains(new char[]{'S', 'Z'}, lastChar) ||
349                            !arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) {
350                            code = '8';
351                        } else {
352                            code = '4';
353                        }
354                    }
355                } else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) {
356                    code = '8';
357                } else if (chr == 'R') {
358                    code = '7';
359                } else if (chr == 'L') {
360                    code = '5';
361                } else if (chr == 'M' || chr == 'N') {
362                    code = '6';
363                } else {
364                    code = chr;
365                }
366    
367                if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
368                    output.addRight(code);
369                }
370    
371                lastChar = chr;
372                lastCode = code;
373            }
374            return output.toString();
375        }
376    
377        public Object encode(Object object) throws EncoderException {
378            if (!(object instanceof String)) {
379                throw new EncoderException("This method’s parameter was expected to be of the type " +
380                    String.class.getName() +
381                    ". But actually it was of the type " +
382                    object.getClass().getName() +
383                    ".");
384            }
385            return encode((String) object);
386        }
387    
388        public String encode(String text) {
389            return colognePhonetic(text);
390        }
391    
392        public boolean isEncodeEqual(String text1, String text2) {
393            return colognePhonetic(text1).equals(colognePhonetic(text2));
394        }
395    
396        /*
397         * Converts the string to upper case and replaces germanic umlauts, and the “ß�.
398         */
399        private String preprocess(String text) {
400            text = text.toUpperCase(Locale.GERMAN);
401    
402            char[] chrs = text.toCharArray();
403    
404            for (int index = 0; index < chrs.length; index++) {
405                if (chrs[index] > 'Z') {
406                    for (int replacement = 0; replacement < PREPROCESS_MAP.length; replacement++) {
407                        if (chrs[index] == PREPROCESS_MAP[replacement][0]) {
408                            chrs[index] = PREPROCESS_MAP[replacement][1];
409                            break;
410                        }
411                    }
412                }
413            }
414            return new String(chrs);
415        }
416    }