001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.codec.binary;
019    
020    /**
021     * Provides Base32 encoding and decoding as defined by <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>.
022     * 
023     * <p>
024     * The class can be parameterized in the following manner with various constructors:
025     * <ul>
026     * <li>Whether to use the "base32hex" variant instead of the default "base32"</li>
027     * <li>Line length: Default 76. Line length that aren't multiples of 8 will still essentially end up being multiples of
028     * 8 in the encoded data.
029     * <li>Line separator: Default is CRLF ("\r\n")</li>
030     * </ul>
031     * </p>
032     * <p>
033     * This class operates directly on byte streams, and not character streams.
034     * </p>
035     * <p>
036     * This class is not thread-safe. Each thread should use its own instance.
037     * </p>
038     * 
039     * @see <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>
040     * 
041     * @since 1.5
042     * @version $Revision: 1080712 $
043     */
044    public class Base32 extends BaseNCodec {
045    
046        /**
047         * BASE32 characters are 5 bits in length. 
048         * They are formed by taking a block of five octets to form a 40-bit string, 
049         * which is converted into eight BASE32 characters.
050         */
051        private static final int BITS_PER_ENCODED_BYTE = 5;
052        private static final int BYTES_PER_ENCODED_BLOCK = 8;
053        private static final int BYTES_PER_UNENCODED_BLOCK = 5;
054    
055        /**
056         * Chunk separator per RFC 2045 section 2.1.
057         *
058         * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 2.1</a>
059         */
060        private static final byte[] CHUNK_SEPARATOR = {'\r', '\n'};
061    
062        /**
063         * This array is a lookup table that translates Unicode characters drawn from the "Base32 Alphabet" (as specified in
064         * Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32
065         * alphabet but fall within the bounds of the array are translated to -1.
066         * 
067         */
068        private static final byte[] DECODE_TABLE = {
069             //  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
070                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
071                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
072                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
073                -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
074                -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, // 40-4f A-N
075                15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,                     // 50-5a O-Z
076        };
077    
078        /**
079         * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Alphabet"
080         * equivalents as specified in Table 3 of RFC 2045.
081         */
082        private static final byte[] ENCODE_TABLE = {
083                'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
084                'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
085                '2', '3', '4', '5', '6', '7',
086        };
087    
088        /**
089         * This array is a lookup table that translates Unicode characters drawn from the "Base32 |Hex Alphabet" (as specified in
090         * Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32 Hex
091         * alphabet but fall within the bounds of the array are translated to -1.
092         * 
093         */
094        private static final byte[] HEX_DECODE_TABLE = {
095             //  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
096                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
097                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
098                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
099                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
100                -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // 40-4f A-N
101                25, 26, 27, 28, 29, 30, 31, 32,                                 // 50-57 O-V
102        };
103    
104        /**
105         * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Hex Alphabet"
106         * equivalents as specified in Table 3 of RFC 2045.
107         */
108        private static final byte[] HEX_ENCODE_TABLE = {
109                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 
110                'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
111                'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
112        };
113    
114        /** Mask used to extract 5 bits, used when encoding Base32 bytes */
115        private static final int MASK_5BITS = 0x1f;
116    
117        // The static final fields above are used for the original static byte[] methods on Base32.
118        // The private member fields below are used with the new streaming approach, which requires
119        // some state be preserved between calls of encode() and decode().
120    
121        /**
122         * Place holder for the bytes we're dealing with for our based logic. 
123         * Bitwise operations store and extract the encoding or decoding from this variable.
124         */
125        private long bitWorkArea;
126    
127        /**
128         * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
129         * <code>decodeSize = {@link BYTES_PER_ENCODED_BLOCK} - 1 + lineSeparator.length;</code>
130         */
131        private final int decodeSize;
132    
133        /**
134         * Decode table to use.
135         */
136        private final byte[] decodeTable;
137    
138        /**
139         * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
140         * <code>encodeSize = {@link BYTES_PER_ENCODED_BLOCK} + lineSeparator.length;</code>
141         */
142        private final int encodeSize;
143    
144        /**
145         * Encode table to use.
146         */
147        private final byte[] encodeTable;
148    
149        /**
150         * Line separator for encoding. Not used when decoding. Only used if lineLength > 0.
151         */
152        private final byte[] lineSeparator;
153    
154        /**
155         * Creates a Base32 codec used for decoding and encoding.
156         * <p>
157         * When encoding the line length is 0 (no chunking).
158         * </p>
159         * 
160         */
161        public Base32() {
162            this(false);
163        }
164    
165        /**
166         * Creates a Base32 codec used for decoding and encoding.
167         * <p>
168         * When encoding the line length is 0 (no chunking).
169         * </p>
170         * @param useHex if <code>true</code> then use Base32 Hex alphabet
171         */
172        public Base32(boolean useHex) {
173            this(0, null, useHex);
174        }
175    
176        /**
177         * Creates a Base32 codec used for decoding and encoding.
178         * <p>
179         * When encoding the line length is given in the constructor, the line separator is CRLF.
180         * </p>
181         * 
182         * @param lineLength
183         *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
184         *            If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
185         */
186        public Base32(int lineLength) {
187            this(lineLength, CHUNK_SEPARATOR);
188        }
189    
190        /**
191         * Creates a Base32 codec used for decoding and encoding.
192         * <p>
193         * When encoding the line length and line separator are given in the constructor.
194         * </p>
195         * <p>
196         * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
197         * </p>
198         * 
199         * @param lineLength
200         *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
201         *            If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
202         * @param lineSeparator
203         *            Each line of encoded data will end with this sequence of bytes.
204         * @throws IllegalArgumentException
205         *             The provided lineSeparator included some Base32 characters. That's not going to work!
206         */
207        public Base32(int lineLength, byte[] lineSeparator) {
208            this(lineLength, lineSeparator, false);
209        }
210        
211        /**
212         * Creates a Base32 / Base32 Hex codec used for decoding and encoding.
213         * <p>
214         * When encoding the line length and line separator are given in the constructor.
215         * </p>
216         * <p>
217         * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
218         * </p>
219         * 
220         * @param lineLength
221         *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
222         *            If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
223         * @param lineSeparator
224         *            Each line of encoded data will end with this sequence of bytes.
225         * @param useHex if <code>true</code>, then use Base32 Hex alphabet, otherwise use Base32 alphabet
226         * @throws IllegalArgumentException
227         *             The provided lineSeparator included some Base32 characters. That's not going to work!
228         *             Or the lineLength > 0 and lineSeparator is null.
229         */
230        public Base32(int lineLength, byte[] lineSeparator, boolean useHex) {
231            super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK, 
232                    lineLength, 
233                    lineSeparator == null ? 0 : lineSeparator.length);
234            if (useHex){
235                this.encodeTable = HEX_ENCODE_TABLE;
236                this.decodeTable = HEX_DECODE_TABLE;            
237            } else {
238                this.encodeTable = ENCODE_TABLE;
239                this.decodeTable = DECODE_TABLE;            
240            }
241            if (lineLength > 0) {
242                if (lineSeparator == null) {
243                    throw new IllegalArgumentException("lineLength "+lineLength+" > 0, but lineSeparator is null");
244                }
245                // Must be done after initializing the tables
246                if (containsAlphabetOrPad(lineSeparator)) {
247                    String sep = StringUtils.newStringUtf8(lineSeparator);
248                    throw new IllegalArgumentException("lineSeparator must not contain Base32 characters: [" + sep + "]");
249                }
250                this.encodeSize = BYTES_PER_ENCODED_BLOCK + lineSeparator.length;
251                this.lineSeparator = new byte[lineSeparator.length];
252                System.arraycopy(lineSeparator, 0, this.lineSeparator, 0, lineSeparator.length);
253            } else {
254                this.encodeSize = BYTES_PER_ENCODED_BLOCK;
255                this.lineSeparator = null;
256            }
257            this.decodeSize = this.encodeSize - 1;
258        }
259    
260        /**
261         * <p>
262         * Decodes all of the provided data, starting at inPos, for inAvail bytes. Should be called at least twice: once
263         * with the data to decode, and once with inAvail set to "-1" to alert decoder that EOF has been reached. The "-1"
264         * call is not necessary when decoding, but it doesn't hurt, either.
265         * </p>
266         * <p>
267         * Ignores all non-Base32 characters. This is how chunked (e.g. 76 character) data is handled, since CR and LF are
268         * silently ignored, but has implications for other bytes, too. This method subscribes to the garbage-in,
269         * garbage-out philosophy: it will not check the provided data for validity.
270         * </p>
271         * 
272         * @param in
273         *            byte[] array of ascii data to Base32 decode.
274         * @param inPos
275         *            Position to start reading data from.
276         * @param inAvail
277         *            Amount of bytes available from input for encoding.
278         *
279         * Output is written to {@link #buffer} as 8-bit octets, using {@link pos} as the buffer position
280         */
281        void decode(byte[] in, int inPos, int inAvail) { // package protected for access from I/O streams
282            if (eof) {
283                return;
284            }
285            if (inAvail < 0) {
286                eof = true;
287            }
288            for (int i = 0; i < inAvail; i++) {
289                byte b = in[inPos++];
290                if (b == PAD) {
291                    // We're done.
292                    eof = true;
293                    break;
294                } else {
295                    ensureBufferSize(decodeSize);
296                    if (b >= 0 && b < this.decodeTable.length) {
297                        int result = this.decodeTable[b];
298                        if (result >= 0) {
299                            modulus = (modulus+1) % BYTES_PER_ENCODED_BLOCK;
300                            bitWorkArea = (bitWorkArea << BITS_PER_ENCODED_BYTE) + result; // collect decoded bytes
301                            if (modulus == 0) { // we can output the 5 bytes
302                                buffer[pos++] = (byte) ((bitWorkArea >> 32) & MASK_8BITS);
303                                buffer[pos++] = (byte) ((bitWorkArea >> 24) & MASK_8BITS);
304                                buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
305                                buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
306                                buffer[pos++] = (byte) (bitWorkArea & MASK_8BITS);
307                            }
308                        }
309                    }
310                }
311            }
312        
313            // Two forms of EOF as far as Base32 decoder is concerned: actual
314            // EOF (-1) and first time '=' character is encountered in stream.
315            // This approach makes the '=' padding characters completely optional.
316            if (eof && modulus >= 2) { // if modulus < 2, nothing to do
317                ensureBufferSize(decodeSize);
318        
319                //  we ignore partial bytes, i.e. only multiples of 8 count
320                switch (modulus) {
321                    case 2 : // 10 bits, drop 2 and output one byte
322                        buffer[pos++] = (byte) ((bitWorkArea >> 2) & MASK_8BITS);
323                        break;
324                    case 3 : // 15 bits, drop 7 and output 1 byte
325                        buffer[pos++] = (byte) ((bitWorkArea >> 7) & MASK_8BITS);
326                        break;
327                    case 4 : // 20 bits = 2*8 + 4
328                        bitWorkArea = bitWorkArea >> 4; // drop 4 bits
329                        buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
330                        buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
331                        break;
332                    case 5 : // 25bits = 3*8 + 1
333                        bitWorkArea = bitWorkArea >> 1;
334                        buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
335                        buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
336                        buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
337                        break;
338                    case 6 : // 30bits = 3*8 + 6
339                        bitWorkArea = bitWorkArea >> 6;
340                        buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
341                        buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
342                        buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
343                        break;
344                    case 7 : // 35 = 4*8 +3
345                        bitWorkArea = bitWorkArea >> 3;
346                        buffer[pos++] = (byte) ((bitWorkArea >> 24) & MASK_8BITS);
347                        buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
348                        buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
349                        buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
350                        break;
351                }
352            }
353        }
354    
355        /**
356         * <p>
357         * Encodes all of the provided data, starting at inPos, for inAvail bytes. Must be called at least twice: once with
358         * the data to encode, and once with inAvail set to "-1" to alert encoder that EOF has been reached, so flush last
359         * remaining bytes (if not multiple of 5).
360         * </p>
361         * 
362         * @param in
363         *            byte[] array of binary data to Base32 encode.
364         * @param inPos
365         *            Position to start reading data from.
366         * @param inAvail
367         *            Amount of bytes available from input for encoding.
368         */
369        void encode(byte[] in, int inPos, int inAvail) { // package protected for access from I/O streams
370            if (eof) {
371                return;
372            }
373            // inAvail < 0 is how we're informed of EOF in the underlying data we're
374            // encoding.
375            if (inAvail < 0) {
376                eof = true;
377                if (0 == modulus && lineLength == 0) {
378                    return; // no leftovers to process and not using chunking
379                }
380                ensureBufferSize(encodeSize);
381                int savedPos = pos;
382                switch (modulus) { // % 5
383                    case 1 : // Only 1 octet; take top 5 bits then remainder
384                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 3) & MASK_5BITS]; // 8-1*5 = 3
385                        buffer[pos++] = encodeTable[(int)(bitWorkArea << 2) & MASK_5BITS]; // 5-3=2
386                        buffer[pos++] = PAD;
387                        buffer[pos++] = PAD;
388                        buffer[pos++] = PAD;
389                        buffer[pos++] = PAD;
390                        buffer[pos++] = PAD;
391                        buffer[pos++] = PAD;
392                        break;
393        
394                    case 2 : // 2 octets = 16 bits to use
395                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 11) & MASK_5BITS]; // 16-1*5 = 11
396                        buffer[pos++] = encodeTable[(int)(bitWorkArea >>  6) & MASK_5BITS]; // 16-2*5 = 6
397                        buffer[pos++] = encodeTable[(int)(bitWorkArea >>  1) & MASK_5BITS]; // 16-3*5 = 1
398                        buffer[pos++] = encodeTable[(int)(bitWorkArea <<  4) & MASK_5BITS]; // 5-1 = 4
399                        buffer[pos++] = PAD;
400                        buffer[pos++] = PAD;
401                        buffer[pos++] = PAD;
402                        buffer[pos++] = PAD;
403                        break;
404                    case 3 : // 3 octets = 24 bits to use
405                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 19) & MASK_5BITS]; // 24-1*5 = 19
406                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 14) & MASK_5BITS]; // 24-2*5 = 14
407                        buffer[pos++] = encodeTable[(int)(bitWorkArea >>  9) & MASK_5BITS]; // 24-3*5 = 9
408                        buffer[pos++] = encodeTable[(int)(bitWorkArea >>  4) & MASK_5BITS]; // 24-4*5 = 4
409                        buffer[pos++] = encodeTable[(int)(bitWorkArea <<  1) & MASK_5BITS]; // 5-4 = 1
410                        buffer[pos++] = PAD;
411                        buffer[pos++] = PAD;
412                        buffer[pos++] = PAD;
413                        break;
414                    case 4 : // 4 octets = 32 bits to use
415                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 27) & MASK_5BITS]; // 32-1*5 = 27
416                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 22) & MASK_5BITS]; // 32-2*5 = 22
417                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 17) & MASK_5BITS]; // 32-3*5 = 17
418                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 12) & MASK_5BITS]; // 32-4*5 = 12
419                        buffer[pos++] = encodeTable[(int)(bitWorkArea >>  7) & MASK_5BITS]; // 32-5*5 =  7
420                        buffer[pos++] = encodeTable[(int)(bitWorkArea >>  2) & MASK_5BITS]; // 32-6*5 =  2
421                        buffer[pos++] = encodeTable[(int)(bitWorkArea <<  3) & MASK_5BITS]; // 5-2 = 3
422                        buffer[pos++] = PAD;
423                        break;
424                }
425                currentLinePos += pos - savedPos; // keep track of current line position
426                // if currentPos == 0 we are at the start of a line, so don't add CRLF
427                if (lineLength > 0 && currentLinePos > 0){ // add chunk separator if required
428                    System.arraycopy(lineSeparator, 0, buffer, pos, lineSeparator.length);
429                    pos += lineSeparator.length;
430                }            
431            } else {
432                for (int i = 0; i < inAvail; i++) {
433                    ensureBufferSize(encodeSize);
434                    modulus = (modulus+1) % BYTES_PER_UNENCODED_BLOCK;
435                    int b = in[inPos++];
436                    if (b < 0) {
437                        b += 256;
438                    }
439                    bitWorkArea = (bitWorkArea << 8) + b; // BITS_PER_BYTE
440                    if (0 == modulus) { // we have enough bytes to create our output 
441                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 35) & MASK_5BITS];
442                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 30) & MASK_5BITS];
443                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 25) & MASK_5BITS];
444                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 20) & MASK_5BITS];
445                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 15) & MASK_5BITS];
446                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 10) & MASK_5BITS];
447                        buffer[pos++] = encodeTable[(int)(bitWorkArea >> 5) & MASK_5BITS];
448                        buffer[pos++] = encodeTable[(int)bitWorkArea & MASK_5BITS];
449                        currentLinePos += BYTES_PER_ENCODED_BLOCK;
450                        if (lineLength > 0 && lineLength <= currentLinePos) {
451                            System.arraycopy(lineSeparator, 0, buffer, pos, lineSeparator.length);
452                            pos += lineSeparator.length;
453                            currentLinePos = 0;
454                        }
455                    }
456                }
457            }
458        }
459    
460        /**
461         * Returns whether or not the <code>octet</code> is in the Base32 alphabet.
462         * 
463         * @param octet
464         *            The value to test
465         * @return <code>true</code> if the value is defined in the the Base32 alphabet <code>false</code> otherwise.
466         */
467        public boolean isInAlphabet(byte octet) {
468            return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
469        }
470    }