001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.codec;
021    
022    import java.nio.ByteBuffer;
023    import java.nio.charset.Charset;
024    import java.util.BitSet;
025    import java.util.Locale;
026    
027    import org.apache.james.mime4j.util.CharsetUtil;
028    
029    /**
030     * Static methods for encoding header field values. This includes encoded-words
031     * as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
032     * or display-names of an e-mail address, for example.
033     */
034    public class EncoderUtil {
035        private static final byte[] BASE64_TABLE = Base64OutputStream.BASE64_TABLE;
036        private static final char BASE64_PAD = '=';
037    
038        private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
039    
040        private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{|}~");
041    
042        private static final int MAX_USED_CHARACTERS = 50;
043    
044        private static final String ENC_WORD_PREFIX = "=?";
045        private static final String ENC_WORD_SUFFIX = "?=";
046    
047        private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
048    
049        private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
050    
051        private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
052    
053        private static BitSet initChars(String specials) {
054            BitSet bs = new BitSet(128);
055            for (char ch = 33; ch < 127; ch++) {
056                if (specials.indexOf(ch) == -1) {
057                    bs.set(ch);
058                }
059            }
060            return bs;
061        }
062    
063        /**
064         * Selects one of the two encodings specified in RFC 2047.
065         */
066        public enum Encoding {
067            /** The B encoding (identical to base64 defined in RFC 2045). */
068            B,
069            /** The Q encoding (similar to quoted-printable defined in RFC 2045). */
070            Q
071        }
072    
073        /**
074         * Indicates the intended usage of an encoded word.
075         */
076        public enum Usage {
077            /**
078             * Encoded word is used to replace a 'text' token in any Subject or
079             * Comments header field.
080             */
081            TEXT_TOKEN,
082            /**
083             * Encoded word is used to replace a 'word' entity within a 'phrase',
084             * for example, one that precedes an address in a From, To, or Cc
085             * header.
086             */
087            WORD_ENTITY
088        }
089    
090        private EncoderUtil() {
091        }
092    
093        /**
094         * Encodes the display-name portion of an address. See <a
095         * href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
096         * and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
097         * 5.3. The specified string should not be folded.
098         * 
099         * @param displayName
100         *            display-name to encode.
101         * @return encoded display-name.
102         */
103        public static String encodeAddressDisplayName(String displayName) {
104            // display-name = phrase
105            // phrase = 1*( encoded-word / word )
106            // word = atom / quoted-string
107            // atom = [CFWS] 1*atext [CFWS]
108            // CFWS = comment or folding white space
109    
110            if (isAtomPhrase(displayName)) {
111                return displayName;
112            } else if (hasToBeEncoded(displayName, 0)) {
113                return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
114            } else {
115                return quote(displayName);
116            }
117        }
118    
119        /**
120         * Encodes the local part of an address specification as described in RFC
121         * 5322 section 3.4.1. Leading and trailing CFWS should have been removed
122         * before calling this method. The specified string should not contain any
123         * illegal (control or non-ASCII) characters.
124         * 
125         * @param localPart
126         *            the local part to encode
127         * @return the encoded local part.
128         */
129        public static String encodeAddressLocalPart(String localPart) {
130            // local-part = dot-atom / quoted-string
131            // dot-atom = [CFWS] dot-atom-text [CFWS]
132            // CFWS = comment or folding white space
133    
134            if (isDotAtomText(localPart)) {
135                return localPart;
136            } else {
137                return quote(localPart);
138            }
139        }
140    
141        /**
142         * Encodes the specified strings into a header parameter as described in RFC
143         * 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
144         * contain any illegal (control or non-ASCII) characters.
145         * 
146         * @param name
147         *            parameter name.
148         * @param value
149         *            parameter value.
150         * @return encoded result.
151         */
152        public static String encodeHeaderParameter(String name, String value) {
153            name = name.toLowerCase(Locale.US);
154    
155            // value := token / quoted-string
156            if (isToken(value)) {
157                return name + "=" + value;
158            } else {
159                return name + "=" + quote(value);
160            }
161        }
162    
163        /**
164         * Shortcut method that encodes the specified text into an encoded-word if
165         * the text has to be encoded.
166         * 
167         * @param text
168         *            text to encode.
169         * @param usage
170         *            whether the encoded-word is to be used to replace a text token
171         *            or a word entity (see RFC 822).
172         * @param usedCharacters
173         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
174         * @return the specified text if encoding is not necessary or an encoded
175         *         word or a sequence of encoded words otherwise.
176         */
177        public static String encodeIfNecessary(String text, Usage usage,
178                int usedCharacters) {
179            if (hasToBeEncoded(text, usedCharacters))
180                return encodeEncodedWord(text, usage, usedCharacters);
181            else
182                return text;
183        }
184    
185        /**
186         * Determines if the specified string has to encoded into an encoded-word.
187         * Returns <code>true</code> if the text contains characters that don't
188         * fall into the printable ASCII character set or if the text contains a
189         * 'word' (sequence of non-whitespace characters) longer than 77 characters
190         * (including characters already used up in the line).
191         * 
192         * @param text
193         *            text to analyze.
194         * @param usedCharacters
195         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
196         * @return <code>true</code> if the specified text has to be encoded into
197         *         an encoded-word, <code>false</code> otherwise.
198         */
199        public static boolean hasToBeEncoded(String text, int usedCharacters) {
200            if (text == null)
201                throw new IllegalArgumentException();
202            if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
203                throw new IllegalArgumentException();
204    
205            int nonWhiteSpaceCount = usedCharacters;
206    
207            for (int idx = 0; idx < text.length(); idx++) {
208                char ch = text.charAt(idx);
209                if (ch == '\t' || ch == ' ') {
210                    nonWhiteSpaceCount = 0;
211                } else {
212                    nonWhiteSpaceCount++;
213                    if (nonWhiteSpaceCount > 77) {
214                        // Line cannot be folded into multiple lines with no more
215                        // than 78 characters each. Encoding as encoded-words makes
216                        // that possible. One character has to be reserved for
217                        // folding white space; that leaves 77 characters.
218                        return true;
219                    }
220    
221                    if (ch < 32 || ch >= 127) {
222                        // non-printable ascii character has to be encoded
223                        return true;
224                    }
225                }
226            }
227    
228            return false;
229        }
230    
231        /**
232         * Encodes the specified text into an encoded word or a sequence of encoded
233         * words separated by space. The text is separated into a sequence of
234         * encoded words if it does not fit in a single one.
235         * <p>
236         * The charset to encode the specified text into a byte array and the
237         * encoding to use for the encoded-word are detected automatically.
238         * <p>
239         * This method assumes that zero characters have already been used up in the
240         * current line.
241         * 
242         * @param text
243         *            text to encode.
244         * @param usage
245         *            whether the encoded-word is to be used to replace a text token
246         *            or a word entity (see RFC 822).
247         * @return the encoded word (or sequence of encoded words if the given text
248         *         does not fit in a single encoded word).
249         * @see #hasToBeEncoded(String, int)
250         */
251        public static String encodeEncodedWord(String text, Usage usage) {
252            return encodeEncodedWord(text, usage, 0, null, null);
253        }
254    
255        /**
256         * Encodes the specified text into an encoded word or a sequence of encoded
257         * words separated by space. The text is separated into a sequence of
258         * encoded words if it does not fit in a single one.
259         * <p>
260         * The charset to encode the specified text into a byte array and the
261         * encoding to use for the encoded-word are detected automatically.
262         * 
263         * @param text
264         *            text to encode.
265         * @param usage
266         *            whether the encoded-word is to be used to replace a text token
267         *            or a word entity (see RFC 822).
268         * @param usedCharacters
269         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
270         * @return the encoded word (or sequence of encoded words if the given text
271         *         does not fit in a single encoded word).
272         * @see #hasToBeEncoded(String, int)
273         */
274        public static String encodeEncodedWord(String text, Usage usage,
275                int usedCharacters) {
276            return encodeEncodedWord(text, usage, usedCharacters, null, null);
277        }
278    
279        /**
280         * Encodes the specified text into an encoded word or a sequence of encoded
281         * words separated by space. The text is separated into a sequence of
282         * encoded words if it does not fit in a single one.
283         * 
284         * @param text
285         *            text to encode.
286         * @param usage
287         *            whether the encoded-word is to be used to replace a text token
288         *            or a word entity (see RFC 822).
289         * @param usedCharacters
290         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
291         * @param charset
292         *            the Java charset that should be used to encode the specified
293         *            string into a byte array. A suitable charset is detected
294         *            automatically if this parameter is <code>null</code>.
295         * @param encoding
296         *            the encoding to use for the encoded-word (either B or Q). A
297         *            suitable encoding is automatically chosen if this parameter is
298         *            <code>null</code>.
299         * @return the encoded word (or sequence of encoded words if the given text
300         *         does not fit in a single encoded word).
301         * @see #hasToBeEncoded(String, int)
302         */
303        public static String encodeEncodedWord(String text, Usage usage,
304                int usedCharacters, Charset charset, Encoding encoding) {
305            if (text == null)
306                throw new IllegalArgumentException();
307            if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
308                throw new IllegalArgumentException();
309    
310            if (charset == null)
311                charset = determineCharset(text);
312    
313            String mimeCharset = CharsetUtil.toMimeCharset(charset.name());
314            if (mimeCharset == null) {
315                // cannot happen if charset was originally null
316                throw new IllegalArgumentException("Unsupported charset");
317            }
318    
319            byte[] bytes = encode(text, charset);
320    
321            if (encoding == null)
322                encoding = determineEncoding(bytes, usage);
323    
324            if (encoding == Encoding.B) {
325                String prefix = ENC_WORD_PREFIX + mimeCharset + "?B?";
326                return encodeB(prefix, text, usedCharacters, charset, bytes);
327            } else {
328                String prefix = ENC_WORD_PREFIX + mimeCharset + "?Q?";
329                return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
330            }
331        }
332    
333        /**
334         * Encodes the specified byte array using the B encoding defined in RFC
335         * 2047.
336         * 
337         * @param bytes
338         *            byte array to encode.
339         * @return encoded string.
340         */
341        public static String encodeB(byte[] bytes) {
342            StringBuilder sb = new StringBuilder();
343    
344            int idx = 0;
345            final int end = bytes.length;
346            for (; idx < end - 2; idx += 3) {
347                int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8
348                        | bytes[idx + 2] & 0xff;
349                sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
350                sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
351                sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
352                sb.append((char) BASE64_TABLE[data & 0x3f]);
353            }
354    
355            if (idx == end - 2) {
356                int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8;
357                sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
358                sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
359                sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
360                sb.append(BASE64_PAD);
361    
362            } else if (idx == end - 1) {
363                int data = (bytes[idx] & 0xff) << 16;
364                sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
365                sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
366                sb.append(BASE64_PAD);
367                sb.append(BASE64_PAD);
368            }
369    
370            return sb.toString();
371        }
372    
373        /**
374         * Encodes the specified byte array using the Q encoding defined in RFC
375         * 2047.
376         * 
377         * @param bytes
378         *            byte array to encode.
379         * @param usage
380         *            whether the encoded-word is to be used to replace a text token
381         *            or a word entity (see RFC 822).
382         * @return encoded string.
383         */
384        public static String encodeQ(byte[] bytes, Usage usage) {
385            BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
386                    : Q_RESTRICTED_CHARS;
387    
388            StringBuilder sb = new StringBuilder();
389    
390            final int end = bytes.length;
391            for (int idx = 0; idx < end; idx++) {
392                int v = bytes[idx] & 0xff;
393                if (v == 32) {
394                    sb.append('_');
395                } else if (!qChars.get(v)) {
396                    sb.append('=');
397                    sb.append(hexDigit(v >>> 4));
398                    sb.append(hexDigit(v & 0xf));
399                } else {
400                    sb.append((char) v);
401                }
402            }
403    
404            return sb.toString();
405        }
406    
407        /**
408         * Tests whether the specified string is a token as defined in RFC 2045
409         * section 5.1.
410         * 
411         * @param str
412         *            string to test.
413         * @return <code>true</code> if the specified string is a RFC 2045 token,
414         *         <code>false</code> otherwise.
415         */
416        public static boolean isToken(String str) {
417            // token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
418            // tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
419            // <"> / "/" / "[" / "]" / "?" / "="
420            // CTL := 0.- 31., 127.
421    
422            final int length = str.length();
423            if (length == 0)
424                return false;
425    
426            for (int idx = 0; idx < length; idx++) {
427                char ch = str.charAt(idx);
428                if (!TOKEN_CHARS.get(ch))
429                    return false;
430            }
431    
432            return true;
433        }
434    
435        private static boolean isAtomPhrase(String str) {
436            // atom = [CFWS] 1*atext [CFWS]
437    
438            boolean containsAText = false;
439    
440            final int length = str.length();
441            for (int idx = 0; idx < length; idx++) {
442                char ch = str.charAt(idx);
443                if (ATEXT_CHARS.get(ch)) {
444                    containsAText = true;
445                } else if (!CharsetUtil.isWhitespace(ch)) {
446                    return false;
447                }
448            }
449    
450            return containsAText;
451        }
452    
453        // RFC 5322 section 3.2.3
454        private static boolean isDotAtomText(String str) {
455            // dot-atom-text = 1*atext *("." 1*atext)
456            // atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
457            // "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~"
458    
459            char prev = '.';
460    
461            final int length = str.length();
462            if (length == 0)
463                return false;
464    
465            for (int idx = 0; idx < length; idx++) {
466                char ch = str.charAt(idx);
467    
468                if (ch == '.') {
469                    if (prev == '.' || idx == length - 1)
470                        return false;
471                } else {
472                    if (!ATEXT_CHARS.get(ch))
473                        return false;
474                }
475    
476                prev = ch;
477            }
478    
479            return true;
480        }
481    
482        // RFC 5322 section 3.2.4
483        private static String quote(String str) {
484            // quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
485            // qcontent = qtext / quoted-pair
486            // qtext = %d33 / %d35-91 / %d93-126
487            // quoted-pair = ("\" (VCHAR / WSP))
488            // VCHAR = %x21-7E
489            // DQUOTE = %x22
490    
491            String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
492            return "\"" + escaped + "\"";
493        }
494    
495        private static String encodeB(String prefix, String text,
496                int usedCharacters, Charset charset, byte[] bytes) {
497            int encodedLength = bEncodedLength(bytes);
498    
499            int totalLength = prefix.length() + encodedLength
500                    + ENC_WORD_SUFFIX.length();
501            if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
502                return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
503            } else {
504                String part1 = text.substring(0, text.length() / 2);
505                byte[] bytes1 = encode(part1, charset);
506                String word1 = encodeB(prefix, part1, usedCharacters, charset,
507                        bytes1);
508    
509                String part2 = text.substring(text.length() / 2);
510                byte[] bytes2 = encode(part2, charset);
511                String word2 = encodeB(prefix, part2, 0, charset, bytes2);
512    
513                return word1 + " " + word2;
514            }
515        }
516    
517        private static int bEncodedLength(byte[] bytes) {
518            return (bytes.length + 2) / 3 * 4;
519        }
520    
521        private static String encodeQ(String prefix, String text, Usage usage,
522                int usedCharacters, Charset charset, byte[] bytes) {
523            int encodedLength = qEncodedLength(bytes, usage);
524    
525            int totalLength = prefix.length() + encodedLength
526                    + ENC_WORD_SUFFIX.length();
527            if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
528                return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
529            } else {
530                String part1 = text.substring(0, text.length() / 2);
531                byte[] bytes1 = encode(part1, charset);
532                String word1 = encodeQ(prefix, part1, usage, usedCharacters,
533                        charset, bytes1);
534    
535                String part2 = text.substring(text.length() / 2);
536                byte[] bytes2 = encode(part2, charset);
537                String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
538    
539                return word1 + " " + word2;
540            }
541        }
542    
543        private static int qEncodedLength(byte[] bytes, Usage usage) {
544            BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
545                    : Q_RESTRICTED_CHARS;
546    
547            int count = 0;
548    
549            for (int idx = 0; idx < bytes.length; idx++) {
550                int v = bytes[idx] & 0xff;
551                if (v == 32) {
552                    count++;
553                } else if (!qChars.get(v)) {
554                    count += 3;
555                } else {
556                    count++;
557                }
558            }
559    
560            return count;
561        }
562    
563        private static byte[] encode(String text, Charset charset) {
564            ByteBuffer buffer = charset.encode(text);
565            byte[] bytes = new byte[buffer.limit()];
566            buffer.get(bytes);
567            return bytes;
568        }
569    
570        private static Charset determineCharset(String text) {
571            // it is an important property of iso-8859-1 that it directly maps
572            // unicode code points 0000 to 00ff to byte values 00 to ff.
573            boolean ascii = true;
574            final int len = text.length();
575            for (int index = 0; index < len; index++) {
576                char ch = text.charAt(index);
577                if (ch > 0xff) {
578                    return CharsetUtil.UTF_8;
579                }
580                if (ch > 0x7f) {
581                    ascii = false;
582                }
583            }
584            return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
585        }
586    
587        private static Encoding determineEncoding(byte[] bytes, Usage usage) {
588            if (bytes.length == 0)
589                return Encoding.Q;
590    
591            BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
592                    : Q_RESTRICTED_CHARS;
593    
594            int qEncoded = 0;
595            for (int i = 0; i < bytes.length; i++) {
596                int v = bytes[i] & 0xff;
597                if (v != 32 && !qChars.get(v)) {
598                    qEncoded++;
599                }
600            }
601    
602            int percentage = qEncoded * 100 / bytes.length;
603            return percentage > 30 ? Encoding.B : Encoding.Q;
604        }
605    
606        private static char hexDigit(int i) {
607            return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
608        }
609    }