001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.codec;
021    
022    import java.io.ByteArrayInputStream;
023    import java.io.ByteArrayOutputStream;
024    import java.io.IOException;
025    import java.io.UnsupportedEncodingException;
026    
027    import org.apache.commons.logging.Log;
028    import org.apache.commons.logging.LogFactory;
029    import org.apache.james.mime4j.util.CharsetUtil;
030    
031    /**
032     * Static methods for decoding strings, byte arrays and encoded words.
033     */
034    public class DecoderUtil {
035        private static Log log = LogFactory.getLog(DecoderUtil.class);
036        
037        /**
038         * Decodes a string containing quoted-printable encoded data. 
039         * 
040         * @param s the string to decode.
041         * @return the decoded bytes.
042         */
043        public static byte[] decodeBaseQuotedPrintable(String s) {
044            ByteArrayOutputStream baos = new ByteArrayOutputStream();
045            
046            try {
047                byte[] bytes = s.getBytes("US-ASCII");
048                
049                QuotedPrintableInputStream is = new QuotedPrintableInputStream(
050                                                   new ByteArrayInputStream(bytes));
051                
052                int b = 0;
053                while ((b = is.read()) != -1) {
054                    baos.write(b);
055                }
056            } catch (IOException e) {
057                /*
058                 * This should never happen!
059                 */
060                log.error(e);
061            }
062            
063            return baos.toByteArray();
064        }
065        
066        /**
067         * Decodes a string containing base64 encoded data. 
068         * 
069         * @param s the string to decode.
070         * @return the decoded bytes.
071         */
072        public static byte[] decodeBase64(String s) {
073            ByteArrayOutputStream baos = new ByteArrayOutputStream();
074            
075            try {
076                byte[] bytes = s.getBytes("US-ASCII");
077                
078                Base64InputStream is = new Base64InputStream(
079                                            new ByteArrayInputStream(bytes));
080                
081                int b = 0;
082                while ((b = is.read()) != -1) {
083                    baos.write(b);
084                }
085            } catch (IOException e) {
086                /*
087                 * This should never happen!
088                 */
089                log.error(e);
090            }
091            
092            return baos.toByteArray();
093        }
094        
095        /**
096         * Decodes an encoded word encoded with the 'B' encoding (described in 
097         * RFC 2047) found in a header field body.
098         * 
099         * @param encodedWord the encoded word to decode.
100         * @param charset the Java charset to use.
101         * @return the decoded string.
102         * @throws UnsupportedEncodingException if the given Java charset isn't 
103         *         supported.
104         */
105        public static String decodeB(String encodedWord, String charset) 
106                throws UnsupportedEncodingException {
107            
108            return new String(decodeBase64(encodedWord), charset);
109        }
110        
111        /**
112         * Decodes an encoded word encoded with the 'Q' encoding (described in 
113         * RFC 2047) found in a header field body.
114         * 
115         * @param encodedWord the encoded word to decode.
116         * @param charset the Java charset to use.
117         * @return the decoded string.
118         * @throws UnsupportedEncodingException if the given Java charset isn't 
119         *         supported.
120         */
121        public static String decodeQ(String encodedWord, String charset)
122                throws UnsupportedEncodingException {
123               
124            /*
125             * Replace _ with =20
126             */
127            StringBuilder sb = new StringBuilder(128);
128            for (int i = 0; i < encodedWord.length(); i++) {
129                char c = encodedWord.charAt(i);
130                if (c == '_') {
131                    sb.append("=20");
132                } else {
133                    sb.append(c);
134                }
135            }
136            
137            return new String(decodeBaseQuotedPrintable(sb.toString()), charset);
138        }
139        
140        /**
141         * Decodes a string containing encoded words as defined by RFC 2047.
142         * Encoded words in have the form 
143         * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for 
144         * quoted-printable and 'B' or 'b' for Base64.
145         * 
146         * @param body the string to decode.
147         * @return the decoded string.
148         */
149        public static String decodeEncodedWords(String body) {
150            int previousEnd = 0;
151            boolean previousWasEncoded = false;
152    
153            StringBuilder sb = new StringBuilder();
154    
155            while (true) {
156                int begin = body.indexOf("=?", previousEnd);
157                int end = begin == -1 ? -1 : body.indexOf("?=", begin + 2);
158                if (end == -1) {
159                    if (previousEnd == 0)
160                        return body;
161    
162                    sb.append(body.substring(previousEnd));
163                    return sb.toString();
164                }
165                end += 2;
166    
167                String sep = body.substring(previousEnd, begin);
168    
169                String decoded = decodeEncodedWord(body, begin, end);
170                if (decoded == null) {
171                    sb.append(sep);
172                    sb.append(body.substring(begin, end));
173                } else {
174                    if (!previousWasEncoded || !CharsetUtil.isWhitespace(sep)) {
175                        sb.append(sep);
176                    }
177                    sb.append(decoded);
178                }
179    
180                previousEnd = end;
181                previousWasEncoded = decoded != null;
182            }
183        }
184    
185        // return null on error
186        private static String decodeEncodedWord(String body, int begin, int end) {
187            int qm1 = body.indexOf('?', begin + 2);
188            if (qm1 == end - 2)
189                return null;
190    
191            int qm2 = body.indexOf('?', qm1 + 1);
192            if (qm2 == end - 2)
193                return null;
194    
195            String mimeCharset = body.substring(begin + 2, qm1);
196            String encoding = body.substring(qm1 + 1, qm2);
197            String encodedText = body.substring(qm2 + 1, end - 2);
198    
199            String charset = CharsetUtil.toJavaCharset(mimeCharset);
200            if (charset == null) {
201                if (log.isWarnEnabled()) {
202                    log.warn("MIME charset '" + mimeCharset + "' in encoded word '"
203                            + body.substring(begin, end) + "' doesn't have a "
204                            + "corresponding Java charset");
205                }
206                return null;
207            } else if (!CharsetUtil.isDecodingSupported(charset)) {
208                if (log.isWarnEnabled()) {
209                    log.warn("Current JDK doesn't support decoding of charset '"
210                            + charset + "' (MIME charset '" + mimeCharset
211                            + "' in encoded word '" + body.substring(begin, end)
212                            + "')");
213                }
214                return null;
215            }
216    
217            if (encodedText.length() == 0) {
218                if (log.isWarnEnabled()) {
219                    log.warn("Missing encoded text in encoded word: '"
220                            + body.substring(begin, end) + "'");
221                }
222                return null;
223            }
224    
225            try {
226                if (encoding.equalsIgnoreCase("Q")) {
227                    return DecoderUtil.decodeQ(encodedText, charset);
228                } else if (encoding.equalsIgnoreCase("B")) {
229                    return DecoderUtil.decodeB(encodedText, charset);
230                } else {
231                    if (log.isWarnEnabled()) {
232                        log.warn("Warning: Unknown encoding in encoded word '"
233                                + body.substring(begin, end) + "'");
234                    }
235                    return null;
236                }
237            } catch (UnsupportedEncodingException e) {
238                // should not happen because of isDecodingSupported check above
239                if (log.isWarnEnabled()) {
240                    log.warn("Unsupported encoding in encoded word '"
241                            + body.substring(begin, end) + "'", e);
242                }
243                return null;
244            } catch (RuntimeException e) {
245                if (log.isWarnEnabled()) {
246                    log.warn("Could not decode encoded word '"
247                            + body.substring(begin, end) + "'", e);
248                }
249                return null;
250            }
251        }
252    }