001 /**************************************************************** 002 * Licensed to the Apache Software Foundation (ASF) under one * 003 * or more contributor license agreements. See the NOTICE file * 004 * distributed with this work for additional information * 005 * regarding copyright ownership. The ASF licenses this file * 006 * to you under the Apache License, Version 2.0 (the * 007 * "License"); you may not use this file except in compliance * 008 * with the License. You may obtain a copy of the License at * 009 * * 010 * http://www.apache.org/licenses/LICENSE-2.0 * 011 * * 012 * Unless required by applicable law or agreed to in writing, * 013 * software distributed under the License is distributed on an * 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 015 * KIND, either express or implied. See the License for the * 016 * specific language governing permissions and limitations * 017 * under the License. * 018 ****************************************************************/ 019 020 package org.apache.james.mime4j.codec; 021 022 import java.io.ByteArrayInputStream; 023 import java.io.ByteArrayOutputStream; 024 import java.io.IOException; 025 import java.io.UnsupportedEncodingException; 026 027 import org.apache.commons.logging.Log; 028 import org.apache.commons.logging.LogFactory; 029 import org.apache.james.mime4j.util.CharsetUtil; 030 031 /** 032 * Static methods for decoding strings, byte arrays and encoded words. 033 */ 034 public class DecoderUtil { 035 private static Log log = LogFactory.getLog(DecoderUtil.class); 036 037 /** 038 * Decodes a string containing quoted-printable encoded data. 039 * 040 * @param s the string to decode. 041 * @return the decoded bytes. 042 */ 043 public static byte[] decodeBaseQuotedPrintable(String s) { 044 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 045 046 try { 047 byte[] bytes = s.getBytes("US-ASCII"); 048 049 QuotedPrintableInputStream is = new QuotedPrintableInputStream( 050 new ByteArrayInputStream(bytes)); 051 052 int b = 0; 053 while ((b = is.read()) != -1) { 054 baos.write(b); 055 } 056 } catch (IOException e) { 057 /* 058 * This should never happen! 059 */ 060 log.error(e); 061 } 062 063 return baos.toByteArray(); 064 } 065 066 /** 067 * Decodes a string containing base64 encoded data. 068 * 069 * @param s the string to decode. 070 * @return the decoded bytes. 071 */ 072 public static byte[] decodeBase64(String s) { 073 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 074 075 try { 076 byte[] bytes = s.getBytes("US-ASCII"); 077 078 Base64InputStream is = new Base64InputStream( 079 new ByteArrayInputStream(bytes)); 080 081 int b = 0; 082 while ((b = is.read()) != -1) { 083 baos.write(b); 084 } 085 } catch (IOException e) { 086 /* 087 * This should never happen! 088 */ 089 log.error(e); 090 } 091 092 return baos.toByteArray(); 093 } 094 095 /** 096 * Decodes an encoded word encoded with the 'B' encoding (described in 097 * RFC 2047) found in a header field body. 098 * 099 * @param encodedWord the encoded word to decode. 100 * @param charset the Java charset to use. 101 * @return the decoded string. 102 * @throws UnsupportedEncodingException if the given Java charset isn't 103 * supported. 104 */ 105 public static String decodeB(String encodedWord, String charset) 106 throws UnsupportedEncodingException { 107 108 return new String(decodeBase64(encodedWord), charset); 109 } 110 111 /** 112 * Decodes an encoded word encoded with the 'Q' encoding (described in 113 * RFC 2047) found in a header field body. 114 * 115 * @param encodedWord the encoded word to decode. 116 * @param charset the Java charset to use. 117 * @return the decoded string. 118 * @throws UnsupportedEncodingException if the given Java charset isn't 119 * supported. 120 */ 121 public static String decodeQ(String encodedWord, String charset) 122 throws UnsupportedEncodingException { 123 124 /* 125 * Replace _ with =20 126 */ 127 StringBuilder sb = new StringBuilder(128); 128 for (int i = 0; i < encodedWord.length(); i++) { 129 char c = encodedWord.charAt(i); 130 if (c == '_') { 131 sb.append("=20"); 132 } else { 133 sb.append(c); 134 } 135 } 136 137 return new String(decodeBaseQuotedPrintable(sb.toString()), charset); 138 } 139 140 /** 141 * Decodes a string containing encoded words as defined by RFC 2047. 142 * Encoded words in have the form 143 * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for 144 * quoted-printable and 'B' or 'b' for Base64. 145 * 146 * @param body the string to decode. 147 * @return the decoded string. 148 */ 149 public static String decodeEncodedWords(String body) { 150 int previousEnd = 0; 151 boolean previousWasEncoded = false; 152 153 StringBuilder sb = new StringBuilder(); 154 155 while (true) { 156 int begin = body.indexOf("=?", previousEnd); 157 int end = begin == -1 ? -1 : body.indexOf("?=", begin + 2); 158 if (end == -1) { 159 if (previousEnd == 0) 160 return body; 161 162 sb.append(body.substring(previousEnd)); 163 return sb.toString(); 164 } 165 end += 2; 166 167 String sep = body.substring(previousEnd, begin); 168 169 String decoded = decodeEncodedWord(body, begin, end); 170 if (decoded == null) { 171 sb.append(sep); 172 sb.append(body.substring(begin, end)); 173 } else { 174 if (!previousWasEncoded || !CharsetUtil.isWhitespace(sep)) { 175 sb.append(sep); 176 } 177 sb.append(decoded); 178 } 179 180 previousEnd = end; 181 previousWasEncoded = decoded != null; 182 } 183 } 184 185 // return null on error 186 private static String decodeEncodedWord(String body, int begin, int end) { 187 int qm1 = body.indexOf('?', begin + 2); 188 if (qm1 == end - 2) 189 return null; 190 191 int qm2 = body.indexOf('?', qm1 + 1); 192 if (qm2 == end - 2) 193 return null; 194 195 String mimeCharset = body.substring(begin + 2, qm1); 196 String encoding = body.substring(qm1 + 1, qm2); 197 String encodedText = body.substring(qm2 + 1, end - 2); 198 199 String charset = CharsetUtil.toJavaCharset(mimeCharset); 200 if (charset == null) { 201 if (log.isWarnEnabled()) { 202 log.warn("MIME charset '" + mimeCharset + "' in encoded word '" 203 + body.substring(begin, end) + "' doesn't have a " 204 + "corresponding Java charset"); 205 } 206 return null; 207 } else if (!CharsetUtil.isDecodingSupported(charset)) { 208 if (log.isWarnEnabled()) { 209 log.warn("Current JDK doesn't support decoding of charset '" 210 + charset + "' (MIME charset '" + mimeCharset 211 + "' in encoded word '" + body.substring(begin, end) 212 + "')"); 213 } 214 return null; 215 } 216 217 if (encodedText.length() == 0) { 218 if (log.isWarnEnabled()) { 219 log.warn("Missing encoded text in encoded word: '" 220 + body.substring(begin, end) + "'"); 221 } 222 return null; 223 } 224 225 try { 226 if (encoding.equalsIgnoreCase("Q")) { 227 return DecoderUtil.decodeQ(encodedText, charset); 228 } else if (encoding.equalsIgnoreCase("B")) { 229 return DecoderUtil.decodeB(encodedText, charset); 230 } else { 231 if (log.isWarnEnabled()) { 232 log.warn("Warning: Unknown encoding in encoded word '" 233 + body.substring(begin, end) + "'"); 234 } 235 return null; 236 } 237 } catch (UnsupportedEncodingException e) { 238 // should not happen because of isDecodingSupported check above 239 if (log.isWarnEnabled()) { 240 log.warn("Unsupported encoding in encoded word '" 241 + body.substring(begin, end) + "'", e); 242 } 243 return null; 244 } catch (RuntimeException e) { 245 if (log.isWarnEnabled()) { 246 log.warn("Could not decode encoded word '" 247 + body.substring(begin, end) + "'", e); 248 } 249 return null; 250 } 251 } 252 }