001 /**************************************************************** 002 * Licensed to the Apache Software Foundation (ASF) under one * 003 * or more contributor license agreements. See the NOTICE file * 004 * distributed with this work for additional information * 005 * regarding copyright ownership. The ASF licenses this file * 006 * to you under the Apache License, Version 2.0 (the * 007 * "License"); you may not use this file except in compliance * 008 * with the License. You may obtain a copy of the License at * 009 * * 010 * http://www.apache.org/licenses/LICENSE-2.0 * 011 * * 012 * Unless required by applicable law or agreed to in writing, * 013 * software distributed under the License is distributed on an * 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 015 * KIND, either express or implied. See the License for the * 016 * specific language governing permissions and limitations * 017 * under the License. * 018 ****************************************************************/ 019 020 package org.apache.james.mime4j.parser; 021 022 import java.io.IOException; 023 import java.io.InputStream; 024 import java.io.InputStreamReader; 025 import java.io.Reader; 026 import java.nio.charset.Charset; 027 import java.nio.charset.IllegalCharsetNameException; 028 import java.nio.charset.UnsupportedCharsetException; 029 import java.util.LinkedList; 030 031 import org.apache.james.mime4j.MimeException; 032 import org.apache.james.mime4j.codec.Base64InputStream; 033 import org.apache.james.mime4j.codec.QuotedPrintableInputStream; 034 import org.apache.james.mime4j.descriptor.BodyDescriptor; 035 import org.apache.james.mime4j.io.BufferedLineReaderInputStream; 036 import org.apache.james.mime4j.io.LineNumberInputStream; 037 import org.apache.james.mime4j.io.LineNumberSource; 038 import org.apache.james.mime4j.util.CharsetUtil; 039 import org.apache.james.mime4j.util.MimeUtil; 040 041 /** 042 * <p> 043 * Parses MIME (or RFC822) message streams of bytes or characters. 044 * The stream is converted into an event stream. 045 * <p> 046 * <p> 047 * Typical usage: 048 * </p> 049 * <pre> 050 * MimeTokenStream stream = new MimeTokenStream(); 051 * stream.parse(new FileInputStream("mime.msg")); 052 * for (int state = stream.getState(); 053 * state != MimeTokenStream.T_END_OF_STREAM; 054 * state = stream.next()) { 055 * switch (state) { 056 * case MimeTokenStream.T_BODY: 057 * System.out.println("Body detected, contents = " 058 * + stream.getInputStream() + ", header data = " 059 * + stream.getBodyDescriptor()); 060 * break; 061 * case MimeTokenStream.T_FIELD: 062 * System.out.println("Header field detected: " 063 * + stream.getField()); 064 * break; 065 * case MimeTokenStream.T_START_MULTIPART: 066 * System.out.println("Multipart message detexted," 067 * + " header data = " 068 * + stream.getBodyDescriptor()); 069 * ... 070 * } 071 * } 072 * </pre> 073 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the 074 * method {@link #parse(InputStream)} resets the token streams internal 075 * state. However, they are definitely <em>not</em> thread safe. If you 076 * have a multi threaded application, then the suggested use is to have 077 * one instance per thread.</p> 078 */ 079 public class MimeTokenStream implements EntityStates, RecursionMode { 080 081 /** 082 * Creates a stream that creates a more detailed body descriptor. 083 * @return <code>MimeTokenStream</code>, not null 084 */ 085 public static final MimeTokenStream createMaximalDescriptorStream() { 086 MimeEntityConfig config = new MimeEntityConfig(); 087 config.setMaximalBodyDescriptor(true); 088 return new MimeTokenStream(config); 089 } 090 091 /** 092 * Creates a stream that strictly validates the input. 093 * @return <code>MimeTokenStream</code> which throws a 094 * <code>MimeException</code> whenever possible issues 095 * are dedicated in the input 096 */ 097 public static final MimeTokenStream createStrictValidationStream() { 098 MimeEntityConfig config = new MimeEntityConfig(); 099 config.setStrictParsing(true); 100 return new MimeTokenStream(config); 101 } 102 103 private final MimeEntityConfig config; 104 private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>(); 105 106 private int state = T_END_OF_STREAM; 107 private EntityStateMachine currentStateMachine; 108 private int recursionMode = M_RECURSE; 109 private BufferedLineReaderInputStream inbuffer; 110 111 /** 112 * Constructs a standard (lax) stream. 113 * Optional validation events will be logged only. 114 * Use {@link #createStrictValidationStream()} to create 115 * a stream that strictly validates the input. 116 */ 117 public MimeTokenStream() { 118 this(new MimeEntityConfig()); 119 } 120 121 protected MimeTokenStream(final MimeEntityConfig config) { 122 super(); 123 this.config = config; 124 } 125 126 /** Instructs the {@code MimeTokenStream} to parse the given streams contents. 127 * If the {@code MimeTokenStream} has already been in use, resets the streams 128 * internal state. 129 */ 130 public void parse(InputStream stream) { 131 doParse(stream, null); 132 } 133 134 /** Instructs the {@code MimeTokenStream} to parse the given content with 135 * the content type. The message stream is assumed to have no message header 136 * and is expected to begin with a message body. This can be the case when 137 * the message content is transmitted using a different transport protocol 138 * such as HTTP. 139 * <p/> 140 * If the {@code MimeTokenStream} has already been in use, resets the streams 141 * internal state. 142 */ 143 public void parseHeadless(InputStream stream, String contentType) { 144 if (contentType == null) { 145 throw new IllegalArgumentException("Content type may not be null"); 146 } 147 doParse(stream, contentType); 148 } 149 150 private void doParse(InputStream stream, String contentType) { 151 entities.clear(); 152 153 LineNumberSource lineSource = null; 154 if (config.isCountLineNumbers()) { 155 LineNumberInputStream lineInput = new LineNumberInputStream(stream); 156 lineSource = lineInput; 157 stream = lineInput; 158 } 159 160 inbuffer = new BufferedLineReaderInputStream( 161 stream, 162 4 * 1024, 163 config.getMaxLineLen()); 164 switch (recursionMode) { 165 case M_RAW: 166 RawEntity rawentity = new RawEntity(inbuffer); 167 currentStateMachine = rawentity; 168 break; 169 case M_NO_RECURSE: 170 case M_FLAT: 171 // expected to be called only at start of paring 172 case M_RECURSE: 173 MimeEntity mimeentity = new MimeEntity( 174 lineSource, 175 inbuffer, 176 null, 177 T_START_MESSAGE, 178 T_END_MESSAGE, 179 config); 180 mimeentity.setRecursionMode(recursionMode); 181 if (contentType != null) { 182 mimeentity.skipHeader(contentType); 183 } 184 currentStateMachine = mimeentity; 185 break; 186 } 187 entities.add(currentStateMachine); 188 state = currentStateMachine.getState(); 189 } 190 191 /** 192 * Determines if this parser is currently in raw mode. 193 * 194 * @return <code>true</code> if in raw mode, <code>false</code> 195 * otherwise. 196 * @see #setRecursionMode(int) 197 */ 198 public boolean isRaw() { 199 return recursionMode == M_RAW; 200 } 201 202 /** 203 * Gets the current recursion mode. 204 * The recursion mode specifies the approach taken to parsing parts. 205 * {@link #M_RAW} mode does not parse the part at all. 206 * {@link #M_RECURSE} mode recursively parses each mail 207 * when an <code>message/rfc822</code> part is encounted; 208 * {@link #M_NO_RECURSE} does not. 209 * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE} 210 */ 211 public int getRecursionMode() { 212 return recursionMode; 213 } 214 215 /** 216 * Sets the current recursion. 217 * The recursion mode specifies the approach taken to parsing parts. 218 * {@link #M_RAW} mode does not parse the part at all. 219 * {@link #M_RECURSE} mode recursively parses each mail 220 * when an <code>message/rfc822</code> part is encounted; 221 * {@link #M_NO_RECURSE} does not. 222 * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE} 223 */ 224 public void setRecursionMode(int mode) { 225 recursionMode = mode; 226 if (currentStateMachine != null) { 227 currentStateMachine.setRecursionMode(mode); 228 } 229 } 230 231 /** 232 * Finishes the parsing and stops reading lines. 233 * NOTE: No more lines will be parsed but the parser 234 * will still call 235 * {@link ContentHandler#endMultipart()}, 236 * {@link ContentHandler#endBodyPart()}, 237 * {@link ContentHandler#endMessage()}, etc to match previous calls 238 * to 239 * {@link ContentHandler#startMultipart(BodyDescriptor)}, 240 * {@link ContentHandler#startBodyPart()}, 241 * {@link ContentHandler#startMessage()}, etc. 242 */ 243 public void stop() { 244 inbuffer.truncate(); 245 } 246 247 /** 248 * Returns the current state. 249 */ 250 public int getState() { 251 return state; 252 } 253 254 /** 255 * This method returns the raw entity, preamble, or epilogue contents. 256 * <p/> 257 * This method is valid, if {@link #getState()} returns either of 258 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}. 259 * 260 * @return Data stream, depending on the current state. 261 * @throws IllegalStateException {@link #getState()} returns an 262 * invalid value. 263 */ 264 public InputStream getInputStream() { 265 return currentStateMachine.getContentStream(); 266 } 267 268 /** 269 * This method returns a transfer decoded stream based on the MIME 270 * fields with the standard defaults. 271 * <p/> 272 * This method is valid, if {@link #getState()} returns either of 273 * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}. 274 * 275 * @return Data stream, depending on the current state. 276 * @throws IllegalStateException {@link #getState()} returns an 277 * invalid value. 278 */ 279 public InputStream getDecodedInputStream() { 280 BodyDescriptor bodyDescriptor = getBodyDescriptor(); 281 String transferEncoding = bodyDescriptor.getTransferEncoding(); 282 InputStream dataStream = currentStateMachine.getContentStream(); 283 if (MimeUtil.isBase64Encoding(transferEncoding)) { 284 dataStream = new Base64InputStream(dataStream); 285 } else if (MimeUtil.isQuotedPrintableEncoded(transferEncoding)) { 286 dataStream = new QuotedPrintableInputStream(dataStream); 287 } 288 return dataStream; 289 } 290 291 /** 292 * Gets a reader configured for the current body or body part. 293 * The reader will return a transfer and charset decoded 294 * stream of characters based on the MIME fields with the standard 295 * defaults. 296 * This is a conveniance method and relies on {@link #getInputStream()}. 297 * Consult the javadoc for that method for known limitations. 298 * 299 * @return <code>Reader</code>, not null 300 * @see #getInputStream 301 * @throws IllegalStateException {@link #getState()} returns an 302 * invalid value 303 * @throws UnsupportedCharsetException if there is no JVM support 304 * for decoding the charset 305 * @throws IllegalCharsetNameException if the charset name specified 306 * in the mime type is illegal 307 */ 308 public Reader getReader() { 309 final BodyDescriptor bodyDescriptor = getBodyDescriptor(); 310 final String mimeCharset = bodyDescriptor.getCharset(); 311 final Charset charset; 312 if (mimeCharset == null || "".equals(mimeCharset)) { 313 charset = CharsetUtil.US_ASCII; 314 } else { 315 charset = Charset.forName(mimeCharset); 316 } 317 final InputStream instream = getDecodedInputStream(); 318 return new InputStreamReader(instream, charset); 319 } 320 321 /** 322 * <p>Gets a descriptor for the current entity. 323 * This method is valid if {@link #getState()} returns:</p> 324 * <ul> 325 * <li>{@link #T_BODY}</li> 326 * <li>{@link #T_START_MULTIPART}</li> 327 * <li>{@link #T_EPILOGUE}</li> 328 * <li>{@link #T_PREAMBLE}</li> 329 * </ul> 330 * @return <code>BodyDescriptor</code>, not nulls 331 */ 332 public BodyDescriptor getBodyDescriptor() { 333 return currentStateMachine.getBodyDescriptor(); 334 } 335 336 /** 337 * This method is valid, if {@link #getState()} returns {@link #T_FIELD}. 338 * @return String with the fields raw contents. 339 * @throws IllegalStateException {@link #getState()} returns another 340 * value than {@link #T_FIELD}. 341 */ 342 public Field getField() { 343 return currentStateMachine.getField(); 344 } 345 346 /** 347 * This method advances the token stream to the next token. 348 * @throws IllegalStateException The method has been called, although 349 * {@link #getState()} was already {@link #T_END_OF_STREAM}. 350 */ 351 public int next() throws IOException, MimeException { 352 if (state == T_END_OF_STREAM || currentStateMachine == null) { 353 throw new IllegalStateException("No more tokens are available."); 354 } 355 while (currentStateMachine != null) { 356 EntityStateMachine next = currentStateMachine.advance(); 357 if (next != null) { 358 entities.add(next); 359 currentStateMachine = next; 360 } 361 state = currentStateMachine.getState(); 362 if (state != T_END_OF_STREAM) { 363 return state; 364 } 365 entities.removeLast(); 366 if (entities.isEmpty()) { 367 currentStateMachine = null; 368 } else { 369 currentStateMachine = entities.getLast(); 370 currentStateMachine.setRecursionMode(recursionMode); 371 } 372 } 373 state = T_END_OF_STREAM; 374 return state; 375 } 376 377 /** 378 * Renders a state as a string suitable for logging. 379 * @param state 380 * @return rendered as string, not null 381 */ 382 public static final String stateToString(int state) { 383 return AbstractEntity.stateToString(state); 384 } 385 }