001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.parser;
021    
022    import java.io.IOException;
023    import java.io.InputStream;
024    import java.io.InputStreamReader;
025    import java.io.Reader;
026    import java.nio.charset.Charset;
027    import java.nio.charset.IllegalCharsetNameException;
028    import java.nio.charset.UnsupportedCharsetException;
029    import java.util.LinkedList;
030    
031    import org.apache.james.mime4j.MimeException;
032    import org.apache.james.mime4j.codec.Base64InputStream;
033    import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
034    import org.apache.james.mime4j.descriptor.BodyDescriptor;
035    import org.apache.james.mime4j.io.BufferedLineReaderInputStream;
036    import org.apache.james.mime4j.io.LineNumberInputStream;
037    import org.apache.james.mime4j.io.LineNumberSource;
038    import org.apache.james.mime4j.util.CharsetUtil;
039    import org.apache.james.mime4j.util.MimeUtil;
040    
041    /**
042     * <p>
043     * Parses MIME (or RFC822) message streams of bytes or characters.
044     * The stream is converted into an event stream.
045     * <p>
046     * <p>
047     * Typical usage:
048     * </p>
049     * <pre>
050     *      MimeTokenStream stream = new MimeTokenStream();
051     *      stream.parse(new FileInputStream("mime.msg"));
052     *      for (int state = stream.getState();
053     *           state != MimeTokenStream.T_END_OF_STREAM;
054     *           state = stream.next()) {
055     *          switch (state) {
056     *            case MimeTokenStream.T_BODY:
057     *              System.out.println("Body detected, contents = "
058     *                + stream.getInputStream() + ", header data = "
059     *                + stream.getBodyDescriptor());
060     *              break;
061     *            case MimeTokenStream.T_FIELD:
062     *              System.out.println("Header field detected: "
063     *                + stream.getField());
064     *              break;
065     *            case MimeTokenStream.T_START_MULTIPART:
066     *              System.out.println("Multipart message detexted,"
067     *                + " header data = "
068     *                + stream.getBodyDescriptor());
069     *            ...
070     *          }
071     *      }
072     * </pre>
073     * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
074     * method {@link #parse(InputStream)} resets the token streams internal
075     * state. However, they are definitely <em>not</em> thread safe. If you
076     * have a multi threaded application, then the suggested use is to have
077     * one instance per thread.</p>
078     */
079    public class MimeTokenStream implements EntityStates, RecursionMode {
080        
081        /**
082         * Creates a stream that creates a more detailed body descriptor.
083         * @return <code>MimeTokenStream</code>, not null
084         */
085        public static final MimeTokenStream createMaximalDescriptorStream() {
086            MimeEntityConfig config = new MimeEntityConfig();
087            config.setMaximalBodyDescriptor(true);
088            return new MimeTokenStream(config);
089        }
090        
091        /**
092         * Creates a stream that strictly validates the input.
093         * @return <code>MimeTokenStream</code> which throws a 
094         * <code>MimeException</code> whenever possible issues 
095         * are dedicated in the input
096         */
097        public static final MimeTokenStream createStrictValidationStream() {
098            MimeEntityConfig config = new MimeEntityConfig();
099            config.setStrictParsing(true);
100            return new MimeTokenStream(config);
101        }
102        
103        private final MimeEntityConfig config;
104        private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>();
105        
106        private int state = T_END_OF_STREAM;
107        private EntityStateMachine currentStateMachine;
108        private int recursionMode = M_RECURSE;
109        private BufferedLineReaderInputStream inbuffer;
110        
111        /**
112         * Constructs a standard (lax) stream.
113         * Optional validation events will be logged only.
114         * Use {@link #createStrictValidationStream()} to create
115         * a stream that strictly validates the input.
116         */
117        public MimeTokenStream() {
118            this(new MimeEntityConfig());
119        }
120        
121        protected MimeTokenStream(final MimeEntityConfig config) {
122            super();
123            this.config = config;
124        }
125        
126        /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
127         * If the {@code MimeTokenStream} has already been in use, resets the streams
128         * internal state.
129         */
130        public void parse(InputStream stream) {
131            doParse(stream, null);
132        }
133    
134        /** Instructs the {@code MimeTokenStream} to parse the given content with 
135         * the content type. The message stream is assumed to have no message header
136         * and is expected to begin with a message body. This can be the case when 
137         * the message content is transmitted using a different transport protocol 
138         * such as HTTP.
139         * <p/>
140         * If the {@code MimeTokenStream} has already been in use, resets the streams
141         * internal state.
142         */    
143        public void parseHeadless(InputStream stream, String contentType) {
144            if (contentType == null) {
145                throw new IllegalArgumentException("Content type may not be null");
146            }
147            doParse(stream, contentType);
148        }
149    
150        private void doParse(InputStream stream, String contentType) {
151            entities.clear();
152    
153            LineNumberSource lineSource = null;
154            if (config.isCountLineNumbers()) {
155                LineNumberInputStream lineInput = new LineNumberInputStream(stream);
156                lineSource = lineInput;
157                stream = lineInput;
158            }
159    
160            inbuffer = new BufferedLineReaderInputStream(
161                    stream, 
162                    4 * 1024,
163                    config.getMaxLineLen());
164            switch (recursionMode) {
165            case M_RAW:
166                RawEntity rawentity = new RawEntity(inbuffer);
167                currentStateMachine = rawentity;
168                break;
169            case M_NO_RECURSE:
170            case M_FLAT:
171                // expected to be called only at start of paring
172            case M_RECURSE:
173                MimeEntity mimeentity = new MimeEntity(
174                        lineSource,
175                        inbuffer,
176                        null, 
177                        T_START_MESSAGE, 
178                        T_END_MESSAGE,
179                        config);
180                mimeentity.setRecursionMode(recursionMode);
181                if (contentType != null) {
182                    mimeentity.skipHeader(contentType);
183                }
184                currentStateMachine = mimeentity;
185                break;
186            }
187            entities.add(currentStateMachine);
188            state = currentStateMachine.getState();
189        }
190    
191        /**
192         * Determines if this parser is currently in raw mode.
193         * 
194         * @return <code>true</code> if in raw mode, <code>false</code>
195         *         otherwise.
196         * @see #setRecursionMode(int)
197         */
198        public boolean isRaw() {
199            return recursionMode == M_RAW;
200        }
201        
202        /**
203         * Gets the current recursion mode.
204         * The recursion mode specifies the approach taken to parsing parts.
205         * {@link #M_RAW}  mode does not parse the part at all.
206         * {@link #M_RECURSE} mode recursively parses each mail
207         * when an <code>message/rfc822</code> part is encounted;
208         * {@link #M_NO_RECURSE} does not.
209         * @return {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
210         */
211        public int getRecursionMode() {
212            return recursionMode;
213        }
214        
215        /**
216         * Sets the current recursion.
217         * The recursion mode specifies the approach taken to parsing parts.
218         * {@link #M_RAW}  mode does not parse the part at all.
219         * {@link #M_RECURSE} mode recursively parses each mail
220         * when an <code>message/rfc822</code> part is encounted;
221         * {@link #M_NO_RECURSE} does not.
222         * @param mode {@link #M_RECURSE}, {@link #M_RAW} or {@link #M_NO_RECURSE}
223         */
224        public void setRecursionMode(int mode) {
225            recursionMode = mode;
226            if (currentStateMachine != null) {
227                currentStateMachine.setRecursionMode(mode);
228            }
229        }
230    
231        /**
232         * Finishes the parsing and stops reading lines.
233         * NOTE: No more lines will be parsed but the parser
234         * will still call 
235         * {@link ContentHandler#endMultipart()},
236         * {@link ContentHandler#endBodyPart()},
237         * {@link ContentHandler#endMessage()}, etc to match previous calls
238         * to 
239         * {@link ContentHandler#startMultipart(BodyDescriptor)},
240         * {@link ContentHandler#startBodyPart()},
241         * {@link ContentHandler#startMessage()}, etc.
242         */
243        public void stop() {
244            inbuffer.truncate();
245        }
246    
247        /**
248         * Returns the current state.
249         */
250        public int getState() {
251            return state;
252        }
253    
254        /**
255         * This method returns the raw entity, preamble, or epilogue contents.
256         * <p/>
257         * This method is valid, if {@link #getState()} returns either of
258         * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
259         * 
260         * @return Data stream, depending on the current state.
261         * @throws IllegalStateException {@link #getState()} returns an
262         *   invalid value.
263         */
264        public InputStream getInputStream() {
265            return currentStateMachine.getContentStream();
266        }
267        
268        /**
269         * This method returns a transfer decoded stream based on the MIME 
270         * fields with the standard defaults.
271         * <p/>
272         * This method is valid, if {@link #getState()} returns either of
273         * {@link #T_RAW_ENTITY}, {@link #T_PREAMBLE}, or {@link #T_EPILOGUE}.
274         * 
275         * @return Data stream, depending on the current state.
276         * @throws IllegalStateException {@link #getState()} returns an
277         *   invalid value.
278         */
279        public InputStream getDecodedInputStream() {
280            BodyDescriptor bodyDescriptor = getBodyDescriptor();
281            String transferEncoding = bodyDescriptor.getTransferEncoding();
282            InputStream dataStream = currentStateMachine.getContentStream();
283            if (MimeUtil.isBase64Encoding(transferEncoding)) {
284                dataStream = new Base64InputStream(dataStream);
285            } else if (MimeUtil.isQuotedPrintableEncoded(transferEncoding)) {
286                dataStream = new QuotedPrintableInputStream(dataStream);
287            }
288            return dataStream;
289        }
290    
291        /**
292         * Gets a reader configured for the current body or body part.
293         * The reader will return a transfer and charset decoded 
294         * stream of characters based on the MIME fields with the standard
295         * defaults.
296         * This is a conveniance method and relies on {@link #getInputStream()}.
297         * Consult the javadoc for that method for known limitations.
298         * 
299         * @return <code>Reader</code>, not null
300         * @see #getInputStream 
301         * @throws IllegalStateException {@link #getState()} returns an
302         *   invalid value 
303         * @throws UnsupportedCharsetException if there is no JVM support 
304         * for decoding the charset
305         * @throws IllegalCharsetNameException if the charset name specified
306         * in the mime type is illegal
307         */
308        public Reader getReader() {
309            final BodyDescriptor bodyDescriptor = getBodyDescriptor();
310            final String mimeCharset = bodyDescriptor.getCharset();
311            final Charset charset;
312            if (mimeCharset == null || "".equals(mimeCharset)) {
313                charset = CharsetUtil.US_ASCII;
314            } else {
315                charset = Charset.forName(mimeCharset);
316            }
317            final InputStream instream = getDecodedInputStream();
318            return new InputStreamReader(instream, charset);
319        }
320        
321        /**
322         * <p>Gets a descriptor for the current entity.
323         * This method is valid if {@link #getState()} returns:</p>
324         * <ul>
325         * <li>{@link #T_BODY}</li>
326         * <li>{@link #T_START_MULTIPART}</li>
327         * <li>{@link #T_EPILOGUE}</li>
328         * <li>{@link #T_PREAMBLE}</li>
329         * </ul>
330         * @return <code>BodyDescriptor</code>, not nulls
331         */
332        public BodyDescriptor getBodyDescriptor() {
333            return currentStateMachine.getBodyDescriptor();
334        }
335    
336        /**
337         * This method is valid, if {@link #getState()} returns {@link #T_FIELD}.
338         * @return String with the fields raw contents.
339         * @throws IllegalStateException {@link #getState()} returns another
340         *   value than {@link #T_FIELD}.
341         */
342        public Field getField() {
343            return currentStateMachine.getField();
344        }
345        
346        /**
347         * This method advances the token stream to the next token.
348         * @throws IllegalStateException The method has been called, although
349         *   {@link #getState()} was already {@link #T_END_OF_STREAM}.
350         */
351        public int next() throws IOException, MimeException {
352            if (state == T_END_OF_STREAM  ||  currentStateMachine == null) {
353                throw new IllegalStateException("No more tokens are available.");
354            }
355            while (currentStateMachine != null) {
356                EntityStateMachine next = currentStateMachine.advance();
357                if (next != null) {
358                    entities.add(next);
359                    currentStateMachine = next;
360                }
361                state = currentStateMachine.getState();
362                if (state != T_END_OF_STREAM) {
363                    return state;
364                }
365                entities.removeLast();
366                if (entities.isEmpty()) {
367                    currentStateMachine = null;
368                } else {
369                    currentStateMachine = entities.getLast();
370                    currentStateMachine.setRecursionMode(recursionMode);
371                }
372            }
373            state = T_END_OF_STREAM;
374            return state;
375        }
376    
377        /**
378         * Renders a state as a string suitable for logging.
379         * @param state 
380         * @return rendered as string, not null
381         */
382        public static final String stateToString(int state) {
383            return AbstractEntity.stateToString(state);
384        }
385    }