javax.mail.internet.HeaderTokenizer.java Source code

Introduction

Here is the source code for javax.mail.internet.HeaderTokenizer.java
Source

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1997-2017 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * https://oss.oracle.com/licenses/CDDL+GPL-1.1
 * or LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */

package javax.mail.internet;

import java.util.*;

/**
 * This class tokenizes RFC822 and MIME headers into the basic
 * symbols specified by RFC822 and MIME. <p>
 *
 * This class handles folded headers (ie headers with embedded
 * CRLF SPACE sequences). The folds are removed in the returned
 * tokens. 
 *
 * @author  John Mani
 * @author  Bill Shannon
 */

public class HeaderTokenizer {

    /**
     * The Token class represents tokens returned by the 
     * HeaderTokenizer.
     */
    public static class Token {

        private int type;
        private String value;

        /**
         * Token type indicating an ATOM.
         */
        public static final int ATOM = -1;

        /**
         * Token type indicating a quoted string. The value 
         * field contains the string without the quotes.
          */
        public static final int QUOTEDSTRING = -2;

        /**
         * Token type indicating a comment. The value field 
         * contains the comment string without the comment 
         * start and end symbols.
         */
        public static final int COMMENT = -3;

        /**
         * Token type indicating end of input.
         */
        public static final int EOF = -4;

        /**
         * Constructor.
         * @param   type   Token type
         * @param   value   Token value
         */
        public Token(int type, String value) {
            this.type = type;
            this.value = value;
        }

        /**
         * Return the type of the token. If the token represents a
         * delimiter or a control character, the type is that character
         * itself, converted to an integer. Otherwise, it's value is 
         * one of the following:
         * <ul>
         * <li><code>ATOM</code> A sequence of ASCII characters 
         *   delimited by either SPACE, CTL, "(", &lt;"&gt; or the 
         *   specified SPECIALS
         * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
         *   within quotes
         * <li><code>COMMENT</code> A sequence of ASCII characters 
         *   within "(" and ")".
         * <li><code>EOF</code> End of header
         * </ul>
         *
         * @return   the token type
         */
        public int getType() {
            return type;
        }

        /**
         * Returns the value of the token just read. When the current
         * token is a quoted string, this field contains the body of the
         * string, without the quotes. When the current token is a comment,
         * this field contains the body of the comment.
         *
         * @return   token value
         */
        public String getValue() {
            return value;
        }
    }

    private String string; // the string to be tokenized
    private boolean skipComments; // should comments be skipped ?
    private String delimiters; // delimiter string
    private int currentPos; // current parse position
    private int maxPos; // string length
    private int nextPos; // track start of next Token for next()
    private int peekPos; // track start of next Token for peek()

    /**
     * RFC822 specials
     */
    public final static String RFC822 = "()<>@,;:\\\"\t .[]";

    /**
     * MIME specials
     */
    public final static String MIME = "()<>@,;:\\\"\t []/?=";

    // The EOF Token
    private final static Token EOFToken = new Token(Token.EOF, null);

    /**
     * Constructor that takes a rfc822 style header.
     *
     * @param   header   The rfc822 header to be tokenized
     * @param   delimiters      Set of delimiter characters 
     *            to be used to delimit ATOMS. These
     *            are usually <code>RFC822</code> or 
     *            <code>MIME</code>
     * @param   skipComments  If true, comments are skipped and
     *            not returned as tokens
     */
    public HeaderTokenizer(String header, String delimiters, boolean skipComments) {
        string = (header == null) ? "" : header; // paranoia ?!
        this.skipComments = skipComments;
        this.delimiters = delimiters;
        currentPos = nextPos = peekPos = 0;
        maxPos = string.length();
    }

    /**
     * Constructor. Comments are ignored and not returned as tokens
     *
     * @param   header  The header that is tokenized
     * @param   delimiters  The delimiters to be used
     */
    public HeaderTokenizer(String header, String delimiters) {
        this(header, delimiters, true);
    }

    /**
     * Constructor. The RFC822 defined delimiters - RFC822 - are
     * used to delimit ATOMS. Also comments are skipped and not
     * returned as tokens
     *
     * @param   header   the header string
     */
    public HeaderTokenizer(String header) {
        this(header, RFC822);
    }

    /**
     * Parses the next token from this String. <p>
     *
     * Clients sit in a loop calling next() to parse successive
     * tokens until an EOF Token is returned.
     *
     * @return      the next Token
     * @exception   ParseException if the parse fails
     */
    public Token next() throws ParseException {
        return next('\0', false);
    }

    /**
     * Parses the next token from this String.
     * If endOfAtom is not NUL, the token extends until the
     * endOfAtom character is seen, or to the end of the header.
     * This method is useful when parsing headers that don't
     * obey the MIME specification, e.g., by failing to quote
     * parameter values that contain spaces.
     *
     * @param   endOfAtom   if not NUL, character marking end of token
     * @return      the next Token
     * @exception   ParseException if the parse fails
     * @since      JavaMail 1.5
     */
    public Token next(char endOfAtom) throws ParseException {
        return next(endOfAtom, false);
    }

    /**
     * Parses the next token from this String.
     * endOfAtom is handled as above.  If keepEscapes is true,
     * any backslash escapes are preserved in the returned string.
     * This method is useful when parsing headers that don't
     * obey the MIME specification, e.g., by failing to escape
     * backslashes in the filename parameter.
     *
     * @param   endOfAtom   if not NUL, character marking end of token
     * @param   keepEscapes   keep all backslashes in returned string?
     * @return      the next Token
     * @exception   ParseException if the parse fails
     * @since      JavaMail 1.5
     */
    public Token next(char endOfAtom, boolean keepEscapes) throws ParseException {
        Token tk;

        currentPos = nextPos; // setup currentPos
        tk = getNext(endOfAtom, keepEscapes);
        nextPos = peekPos = currentPos; // update currentPos and peekPos
        return tk;
    }

    /**
     * Peek at the next token, without actually removing the token
     * from the parse stream. Invoking this method multiple times
     * will return successive tokens, until <code>next()</code> is
     * called. <p>
     *
     * @return      the next Token
     * @exception   ParseException if the parse fails
     */
    public Token peek() throws ParseException {
        Token tk;

        currentPos = peekPos; // setup currentPos
        tk = getNext('\0', false);
        peekPos = currentPos; // update peekPos
        return tk;
    }

    /**
     * Return the rest of the Header.
     *
     * @return String   rest of header. null is returned if we are
     *         already at end of header
     */
    public String getRemainder() {
        if (nextPos >= string.length())
            return null;
        return string.substring(nextPos);
    }

    /*
     * Return the next token starting from 'currentPos'. After the
     * parse, 'currentPos' is updated to point to the start of the 
     * next token.
     */
    private Token getNext(char endOfAtom, boolean keepEscapes) throws ParseException {
        // If we're already at end of string, return EOF
        if (currentPos >= maxPos)
            return EOFToken;

        // Skip white-space, position currentPos beyond the space
        if (skipWhiteSpace() == Token.EOF)
            return EOFToken;

        char c;
        int start;
        boolean filter = false;

        c = string.charAt(currentPos);

        // Check or Skip comments and position currentPos
        // beyond the comment
        while (c == '(') {
            // Parsing comment ..
            int nesting;
            for (start = ++currentPos, nesting = 1; nesting > 0 && currentPos < maxPos; currentPos++) {
                c = string.charAt(currentPos);
                if (c == '\\') { // Escape sequence
                    currentPos++; // skip the escaped character
                    filter = true;
                } else if (c == '\r')
                    filter = true;
                else if (c == '(')
                    nesting++;
                else if (c == ')')
                    nesting--;
            }
            if (nesting != 0)
                throw new ParseException("Unbalanced comments");

            if (!skipComments) {
                // Return the comment, if we are asked to.
                // Note that the comment start & end markers are ignored.
                String s;
                if (filter) // need to go thru the token again.
                    s = filterToken(string, start, currentPos - 1, keepEscapes);
                else
                    s = string.substring(start, currentPos - 1);

                return new Token(Token.COMMENT, s);
            }

            // Skip any whitespace after the comment.
            if (skipWhiteSpace() == Token.EOF)
                return EOFToken;
            c = string.charAt(currentPos);
        }

        // Check for quoted-string and position currentPos 
        //  beyond the terminating quote
        if (c == '"') {
            currentPos++; // skip initial quote
            return collectString('"', keepEscapes);
        }

        // Check for SPECIAL or CTL
        if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
            if (endOfAtom > 0 && c != endOfAtom) {
                // not expecting a special character here,
                // pretend it's a quoted string
                return collectString(endOfAtom, keepEscapes);
            }
            currentPos++; // re-position currentPos
            char ch[] = new char[1];
            ch[0] = c;
            return new Token((int) c, new String(ch));
        }

        // Check for ATOM
        for (start = currentPos; currentPos < maxPos; currentPos++) {
            c = string.charAt(currentPos);
            // ATOM is delimited by either SPACE, CTL, "(", <"> 
            // or the specified SPECIALS
            if (c < 040 || c >= 0177 || c == '(' || c == ' ' || c == '"' || delimiters.indexOf(c) >= 0) {
                if (endOfAtom > 0 && c != endOfAtom) {
                    // not the expected atom after all;
                    // back up and pretend it's a quoted string
                    currentPos = start;
                    return collectString(endOfAtom, keepEscapes);
                }
                break;
            }
        }
        return new Token(Token.ATOM, string.substring(start, currentPos));
    }

    private Token collectString(char eos, boolean keepEscapes) throws ParseException {
        int start;
        boolean filter = false;
        for (start = currentPos; currentPos < maxPos; currentPos++) {
            char c = string.charAt(currentPos);
            if (c == '\\') { // Escape sequence
                currentPos++;
                filter = true;
            } else if (c == '\r')
                filter = true;
            else if (c == eos) {
                currentPos++;
                String s;

                if (filter)
                    s = filterToken(string, start, currentPos - 1, keepEscapes);
                else
                    s = string.substring(start, currentPos - 1);

                if (c != '"') { // not a real quoted string
                    s = trimWhiteSpace(s);
                    currentPos--; // back up before the eos char
                }

                return new Token(Token.QUOTEDSTRING, s);
            }
        }

        // ran off the end of the string

        // if we're looking for a matching quote, that's an error
        if (eos == '"')
            throw new ParseException("Unbalanced quoted string");

        // otherwise, just return whatever's left
        String s;
        if (filter)
            s = filterToken(string, start, currentPos, keepEscapes);
        else
            s = string.substring(start, currentPos);
        s = trimWhiteSpace(s);
        return new Token(Token.QUOTEDSTRING, s);
    }

    // Skip SPACE, HT, CR and NL
    private int skipWhiteSpace() {
        char c;
        for (; currentPos < maxPos; currentPos++)
            if (((c = string.charAt(currentPos)) != ' ') && (c != '\t') && (c != '\r') && (c != '\n'))
                return currentPos;
        return Token.EOF;
    }

    // Trim SPACE, HT, CR and NL from end of string
    private static String trimWhiteSpace(String s) {
        char c;
        int i;
        for (i = s.length() - 1; i >= 0; i--) {
            if (((c = s.charAt(i)) != ' ') && (c != '\t') && (c != '\r') && (c != '\n'))
                break;
        }
        if (i <= 0)
            return "";
        else
            return s.substring(0, i + 1);
    }

    /* Process escape sequences and embedded LWSPs from a comment or
     * quoted string.
     */
    private static String filterToken(String s, int start, int end, boolean keepEscapes) {
        StringBuilder sb = new StringBuilder();
        char c;
        boolean gotEscape = false;
        boolean gotCR = false;

        for (int i = start; i < end; i++) {
            c = s.charAt(i);
            if (c == '\n' && gotCR) {
                // This LF is part of an unescaped 
                // CRLF sequence (i.e, LWSP). Skip it.
                gotCR = false;
                continue;
            }

            gotCR = false;
            if (!gotEscape) {
                // Previous character was NOT '\'
                if (c == '\\') // skip this character
                    gotEscape = true;
                else if (c == '\r') // skip this character
                    gotCR = true;
                else // append this character
                    sb.append(c);
            } else {
                // Previous character was '\'. So no need to 
                // bother with any special processing, just 
                // append this character.  If keepEscapes is
                // set, keep the backslash.  IE6 fails to escape
                // backslashes in quoted strings in HTTP headers,
                // e.g., in the filename parameter.
                if (keepEscapes)
                    sb.append('\\');
                sb.append(c);
                gotEscape = false;
            }
        }
        return sb.toString();
    }
}