com.microsoft.tfs.util.datetime.LenientDateTimeParser.java Source code

Java tutorial

Introduction

Here is the source code for com.microsoft.tfs.util.datetime.LenientDateTimeParser.java

Source

// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See License.txt in the repository root.

package com.microsoft.tfs.util.datetime;

import java.text.DateFormat;
import java.text.MessageFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.TimeZone;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.microsoft.tfs.util.Check;
import com.microsoft.tfs.util.Messages;

/**
 * Attemps to parse a string containing a date and/or time into a Calendar
 * according to all the date format definitions it knows. Flags may be passed to
 * the parse method to use a default time if the given string contains a date
 * but not time, or use a default date if it contains a time but not date. An
 * exception is thrown if the parse did not match any available formats.
 * <p>
 * All parsing is done internally using SimpleDateFormat many instances arranged
 * in a list. The order in which they are matched against is:
 * <p>
 * <ol>
 * <li>SimpleDateFormat.getDateTimeInstance() for many precisions for the
 * instance's locale is tried</li>
 * <li>ISO8601 and ISO8601-like formats are tried next ("year-month-day")</li>
 * <li>If the instance's locale prefers "month-day-year" formats (United
 * States), those are tried next; if it prefers "day-month-year" instead, those
 * are tried next. Then the other (European or US) is tried.</li>
 * <li>Generic date-only formats (via SimpleDateFormat.getDateInstance()) for
 * the instance's locale are tried</li>
 * <li>Generic time-only formats (via SimpleDateFormat.getTimeInstance()) for
 * the instance's locale are tried</li>
 * </ol>
 * <p>
 * In total, many hundreds of formats are matched against, comprising all
 * commonly used combinations of precision, component ordering, different date
 * and time separators, month names vs. numbers, time zone presence, etc.
 * Because of the large number of formats constructed and matched, creating an
 * instance of LenientDateTimeParser may be a computationally intensive
 * operation. Consider reusing instances you create instead of creating many new
 * ones.
 * <p>
 * <b>"Lenient"</b>
 * <p>
 * The name "Lenient" as it applies to this class means "accepting of many
 * different date and time formats." It does not mean the same as the lenient
 * property of DateFormat, which means something like "nonsensical dates like
 * 2007/01/45 really mean 14 February 2007." The LenientDateTimeParser class
 * uses non-lenient SimpleDateFormat instances to do its parsing. Dates like
 * "2007/01/45" will not parse (unless some locale I don't know about accepts
 * it).
 * <p>
 * <b>Notes on Time Zones</b>
 * <p>
 * Each LenientDateTimeParser instance has an associated time zone (if not
 * specified during construction, it is the default time zone). When a string
 * <b>without</b> time zone is parsed, the returned Calendar has its time zone
 * initialized to the {@link LenientDateTimeParser} instance's associated time
 * zone. When a string <b>with</b> a time zone is parsed, the
 * {@link LenientDateTimeParser} instance's associated time zone is ignored and
 * the returned Calendar is in the time zone present in the input string.
 * <p>
 * This class is immutable (and therefore thread-safe).
 */
public class LenientDateTimeParser {
    private static final Log log = LogFactory.getLog(LenientDateTimeParser.class);

    /**
     * These formats are internationally recognized date formats. They will be
     * expanded in all the ways described by LenientDateTimeParserExpander.
     *
     * As a general rule, put more precise formats first in these arrays. Less
     * precise formats will match prematurely and cause additional precision in
     * the candidate string to be ignored.
     *
     * 12-hour time formats must include AM/PM marker.
     *
     * Milliseconds are generally not included (because TFS doesn't match with
     * that precision) except where standards require them (ISO8601).
     */

    // ISO8601 and variants
    private static final LenientDateTimePattern[] isoDateTimeFormats = new LenientDateTimePattern[]

    {
            // This comment blank to make the formatter happy.
            new LenientDateTimePattern("yyyy/MM/dd'T'HH:mm:ss.SSSz", true, true), // ISO8601-Standard //$NON-NLS-1$
            new LenientDateTimePattern("yyyy/MM/dd'T'HH:mm:ssz", true, true), // ISO8601-like //$NON-NLS-1$
            new LenientDateTimePattern("yyyy/MM/dd'T'HH:mm:ss", true, true), // ISO8601-like //$NON-NLS-1$
            new LenientDateTimePattern("yyyy/MM/dd'T'HH:mmz", true, true), // ISO8601-like //$NON-NLS-1$
            new LenientDateTimePattern("yyyy/MM/dd'T'HH:mm", true, true), // ISO8601-like //$NON-NLS-1$
    };

    // US date style
    private static final LenientDateTimePattern[] usDateFormats = new LenientDateTimePattern[]

    {
            // This comment blank to make the formatter happy.
            new LenientDateTimePattern("MM/dd/yy h:mm:ss a z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("MM/dd/yy h:mm:ss a", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("MM/dd/yy h:mm a z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("MM/dd/yy h:mm a", true, true), //$NON-NLS-1$

            new LenientDateTimePattern("MM/dd/yy HH:mm:ss z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("MM/dd/yy HH:mm:ss", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("MM/dd/yy HH:mm z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("MM/dd/yy HH:mm", true, true), //$NON-NLS-1$

            new LenientDateTimePattern("MM/dd/yy", true, false) //$NON-NLS-1$

    };

    // European date style
    private static final LenientDateTimePattern[] euDateFormats = new LenientDateTimePattern[]

    {
            // This comment blank to make the formatter happy.
            new LenientDateTimePattern("d/MM/yy h:mm:ss a z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("d/MM/yy h:mm:ss a", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("d/MM/yy h:mm a z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("d/MM/yy h:mm a", true, true), //$NON-NLS-1$

            new LenientDateTimePattern("d/MM/yy HH:mm:ss z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("d/MM/yy HH:mm:ss", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("d/MM/yy HH:mm z", true, true), //$NON-NLS-1$
            new LenientDateTimePattern("d/MM/yy HH:mm", true, true), //$NON-NLS-1$

            new LenientDateTimePattern("d/MM/yy", true, false) //$NON-NLS-1$

    };

    /*
     * Times only. These should be matched last because they're rarer, and they
     * may cause a false positive parse match with what should be a date
     * (because '.' is sometimes used as a separator for dates and times and
     * SimpleDateFormat is eager to match long inputs with short patterns).
     */
    private static final LenientDateTimePattern[] genericTimeFormats = new LenientDateTimePattern[]

    {
            // This comment blank to make the formatter happy.
            new LenientDateTimePattern("h:mm:ss a z", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("h:mm:ss a", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("h:mm a z", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("h:mm a", false, true), //$NON-NLS-1$

            new LenientDateTimePattern("HH:mm:ss z", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("HH:mm:ssz", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("HH:mm:ss", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("HH:mm z", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("HH:mmz", false, true), //$NON-NLS-1$
            new LenientDateTimePattern("HH:mm", false, true) //$NON-NLS-1$

    };

    /**
     * This array holds the sorted results of the static expansion code defined
     * in this class. The contents are all the format strings we'll test against
     * when parsing a date.
     */
    private LenientDateTimeFormat[] expandedFormats = new LenientDateTimeFormat[0];

    private final TimeZone timeZone;
    private final Locale locale;

    /**
     * These time zone names are equivalent to UTC but must be converted to a
     * string like "UTC" because {@link SimpleDateFormat} can't parse them
     * normally.
     */
    private final String[] additionalUTCTimeZoneStrings = new String[] { "Z", //$NON-NLS-1$
            "z" //$NON-NLS-1$
    };

    /**
     * Constructs a lenient parser for the default locale and time zone.
     */
    public LenientDateTimeParser() {
        timeZone = TimeZone.getDefault();
        locale = Locale.getDefault();

        computeFormats();
    }

    /**
     * Constructs a lenient parser for the given time zone and locale.
     *
     * @param timeZone
     *        the time zone where the returned dates and times are rooted (not
     *        null).
     * @param locale
     *        the locale to use for parsing and matching patterns as well as
     *        generated calendars (not null).
     */
    public LenientDateTimeParser(final TimeZone timeZone, final Locale locale) {
        Check.notNull(timeZone, "timeZone"); //$NON-NLS-1$
        Check.notNull(locale, "locale"); //$NON-NLS-1$

        this.timeZone = timeZone;
        this.locale = locale;

        computeFormats();
    }

    /**
     * Parse a free-form date and time first with the default format for the
     * default locale, then by parsing a list of known formats until one
     * succeeds. If the date cannot be parsed, ParseException is thrown.
     * <p>
     * WARNING: GMT doesn't parse as a time zone in on my platform (JDK 1.5,
     * Linux). Use UTC in your input instead.
     * <p>
     * This method may be somewhat slow, although it is probably not an area of
     * concern for most programs. Because of the way SimpleDateFormat parses,
     * all of the longer, more precise patterns must be tried first (and these
     * are least likely to be used by the user). This means we must walk an
     * array for each search and common formats are necessarily near the end. In
     * reality, most searches can be done under a few (6) milliseconds on a 3
     * GHz computer.
     * <p>
     * <b>Notes on Default Date and Time</b>
     * <p>
     * The defaultDate and defaultTime parameters control whether the returned
     * Calendar has its date and/or time components set to the current date
     * and/or time (instead of the Date class epoch) when the given date time
     * string matches a pattern that does not specify <b>both</b> date and time
     * components. These parameters do <b>not</b> affect the order in which
     * patterns are tested or the criteria for a pattern match. They simply
     * choose which date or time is returned in the date- or time-only pattern
     * match where one of the components was unspecified. When either
     * defaultDate or defaultTime is false the Date class's epoch (1970/01/01
     * 00:00:00 UTC) is used for the default date or time.
     * <p>
     * Both defaultDate and defaultTime may be set to true, but as these
     * parameters do not affect the matching behavior, a given string missing
     * <b>both</b> date and time components will still match no patterns
     * (causing a ParseException to be thrown).
     *
     * @param dateTimeString
     *        the date string to parse (not null).
     * @param defaultDate
     *        if true and the given date time string matches a time-only
     *        pattern, today's date is used in the returned Calendar. If false
     *        and a time-only pattern is matched, 1970/01/01 is used for the
     *        date. See the method Javadoc for more information.
     * @param defaultTime
     *        if true and the given date time string matches a date-only
     *        pattern, the current time is used in the returned Calendar. If
     *        false and a date-only pattern is matched, 00:00:00 is used for the
     *        date. See the method Javadoc for more information.
     * @return a {@link LenientDateTimeResult} containing the date/time that was
     *         in the given dateTimeString and additional match information.
     * @throws ParseException
     *         if the date time string could not be parsed.
     */
    public LenientDateTimeResult parseExtended(String dateTimeString, final boolean defaultDate,
            final boolean defaultTime) throws ParseException {
        Check.notNull(dateTimeString, "dateTimeString"); //$NON-NLS-1$

        final long start = System.currentTimeMillis();
        int i = 0;
        Calendar calendar = Calendar.getInstance(timeZone, locale);
        calendar.clear();

        // Java's SimpleDateFormat.parse() doesn't support some common time
        // zones, so we convert them before parsing.
        dateTimeString = convertUnsupportedTimeZones(dateTimeString);

        for (i = 0; i < expandedFormats.length; i++) {
            final LenientDateTimeFormat format = expandedFormats[i];

            try {
                calendar.setTime(format.getDateFormat().parse(dateTimeString));

                if (defaultDate && format.specifiesDate() == false) {
                    /*
                     * We matched a pattern that doesn't specify a date and the
                     * user wants a default of today. Copy now's date info into
                     * ret.
                     */

                    final Calendar now = new GregorianCalendar(timeZone, locale);
                    calendar.set(Calendar.ERA, now.get(Calendar.ERA));
                    calendar.set(Calendar.YEAR, now.get(Calendar.YEAR));
                    calendar.set(Calendar.MONTH, now.get(Calendar.MONTH));
                    calendar.set(Calendar.DATE, now.get(Calendar.DATE));
                }

                if (defaultTime && format.specifiesTime() == false) {
                    /*
                     * We matched a pattern that doesn't specify a time and the
                     * user wants a default of now. Copy the parsed Calendar's
                     * date info into a new time.
                     */

                    final Calendar newRet = new GregorianCalendar(timeZone, locale);
                    newRet.set(Calendar.ERA, calendar.get(Calendar.ERA));
                    newRet.set(Calendar.YEAR, calendar.get(Calendar.YEAR));
                    newRet.set(Calendar.MONTH, calendar.get(Calendar.MONTH));
                    newRet.set(Calendar.DATE, calendar.get(Calendar.DATE));

                    calendar = newRet;
                }

                final String messageFormat = "matched at index {0} in {1} ms: {2}"; //$NON-NLS-1$
                final String message = MessageFormat.format(messageFormat, i, (System.currentTimeMillis() - start),
                        dateTimeString);
                log.trace(message);

                return new LenientDateTimeResult(calendar, format.getDateFormat(), format.specifiesDate(),
                        format.specifiesTime());
            } catch (final ParseException e) {
                // Ignore and keep looping.
            }
        }

        String messageFormat = "no match in {0} ms: {1}"; //$NON-NLS-1$
        String message = MessageFormat.format(messageFormat, (System.currentTimeMillis() - start), dateTimeString);
        log.trace(message);

        messageFormat = Messages.getString("LenientDateTimeParser.UnknownDateFormat"); //$NON-NLS-1$
        message = MessageFormat.format(messageFormat, dateTimeString);
        throw new ParseException(message, 0);
    }

    /**
     * Like {@link #parseExtended(String, boolean, boolean)} but simply returns
     * the Calendar inside the {@link LenientDateTimeResult} instead of the
     * entire result.
     *
     * @param dateTimeString
     *        the date string to parse (not null).
     * @param defaultDate
     *        if true and the given date time string matches a time-only
     *        pattern, today's date is used in the returned Calendar. If false
     *        and a time-only pattern is matched, 1970/01/01 is used for the
     *        date. See the method Javadoc for more information.
     * @param defaultTime
     *        if true and the given date time string matches a date-only
     *        pattern, the current time is used in the returned Calendar. If
     *        false and a date-only pattern is matched, 00:00:00 is used for the
     *        date. See the method Javadoc for more information.
     * @return a new Calendar containing the date/time that was in the given
     *         dateTimeString.
     * @throws ParseException
     *         if the date time string could not be parsed.
     */
    public Calendar parse(final String dateTimeString, final boolean defaultDate, final boolean defaultTime)
            throws ParseException {
        return parseExtended(dateTimeString, defaultDate, defaultTime).getCalendar();
    }

    /**
     * Like calling {@link #parseExtended(String, boolean, boolean)} with
     * defaultDate and defaultTime set to true but simply returns the Calendar
     * inside the {@link LenientDateTimeResult} instead of the entire result.
     *
     * @see LenientDateTimeParser#parse(String, boolean, boolean)
     *
     * @param dateTimeString
     *        the date string to parse (not null).
     * @return a new Calendar containing the date/time that was in the given
     *         dateTimeString.
     * @throws ParseException
     *         if the date time string could not be parsed.
     */
    public Calendar parse(final String dateTimeString) throws ParseException {
        return parseExtended(dateTimeString, true, true).getCalendar();
    }

    /**
     * Looks for common time zone strings at the end of the given dateTimeString
     * that {@link SimpleDateFormat} can't parse and converts them to equivalent
     * ones it can. For example, "Z" is turned into "UTC".
     *
     * @param dateTimeString
     *        the date time string to parse. If null, null is returned.
     * @return the given dateTimeString with time zones converted to ones
     *         {@link SimpleDateFormat} can parse. Null if the given string was
     *         null.
     */
    protected String convertUnsupportedTimeZones(final String dateTimeString) {
        if (dateTimeString == null) {
            return null;
        }

        /*
         * Convert additional UTC zone strings (like "Z" and "z") to "UTC". We
         * consider the dateTimeString to contain a matching zone string if it
         * ends with it, and the character before it is whitespace or numeric.
         * This lets a date time string like "2007-05-02T22:04:12Foobaz" pass
         * unaltered, because "Foobaz" refers to a (hypothetical) time zone name
         * which SimpleDateFormat could parse.
         */
        for (int i = 0; i < additionalUTCTimeZoneStrings.length; i++) {
            final String zoneString = additionalUTCTimeZoneStrings[i];

            if (dateTimeString.endsWith(zoneString)) {
                final int zoneIndex = dateTimeString.lastIndexOf(zoneString);

                // Make sure there's at least one character before the index.
                if (zoneIndex < 1) {
                    continue;
                }

                final char previousChracter = dateTimeString.charAt(zoneIndex - 1);

                if (Character.isDigit(previousChracter) || Character.isWhitespace(previousChracter)) {
                    // It's a match, replace the string anchored at the end with
                    // UTC.
                    return dateTimeString.replaceAll(zoneString + "$", "UTC"); //$NON-NLS-1$ //$NON-NLS-2$
                }
            }
        }

        return dateTimeString;
    }

    private void computeFormats() {
        /*
         * The expander takes pattern strings (and expands them) or takes
         * already created DateFormat instances and keeps them in the correct
         * order with the expanded strings.
         */
        final LenientDateTimeParserExpander expander = new LenientDateTimeParserExpander(false, locale);

        /*
         * Fill in the default locale date time formats. The indexes (0,1,2,3)
         * correspond to SimpleDateFormat.FULL, LONG, MEDIUM, and SHORT. We go
         * from FULL (0) to MEDIUM (2) with dates to prevent 2-digit date
         * matching (matches way too early). We go from FULL (0) to SHORT (3)
         * with times.
         *
         * Using the integers directly is a hack, but this whole parser is a
         * hack, so I don't feel too bad.
         */
        for (int i = 0; i <= 2; i++) {
            for (int j = 0; j <= 3; j++) {
                final DateFormat df = SimpleDateFormat.getDateTimeInstance(i, j, locale);
                df.setLenient(false);
                expander.add(df, true, true);
            }
        }

        /*
         * We can detect whether this parser's configured locale likes day
         * before month or month before day in its strings by instantiating a
         * short date formatter, passing a different month and day, and seeing
         * which appears first. We use the short date formatter so we don't get
         * (possibly localized) month names.
         */

        final Calendar c = new GregorianCalendar(timeZone, locale);
        c.clear();
        c.set(Calendar.MONTH, Calendar.JANUARY);
        c.set(Calendar.DATE, 5);
        c.set(Calendar.YEAR, 9999);
        final String defaultFormattedDate = SimpleDateFormat.getDateInstance(SimpleDateFormat.SHORT, locale)
                .format(c.getTime());

        final int monthIndex = defaultFormattedDate.indexOf("1"); //$NON-NLS-1$
        final int dateIndex = defaultFormattedDate.indexOf("5"); //$NON-NLS-1$

        // ISO goes next.
        expander.addExpanded(isoDateTimeFormats);

        if (dateIndex > monthIndex) {
            /*
             * Month-before-day. Use US before EU.
             */
            expander.addExpanded(usDateFormats);
            expander.addExpanded(euDateFormats);
        } else {
            /*
             * Day-before-month. Use EU before US.
             */
            expander.addExpanded(euDateFormats);
            expander.addExpanded(usDateFormats);
        }

        // Add just the default local date instances.
        for (int i = 0; i <= 2; i++) {
            final DateFormat df = SimpleDateFormat.getDateInstance(i, locale);
            df.setLenient(false);
            expander.add(df, true, false);
        }

        // Add just the default local time instances.
        for (int i = 0; i <= 3; i++) {
            final DateFormat df = SimpleDateFormat.getTimeInstance(i, locale);
            df.setLenient(false);
            expander.add(df, false, true);
        }

        // Generic time goes last.
        expander.addExpanded(genericTimeFormats);

        expandedFormats = expander.getSortedResults();
    }

    /*
     * (non-Javadoc)
     *
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
        return locale.toString() + "@" + timeZone.getID() + " [" + expandedFormats.length + " formats]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
    }
}