org.apache.flume.ext.source.SyslogParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flume.ext.source.SyslogParser.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package org.apache.flume.ext.source;

import java.nio.charset.Charset;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.regex.Pattern;

import org.apache.flume.Event;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.source.SyslogUtils;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.collect.Maps;

public class SyslogParser {

    private static final Logger logger = LoggerFactory.getLogger(SyslogParser.class);

    private static final int TS_CACHE_MAX = 1000; // timestamp cache size limit
    private static final Pattern TWO_SPACES = Pattern.compile("  ");
    private static final DateTimeFormatter rfc3164Format = DateTimeFormat.forPattern("MMM d HH:mm:ss")
            .withZoneUTC();

    private static final String timePat = "yyyy-MM-dd'T'HH:mm:ss";
    private static final int RFC3164_LEN = 15;
    private static final int RFC5424_PREFIX_LEN = 19;

    private final DateTimeFormatter timeParser;

    private Cache<String, Long> timestampCache;

    public SyslogParser() {
        timeParser = DateTimeFormat.forPattern(timePat).withZoneUTC();
        timestampCache = CacheBuilder.newBuilder().maximumSize(TS_CACHE_MAX).build(new CacheLoader<String, Long>() {

            @Override
            public Long load(String key) throws Exception {
                return timeParser.parseMillis(key);
            }
        });
    }

    /**
     * Parses a Flume Event out of a syslog message string.
     * @param msg Syslog message, not including the newline character
     * @return Parsed Flume Event
     * @throws IllegalArgumentException if unable to successfully parse message
     */
    public Event parseMessage(String msg, Charset charset) {
        Map<String, String> headers = Maps.newHashMap();

        int msgLen = msg.length();

        int curPos = 0;

        Preconditions.checkArgument(msg.charAt(curPos) == '<',
                "Bad format: invalid priority: cannot find open bracket '<' (%s)", msg);

        int endBracketPos = msg.indexOf('>');
        Preconditions.checkArgument(endBracketPos > 0 && endBracketPos <= 6,
                "Bad format: invalid priority: cannot find end bracket '>' (%s)", msg);

        String priority = msg.substring(1, endBracketPos);
        int pri = Integer.parseInt(priority);
        int facility = pri / 8;
        int severity = pri % 8;

        // put fac / sev into header
        headers.put(SyslogUtils.SYSLOG_FACILITY, String.valueOf(facility));
        headers.put(SyslogUtils.SYSLOG_SEVERITY, String.valueOf(severity));

        Preconditions.checkArgument(msgLen > endBracketPos + 1, "Bad format: no data except priority (%s)", msg);

        // update parsing position
        curPos = endBracketPos + 1;

        // ignore version string
        if (msgLen > curPos + 2 && "1 ".equals(msg.substring(curPos, curPos + 2))) {
            curPos += 2;
        }

        // now parse timestamp (handle different varieties)
        long ts;
        char dateStartChar = msg.charAt(curPos);
        boolean rfc5424 = false;
        try {

            // no timestamp specified; use relay current time
            if (dateStartChar == '-') {
                ts = System.currentTimeMillis();
                if (msgLen <= curPos + 2) {
                    throw new IllegalArgumentException("bad syslog format (missing hostname)");
                }
                curPos += 2; // assume we skip past a space to get to the hostname

                // rfc3164 imestamp
            } else if (dateStartChar >= 'A' && dateStartChar <= 'Z') {
                if (msgLen <= curPos + RFC3164_LEN) {
                    throw new IllegalArgumentException("bad timestamp format");
                }
                ts = parseRfc3164Time(msg.substring(curPos, curPos + RFC3164_LEN));
                curPos += RFC3164_LEN + 1;
                // rfc 5424 timestamp
            } else {
                int nextSpace = msg.indexOf(' ', curPos);
                if (nextSpace == -1) {
                    throw new IllegalArgumentException("bad timestamp format");
                }
                ts = parseRfc5424Date(msg.substring(curPos, nextSpace));
                curPos = nextSpace + 1;
                rfc5424 = true;
            }

        } catch (IllegalArgumentException ex) {
            throw new IllegalArgumentException("Unable to parse message: " + msg, ex);
        }

        headers.put("timestamp", String.valueOf(ts));

        // parse out hostname
        int nextSpace = msg.indexOf(' ', curPos);
        if (nextSpace == -1) {
            throw new IllegalArgumentException("bad syslog format (missing hostname)");
        }
        // copy the host string to avoid holding the message string in memory
        // if using a memory-based queue
        String hostname = new String(msg.substring(curPos, nextSpace));
        headers.put("host", hostname);

        // Handle RFC-5424 fields
        if (rfc5424) {
            curPos = nextSpace + 1;

            nextSpace = msg.indexOf(' ', curPos);
            if (nextSpace != -1) {
                String app_name = new String(msg.substring(curPos, nextSpace));
                headers.put("app_name", app_name);
            }

            curPos = nextSpace + 1;
            nextSpace = msg.indexOf(' ', curPos);
            if (nextSpace != -1) {
                String procid = new String(msg.substring(curPos, nextSpace));
                headers.put("procid", procid);
            }

            curPos = nextSpace + 1;
            nextSpace = msg.indexOf(' ', curPos);
            if (nextSpace != -1) {
                String msgid = new String(msg.substring(curPos, nextSpace));
                headers.put("msgid", msgid);
            }
            curPos = nextSpace + 1;

            // Structured Data
            if (msg.charAt(curPos) != '-') {
                while (curPos < msg.length() && msg.charAt(curPos) == '[') {
                    curPos += 1; // Skip the '['
                    nextSpace = msg.indexOf(' ', curPos);
                    if (nextSpace == -1)
                        throw new IllegalArgumentException("bad syslog format (malformed structured data)");

                    String sdid = msg.substring(curPos, nextSpace);
                    int endPos = msg.indexOf(']', curPos);
                    if (endPos == -1)
                        throw new IllegalArgumentException("bad syslog format (malformed structured data)");

                    curPos = nextSpace + 1;

                    while (curPos < endPos) {
                        nextSpace = msg.indexOf(' ', curPos);
                        if (nextSpace > endPos || nextSpace == -1)
                            nextSpace = endPos;
                        int equals = msg.indexOf('=', curPos);
                        if (equals == -1)
                            throw new IllegalArgumentException("bad syslog format (malformed structured data)");
                        String field = new String(msg.substring(curPos, equals));
                        String value = new String(msg.substring(equals + 2, nextSpace - 1));
                        headers.put(sdid + "." + field, value);
                        curPos = nextSpace + 1;
                    }
                }

                nextSpace += 1; // Skip the last space
            } else {
                nextSpace = msg.indexOf(' ', curPos);
            }
        }

        // EventBuilder will do a copy of its own, so no defensive copy of the body
        String data = "";
        if (msgLen > nextSpace + 1) {
            curPos = nextSpace + 1;
            data = msg.substring(curPos);
        }

        Event event = EventBuilder.withBody(data, charset, headers);

        return event;
    }

    /**
     * Parse date in RFC 5424 format. Uses an LRU cache to speed up parsing for
     * multiple messages that occur in the same second.
     * @param msg
     * @return Typical (for Java) milliseconds since UNIX epoch
     */
    protected long parseRfc5424Date(String msg) {

        Long ts = null;
        int curPos = 0;

        int msgLen = msg.length();
        Preconditions.checkArgument(msgLen > RFC5424_PREFIX_LEN, "Bad format: Not a valid RFC5424 timestamp: %s",
                msg);
        String timestampPrefix = msg.substring(curPos, RFC5424_PREFIX_LEN);

        try {
            ts = timestampCache.get(timestampPrefix);
        } catch (ExecutionException ex) {
            throw new IllegalArgumentException("bad timestamp format", ex);
        }

        curPos += RFC5424_PREFIX_LEN;

        Preconditions.checkArgument(ts != null, "Parsing error: timestamp is null");

        // look for the optional fractional seconds
        if (msg.charAt(curPos) == '.') {
            // figure out how many numeric digits
            boolean foundEnd = false;
            int endMillisPos = curPos + 1;

            if (msgLen <= endMillisPos) {
                throw new IllegalArgumentException("bad timestamp format (no TZ)");
            }

            // FIXME: TODO: ensure we handle all bad formatting cases
            while (!foundEnd) {
                char curDigit = msg.charAt(endMillisPos);
                if (curDigit >= '0' && curDigit <= '9') {
                    endMillisPos++;
                } else {
                    foundEnd = true;
                }
            }

            // if they had a valid fractional second, append it rounded to millis
            if (endMillisPos - (curPos + 1) > 0) {
                float frac = Float.parseFloat(msg.substring(curPos, endMillisPos));
                long milliseconds = (long) (frac * 1000f);
                ts += milliseconds;
            } else {
                throw new IllegalArgumentException("Bad format: Invalid timestamp (fractional portion): " + msg);
            }

            curPos = endMillisPos;
        }

        // look for timezone
        char tzFirst = msg.charAt(curPos);

        // UTC
        if (tzFirst == 'Z') {
            // no-op
        } else if (tzFirst == '+' || tzFirst == '-') {

            Preconditions.checkArgument(msgLen > curPos + 5, "Bad format: Invalid timezone (%s)", msg);

            int polarity;
            if (tzFirst == '+') {
                polarity = +1;
            } else {
                polarity = -1;
            }

            char[] h = new char[5];
            for (int i = 0; i < 5; i++) {
                h[i] = msg.charAt(curPos + 1 + i);
            }

            if (h[0] >= '0' && h[0] <= '9' && h[1] >= '0' && h[1] <= '9' && h[2] == ':' && h[3] >= '0'
                    && h[3] <= '9' && h[4] >= '0' && h[4] <= '9') {
                int hourOffset = Integer.parseInt(msg.substring(curPos + 1, curPos + 3));
                int minOffset = Integer.parseInt(msg.substring(curPos + 4, curPos + 6));
                ts -= polarity * ((hourOffset * 60) + minOffset) * 60000;
            } else {
                throw new IllegalArgumentException("Bad format: Invalid timezone: " + msg);
            }

        }

        return ts;
    }

    /**
     * Parse the RFC3164 date format. This is trickier than it sounds because this
     * format does not specify a year so we get weird edge cases at year
     * boundaries. This implementation tries to "do what I mean".
     * @param ts RFC3164-compatible timestamp to be parsed
     * @return Typical (for Java) milliseconds since the UNIX epoch
     */
    protected long parseRfc3164Time(String ts) {
        DateTime now = DateTime.now();
        int year = now.getYear();

        ts = TWO_SPACES.matcher(ts).replaceFirst(" ");

        DateTime date;
        try {
            date = rfc3164Format.parseDateTime(ts);
        } catch (IllegalArgumentException e) {
            logger.debug("rfc3164 date parse failed on (" + ts + "): invalid format", e);
            return 0;
        }

        // try to deal with boundary cases, i.e. new year's eve.
        // rfc3164 dates are really dumb.
        // NB: cannot handle replaying of old logs or going back to the future
        if (date != null) {
            DateTime fixed = date.withYear(year);

            // flume clock is ahead or there is some latency, and the year rolled
            if (fixed.isAfter(now) && fixed.minusMonths(1).isAfter(now)) {
                fixed = date.withYear(year - 1);
                // flume clock is behind and the year rolled
            } else if (fixed.isBefore(now) && fixed.plusMonths(1).isBefore(now)) {
                fixed = date.withYear(year + 1);
            }
            date = fixed;
        }

        if (date == null) {
            return 0;
        }

        return date.getMillis();
    }

}