Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * To change this template, choose Tools | Templates * and open the template in the editor. */ package org.apache.flume.ext.source; import java.nio.charset.Charset; import java.util.Map; import java.util.concurrent.ExecutionException; import java.util.regex.Pattern; import org.apache.flume.Event; import org.apache.flume.event.EventBuilder; import org.apache.flume.source.SyslogUtils; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.collect.Maps; public class SyslogParser { private static final Logger logger = LoggerFactory.getLogger(SyslogParser.class); private static final int TS_CACHE_MAX = 1000; // timestamp cache size limit private static final Pattern TWO_SPACES = Pattern.compile(" "); private static final DateTimeFormatter rfc3164Format = DateTimeFormat.forPattern("MMM d HH:mm:ss") .withZoneUTC(); private static final String timePat = "yyyy-MM-dd'T'HH:mm:ss"; private static final int RFC3164_LEN = 15; private static final int RFC5424_PREFIX_LEN = 19; private final DateTimeFormatter timeParser; private Cache<String, Long> timestampCache; public SyslogParser() { timeParser = DateTimeFormat.forPattern(timePat).withZoneUTC(); timestampCache = CacheBuilder.newBuilder().maximumSize(TS_CACHE_MAX).build(new CacheLoader<String, Long>() { @Override public Long load(String key) throws Exception { return timeParser.parseMillis(key); } }); } /** * Parses a Flume Event out of a syslog message string. * @param msg Syslog message, not including the newline character * @return Parsed Flume Event * @throws IllegalArgumentException if unable to successfully parse message */ public Event parseMessage(String msg, Charset charset) { Map<String, String> headers = Maps.newHashMap(); int msgLen = msg.length(); int curPos = 0; Preconditions.checkArgument(msg.charAt(curPos) == '<', "Bad format: invalid priority: cannot find open bracket '<' (%s)", msg); int endBracketPos = msg.indexOf('>'); Preconditions.checkArgument(endBracketPos > 0 && endBracketPos <= 6, "Bad format: invalid priority: cannot find end bracket '>' (%s)", msg); String priority = msg.substring(1, endBracketPos); int pri = Integer.parseInt(priority); int facility = pri / 8; int severity = pri % 8; // put fac / sev into header headers.put(SyslogUtils.SYSLOG_FACILITY, String.valueOf(facility)); headers.put(SyslogUtils.SYSLOG_SEVERITY, String.valueOf(severity)); Preconditions.checkArgument(msgLen > endBracketPos + 1, "Bad format: no data except priority (%s)", msg); // update parsing position curPos = endBracketPos + 1; // ignore version string if (msgLen > curPos + 2 && "1 ".equals(msg.substring(curPos, curPos + 2))) { curPos += 2; } // now parse timestamp (handle different varieties) long ts; char dateStartChar = msg.charAt(curPos); boolean rfc5424 = false; try { // no timestamp specified; use relay current time if (dateStartChar == '-') { ts = System.currentTimeMillis(); if (msgLen <= curPos + 2) { throw new IllegalArgumentException("bad syslog format (missing hostname)"); } curPos += 2; // assume we skip past a space to get to the hostname // rfc3164 imestamp } else if (dateStartChar >= 'A' && dateStartChar <= 'Z') { if (msgLen <= curPos + RFC3164_LEN) { throw new IllegalArgumentException("bad timestamp format"); } ts = parseRfc3164Time(msg.substring(curPos, curPos + RFC3164_LEN)); curPos += RFC3164_LEN + 1; // rfc 5424 timestamp } else { int nextSpace = msg.indexOf(' ', curPos); if (nextSpace == -1) { throw new IllegalArgumentException("bad timestamp format"); } ts = parseRfc5424Date(msg.substring(curPos, nextSpace)); curPos = nextSpace + 1; rfc5424 = true; } } catch (IllegalArgumentException ex) { throw new IllegalArgumentException("Unable to parse message: " + msg, ex); } headers.put("timestamp", String.valueOf(ts)); // parse out hostname int nextSpace = msg.indexOf(' ', curPos); if (nextSpace == -1) { throw new IllegalArgumentException("bad syslog format (missing hostname)"); } // copy the host string to avoid holding the message string in memory // if using a memory-based queue String hostname = new String(msg.substring(curPos, nextSpace)); headers.put("host", hostname); // Handle RFC-5424 fields if (rfc5424) { curPos = nextSpace + 1; nextSpace = msg.indexOf(' ', curPos); if (nextSpace != -1) { String app_name = new String(msg.substring(curPos, nextSpace)); headers.put("app_name", app_name); } curPos = nextSpace + 1; nextSpace = msg.indexOf(' ', curPos); if (nextSpace != -1) { String procid = new String(msg.substring(curPos, nextSpace)); headers.put("procid", procid); } curPos = nextSpace + 1; nextSpace = msg.indexOf(' ', curPos); if (nextSpace != -1) { String msgid = new String(msg.substring(curPos, nextSpace)); headers.put("msgid", msgid); } curPos = nextSpace + 1; // Structured Data if (msg.charAt(curPos) != '-') { while (curPos < msg.length() && msg.charAt(curPos) == '[') { curPos += 1; // Skip the '[' nextSpace = msg.indexOf(' ', curPos); if (nextSpace == -1) throw new IllegalArgumentException("bad syslog format (malformed structured data)"); String sdid = msg.substring(curPos, nextSpace); int endPos = msg.indexOf(']', curPos); if (endPos == -1) throw new IllegalArgumentException("bad syslog format (malformed structured data)"); curPos = nextSpace + 1; while (curPos < endPos) { nextSpace = msg.indexOf(' ', curPos); if (nextSpace > endPos || nextSpace == -1) nextSpace = endPos; int equals = msg.indexOf('=', curPos); if (equals == -1) throw new IllegalArgumentException("bad syslog format (malformed structured data)"); String field = new String(msg.substring(curPos, equals)); String value = new String(msg.substring(equals + 2, nextSpace - 1)); headers.put(sdid + "." + field, value); curPos = nextSpace + 1; } } nextSpace += 1; // Skip the last space } else { nextSpace = msg.indexOf(' ', curPos); } } // EventBuilder will do a copy of its own, so no defensive copy of the body String data = ""; if (msgLen > nextSpace + 1) { curPos = nextSpace + 1; data = msg.substring(curPos); } Event event = EventBuilder.withBody(data, charset, headers); return event; } /** * Parse date in RFC 5424 format. Uses an LRU cache to speed up parsing for * multiple messages that occur in the same second. * @param msg * @return Typical (for Java) milliseconds since UNIX epoch */ protected long parseRfc5424Date(String msg) { Long ts = null; int curPos = 0; int msgLen = msg.length(); Preconditions.checkArgument(msgLen > RFC5424_PREFIX_LEN, "Bad format: Not a valid RFC5424 timestamp: %s", msg); String timestampPrefix = msg.substring(curPos, RFC5424_PREFIX_LEN); try { ts = timestampCache.get(timestampPrefix); } catch (ExecutionException ex) { throw new IllegalArgumentException("bad timestamp format", ex); } curPos += RFC5424_PREFIX_LEN; Preconditions.checkArgument(ts != null, "Parsing error: timestamp is null"); // look for the optional fractional seconds if (msg.charAt(curPos) == '.') { // figure out how many numeric digits boolean foundEnd = false; int endMillisPos = curPos + 1; if (msgLen <= endMillisPos) { throw new IllegalArgumentException("bad timestamp format (no TZ)"); } // FIXME: TODO: ensure we handle all bad formatting cases while (!foundEnd) { char curDigit = msg.charAt(endMillisPos); if (curDigit >= '0' && curDigit <= '9') { endMillisPos++; } else { foundEnd = true; } } // if they had a valid fractional second, append it rounded to millis if (endMillisPos - (curPos + 1) > 0) { float frac = Float.parseFloat(msg.substring(curPos, endMillisPos)); long milliseconds = (long) (frac * 1000f); ts += milliseconds; } else { throw new IllegalArgumentException("Bad format: Invalid timestamp (fractional portion): " + msg); } curPos = endMillisPos; } // look for timezone char tzFirst = msg.charAt(curPos); // UTC if (tzFirst == 'Z') { // no-op } else if (tzFirst == '+' || tzFirst == '-') { Preconditions.checkArgument(msgLen > curPos + 5, "Bad format: Invalid timezone (%s)", msg); int polarity; if (tzFirst == '+') { polarity = +1; } else { polarity = -1; } char[] h = new char[5]; for (int i = 0; i < 5; i++) { h[i] = msg.charAt(curPos + 1 + i); } if (h[0] >= '0' && h[0] <= '9' && h[1] >= '0' && h[1] <= '9' && h[2] == ':' && h[3] >= '0' && h[3] <= '9' && h[4] >= '0' && h[4] <= '9') { int hourOffset = Integer.parseInt(msg.substring(curPos + 1, curPos + 3)); int minOffset = Integer.parseInt(msg.substring(curPos + 4, curPos + 6)); ts -= polarity * ((hourOffset * 60) + minOffset) * 60000; } else { throw new IllegalArgumentException("Bad format: Invalid timezone: " + msg); } } return ts; } /** * Parse the RFC3164 date format. This is trickier than it sounds because this * format does not specify a year so we get weird edge cases at year * boundaries. This implementation tries to "do what I mean". * @param ts RFC3164-compatible timestamp to be parsed * @return Typical (for Java) milliseconds since the UNIX epoch */ protected long parseRfc3164Time(String ts) { DateTime now = DateTime.now(); int year = now.getYear(); ts = TWO_SPACES.matcher(ts).replaceFirst(" "); DateTime date; try { date = rfc3164Format.parseDateTime(ts); } catch (IllegalArgumentException e) { logger.debug("rfc3164 date parse failed on (" + ts + "): invalid format", e); return 0; } // try to deal with boundary cases, i.e. new year's eve. // rfc3164 dates are really dumb. // NB: cannot handle replaying of old logs or going back to the future if (date != null) { DateTime fixed = date.withYear(year); // flume clock is ahead or there is some latency, and the year rolled if (fixed.isAfter(now) && fixed.minusMonths(1).isAfter(now)) { fixed = date.withYear(year - 1); // flume clock is behind and the year rolled } else if (fixed.isBefore(now) && fixed.plusMonths(1).isBefore(now)) { fixed = date.withYear(year + 1); } date = fixed; } if (date == null) { return 0; } return date.getMillis(); } }