com.logsniffer.util.grok.Grok.java Source code

Java tutorial

Introduction

Here is the source code for com.logsniffer.util.grok.Grok.java

Source

/*******************************************************************************
 * logsniffer, open source tool for viewing, monitoring and analysing log data.
 * Copyright (c) 2015 Scaleborn UG, www.scaleborn.com
 *
 * logsniffer is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * logsniffer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *******************************************************************************/
package com.logsniffer.util.grok;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.logsniffer.fields.FieldBaseTypes;
import com.logsniffer.fields.FieldsHost;
import com.logsniffer.reader.FormatException;

/**
 * GROK https://code.google.com/p/semicomplete/wiki/Grok pattern implementation.
 */
@JsonAutoDetect(creatorVisibility = Visibility.NONE, fieldVisibility = Visibility.NONE, getterVisibility = Visibility.NONE, isGetterVisibility = Visibility.NONE, setterVisibility = Visibility.NONE)
public final class Grok implements FieldsHost {
    private static final Logger LOGGER = LoggerFactory.getLogger(Grok.class);
    protected static final Pattern PATTERN_SUBGROK = Pattern.compile(
            "%\\{([A-Z0-9_-]+)(?::([A-Z0-9_-]+)(?::(int|long|float|double|boolean))?)?\\}",
            Pattern.CASE_INSENSITIVE);

    /**
     * Pattern helper included from
     * https://github.com/tony19/named-regexp/blob/master
     * /src/main/java/com/google/code/regexp/Pattern.java.
     * 
     * @author mbok
     * 
     */
    private static final class PatternHelper {
        /**
         * Determines if the character at the specified position of a string is
         * escaped
         * 
         * @param s
         *            string to evaluate
         * @param pos
         *            the position of the character to evaluate
         * @return true if the character is escaped; otherwise false
         */
        static private boolean isEscapedChar(final String s, final int pos) {
            return isSlashEscapedChar(s, pos) || isQuoteEscapedChar(s, pos);
        }

        /**
         * Determines if the character at the specified position of a string is
         * escaped with a backslash
         * 
         * @param s
         *            string to evaluate
         * @param pos
         *            the position of the character to evaluate
         * @return true if the character is escaped; otherwise false
         */
        static private boolean isSlashEscapedChar(final String s, int pos) {

            // Count the backslashes preceding this position. If it's
            // even, there is no escape and the slashes are just literals.
            // If it's odd, one of the slashes (the last one) is escaping
            // the character at the given position.
            int numSlashes = 0;
            while (pos > 0 && s.charAt(pos - 1) == '\\') {
                pos--;
                numSlashes++;
            }
            return numSlashes % 2 != 0;
        }

        /**
         * Determines if the character at the specified position of a string is
         * quote-escaped (between \\Q and \\E)
         * 
         * @param s
         *            string to evaluate
         * @param pos
         *            the position of the character to evaluate
         * @return true if the character is quote-escaped; otherwise false
         */
        static private boolean isQuoteEscapedChar(final String s, final int pos) {

            boolean openQuoteFound = false;
            boolean closeQuoteFound = false;

            // find last non-escaped open-quote
            final String s2 = s.substring(0, pos);
            int posOpen = pos;
            while ((posOpen = s2.lastIndexOf("\\Q", posOpen - 1)) != -1) {
                if (!isSlashEscapedChar(s2, posOpen)) {
                    openQuoteFound = true;
                    break;
                }
            }

            if (openQuoteFound) {
                // search remainder of string (after open-quote) for a
                // close-quote;
                // no need to check that it's slash-escaped because it can't be
                // (the escape character itself is part of the literal when
                // quoted)
                if (s2.indexOf("\\E", posOpen) != -1) {
                    closeQuoteFound = true;
                }
            }

            return openQuoteFound && !closeQuoteFound;
        }

        /**
         * Determines if a string's character is within a regex character class
         * 
         * @param s
         *            string to evaluate
         * @param pos
         *            the position of the character to evaluate
         * @return true if the character is inside a character class; otherwise
         *         false
         */
        static private boolean isInsideCharClass(final String s, final int pos) {

            boolean openBracketFound = false;
            boolean closeBracketFound = false;

            // find last non-escaped open-bracket
            final String s2 = s.substring(0, pos);
            int posOpen = pos;
            while ((posOpen = s2.lastIndexOf('[', posOpen - 1)) != -1) {
                if (!isEscapedChar(s2, posOpen)) {
                    openBracketFound = true;
                    break;
                }
            }

            if (openBracketFound) {
                // search remainder of string (after open-bracket) for a
                // close-bracket
                final String s3 = s.substring(posOpen, pos);
                int posClose = -1;
                while ((posClose = s3.indexOf(']', posClose + 1)) != -1) {
                    if (!isEscapedChar(s3, posClose)) {
                        closeBracketFound = true;
                        break;
                    }
                }
            }

            return openBracketFound && !closeBracketFound;
        }

        /**
         * Determines if the parenthesis at the specified position of a string
         * is for a non-capturing group, which is one of the flag specifiers
         * (e.g., (?s) or (?m) or (?:pattern). If the parenthesis is followed by
         * "?", it must be a non- capturing group unless it's a named group
         * (which begins with "?<"). Make sure not to confuse it with the
         * lookbehind construct ("?<=" or "?<!").
         *
         * @param s
         *            string to evaluate
         * @param pos
         *            the position of the parenthesis to evaluate
         * @return true if the parenthesis is non-capturing; otherwise false
         */
        static private boolean isNoncapturingParen(final String s, final int pos) {

            final int len = s.length();
            boolean isLookbehind = false;

            // code-coverage reports show that pos and the text to
            // check never exceed len in this class, so it's safe
            // to not test for it, which resolves uncovered branches
            // in Cobertura

            if (pos >= 0 && pos + 4 < len) {
                final String pre = s.substring(pos, pos + 4);
                isLookbehind = pre.equals("(?<=") || pre.equals("(?<!");
            }
            return pos >= 0 && pos + 2 < len && s.charAt(pos + 1) == '?'
                    && (isLookbehind || s.charAt(pos + 2) != '<');
        }

        /**
         * Counts the open-parentheses to the left of a string position,
         * excluding escaped parentheses
         * 
         * @param s
         *            string to evaluate
         * @param pos
         *            ending position of string; characters to the left of this
         *            position are evaluated
         * @return number of open parentheses
         */
        static private int countOpenParens(final String s, final int pos) {
            final java.util.regex.Pattern p = java.util.regex.Pattern.compile("\\(");
            final java.util.regex.Matcher m = p.matcher(s.subSequence(0, pos));

            int numParens = 0;

            while (m.find()) {
                // ignore parentheses inside character classes: [0-9()a-f]
                // which are just literals
                if (isInsideCharClass(s, m.start())) {
                    continue;
                }

                // ignore escaped parens
                if (isEscapedChar(s, m.start())) {
                    continue;
                }

                if (!isNoncapturingParen(s, m.start())) {
                    numParens++;
                }
            }
            return numParens;
        }
    }

    /**
     * Converts matching text to the destined type.
     * 
     * @author mbok
     *
     * @param <T>
     *            destined type
     */
    protected static interface TypeConverter<T> {
        /**
         * Returns the converted value or null in case of errors.
         * 
         * @param input
         *            matching text to convert
         * @return converted value or null in case of errors
         */
        T convert(String input);

        FieldBaseTypes getTargetType();
    }

    private Map<Integer, TypeConverter<Object>> typeConverters;
    private final LinkedHashMap<String, Integer> groupNames = new LinkedHashMap<String, Integer>();
    private Pattern regexPattern;
    @JsonProperty
    private String grokPattern;
    private HashMap<Integer, GrokPredicate> groupPredicates;// = new
    // HashMap<Integer,
    // GrokPredicate>();
    private static Map<String, TypeConverter<? extends Object>> supportedTypeConverters = new HashMap<>();

    static {
        supportedTypeConverters.put("int", new TypeConverter<Integer>() {
            @Override
            public Integer convert(final String input) {
                if (!StringUtils.isEmpty(input)) {
                    try {
                        return Integer.parseInt(input.trim());
                    } catch (final NumberFormatException e) {
                    }
                }
                return null;
            }

            @Override
            public FieldBaseTypes getTargetType() {
                return FieldBaseTypes.INTEGER;
            }
        });
        supportedTypeConverters.put("long", new TypeConverter<Long>() {
            @Override
            public Long convert(final String input) {
                if (!StringUtils.isEmpty(input)) {
                    try {
                        return Long.parseLong(input.trim());
                    } catch (final NumberFormatException e) {
                    }
                }
                return null;
            }

            @Override
            public FieldBaseTypes getTargetType() {
                return FieldBaseTypes.LONG;
            }
        });
        supportedTypeConverters.put("float", new TypeConverter<Float>() {
            @Override
            public Float convert(final String input) {
                if (!StringUtils.isEmpty(input)) {
                    try {
                        return Float.parseFloat(input.trim());
                    } catch (final NumberFormatException e) {
                    }
                }
                return null;
            }

            @Override
            public FieldBaseTypes getTargetType() {
                return FieldBaseTypes.FLOAT;
            }
        });
        supportedTypeConverters.put("double", new TypeConverter<Double>() {
            @Override
            public Double convert(final String input) {
                if (!StringUtils.isEmpty(input)) {
                    try {
                        return Double.parseDouble(input.trim());
                    } catch (final NumberFormatException e) {
                    }
                }
                return null;
            }

            @Override
            public FieldBaseTypes getTargetType() {
                return FieldBaseTypes.DOUBLE;
            }
        });
        supportedTypeConverters.put("boolean", new TypeConverter<Boolean>() {
            @Override
            public Boolean convert(final String input) {
                if (!StringUtils.isEmpty(input)) {
                    return Boolean.parseBoolean(input.trim());
                }
                return null;
            }

            @Override
            public FieldBaseTypes getTargetType() {
                return FieldBaseTypes.BOOLEAN;
            }
        });
    }

    /**
     * Prohibit outside instantiation.
     */
    private Grok() {
        super();
    }

    /**
     * @return the regexPattern
     */
    public GrokMatcher matcher(final CharSequence input) {
        return new GrokMatcher(this, regexPattern.matcher(input));
    }

    public LinkedHashMap<String, Integer> getGroupNames() {
        return groupNames;
    }

    /**
     * @return the groupPredicates
     */
    public HashMap<Integer, GrokPredicate> getGroupPredicates() {
        return groupPredicates;
    }

    /**
     * @return the grokPattern
     */
    public String getGrokPattern() {
        return grokPattern;
    }

    /**
     * @return the typeConverter
     */
    protected Map<Integer, TypeConverter<Object>> getTypeConverters() {
        return typeConverters;
    }

    /**
     * Compiles a grok pattern and generates an internal standard pattern
     * representation for it.
     * 
     * @param registry
     *            Groks registry for predefined types
     * @param pattern
     *            the grok pattern
     * @param flags
     *            flags corresponding to {@link Pattern#flags()}
     * @return a compiled grok pattern
     * @throws GrokException
     */
    @SuppressWarnings("unchecked")
    public static Grok compile(final GroksRegistry registry, final String pattern, final int flags)
            throws GrokException {
        final Grok g = new Grok();
        g.grokPattern = pattern;
        final StringBuilder compiledPattern = new StringBuilder();
        final Matcher m = PATTERN_SUBGROK.matcher(pattern);
        int lastPos = 0;
        g.typeConverters = new HashMap<>();
        while (m.find()) {
            final String left = pattern.substring(lastPos, m.start());
            lastPos = m.end();
            compiledPattern.append(left);
            int groupsCount = PatternHelper.countOpenParens(compiledPattern.toString(), compiledPattern.length());
            final String subGrokName = m.group(1);
            final String subGrokAttr = m.group(2);
            String subGrokType = m.group(3);
            final Grok subGrok = registry.getGroks().get(subGrokName);
            if (subGrok == null) {
                throw new GrokException("No predefined Grok pattern for name '" + subGrokName
                        + "' found used in pattern: " + pattern);
            }
            if (subGrokAttr != null) {
                compiledPattern.append("(");
                groupsCount++;
                g.groupNames.put(subGrokAttr, groupsCount);
            }
            if (subGrokType != null) {
                subGrokType = subGrokType.toLowerCase();
                if (supportedTypeConverters.containsKey(subGrokType)) {
                    g.typeConverters.put(groupsCount,
                            (TypeConverter<Object>) supportedTypeConverters.get(subGrokType));
                } else {
                    LOGGER.warn("Conversion type {} not support in grok pattern: {}", subGrokType, m.group(0));
                }
            }
            compiledPattern.append(subGrok.regexPattern.pattern());
            if (subGrokAttr != null) {
                compiledPattern.append(")");
            }
            for (final String subGrokSubAttr : subGrok.groupNames.keySet()) {
                final int subGrokGroup = subGrok.groupNames.get(subGrokSubAttr);
                g.groupNames.put(subGrokSubAttr, groupsCount + subGrokGroup);
                if (subGrok.typeConverters.get(subGrokGroup) != null) {
                    g.typeConverters.put(groupsCount + subGrokGroup, subGrok.typeConverters.get(subGrokGroup));
                }
            }
        }
        compiledPattern.append(pattern.substring(lastPos));
        // g.regexPattern = Pattern.compile(compiledPattern.toString(), flags);
        final com.google.code.regexp.Pattern namedPattern = com.google.code.regexp.Pattern
                .compile(compiledPattern.toString(), flags);
        g.regexPattern = namedPattern.pattern();
        for (final String name : namedPattern.groupInfo().keySet()) {
            g.groupNames.put(name, namedPattern.groupInfo().get(name).get(0).groupIndex() + 1);
        }
        // Order groups by occurrence
        final List<Entry<String, Integer>> groups = new ArrayList<>(g.groupNames.entrySet());
        Collections.sort(groups, new Comparator<Entry<String, Integer>>() {
            @Override
            public int compare(final Entry<String, Integer> o1, final Entry<String, Integer> o2) {
                return o1.getValue().compareTo(o2.getValue());
            }
        });
        g.groupNames.clear();
        for (final Entry<String, Integer> entry : groups) {
            g.groupNames.put(entry.getKey(), entry.getValue());
        }
        LOGGER.debug("Compiled grok: {}", g);
        return g;
    }

    /**
     * Compiles a grok pattern and generates an internal standard pattern
     * representation for it.
     * 
     * @param registry
     *            Groks registry for predefined types
     * @param pattern
     *            the grok pattern
     * @return a compiled grok pattern
     * @throws GrokException
     */
    public static Grok compile(final GroksRegistry registry, final String pattern) throws GrokException {
        return compile(registry, pattern, 0);
    }

    @Override
    public String toString() {
        return "Grok [grokPattern=" + grokPattern + ", regexPattern=" + regexPattern + ", groupNames=" + groupNames
                + "]";
    }

    @Override
    public LinkedHashMap<String, FieldBaseTypes> getFieldTypes() throws FormatException {
        final LinkedHashMap<String, FieldBaseTypes> fields = new LinkedHashMap<>();
        for (final String attr : groupNames.keySet()) {
            final int groupIndex = groupNames.get(attr);
            final TypeConverter<Object> typeConverter = getTypeConverters().get(groupIndex);
            if (typeConverter != null) {
                fields.put(attr, typeConverter.getTargetType());
            } else {
                fields.put(attr, FieldBaseTypes.STRING);
            }
        }
        return fields;
    }
}