org.graylog.collector.file.splitters.PatternChunkSplitter.java Source code

Introduction

Here is the source code for org.graylog.collector.file.splitters.PatternChunkSplitter.java
Source

/**
 * This file is part of Graylog.
 *
 * Graylog is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Graylog is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Graylog.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.graylog.collector.file.splitters;

import com.google.common.collect.AbstractIterator;
import io.netty.buffer.ByteBuf;

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PatternChunkSplitter extends ContentSplitter {

    private Pattern pattern;

    public PatternChunkSplitter(String pattern) {
        this.pattern = Pattern.compile(pattern, Pattern.MULTILINE);
    }

    @Override
    public Iterable<String> split(final ByteBuf buffer, final Charset charset, final boolean includeRemainingData) {
        return new Iterable<String>() {
            @Override
            public Iterator<String> iterator() {
                return new AbstractIterator<String>() {
                    // TODO Might throw an exception if multibyte charset is used and buffer is not complete.
                    //      Use CharsetDecoder to create a CharBuffer and match on that!
                    private final String inputAsString = buffer.toString(charset);
                    final Matcher matcher = pattern.matcher(inputAsString);
                    private int positionInString = 0;

                    @Override
                    protected String computeNext() {
                        try {
                            if (!buffer.isReadable()) {
                                return endOfData();
                            }
                            if (matcher.find()) {
                                int firstByte = matcher.start();
                                if (firstByte == 0) {
                                    // advance further, the buffer begins with our pattern.
                                    if (matcher.find()) {
                                        firstByte = matcher.start();
                                    } else {
                                        if (!includeRemainingData) {
                                            // couldn't find the end of the entry (i.e. there wasn't a next line yet)
                                            return endOfData();
                                        } else {
                                            // couldn't find another line, but we are asked to finish up, include everything that remains
                                            return getRemainingContent();
                                        }
                                    }
                                }
                                if (firstByte == 0) {
                                    // still haven't found a non-zero length string, keep waiting for more data.
                                    return endOfData();
                                }
                                final String substring = inputAsString.substring(positionInString, firstByte);
                                positionInString = firstByte;
                                buffer.skipBytes(substring.getBytes(charset).length); // TODO performance
                                return substring;
                            } else {
                                if (includeRemainingData) {
                                    return getRemainingContent();
                                }
                                return endOfData();
                            }
                        } catch (IllegalStateException e) {
                            // the cause contains the CharacterCodingException from the ChannelBuffer.toString() methods
                            // this usually means the buffer ended with an incomplete encoding of a unicode character.
                            // WHY U SO SUCK CHARACTER ENCODINGS?
                            // we need to wait until more data is available
                            return endOfData();
                        } finally {
                            buffer.discardReadBytes();
                        }
                    }

                    private String getRemainingContent() {
                        final ByteBuf channelBuffer = buffer.readBytes(buffer.readableBytes());
                        return channelBuffer.toString(charset);
                    }
                };
            }
        };
    }
}